You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/resource/index.js

437 lines
15 KiB
JavaScript

import 'babel-polyfill'
import cheerio from 'cheerio'
import { fetchResource } from './utils'
import {
normalizeMetaTags,
convertLazyLoadedImages,
clean,
} from './utils/dom'
const Resource = {
// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parseNon2xx: If true, attempt to parse non-200 level
// resources. Default is false.
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
create: async function(url, preparedResponse, parseNon2xx=false) {
let result
if (preparedResponse) {
result = preparedResponse
} else {
result = await fetchResource(url)
}
return this.generateDoc(result)
},
generateDoc({ body: content, response }) {
const { "content-type": contentType } = response.headers
// TODO: Implement is_text function from
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
if (!contentType.includes('html') &&
!contentType.includes('text')) {
throw new Error(`Content does not appear to be text.`)
}
let $ = cheerio.load(content, { normalizeWhitespace: true })
if ($.root().children().length === 0) {
throw new Error(`No children, likely a bad parse.`)
}
$ = normalizeMetaTags($)
$ = convertLazyLoadedImages($)
$ = clean($)
return $
}
}
export default Resource
// def __init__(self, url, parse_non_2xx=False, response=None):
// """ Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parse_non_2xx: If True, attempt to parse non-200 level
// resources. If False, raise a RetrievalFailed
// based exception. Default is False.
// :param response: If not None, use as the response rather than
// attempting to fetch it ourselves. Expects a
// requests.models.Response object.
// """
// self.url = url
// self.parse_non_2xx = parse_non_2xx
//
// if response:
// self.response = response
// else:
// self.response = self._fetch_resource()
// Iris: Human-friendly content extraction.
// import logging
// import lxml
// import re
// import requests
// import socket
//
// from django.conf import settings
// from lxml.etree import XPathEvaluator
// from lxml.html.clean import Cleaner
// from urlparse import urlparse
//
// from utils.dom import extract_by_selector as ebs, convert_lazy_loaded_images
// from utils.dom.attribmap import AttribMap
// from utils.statsd import stats
// from utils.text import is_text
// from utils.html import get_charset_from_html, strip_content_encodings
//
// from . import exceptions
//
// logger = logging.getLogger(__name__)
//
// # Hosts that are allowed to use embeds and iframes. We should be very
// # restrictive with this and only include top-tier video sites.
// host_whitelist = ['www.youtube.com', 'www.vimeo.com']
//
// # The number of seconds to attempt to fetch a resource before timing out.
// FETCH_TIMEOUT = 10
//
// cleaner = Cleaner(
// style=True,
// page_structure=False,
// meta=False,
// add_nofollow=False, # done by hand
// remove_unknown_tags=False,
// links=False,
// host_whitelist=host_whitelist)
//
//
//
// class Resource(object):
// """ A Resource is a wrapper class for an HTTP resource. Provides
// functionality to fetch a resource as well as a handful of shortcut
// methods to run xpath efficiently on HTML, etc.
//
// Uses requests and lxml internally for fetching and querying.
// """
//
//
// def __init__(self, url, parse_non_2xx=False, response=None):
// """ Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parse_non_2xx: If True, attempt to parse non-200 level
// resources. If False, raise a RetrievalFailed
// based exception. Default is False.
// :param response: If not None, use as the response rather than
// attempting to fetch it ourselves. Expects a
// requests.models.Response object.
// """
// self.url = url
// self.parse_non_2xx = parse_non_2xx
//
// if response:
// self.response = response
// else:
// self.response = self._fetch_resource()
//
// def __unicode__(self):
// return u'<Resource ({0})>'.format(self.url)
//
// def __repr__(self):
// return "<Resource ({0})>".format(self.url)
//
// @classmethod
// def fabricate(kls, url, content, headers=None):
// """ Given a URL and some content, create a fake Resource that looks
// as though it has already fetched the content. Useful for using
// Resource objects without having to do a GET.
// """
//
// if type(content) != unicode:
// raise TypeError("Provided content must be unicode.")
//
// if headers is None:
// headers = {}
//
// try:
// utf8_content = content.encode('utf-8', 'strict')
// except UnicodeDecodeError:
// logger.warning("Unable to encode content for url %s. Content "
// "should be unicode and encodeable at this point.")
// utf8_content = content.encode('utf-8', 'replace')
//
// mocked_response_dict = {
// "cookies": {},
// "_content": utf8_content,
// "headers": dict({
// "content-length": len(content),
// "accept-ranges": "bytes",
// "vary": "Accept-Encoding,Cookie",
// "server": "Apache/2.2.21",
// "content-type": "text/html; charset=UTF-8"
// }, **headers),
// "url": url,
// "status_code": 200,
// "_content_consumed": False,
// "request": None,
// "raw": None,
// "error": None,
// "config": {
// "decode_unicode": True,
// "pool_connections": 10,
// "verbose": None,
// "keep_alive": True,
// "max_retries": 0,
// "base_headers": {
// "Accept-Encoding": "identity, deflate, compress, gzip",
// "Accept": "|)}>#*",
// "User-Agent": "python-requests/0.8.1"
// },
// "pool_maxsize": 10,
// "safe_mode": False,
// "max_redirects": 30
// },
// "history": []
// }
// mocked_response = requests.Response()
// for k, v in mocked_response_dict.items():
// setattr(mocked_response, k, v)
//
// return Resource(
// url = url,
// response = mocked_response
// )
//
//
// @property
// def url(self):
// return self._url
//
//
// @url.setter
// def url(self, value):
// parsed_url = urlparse(value)
// if parsed_url.scheme not in ('http', 'https'):
// raise ValueError("Resource only allows HTTP and HTTPS urls.")
//
// if not parsed_url.netloc:
// raise ValueError("Relative URLs are not allowed.")
//
// self._url = value
//
// _parsed_url = None
// @property
// def parsed_url(self):
// if self._parsed_url is None:
// self._parsed_url = urlparse(self.url)
// return self._parsed_url
//
// @property
// def status_code(self):
// return self.response.status_code
//
//
// _content = None
// @property
// def content(self):
// """Return the content for a resource. Always returns unicode.
//
// """
// if self._content is None:
// # Requests that come in without content-type encoding headers will
// # default to iso-8859-1, which could be wrong
// if (self.response.encoding and
// self.response.encoding.lower() == 'iso-8859-1'):
// # Dont send unicode, because it could have been decoded wrong
// # by an incorrect content-type guess.
// encoding = get_charset_from_html(self.response.content) or 'iso-8859-1'
//
// if encoding != self.response.encoding:
// # First, try to use the encoding we found in the markup
// try:
// self._content = self.response.content.decode(encoding)
// except (LookupError, UnicodeDecodeError):
// stats.increment(
// 'iris.resource.encoding.encoding_mismatch')
// # That encoding might be wrong though, so if it is, use
// # the one it reported since they could have the wrong
// # one set in the markup. eg. sending the content over
// # as iso but declaring it to be utf-8 like gq.com does.
// # We may also end up with an invalid encoding type, at
// # which point we should also just use the request
// # encoding and replace silently.
// self._content = self.response.content.decode(
// self.response.encoding, 'replace')
// else:
// # If the encoding guess was right, just use the unicode
// self._content = self.response.text
//
// else:
// # Otherwise we trust the encoding
// self._content = self.response.text
//
// return self._content
//
//
// @property
// def content_type(self):
// return self.response.headers.get('content-type', '')
//
//
// @property
// def is_html(self):
// if 'html' in self.content_type:
// return True
//
// # Otherwise, just try parsing it and see if it succeeds
// try:
// return (self.doc is not None)
// except:
// return False
//
// @property
// def is_plaintext(self):
// if 'text/plain' in self.content_type:
// return True
//
// return False
//
// @property
// def is_image(self):
// if 'image' in self.content_type:
// return True
//
// return False
//
// @property
// def is_pdf(self):
// if 'pdf' in self.content_type:
// return True
//
// return False
//
// _lxml_doc = None
// @property
// def doc(self):
// if self._lxml_doc is None:
// self._generate_lxml_doc()
//
// return self._lxml_doc
//
// _docxp = None
// @property
// def docxp(self):
// """ Generate an XPath Evaluator for this doc. """
// if self._docxp is None:
// self._docxp = XPathEvaluator(self.doc)
//
// return self._docxp
//
// _redocxp = None
// @property
// def redocxp(self):
// """ Generate an XPath Evaluator for this doc, that includes the RE
// namespace for regular expression matching.
//
// """
// if self._redocxp is None:
// _rens = {'re':'http://exslt.org/regular-expressions'}
// self._redocxp = XPathEvaluator(self.doc, namespaces=_rens)
//
// return self._redocxp
//
// def _generate_lxml_doc(self):
// # First check if we have a text based resource
// if (not 'html' in self.content_type and
// not 'text' in self.content_type and
// not is_text(self.content[:512])):
// raise ValueError("Content does not appear to be text.")
//
//
// # Remove useless carriage returns which get parsed as &#13; otherwise
// content = re.sub(r'(\n\r|\r\n)', '\n', self.content)
//
// # Dont pass any content encodings into lxml, it is dumb about them
// content = strip_content_encodings(content)
//
// self._lxml_doc = lxml.html.fromstring(content)
//
//
//
//
// if len(self._lxml_doc.getchildren()) == 0:
// stats.increment('iris.resource.encoding.no_children')
// raise ValueError("No children, likely a bad parse.")
//
//
// # Sometimes, lxml (or BeautifulSoup) will wrap the whole document
// # in an extra html tag. This screws up a whole bunch of things in
// # the parsing process. If this is the case, reset the doc to the
// # ACTUAL root of the doc.
// # Sample cases:
// # * Strange Doctype causing issues: http://bit.ly/IATz0B
// # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o
// # Also check for a body inside of our internal HTML tag, to determine
// # that it's not just a junk HTML tag sibling at the bottom of the
// # doc or something.
// internal_html_tag = self._lxml_doc.find('html')
// if (internal_html_tag is not None and
// len(internal_html_tag.xpath('.//body')) > 0):
// self._lxml_doc = internal_html_tag
//
// self._normalize_meta_tags()
//
// self._lxml_doc.make_links_absolute(self.url)
//
// # Convert any lazy loaded images into normal images before clean_html
// # which will strip all other attributes
// self._lxml_doc = convert_lazy_loaded_images(self._lxml_doc)
//
// # Clean the doc of anything malicious.
// self._lxml_doc = cleaner.clean_html(self._lxml_doc)
//
// # Manually nofollow links so that we don't clobber rel author
// # Workaround for https://bugs.launchpad.net/lxml/+bug/971754
// for a in self.docxp('//a'):
// if a.attrib.get('rel', None):
// rel_attribs = set(a.attrib['rel'].split())
// rel_attribs.add('nofollow')
// a.attrib['rel'] = ' '.join(rel_attribs)
// else:
// a.attrib['rel'] = 'nofollow'
//
// # Re-relativize anchor links
// anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" %
// self.url.replace("'", "%27"))
// for link in self.docxp(anchor_link_xpath):
// link.attrib['href'] = link.attrib['href'].replace(self.url, '')
//
//
// _attrib_map = None
// @property
// def attrib_map(self):
// """ Create an AttribMap object for fast checking of class/id existence
// in the document. Used in association with extract_by_selector.
//
// """
// if self._attrib_map is None:
// self._attrib_map = AttribMap(self.doc)
//
// return self._attrib_map
//
//
// def extract_by_selector(self, selector):
// " Shortcut to run extract_by_selector on our doc with our AttribMap. "
// return ebs(self.doc, selector, self.attrib_map, self.docxp)
//
//