|
|
|
import 'babel-polyfill'
|
|
|
|
|
|
|
|
import cheerio from 'cheerio'
|
|
|
|
|
|
|
|
import { fetchResource } from './utils'
|
|
|
|
import {
|
|
|
|
normalizeMetaTags,
|
|
|
|
convertLazyLoadedImages,
|
|
|
|
clean,
|
|
|
|
} from './utils/dom'
|
|
|
|
|
|
|
|
const Resource = {
|
|
|
|
|
|
|
|
// Create a Resource.
|
|
|
|
//
|
|
|
|
// :param url: The URL for the document we should retrieve.
|
|
|
|
// :param parseNon2xx: If true, attempt to parse non-200 level
|
|
|
|
// resources. Default is false.
|
|
|
|
// :param response: If set, use as the response rather than
|
|
|
|
// attempting to fetch it ourselves. Expects a
|
|
|
|
// string.
|
|
|
|
create: async function(url, preparedResponse, parseNon2xx=false) {
|
|
|
|
let result
|
|
|
|
|
|
|
|
if (preparedResponse) {
|
|
|
|
const validResponse = {
|
|
|
|
statusMessage: "OK",
|
|
|
|
statusCode: 200,
|
|
|
|
headers: {
|
|
|
|
"content-type": 'text/html',
|
|
|
|
"content-length": 500,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
result = { body: preparedResponse, response: validResponse }
|
|
|
|
} else {
|
|
|
|
result = await fetchResource(url)
|
|
|
|
}
|
|
|
|
return this.generateDoc(result)
|
|
|
|
},
|
|
|
|
|
|
|
|
generateDoc({ body: content, response }) {
|
|
|
|
const { "content-type": contentType } = response.headers
|
|
|
|
|
|
|
|
// TODO: Implement is_text function from
|
|
|
|
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
|
|
|
|
if (!contentType.includes('html') &&
|
|
|
|
!contentType.includes('text')) {
|
|
|
|
throw new Error(`Content does not appear to be text.`)
|
|
|
|
}
|
|
|
|
|
|
|
|
let $ = cheerio.load(content, { normalizeWhitespace: true })
|
|
|
|
|
|
|
|
if ($.root().children().length === 0) {
|
|
|
|
throw new Error(`No children, likely a bad parse.`)
|
|
|
|
}
|
|
|
|
|
|
|
|
$ = normalizeMetaTags($)
|
|
|
|
$ = convertLazyLoadedImages($)
|
|
|
|
$ = clean($)
|
|
|
|
|
|
|
|
return $
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
export default Resource
|
|
|
|
// def __init__(self, url, parse_non_2xx=False, response=None):
|
|
|
|
// """ Create a Resource.
|
|
|
|
//
|
|
|
|
// :param url: The URL for the document we should retrieve.
|
|
|
|
// :param parse_non_2xx: If True, attempt to parse non-200 level
|
|
|
|
// resources. If False, raise a RetrievalFailed
|
|
|
|
// based exception. Default is False.
|
|
|
|
// :param response: If not None, use as the response rather than
|
|
|
|
// attempting to fetch it ourselves. Expects a
|
|
|
|
// requests.models.Response object.
|
|
|
|
// """
|
|
|
|
// self.url = url
|
|
|
|
// self.parse_non_2xx = parse_non_2xx
|
|
|
|
//
|
|
|
|
// if response:
|
|
|
|
// self.response = response
|
|
|
|
// else:
|
|
|
|
// self.response = self._fetch_resource()
|
|
|
|
|
|
|
|
// Iris: Human-friendly content extraction.
|
|
|
|
|
|
|
|
// import logging
|
|
|
|
// import lxml
|
|
|
|
// import re
|
|
|
|
// import requests
|
|
|
|
// import socket
|
|
|
|
//
|
|
|
|
// from django.conf import settings
|
|
|
|
// from lxml.etree import XPathEvaluator
|
|
|
|
// from lxml.html.clean import Cleaner
|
|
|
|
// from urlparse import urlparse
|
|
|
|
//
|
|
|
|
// from utils.dom import extract_by_selector as ebs, convert_lazy_loaded_images
|
|
|
|
// from utils.dom.attribmap import AttribMap
|
|
|
|
// from utils.statsd import stats
|
|
|
|
// from utils.text import is_text
|
|
|
|
// from utils.html import get_charset_from_html, strip_content_encodings
|
|
|
|
//
|
|
|
|
// from . import exceptions
|
|
|
|
//
|
|
|
|
// logger = logging.getLogger(__name__)
|
|
|
|
//
|
|
|
|
// # Hosts that are allowed to use embeds and iframes. We should be very
|
|
|
|
// # restrictive with this and only include top-tier video sites.
|
|
|
|
// host_whitelist = ['www.youtube.com', 'www.vimeo.com']
|
|
|
|
//
|
|
|
|
// # The number of seconds to attempt to fetch a resource before timing out.
|
|
|
|
// FETCH_TIMEOUT = 10
|
|
|
|
//
|
|
|
|
// cleaner = Cleaner(
|
|
|
|
// style=True,
|
|
|
|
// page_structure=False,
|
|
|
|
// meta=False,
|
|
|
|
// add_nofollow=False, # done by hand
|
|
|
|
// remove_unknown_tags=False,
|
|
|
|
// links=False,
|
|
|
|
// host_whitelist=host_whitelist)
|
|
|
|
//
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// class Resource(object):
|
|
|
|
// """ A Resource is a wrapper class for an HTTP resource. Provides
|
|
|
|
// functionality to fetch a resource as well as a handful of shortcut
|
|
|
|
// methods to run xpath efficiently on HTML, etc.
|
|
|
|
//
|
|
|
|
// Uses requests and lxml internally for fetching and querying.
|
|
|
|
// """
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// def __init__(self, url, parse_non_2xx=False, response=None):
|
|
|
|
// """ Create a Resource.
|
|
|
|
//
|
|
|
|
// :param url: The URL for the document we should retrieve.
|
|
|
|
// :param parse_non_2xx: If True, attempt to parse non-200 level
|
|
|
|
// resources. If False, raise a RetrievalFailed
|
|
|
|
// based exception. Default is False.
|
|
|
|
// :param response: If not None, use as the response rather than
|
|
|
|
// attempting to fetch it ourselves. Expects a
|
|
|
|
// requests.models.Response object.
|
|
|
|
// """
|
|
|
|
// self.url = url
|
|
|
|
// self.parse_non_2xx = parse_non_2xx
|
|
|
|
//
|
|
|
|
// if response:
|
|
|
|
// self.response = response
|
|
|
|
// else:
|
|
|
|
// self.response = self._fetch_resource()
|
|
|
|
//
|
|
|
|
// def __unicode__(self):
|
|
|
|
// return u'<Resource ({0})>'.format(self.url)
|
|
|
|
//
|
|
|
|
// def __repr__(self):
|
|
|
|
// return "<Resource ({0})>".format(self.url)
|
|
|
|
//
|
|
|
|
// @classmethod
|
|
|
|
// def fabricate(kls, url, content, headers=None):
|
|
|
|
// """ Given a URL and some content, create a fake Resource that looks
|
|
|
|
// as though it has already fetched the content. Useful for using
|
|
|
|
// Resource objects without having to do a GET.
|
|
|
|
// """
|
|
|
|
//
|
|
|
|
// if type(content) != unicode:
|
|
|
|
// raise TypeError("Provided content must be unicode.")
|
|
|
|
//
|
|
|
|
// if headers is None:
|
|
|
|
// headers = {}
|
|
|
|
//
|
|
|
|
// try:
|
|
|
|
// utf8_content = content.encode('utf-8', 'strict')
|
|
|
|
// except UnicodeDecodeError:
|
|
|
|
// logger.warning("Unable to encode content for url %s. Content "
|
|
|
|
// "should be unicode and encodeable at this point.")
|
|
|
|
// utf8_content = content.encode('utf-8', 'replace')
|
|
|
|
//
|
|
|
|
// mocked_response_dict = {
|
|
|
|
// "cookies": {},
|
|
|
|
// "_content": utf8_content,
|
|
|
|
// "headers": dict({
|
|
|
|
// "content-length": len(content),
|
|
|
|
// "accept-ranges": "bytes",
|
|
|
|
// "vary": "Accept-Encoding,Cookie",
|
|
|
|
// "server": "Apache/2.2.21",
|
|
|
|
// "content-type": "text/html; charset=UTF-8"
|
|
|
|
// }, **headers),
|
|
|
|
// "url": url,
|
|
|
|
// "status_code": 200,
|
|
|
|
// "_content_consumed": False,
|
|
|
|
// "request": None,
|
|
|
|
// "raw": None,
|
|
|
|
// "error": None,
|
|
|
|
// "config": {
|
|
|
|
// "decode_unicode": True,
|
|
|
|
// "pool_connections": 10,
|
|
|
|
// "verbose": None,
|
|
|
|
// "keep_alive": True,
|
|
|
|
// "max_retries": 0,
|
|
|
|
// "base_headers": {
|
|
|
|
// "Accept-Encoding": "identity, deflate, compress, gzip",
|
|
|
|
// "Accept": "|)}>#*",
|
|
|
|
// "User-Agent": "python-requests/0.8.1"
|
|
|
|
// },
|
|
|
|
// "pool_maxsize": 10,
|
|
|
|
// "safe_mode": False,
|
|
|
|
// "max_redirects": 30
|
|
|
|
// },
|
|
|
|
// "history": []
|
|
|
|
// }
|
|
|
|
// mocked_response = requests.Response()
|
|
|
|
// for k, v in mocked_response_dict.items():
|
|
|
|
// setattr(mocked_response, k, v)
|
|
|
|
//
|
|
|
|
// return Resource(
|
|
|
|
// url = url,
|
|
|
|
// response = mocked_response
|
|
|
|
// )
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def url(self):
|
|
|
|
// return self._url
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// @url.setter
|
|
|
|
// def url(self, value):
|
|
|
|
// parsed_url = urlparse(value)
|
|
|
|
// if parsed_url.scheme not in ('http', 'https'):
|
|
|
|
// raise ValueError("Resource only allows HTTP and HTTPS urls.")
|
|
|
|
//
|
|
|
|
// if not parsed_url.netloc:
|
|
|
|
// raise ValueError("Relative URLs are not allowed.")
|
|
|
|
//
|
|
|
|
// self._url = value
|
|
|
|
//
|
|
|
|
// _parsed_url = None
|
|
|
|
// @property
|
|
|
|
// def parsed_url(self):
|
|
|
|
// if self._parsed_url is None:
|
|
|
|
// self._parsed_url = urlparse(self.url)
|
|
|
|
// return self._parsed_url
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def status_code(self):
|
|
|
|
// return self.response.status_code
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// _content = None
|
|
|
|
// @property
|
|
|
|
// def content(self):
|
|
|
|
// """Return the content for a resource. Always returns unicode.
|
|
|
|
//
|
|
|
|
// """
|
|
|
|
// if self._content is None:
|
|
|
|
// # Requests that come in without content-type encoding headers will
|
|
|
|
// # default to iso-8859-1, which could be wrong
|
|
|
|
// if (self.response.encoding and
|
|
|
|
// self.response.encoding.lower() == 'iso-8859-1'):
|
|
|
|
// # Dont send unicode, because it could have been decoded wrong
|
|
|
|
// # by an incorrect content-type guess.
|
|
|
|
// encoding = get_charset_from_html(self.response.content) or 'iso-8859-1'
|
|
|
|
//
|
|
|
|
// if encoding != self.response.encoding:
|
|
|
|
// # First, try to use the encoding we found in the markup
|
|
|
|
// try:
|
|
|
|
// self._content = self.response.content.decode(encoding)
|
|
|
|
// except (LookupError, UnicodeDecodeError):
|
|
|
|
// stats.increment(
|
|
|
|
// 'iris.resource.encoding.encoding_mismatch')
|
|
|
|
// # That encoding might be wrong though, so if it is, use
|
|
|
|
// # the one it reported since they could have the wrong
|
|
|
|
// # one set in the markup. eg. sending the content over
|
|
|
|
// # as iso but declaring it to be utf-8 like gq.com does.
|
|
|
|
// # We may also end up with an invalid encoding type, at
|
|
|
|
// # which point we should also just use the request
|
|
|
|
// # encoding and replace silently.
|
|
|
|
// self._content = self.response.content.decode(
|
|
|
|
// self.response.encoding, 'replace')
|
|
|
|
// else:
|
|
|
|
// # If the encoding guess was right, just use the unicode
|
|
|
|
// self._content = self.response.text
|
|
|
|
//
|
|
|
|
// else:
|
|
|
|
// # Otherwise we trust the encoding
|
|
|
|
// self._content = self.response.text
|
|
|
|
//
|
|
|
|
// return self._content
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def content_type(self):
|
|
|
|
// return self.response.headers.get('content-type', '')
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def is_html(self):
|
|
|
|
// if 'html' in self.content_type:
|
|
|
|
// return True
|
|
|
|
//
|
|
|
|
// # Otherwise, just try parsing it and see if it succeeds
|
|
|
|
// try:
|
|
|
|
// return (self.doc is not None)
|
|
|
|
// except:
|
|
|
|
// return False
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def is_plaintext(self):
|
|
|
|
// if 'text/plain' in self.content_type:
|
|
|
|
// return True
|
|
|
|
//
|
|
|
|
// return False
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def is_image(self):
|
|
|
|
// if 'image' in self.content_type:
|
|
|
|
// return True
|
|
|
|
//
|
|
|
|
// return False
|
|
|
|
//
|
|
|
|
// @property
|
|
|
|
// def is_pdf(self):
|
|
|
|
// if 'pdf' in self.content_type:
|
|
|
|
// return True
|
|
|
|
//
|
|
|
|
// return False
|
|
|
|
//
|
|
|
|
// _lxml_doc = None
|
|
|
|
// @property
|
|
|
|
// def doc(self):
|
|
|
|
// if self._lxml_doc is None:
|
|
|
|
// self._generate_lxml_doc()
|
|
|
|
//
|
|
|
|
// return self._lxml_doc
|
|
|
|
//
|
|
|
|
// _docxp = None
|
|
|
|
// @property
|
|
|
|
// def docxp(self):
|
|
|
|
// """ Generate an XPath Evaluator for this doc. """
|
|
|
|
// if self._docxp is None:
|
|
|
|
// self._docxp = XPathEvaluator(self.doc)
|
|
|
|
//
|
|
|
|
// return self._docxp
|
|
|
|
//
|
|
|
|
// _redocxp = None
|
|
|
|
// @property
|
|
|
|
// def redocxp(self):
|
|
|
|
// """ Generate an XPath Evaluator for this doc, that includes the RE
|
|
|
|
// namespace for regular expression matching.
|
|
|
|
//
|
|
|
|
// """
|
|
|
|
// if self._redocxp is None:
|
|
|
|
// _rens = {'re':'http://exslt.org/regular-expressions'}
|
|
|
|
// self._redocxp = XPathEvaluator(self.doc, namespaces=_rens)
|
|
|
|
//
|
|
|
|
// return self._redocxp
|
|
|
|
//
|
|
|
|
// def _generate_lxml_doc(self):
|
|
|
|
// # First check if we have a text based resource
|
|
|
|
// if (not 'html' in self.content_type and
|
|
|
|
// not 'text' in self.content_type and
|
|
|
|
// not is_text(self.content[:512])):
|
|
|
|
// raise ValueError("Content does not appear to be text.")
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// # Remove useless carriage returns which get parsed as otherwise
|
|
|
|
// content = re.sub(r'(\n\r|\r\n)', '\n', self.content)
|
|
|
|
//
|
|
|
|
// # Dont pass any content encodings into lxml, it is dumb about them
|
|
|
|
// content = strip_content_encodings(content)
|
|
|
|
//
|
|
|
|
// self._lxml_doc = lxml.html.fromstring(content)
|
|
|
|
//
|
|
|
|
//
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// if len(self._lxml_doc.getchildren()) == 0:
|
|
|
|
// stats.increment('iris.resource.encoding.no_children')
|
|
|
|
// raise ValueError("No children, likely a bad parse.")
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// # Sometimes, lxml (or BeautifulSoup) will wrap the whole document
|
|
|
|
// # in an extra html tag. This screws up a whole bunch of things in
|
|
|
|
// # the parsing process. If this is the case, reset the doc to the
|
|
|
|
// # ACTUAL root of the doc.
|
|
|
|
// # Sample cases:
|
|
|
|
// # * Strange Doctype causing issues: http://bit.ly/IATz0B
|
|
|
|
// # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o
|
|
|
|
// # Also check for a body inside of our internal HTML tag, to determine
|
|
|
|
// # that it's not just a junk HTML tag sibling at the bottom of the
|
|
|
|
// # doc or something.
|
|
|
|
// internal_html_tag = self._lxml_doc.find('html')
|
|
|
|
// if (internal_html_tag is not None and
|
|
|
|
// len(internal_html_tag.xpath('.//body')) > 0):
|
|
|
|
// self._lxml_doc = internal_html_tag
|
|
|
|
//
|
|
|
|
// self._normalize_meta_tags()
|
|
|
|
//
|
|
|
|
// self._lxml_doc.make_links_absolute(self.url)
|
|
|
|
//
|
|
|
|
// # Convert any lazy loaded images into normal images before clean_html
|
|
|
|
// # which will strip all other attributes
|
|
|
|
// self._lxml_doc = convert_lazy_loaded_images(self._lxml_doc)
|
|
|
|
//
|
|
|
|
// # Clean the doc of anything malicious.
|
|
|
|
// self._lxml_doc = cleaner.clean_html(self._lxml_doc)
|
|
|
|
//
|
|
|
|
// # Manually nofollow links so that we don't clobber rel author
|
|
|
|
// # Workaround for https://bugs.launchpad.net/lxml/+bug/971754
|
|
|
|
// for a in self.docxp('//a'):
|
|
|
|
// if a.attrib.get('rel', None):
|
|
|
|
// rel_attribs = set(a.attrib['rel'].split())
|
|
|
|
// rel_attribs.add('nofollow')
|
|
|
|
// a.attrib['rel'] = ' '.join(rel_attribs)
|
|
|
|
// else:
|
|
|
|
// a.attrib['rel'] = 'nofollow'
|
|
|
|
//
|
|
|
|
// # Re-relativize anchor links
|
|
|
|
// anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" %
|
|
|
|
// self.url.replace("'", "%27"))
|
|
|
|
// for link in self.docxp(anchor_link_xpath):
|
|
|
|
// link.attrib['href'] = link.attrib['href'].replace(self.url, '')
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// _attrib_map = None
|
|
|
|
// @property
|
|
|
|
// def attrib_map(self):
|
|
|
|
// """ Create an AttribMap object for fast checking of class/id existence
|
|
|
|
// in the document. Used in association with extract_by_selector.
|
|
|
|
//
|
|
|
|
// """
|
|
|
|
// if self._attrib_map is None:
|
|
|
|
// self._attrib_map = AttribMap(self.doc)
|
|
|
|
//
|
|
|
|
// return self._attrib_map
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// def extract_by_selector(self, selector):
|
|
|
|
// " Shortcut to run extract_by_selector on our doc with our AttribMap. "
|
|
|
|
// return ebs(self.doc, selector, self.attrib_map, self.docxp)
|
|
|
|
//
|
|
|
|
//
|