feat: resource fetches content from a URL and prepares for parsing
Squashed commit of the following: commit 7ba2d2b36d175f5ccbc02f918322ea0dd44bf2c1 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 17:55:10 2016 -0400 feat: resource fetches content from a URL and prepares for parsing commit 0abdfa49eed5b363169070dac6d65d0a5818c918 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 17:54:07 2016 -0400 fix: this was messing up double Esses ('ss', as in class => cla) commit 9dc65a99631e3a68267a68b2b4629c4be8f61546 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 14:58:57 2016 -0400 fix: test suite working w/new dirs commit 993dc33a5229bfa22ea998e3c4fe105be9d91c21 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 14:49:39 2016 -0400 feat: convertLazyLoadedImages puts img urls in the src commit e7fb105443dd16d036e460ad21fbcb47191f475b Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 14:30:43 2016 -0400 feat: makeLinksAbsolute to fully qualify urls commit dbd665078af854efe84bbbfe9b55acd02e1a652f Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 13:38:33 2016 -0400 feat: fetchResource to fetch a url and validate the response commit 42d3937c8f0f8df693996c2edee93625f13dced7 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 10:25:34 2016 -0400 feat: normalizing meta tagspull/1/head
parent
bc97156718
commit
8da2425e59
@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env node
|
||||
var Iris = require('./dist/bundle')
|
||||
|
||||
var url = process.argv[2]
|
||||
var result = Iris.parse(url).then(function(result) {
|
||||
console.log(result.content)
|
||||
})
|
@ -1,10 +1,19 @@
|
||||
import fs from 'fs'
|
||||
|
||||
import GenericExtractor from './extractor/generic/index.js'
|
||||
import Resource from './resource'
|
||||
import GenericExtractor from './extractor/generic'
|
||||
|
||||
// const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
|
||||
// const url = 'http://wired.com'
|
||||
// const result = GenericExtractor.parse(url, html)
|
||||
// console.log(result)
|
||||
import fetchResource from './resource/utils/fetch-resource'
|
||||
// export default fetchResource
|
||||
|
||||
export default GenericExtractor
|
||||
// export { default as GenericExtractor } from './extractor/generic/index.js'
|
||||
|
||||
const Iris = {
|
||||
parse: async function(url) {
|
||||
const $ = await Resource.create(url)
|
||||
const result = GenericExtractor.parse(url, null, $)
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
export default Iris
|
||||
|
@ -0,0 +1,12 @@
|
||||
import Iris from './index'
|
||||
|
||||
describe('Iris', function() {
|
||||
describe('parse(url)', function() {
|
||||
this.timeout(1000000)
|
||||
it('does the whole thing', async function() {
|
||||
const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220')
|
||||
|
||||
console.log(result)
|
||||
})
|
||||
})
|
||||
})
|
@ -0,0 +1,430 @@
|
||||
import 'babel-polyfill'
|
||||
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import { fetchResource } from './utils'
|
||||
import {
|
||||
normalizeMetaTags,
|
||||
convertLazyLoadedImages,
|
||||
clean,
|
||||
} from './utils/dom'
|
||||
|
||||
const Resource = {
|
||||
// Create a Resource.
|
||||
//
|
||||
// :param url: The URL for the document we should retrieve.
|
||||
// :param parseNon2xx: If true, attempt to parse non-200 level
|
||||
// resources. Default is false.
|
||||
// :param response: If set, use as the response rather than
|
||||
// attempting to fetch it ourselves. Expects a
|
||||
// string.
|
||||
|
||||
create: async function(url, parseNon2xx=false, preparedResponse) {
|
||||
const result = await fetchResource(url)
|
||||
return this.generateDoc(result)
|
||||
},
|
||||
|
||||
generateDoc({ body: content, response }) {
|
||||
const { "content-type": contentType } = response.headers
|
||||
|
||||
// TODO: Implement is_text function from
|
||||
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
|
||||
if (!contentType.includes('html') &&
|
||||
!contentType.includes('text')) {
|
||||
throw new Error(`Content does not appear to be text.`)
|
||||
}
|
||||
|
||||
let $ = cheerio.load(content, { normalizeWhitespace: true })
|
||||
|
||||
if ($.root().children().length === 0) {
|
||||
throw new Error(`No children, likely a bad parse.`)
|
||||
}
|
||||
|
||||
$ = normalizeMetaTags($)
|
||||
$ = convertLazyLoadedImages($)
|
||||
$ = clean($)
|
||||
|
||||
return $
|
||||
}
|
||||
}
|
||||
|
||||
export default Resource
|
||||
// def __init__(self, url, parse_non_2xx=False, response=None):
|
||||
// """ Create a Resource.
|
||||
//
|
||||
// :param url: The URL for the document we should retrieve.
|
||||
// :param parse_non_2xx: If True, attempt to parse non-200 level
|
||||
// resources. If False, raise a RetrievalFailed
|
||||
// based exception. Default is False.
|
||||
// :param response: If not None, use as the response rather than
|
||||
// attempting to fetch it ourselves. Expects a
|
||||
// requests.models.Response object.
|
||||
// """
|
||||
// self.url = url
|
||||
// self.parse_non_2xx = parse_non_2xx
|
||||
//
|
||||
// if response:
|
||||
// self.response = response
|
||||
// else:
|
||||
// self.response = self._fetch_resource()
|
||||
|
||||
// Iris: Human-friendly content extraction.
|
||||
|
||||
// import logging
|
||||
// import lxml
|
||||
// import re
|
||||
// import requests
|
||||
// import socket
|
||||
//
|
||||
// from django.conf import settings
|
||||
// from lxml.etree import XPathEvaluator
|
||||
// from lxml.html.clean import Cleaner
|
||||
// from urlparse import urlparse
|
||||
//
|
||||
// from utils.dom import extract_by_selector as ebs, convert_lazy_loaded_images
|
||||
// from utils.dom.attribmap import AttribMap
|
||||
// from utils.statsd import stats
|
||||
// from utils.text import is_text
|
||||
// from utils.html import get_charset_from_html, strip_content_encodings
|
||||
//
|
||||
// from . import exceptions
|
||||
//
|
||||
// logger = logging.getLogger(__name__)
|
||||
//
|
||||
// # Hosts that are allowed to use embeds and iframes. We should be very
|
||||
// # restrictive with this and only include top-tier video sites.
|
||||
// host_whitelist = ['www.youtube.com', 'www.vimeo.com']
|
||||
//
|
||||
// # The number of seconds to attempt to fetch a resource before timing out.
|
||||
// FETCH_TIMEOUT = 10
|
||||
//
|
||||
// cleaner = Cleaner(
|
||||
// style=True,
|
||||
// page_structure=False,
|
||||
// meta=False,
|
||||
// add_nofollow=False, # done by hand
|
||||
// remove_unknown_tags=False,
|
||||
// links=False,
|
||||
// host_whitelist=host_whitelist)
|
||||
//
|
||||
//
|
||||
//
|
||||
// class Resource(object):
|
||||
// """ A Resource is a wrapper class for an HTTP resource. Provides
|
||||
// functionality to fetch a resource as well as a handful of shortcut
|
||||
// methods to run xpath efficiently on HTML, etc.
|
||||
//
|
||||
// Uses requests and lxml internally for fetching and querying.
|
||||
// """
|
||||
//
|
||||
//
|
||||
// def __init__(self, url, parse_non_2xx=False, response=None):
|
||||
// """ Create a Resource.
|
||||
//
|
||||
// :param url: The URL for the document we should retrieve.
|
||||
// :param parse_non_2xx: If True, attempt to parse non-200 level
|
||||
// resources. If False, raise a RetrievalFailed
|
||||
// based exception. Default is False.
|
||||
// :param response: If not None, use as the response rather than
|
||||
// attempting to fetch it ourselves. Expects a
|
||||
// requests.models.Response object.
|
||||
// """
|
||||
// self.url = url
|
||||
// self.parse_non_2xx = parse_non_2xx
|
||||
//
|
||||
// if response:
|
||||
// self.response = response
|
||||
// else:
|
||||
// self.response = self._fetch_resource()
|
||||
//
|
||||
// def __unicode__(self):
|
||||
// return u'<Resource ({0})>'.format(self.url)
|
||||
//
|
||||
// def __repr__(self):
|
||||
// return "<Resource ({0})>".format(self.url)
|
||||
//
|
||||
// @classmethod
|
||||
// def fabricate(kls, url, content, headers=None):
|
||||
// """ Given a URL and some content, create a fake Resource that looks
|
||||
// as though it has already fetched the content. Useful for using
|
||||
// Resource objects without having to do a GET.
|
||||
// """
|
||||
//
|
||||
// if type(content) != unicode:
|
||||
// raise TypeError("Provided content must be unicode.")
|
||||
//
|
||||
// if headers is None:
|
||||
// headers = {}
|
||||
//
|
||||
// try:
|
||||
// utf8_content = content.encode('utf-8', 'strict')
|
||||
// except UnicodeDecodeError:
|
||||
// logger.warning("Unable to encode content for url %s. Content "
|
||||
// "should be unicode and encodeable at this point.")
|
||||
// utf8_content = content.encode('utf-8', 'replace')
|
||||
//
|
||||
// mocked_response_dict = {
|
||||
// "cookies": {},
|
||||
// "_content": utf8_content,
|
||||
// "headers": dict({
|
||||
// "content-length": len(content),
|
||||
// "accept-ranges": "bytes",
|
||||
// "vary": "Accept-Encoding,Cookie",
|
||||
// "server": "Apache/2.2.21",
|
||||
// "content-type": "text/html; charset=UTF-8"
|
||||
// }, **headers),
|
||||
// "url": url,
|
||||
// "status_code": 200,
|
||||
// "_content_consumed": False,
|
||||
// "request": None,
|
||||
// "raw": None,
|
||||
// "error": None,
|
||||
// "config": {
|
||||
// "decode_unicode": True,
|
||||
// "pool_connections": 10,
|
||||
// "verbose": None,
|
||||
// "keep_alive": True,
|
||||
// "max_retries": 0,
|
||||
// "base_headers": {
|
||||
// "Accept-Encoding": "identity, deflate, compress, gzip",
|
||||
// "Accept": "|)}>#*",
|
||||
// "User-Agent": "python-requests/0.8.1"
|
||||
// },
|
||||
// "pool_maxsize": 10,
|
||||
// "safe_mode": False,
|
||||
// "max_redirects": 30
|
||||
// },
|
||||
// "history": []
|
||||
// }
|
||||
// mocked_response = requests.Response()
|
||||
// for k, v in mocked_response_dict.items():
|
||||
// setattr(mocked_response, k, v)
|
||||
//
|
||||
// return Resource(
|
||||
// url = url,
|
||||
// response = mocked_response
|
||||
// )
|
||||
//
|
||||
//
|
||||
// @property
|
||||
// def url(self):
|
||||
// return self._url
|
||||
//
|
||||
//
|
||||
// @url.setter
|
||||
// def url(self, value):
|
||||
// parsed_url = urlparse(value)
|
||||
// if parsed_url.scheme not in ('http', 'https'):
|
||||
// raise ValueError("Resource only allows HTTP and HTTPS urls.")
|
||||
//
|
||||
// if not parsed_url.netloc:
|
||||
// raise ValueError("Relative URLs are not allowed.")
|
||||
//
|
||||
// self._url = value
|
||||
//
|
||||
// _parsed_url = None
|
||||
// @property
|
||||
// def parsed_url(self):
|
||||
// if self._parsed_url is None:
|
||||
// self._parsed_url = urlparse(self.url)
|
||||
// return self._parsed_url
|
||||
//
|
||||
// @property
|
||||
// def status_code(self):
|
||||
// return self.response.status_code
|
||||
//
|
||||
//
|
||||
// _content = None
|
||||
// @property
|
||||
// def content(self):
|
||||
// """Return the content for a resource. Always returns unicode.
|
||||
//
|
||||
// """
|
||||
// if self._content is None:
|
||||
// # Requests that come in without content-type encoding headers will
|
||||
// # default to iso-8859-1, which could be wrong
|
||||
// if (self.response.encoding and
|
||||
// self.response.encoding.lower() == 'iso-8859-1'):
|
||||
// # Dont send unicode, because it could have been decoded wrong
|
||||
// # by an incorrect content-type guess.
|
||||
// encoding = get_charset_from_html(self.response.content) or 'iso-8859-1'
|
||||
//
|
||||
// if encoding != self.response.encoding:
|
||||
// # First, try to use the encoding we found in the markup
|
||||
// try:
|
||||
// self._content = self.response.content.decode(encoding)
|
||||
// except (LookupError, UnicodeDecodeError):
|
||||
// stats.increment(
|
||||
// 'iris.resource.encoding.encoding_mismatch')
|
||||
// # That encoding might be wrong though, so if it is, use
|
||||
// # the one it reported since they could have the wrong
|
||||
// # one set in the markup. eg. sending the content over
|
||||
// # as iso but declaring it to be utf-8 like gq.com does.
|
||||
// # We may also end up with an invalid encoding type, at
|
||||
// # which point we should also just use the request
|
||||
// # encoding and replace silently.
|
||||
// self._content = self.response.content.decode(
|
||||
// self.response.encoding, 'replace')
|
||||
// else:
|
||||
// # If the encoding guess was right, just use the unicode
|
||||
// self._content = self.response.text
|
||||
//
|
||||
// else:
|
||||
// # Otherwise we trust the encoding
|
||||
// self._content = self.response.text
|
||||
//
|
||||
// return self._content
|
||||
//
|
||||
//
|
||||
// @property
|
||||
// def content_type(self):
|
||||
// return self.response.headers.get('content-type', '')
|
||||
//
|
||||
//
|
||||
// @property
|
||||
// def is_html(self):
|
||||
// if 'html' in self.content_type:
|
||||
// return True
|
||||
//
|
||||
// # Otherwise, just try parsing it and see if it succeeds
|
||||
// try:
|
||||
// return (self.doc is not None)
|
||||
// except:
|
||||
// return False
|
||||
//
|
||||
// @property
|
||||
// def is_plaintext(self):
|
||||
// if 'text/plain' in self.content_type:
|
||||
// return True
|
||||
//
|
||||
// return False
|
||||
//
|
||||
// @property
|
||||
// def is_image(self):
|
||||
// if 'image' in self.content_type:
|
||||
// return True
|
||||
//
|
||||
// return False
|
||||
//
|
||||
// @property
|
||||
// def is_pdf(self):
|
||||
// if 'pdf' in self.content_type:
|
||||
// return True
|
||||
//
|
||||
// return False
|
||||
//
|
||||
// _lxml_doc = None
|
||||
// @property
|
||||
// def doc(self):
|
||||
// if self._lxml_doc is None:
|
||||
// self._generate_lxml_doc()
|
||||
//
|
||||
// return self._lxml_doc
|
||||
//
|
||||
// _docxp = None
|
||||
// @property
|
||||
// def docxp(self):
|
||||
// """ Generate an XPath Evaluator for this doc. """
|
||||
// if self._docxp is None:
|
||||
// self._docxp = XPathEvaluator(self.doc)
|
||||
//
|
||||
// return self._docxp
|
||||
//
|
||||
// _redocxp = None
|
||||
// @property
|
||||
// def redocxp(self):
|
||||
// """ Generate an XPath Evaluator for this doc, that includes the RE
|
||||
// namespace for regular expression matching.
|
||||
//
|
||||
// """
|
||||
// if self._redocxp is None:
|
||||
// _rens = {'re':'http://exslt.org/regular-expressions'}
|
||||
// self._redocxp = XPathEvaluator(self.doc, namespaces=_rens)
|
||||
//
|
||||
// return self._redocxp
|
||||
//
|
||||
// def _generate_lxml_doc(self):
|
||||
// # First check if we have a text based resource
|
||||
// if (not 'html' in self.content_type and
|
||||
// not 'text' in self.content_type and
|
||||
// not is_text(self.content[:512])):
|
||||
// raise ValueError("Content does not appear to be text.")
|
||||
//
|
||||
//
|
||||
// # Remove useless carriage returns which get parsed as otherwise
|
||||
// content = re.sub(r'(\n\r|\r\n)', '\n', self.content)
|
||||
//
|
||||
// # Dont pass any content encodings into lxml, it is dumb about them
|
||||
// content = strip_content_encodings(content)
|
||||
//
|
||||
// self._lxml_doc = lxml.html.fromstring(content)
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// if len(self._lxml_doc.getchildren()) == 0:
|
||||
// stats.increment('iris.resource.encoding.no_children')
|
||||
// raise ValueError("No children, likely a bad parse.")
|
||||
//
|
||||
//
|
||||
// # Sometimes, lxml (or BeautifulSoup) will wrap the whole document
|
||||
// # in an extra html tag. This screws up a whole bunch of things in
|
||||
// # the parsing process. If this is the case, reset the doc to the
|
||||
// # ACTUAL root of the doc.
|
||||
// # Sample cases:
|
||||
// # * Strange Doctype causing issues: http://bit.ly/IATz0B
|
||||
// # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o
|
||||
// # Also check for a body inside of our internal HTML tag, to determine
|
||||
// # that it's not just a junk HTML tag sibling at the bottom of the
|
||||
// # doc or something.
|
||||
// internal_html_tag = self._lxml_doc.find('html')
|
||||
// if (internal_html_tag is not None and
|
||||
// len(internal_html_tag.xpath('.//body')) > 0):
|
||||
// self._lxml_doc = internal_html_tag
|
||||
//
|
||||
// self._normalize_meta_tags()
|
||||
//
|
||||
// self._lxml_doc.make_links_absolute(self.url)
|
||||
//
|
||||
// # Convert any lazy loaded images into normal images before clean_html
|
||||
// # which will strip all other attributes
|
||||
// self._lxml_doc = convert_lazy_loaded_images(self._lxml_doc)
|
||||
//
|
||||
// # Clean the doc of anything malicious.
|
||||
// self._lxml_doc = cleaner.clean_html(self._lxml_doc)
|
||||
//
|
||||
// # Manually nofollow links so that we don't clobber rel author
|
||||
// # Workaround for https://bugs.launchpad.net/lxml/+bug/971754
|
||||
// for a in self.docxp('//a'):
|
||||
// if a.attrib.get('rel', None):
|
||||
// rel_attribs = set(a.attrib['rel'].split())
|
||||
// rel_attribs.add('nofollow')
|
||||
// a.attrib['rel'] = ' '.join(rel_attribs)
|
||||
// else:
|
||||
// a.attrib['rel'] = 'nofollow'
|
||||
//
|
||||
// # Re-relativize anchor links
|
||||
// anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" %
|
||||
// self.url.replace("'", "%27"))
|
||||
// for link in self.docxp(anchor_link_xpath):
|
||||
// link.attrib['href'] = link.attrib['href'].replace(self.url, '')
|
||||
//
|
||||
//
|
||||
// _attrib_map = None
|
||||
// @property
|
||||
// def attrib_map(self):
|
||||
// """ Create an AttribMap object for fast checking of class/id existence
|
||||
// in the document. Used in association with extract_by_selector.
|
||||
//
|
||||
// """
|
||||
// if self._attrib_map is None:
|
||||
// self._attrib_map = AttribMap(self.doc)
|
||||
//
|
||||
// return self._attrib_map
|
||||
//
|
||||
//
|
||||
// def extract_by_selector(self, selector):
|
||||
// " Shortcut to run extract_by_selector on our doc with our AttribMap. "
|
||||
// return ebs(self.doc, selector, self.attrib_map, self.docxp)
|
||||
//
|
||||
//
|
@ -0,0 +1,58 @@
|
||||
import assert from 'assert'
|
||||
|
||||
import Resource from './index'
|
||||
|
||||
describe('Resource', () => {
|
||||
describe('create(url)', function() {
|
||||
this.timeout(3000)
|
||||
it('fetches the page and returns a cheerio object', async () => {
|
||||
const url = 'http://theconcourse.deadspin.com/1786177057'
|
||||
const $ = await Resource.create(url)
|
||||
|
||||
console.log($.html())
|
||||
})
|
||||
})
|
||||
|
||||
describe('generateDoc({ body, response })', () => {
|
||||
it('returns a cheerio object if valid', () => {
|
||||
const response = { headers: { "content-type": "text/html" } }
|
||||
|
||||
const body = `<div><p>Hi</p></div>`
|
||||
const $ = Resource.generateDoc({ body, response })
|
||||
|
||||
assert.equal($.html(), body)
|
||||
})
|
||||
|
||||
it('throws an error if the content is not text', () => {
|
||||
const response = {
|
||||
headers: {
|
||||
"content-type": "foo"
|
||||
}
|
||||
}
|
||||
const body = ''
|
||||
|
||||
assert.throws(
|
||||
() => {
|
||||
Resource.generateDoc({ body, response })
|
||||
},
|
||||
/content does not appear to be text/i
|
||||
)
|
||||
})
|
||||
|
||||
it('throws an error if the content has no children', () => {
|
||||
const response = {
|
||||
headers: {
|
||||
"content-type": "html"
|
||||
}
|
||||
}
|
||||
const body = ``
|
||||
|
||||
assert.throws(
|
||||
() => {
|
||||
Resource.generateDoc({ body, response })
|
||||
},
|
||||
/no children/i
|
||||
)
|
||||
})
|
||||
})
|
||||
})
|
@ -0,0 +1,36 @@
|
||||
export const REQUEST_HEADERS = {
|
||||
'User-Agent': 'Readability - http://readability.com/about/'
|
||||
}
|
||||
|
||||
// The number of milliseconds to attempt to fetch a resource before timing out.
|
||||
export const FETCH_TIMEOUT = 10000
|
||||
|
||||
// Content types that we do not extract content from
|
||||
const BAD_CONTENT_TYPES = [
|
||||
'audio/mpeg',
|
||||
'image/gif',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
]
|
||||
|
||||
export const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i')
|
||||
|
||||
|
||||
|
||||
// Use this setting as the maximum size an article can be
|
||||
// for us to attempt parsing. Defaults to 5 MB.
|
||||
export const MAX_CONTENT_LENGTH = 5242880
|
||||
|
||||
// Turn the global proxy on or off
|
||||
// Proxying is not currently enabled in Python source
|
||||
// so not implementing logic in port.
|
||||
export const PROXY_DOMAINS = false
|
||||
export const REQUESTS_PROXIES = {
|
||||
'http': 'http://38.98.105.139:33333',
|
||||
'https': 'http://38.98.105.139:33333',
|
||||
}
|
||||
|
||||
export const DOMAINS_TO_PROXY = [
|
||||
'nih.gov',
|
||||
'gutenberg.org',
|
||||
]
|
@ -0,0 +1,17 @@
|
||||
import { TAGS_TO_REMOVE } from './constants'
|
||||
export default function clean($) {
|
||||
$(TAGS_TO_REMOVE).remove()
|
||||
|
||||
$ = cleanComments($)
|
||||
return $
|
||||
}
|
||||
|
||||
function isComment(index, node) {
|
||||
return node.type === 'comment'
|
||||
}
|
||||
|
||||
function cleanComments($) {
|
||||
$.root().find('*').contents().filter(isComment).remove()
|
||||
|
||||
return $
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import clean from './clean'
|
||||
|
||||
describe('clean($)', () => {
|
||||
it('removes script elements', () => {
|
||||
const html = `<div><script>alert('hi')</script></div>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
assert.equal(clean($).html(), '<div></div>')
|
||||
})
|
||||
|
||||
it('removes style elements', () => {
|
||||
const html = `<div><style>foo: {color: red;}</style></div>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
assert.equal(clean($).html(), '<div></div>')
|
||||
})
|
||||
|
||||
it('removes comments', () => {
|
||||
const html = `<div>HI <!-- This is a comment --></div>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
assert.equal(clean($).html(), '<div>HI </div>')
|
||||
})
|
||||
})
|
@ -0,0 +1,8 @@
|
||||
export const IS_LINK = new RegExp('https?://', 'i')
|
||||
export const IS_IMAGE = new RegExp('\.(png|gif|jpe?g)', 'i')
|
||||
|
||||
export const TAGS_TO_REMOVE = [
|
||||
'script',
|
||||
'style',
|
||||
'form',
|
||||
].join(',')
|
@ -0,0 +1,26 @@
|
||||
import 'babel-polyfill'
|
||||
|
||||
import {
|
||||
IS_LINK,
|
||||
IS_IMAGE,
|
||||
} from './constants'
|
||||
|
||||
// Convert all instances of images with potentially
|
||||
// lazy loaded images into normal images.
|
||||
// Many sites will have img tags with no source, or an image tag with a src
|
||||
// attribute that a is a placeholer. We need to be able to properly fill in
|
||||
// the src attribute so the images are no longer lazy loaded.
|
||||
export default function convertLazyLoadedImages($) {
|
||||
$('img').each((_, img) => {
|
||||
Reflect.ownKeys(img.attribs).forEach((attr) => {
|
||||
const value = img.attribs[attr]
|
||||
|
||||
if (attr !== 'src' && IS_LINK.test(value) &&
|
||||
IS_IMAGE.test(value)) {
|
||||
$(img).attr('src', value)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return $
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import convertLazyLoadedImages from './convert-lazy-loaded-images'
|
||||
|
||||
describe('convertLazyLoadedImages($)', () => {
|
||||
it('moves image links to src if placed in another attribute', () => {
|
||||
const html = `<img data-src="http://example.com/foo.jpg">`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = convertLazyLoadedImages($).html()
|
||||
|
||||
assert.equal(result, `<img data-src="http://example.com/foo.jpg" src="http://example.com/foo.jpg">`)
|
||||
})
|
||||
|
||||
it('does nothing when value is not a link', () => {
|
||||
// This is far from perfect, since a relative url could
|
||||
// be perfectly correct.
|
||||
const html = `<img data-src="foo.jpg">`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = convertLazyLoadedImages($).html()
|
||||
|
||||
assert.equal(result, `<img data-src="foo.jpg">`)
|
||||
})
|
||||
|
||||
it('does nothing when value is not an image', () => {
|
||||
const html = `<img data-src="http://example.com">`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = convertLazyLoadedImages($).html()
|
||||
|
||||
assert.equal(result, `<img data-src="http://example.com">`)
|
||||
})
|
||||
|
||||
it('does not change a correct img with src', () => {
|
||||
const html = `<img src="http://example.com/foo.jpg">`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = convertLazyLoadedImages($).html()
|
||||
|
||||
assert.equal(result, `<img src="http://example.com/foo.jpg">`)
|
||||
})
|
||||
})
|
@ -0,0 +1,4 @@
|
||||
export { default as normalizeMetaTags } from './normalize-meta-tags'
|
||||
export { default as makeLinksAbsolute } from './make-links-absolute'
|
||||
export { default as convertLazyLoadedImages } from './convert-lazy-loaded-images'
|
||||
export { default as clean } from './clean'
|
@ -0,0 +1,13 @@
|
||||
import URL from 'url'
|
||||
|
||||
export default function makeLinksAbsolute($, url) {
|
||||
['href', 'src'].forEach(attr => absolutize($, url, attr))
|
||||
return $
|
||||
}
|
||||
|
||||
function absolutize($, url, attr) {
|
||||
$(`[${attr}]`).each((_, node) => {
|
||||
const $node = $(node)
|
||||
$node.attr(attr, URL.resolve(url, $node.attr(attr)))
|
||||
})
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import makeLinksAbsolute from './make-links-absolute'
|
||||
|
||||
describe('makeLinksAbsolute($)', () => {
|
||||
it('makes relative #hrefs absolute', () => {
|
||||
const html = `<a href="#foo">bar</a>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = makeLinksAbsolute($, 'http://example.com').html()
|
||||
|
||||
assert.equal(result, `<a href="http://example.com/#foo">bar</a>`)
|
||||
})
|
||||
|
||||
it('makes relative ./relative paths absolute', () => {
|
||||
const html = `<a href="foo/bar">bar</a>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = makeLinksAbsolute($, 'http://example.com/baz/bat').html()
|
||||
|
||||
assert.equal(result, `<a href="http://example.com/baz/foo/bar">bar</a>`)
|
||||
})
|
||||
|
||||
it('makes relative /root/paths absolute', () => {
|
||||
const html = `<a href="/foo/bar">bar</a>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = makeLinksAbsolute($, 'http://example.com/baz/bat').html()
|
||||
|
||||
assert.equal(result, `<a href="http://example.com/foo/bar">bar</a>`)
|
||||
})
|
||||
|
||||
it('makes relative srcs absolute', () => {
|
||||
const html = `<img src="#foo">`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = makeLinksAbsolute($, 'http://example.com').html()
|
||||
|
||||
assert.equal(result, `<img src="http://example.com/#foo">`)
|
||||
})
|
||||
})
|
@ -0,0 +1,24 @@
|
||||
// For ease of use in extracting from meta tags,
|
||||
// replace the "content" attribute on meta tags with the
|
||||
// "value" attribute.
|
||||
//
|
||||
// In addition, normalize 'property' attributes to 'name' for ease of
|
||||
// querying later. See, e.g., og or twitter meta tags.
|
||||
|
||||
export default function normalizeMetaTags($) {
|
||||
$ = convertMetaProp($, 'content', 'value')
|
||||
$ = convertMetaProp($, 'property', 'name')
|
||||
return $
|
||||
}
|
||||
|
||||
function convertMetaProp($, from, to) {
|
||||
$(`meta[${from}]`).each((_, node) => {
|
||||
const $node = $(node)
|
||||
|
||||
const value = $node.attr(from)
|
||||
$node.attr(to, value)
|
||||
$node.removeAttr(from)
|
||||
})
|
||||
|
||||
return $
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import normalizeMetaTags from './normalize-meta-tags'
|
||||
|
||||
describe('normalizeMetaTags($)', () => {
|
||||
it('replaces "content" attributes with "value"', () => {
|
||||
const html = `<html><meta name="foo" content="bar"></html>`
|
||||
const test = `<html><meta name="foo" value="bar"></html>`
|
||||
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = normalizeMetaTags($).html()
|
||||
|
||||
assert.equal(result, test)
|
||||
})
|
||||
|
||||
it('replaces "property" attributes with "name"', () => {
|
||||
const html = `<html><meta property="foo" value="bar"></html>`
|
||||
const test = `<html><meta value="bar" name="foo"></html>`
|
||||
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const result = normalizeMetaTags($).html()
|
||||
|
||||
assert.equal(result, test)
|
||||
})
|
||||
})
|
@ -0,0 +1,96 @@
|
||||
import 'babel-polyfill'
|
||||
|
||||
import URL from 'url'
|
||||
import request from 'request'
|
||||
|
||||
import {
|
||||
REQUEST_HEADERS,
|
||||
FETCH_TIMEOUT,
|
||||
BAD_CONTENT_TYPES_RE,
|
||||
MAX_CONTENT_LENGTH,
|
||||
} from './constants'
|
||||
|
||||
// Set our response attribute to the result of fetching our URL.
|
||||
// TODO: This should gracefully handle timeouts and raise the
|
||||
// proper exceptions on the many failure cases of HTTP.
|
||||
// TODO: Ensure we are not fetching something enormous. Always return
|
||||
// unicode content for HTML, with charset conversion.
|
||||
|
||||
export default async function fetchResource(url) {
|
||||
const parsedUrl = URL.parse(url)
|
||||
|
||||
const options = {
|
||||
url: parsedUrl,
|
||||
headers: REQUEST_HEADERS,
|
||||
timeout: FETCH_TIMEOUT,
|
||||
}
|
||||
|
||||
const { response, body } = await get(options)
|
||||
|
||||
try {
|
||||
validateResponse(response)
|
||||
return { body, response }
|
||||
} catch(e) {
|
||||
return e
|
||||
}
|
||||
}
|
||||
|
||||
function get(options){
|
||||
return new Promise(function(resolve, reject){
|
||||
request(options, function(err, response, body){
|
||||
if(err){
|
||||
reject(err)
|
||||
} else {
|
||||
resolve({ body, response })
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// Evaluate a response to ensure it's something we should be keeping.
|
||||
// This does not validate in the sense of a response being 200 level or
|
||||
// not. Validation here means that we haven't found reason to bail from
|
||||
// further processing of this url.
|
||||
|
||||
export function validateResponse(response, parseNon2xx=false) {
|
||||
// Check if we got a valid status code
|
||||
if (response.statusMessage !== "OK") {
|
||||
if (!response.statusCode) {
|
||||
throw new Error(
|
||||
`Unable to fetch content. Original exception was ${response.error}`
|
||||
)
|
||||
} else if (!parseNon2xx) {
|
||||
throw new Error(
|
||||
`Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.`
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const {
|
||||
"content-type": contentType,
|
||||
"content-length": contentLength
|
||||
} = response.headers
|
||||
|
||||
// Check that the content is not in BAD_CONTENT_TYPES
|
||||
if (BAD_CONTENT_TYPES_RE.test(contentType)) {
|
||||
throw new Error(
|
||||
`Content-type for this resource was ${contentType} and is not allowed.`
|
||||
)
|
||||
}
|
||||
|
||||
// Check that the content length is below maximum
|
||||
if (contentLength > MAX_CONTENT_LENGTH) {
|
||||
throw new Error(
|
||||
`Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`
|
||||
)
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// Grabs the last two pieces of the URL and joins them back together
|
||||
// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'
|
||||
export function baseDomain({ host }) {
|
||||
return host.split('.').slice(-2).join('.')
|
||||
}
|
@ -0,0 +1,110 @@
|
||||
import assert from 'assert'
|
||||
import URL from 'url'
|
||||
|
||||
import {
|
||||
default as fetchResource,
|
||||
baseDomain,
|
||||
validateResponse,
|
||||
} from './fetch-resource'
|
||||
import { MAX_CONTENT_LENGTH } from './constants'
|
||||
|
||||
describe('fetchResource(url)', () => {
|
||||
it('fetches domains', async () => {
|
||||
const url = 'http://theconcourse.deadspin.com/1786177057'
|
||||
const { body, response } = await fetchResource(url)
|
||||
|
||||
assert.equal(typeof body, 'string')
|
||||
})
|
||||
})
|
||||
|
||||
describe('validateResponse(response)', () => {
|
||||
it('validates a response object', () => {
|
||||
const validResponse = {
|
||||
statusMessage: "OK",
|
||||
statusCode: 200,
|
||||
headers: {
|
||||
"content-type": 'text/html',
|
||||
"content-length": 500,
|
||||
}
|
||||
}
|
||||
|
||||
assert.equal(validateResponse(validResponse), true)
|
||||
})
|
||||
|
||||
it('throws an error if there is no status code', () => {
|
||||
const invalidResponse = {
|
||||
}
|
||||
|
||||
assert.throws(
|
||||
() => {
|
||||
validateResponse(invalidResponse)
|
||||
},
|
||||
/unable to fetch content/i
|
||||
)
|
||||
})
|
||||
|
||||
it('throws an error if response code is not 2xx', () => {
|
||||
const invalidResponse = {
|
||||
statusCode: 500,
|
||||
}
|
||||
|
||||
assert.throws(
|
||||
() => {
|
||||
validateResponse(invalidResponse)
|
||||
},
|
||||
/instructed to reject non-2xx/i
|
||||
)
|
||||
})
|
||||
|
||||
it('throws an error if response has bad content-type', () => {
|
||||
const invalidResponse = {
|
||||
statusMessage: "OK",
|
||||
statusCode: 200,
|
||||
headers: {
|
||||
"content-type": 'image/gif',
|
||||
"content-length": 500,
|
||||
}
|
||||
}
|
||||
|
||||
assert.throws(
|
||||
() => {
|
||||
validateResponse(invalidResponse)
|
||||
},
|
||||
/content-type for this resource/i
|
||||
)
|
||||
})
|
||||
|
||||
it('throws an error if response length is > max', () => {
|
||||
const invalidResponse = {
|
||||
statusMessage: "OK",
|
||||
statusCode: 200,
|
||||
headers: {
|
||||
"content-type": 'text/html',
|
||||
"content-length": MAX_CONTENT_LENGTH + 1,
|
||||
}
|
||||
}
|
||||
|
||||
assert.throws(
|
||||
() => {
|
||||
validateResponse(invalidResponse)
|
||||
},
|
||||
/Content for this resource was too large/i
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('baseDomain(parsedUrl)', () => {
|
||||
it('returns the base domain, excluding subdomain', () => {
|
||||
const url = 'https://www.npmjs.com/package/request#streaming'
|
||||
const parsedUrl = URL.parse(url)
|
||||
|
||||
assert.equal(baseDomain(parsedUrl), 'npmjs.com')
|
||||
})
|
||||
|
||||
it('returns the base domain as is if no subdomain', () => {
|
||||
const url = 'https://npmjs.com/package/request#streaming'
|
||||
const parsedUrl = URL.parse(url)
|
||||
|
||||
assert.equal(baseDomain(parsedUrl), 'npmjs.com')
|
||||
})
|
||||
})
|
@ -0,0 +1 @@
|
||||
export { default as fetchResource } from './fetch-resource'
|
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
# Runs the mocha tests
|
||||
|
||||
mocha --compilers js:babel-register $(find src -name "*.test.js") --require babel-polyfill
|
Loading…
Reference in New Issue