feat: resource fetches content from a URL and prepares for parsing

Squashed commit of the following:

commit 7ba2d2b36d175f5ccbc02f918322ea0dd44bf2c1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 17:55:10 2016 -0400

    feat: resource fetches content from a URL and prepares for parsing

commit 0abdfa49eed5b363169070dac6d65d0a5818c918
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 17:54:07 2016 -0400

    fix: this was messing up double Esses ('ss', as in class => cla)

commit 9dc65a99631e3a68267a68b2b4629c4be8f61546
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 14:58:57 2016 -0400

    fix: test suite working w/new dirs

commit 993dc33a5229bfa22ea998e3c4fe105be9d91c21
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 14:49:39 2016 -0400

    feat: convertLazyLoadedImages puts img urls in the src

commit e7fb105443dd16d036e460ad21fbcb47191f475b
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 14:30:43 2016 -0400

    feat: makeLinksAbsolute to fully qualify urls

commit dbd665078af854efe84bbbfe9b55acd02e1a652f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 13:38:33 2016 -0400

    feat: fetchResource to fetch a url and validate the response

commit 42d3937c8f0f8df693996c2edee93625f13dced7
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 6 10:25:34 2016 -0400

    feat: normalizing meta tags
pull/1/head
Adam Pash 8 years ago
parent bc97156718
commit 8da2425e59

@ -2,6 +2,7 @@
"presets": ["es2015"],
"plugins": [
"transform-es2015-destructuring",
"transform-object-rest-spread"
"transform-object-rest-spread",
"transform-async-to-generator"
]
}

1
.gitignore vendored

@ -2,3 +2,4 @@ node_modules
build
bundle.js
npm-debug.log
dist

@ -1,4 +1,6 @@
TODO:
- run makeLinksAbsolute on extracted content before returning
- remove logic for fetching meta attrs with custom props
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
- extractNextPageUrl
- Rename all cleaners from cleanThing to clean

@ -6,12 +6,13 @@
"scripts": {
"start": "node ./build",
"build": "rollup -c",
"test": "mocha --compilers js:babel-register --recursive src/**/*.test.js"
"test": "./test-runner"
},
"author": "",
"license": "ISC",
"devDependencies": {
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0",
"babel-polyfill": "^6.13.0",
@ -27,6 +28,7 @@
"dependencies": {
"cheerio": "^0.20.0",
"moment": "^2.14.1",
"request-promise": "^4.1.1",
"valid-url": "^1.0.9",
"wuzzy": "^0.1.2"
}

@ -0,0 +1,7 @@
#!/usr/bin/env node
var Iris = require('./dist/bundle')
var url = process.argv[2]
var result = Iris.parse(url).then(function(result) {
console.log(result.content)
})

@ -9,5 +9,6 @@ export default {
babel(babelrc()),
],
format: 'cjs',
dest: 'dist/bundle.js' // equivalent to --output
dest: 'dist/bundle.js', // equivalent to --output
sourceMap: true,
}

@ -24,13 +24,14 @@ export default function extractBestNode($, opts) {
// TODO Do I need this? AP
// let $root = $.root().clone()
if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($)
}
$ = convertToParagraphs($)
$ = scoreContent($, opts.weightNodes)
const topCandidate = findTopCandidate($)
const $topCandidate = findTopCandidate($)
return topCandidate
return $topCandidate
}

@ -17,8 +17,6 @@ const GenericContentExtractor = {
parse($, html, title='', opts={}) {
opts = { ...this.defaultOpts, ...opts }
// TODO: Title is used to clean headers.
// Should be passed from title extraction.
return this.extract($, html, opts, title)
},

@ -8,31 +8,32 @@ import {
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
export default function findTopCandidate($) {
let candidate, topScore = 0
let $candidate, topScore = 0
$('*[score]').each((index, node) => {
const $node = $(node)
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
return
}
const score = getScore($(node))
const score = getScore($node)
if (score > topScore) {
topScore = score
candidate = node
$candidate = $node
}
})
// If we don't have a candidate, return the body
// or whatever the first element is
if (!candidate) {
if (!$candidate) {
return $('body') || $('*').first()
}
candidate = mergeSiblings(candidate, topScore, $)
$candidate = mergeSiblings($candidate, topScore, $)
return $(candidate)
return $candidate
}
// Now that we have a top_candidate, look through the siblings of
@ -40,28 +41,29 @@ export default function findTopCandidate($) {
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export function mergeSiblings(candidate, topScore, $) {
if (!$(candidate).parent().length) {
return candidate
export function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
let wrappingDiv = $('<div></div>')
$(candidate).parent().children().each((index, child) => {
$candidate.parent().children().each((index, child) => {
const $child = $(child)
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return
}
const childScore = getScore($(child))
const childScore = getScore($child)
if (childScore) {
if (child === candidate) {
wrappingDiv.append(child)
if ($child === $candidate) {
wrappingDiv.append($child)
} else {
let contentBonus = 0
// extract to scoreLinkDensity() TODO
const density = linkDensity($(child))
const density = linkDensity($child)
// If sibling has a very low link density,
// give it a small bonus
@ -77,23 +79,23 @@ export function mergeSiblings(candidate, topScore, $) {
// If sibling node has the same class as
// candidate, give it a bonus
if ($(child).attr('class') === $(candidate).attr('class')) {
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus = contentBonus + topScore * .2
}
const newScore = getScore($(child)) + contentBonus
const newScore = getScore($child) + contentBonus
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append(child)
return wrappingDiv.append($child)
} else if (child.tagName === 'p') {
const childContentLength = textLength($(child).text())
const childContentLength = textLength($child.text())
if (childContentLength > 80 && density < .25) {
return wrappingDiv.append(child)
return wrappingDiv.append($child)
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append(child)
return wrappingDiv.append($child)
}
}
}

@ -1,4 +1,4 @@
const NORMALIZE_RE = new RegExp('\s{2,}')
const NORMALIZE_RE = /\s{2,}/
export default function normalizeSpaces(text) {
return text.replace(NORMALIZE_RE, ' ').trim()

@ -8,8 +8,7 @@ import GenericDekExtractor from './dek/extractor'
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
const GenericExtractor = {
parse: (url, html) => {
let $
parse: (url, html, $) => {
if (html) {
$ = cheerio.load(html)
} else {

@ -1,10 +1,19 @@
import fs from 'fs'
import GenericExtractor from './extractor/generic/index.js'
import Resource from './resource'
import GenericExtractor from './extractor/generic'
// const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
// const url = 'http://wired.com'
// const result = GenericExtractor.parse(url, html)
// console.log(result)
import fetchResource from './resource/utils/fetch-resource'
// export default fetchResource
export default GenericExtractor
// export { default as GenericExtractor } from './extractor/generic/index.js'
const Iris = {
parse: async function(url) {
const $ = await Resource.create(url)
const result = GenericExtractor.parse(url, null, $)
return result
}
}
export default Iris

@ -0,0 +1,12 @@
import Iris from './index'
describe('Iris', function() {
describe('parse(url)', function() {
this.timeout(1000000)
it('does the whole thing', async function() {
const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220')
console.log(result)
})
})
})

@ -0,0 +1,430 @@
import 'babel-polyfill'
import cheerio from 'cheerio'
import { fetchResource } from './utils'
import {
normalizeMetaTags,
convertLazyLoadedImages,
clean,
} from './utils/dom'
const Resource = {
// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parseNon2xx: If true, attempt to parse non-200 level
// resources. Default is false.
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
create: async function(url, parseNon2xx=false, preparedResponse) {
const result = await fetchResource(url)
return this.generateDoc(result)
},
generateDoc({ body: content, response }) {
const { "content-type": contentType } = response.headers
// TODO: Implement is_text function from
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
if (!contentType.includes('html') &&
!contentType.includes('text')) {
throw new Error(`Content does not appear to be text.`)
}
let $ = cheerio.load(content, { normalizeWhitespace: true })
if ($.root().children().length === 0) {
throw new Error(`No children, likely a bad parse.`)
}
$ = normalizeMetaTags($)
$ = convertLazyLoadedImages($)
$ = clean($)
return $
}
}
export default Resource
// def __init__(self, url, parse_non_2xx=False, response=None):
// """ Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parse_non_2xx: If True, attempt to parse non-200 level
// resources. If False, raise a RetrievalFailed
// based exception. Default is False.
// :param response: If not None, use as the response rather than
// attempting to fetch it ourselves. Expects a
// requests.models.Response object.
// """
// self.url = url
// self.parse_non_2xx = parse_non_2xx
//
// if response:
// self.response = response
// else:
// self.response = self._fetch_resource()
// Iris: Human-friendly content extraction.
// import logging
// import lxml
// import re
// import requests
// import socket
//
// from django.conf import settings
// from lxml.etree import XPathEvaluator
// from lxml.html.clean import Cleaner
// from urlparse import urlparse
//
// from utils.dom import extract_by_selector as ebs, convert_lazy_loaded_images
// from utils.dom.attribmap import AttribMap
// from utils.statsd import stats
// from utils.text import is_text
// from utils.html import get_charset_from_html, strip_content_encodings
//
// from . import exceptions
//
// logger = logging.getLogger(__name__)
//
// # Hosts that are allowed to use embeds and iframes. We should be very
// # restrictive with this and only include top-tier video sites.
// host_whitelist = ['www.youtube.com', 'www.vimeo.com']
//
// # The number of seconds to attempt to fetch a resource before timing out.
// FETCH_TIMEOUT = 10
//
// cleaner = Cleaner(
// style=True,
// page_structure=False,
// meta=False,
// add_nofollow=False, # done by hand
// remove_unknown_tags=False,
// links=False,
// host_whitelist=host_whitelist)
//
//
//
// class Resource(object):
// """ A Resource is a wrapper class for an HTTP resource. Provides
// functionality to fetch a resource as well as a handful of shortcut
// methods to run xpath efficiently on HTML, etc.
//
// Uses requests and lxml internally for fetching and querying.
// """
//
//
// def __init__(self, url, parse_non_2xx=False, response=None):
// """ Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param parse_non_2xx: If True, attempt to parse non-200 level
// resources. If False, raise a RetrievalFailed
// based exception. Default is False.
// :param response: If not None, use as the response rather than
// attempting to fetch it ourselves. Expects a
// requests.models.Response object.
// """
// self.url = url
// self.parse_non_2xx = parse_non_2xx
//
// if response:
// self.response = response
// else:
// self.response = self._fetch_resource()
//
// def __unicode__(self):
// return u'<Resource ({0})>'.format(self.url)
//
// def __repr__(self):
// return "<Resource ({0})>".format(self.url)
//
// @classmethod
// def fabricate(kls, url, content, headers=None):
// """ Given a URL and some content, create a fake Resource that looks
// as though it has already fetched the content. Useful for using
// Resource objects without having to do a GET.
// """
//
// if type(content) != unicode:
// raise TypeError("Provided content must be unicode.")
//
// if headers is None:
// headers = {}
//
// try:
// utf8_content = content.encode('utf-8', 'strict')
// except UnicodeDecodeError:
// logger.warning("Unable to encode content for url %s. Content "
// "should be unicode and encodeable at this point.")
// utf8_content = content.encode('utf-8', 'replace')
//
// mocked_response_dict = {
// "cookies": {},
// "_content": utf8_content,
// "headers": dict({
// "content-length": len(content),
// "accept-ranges": "bytes",
// "vary": "Accept-Encoding,Cookie",
// "server": "Apache/2.2.21",
// "content-type": "text/html; charset=UTF-8"
// }, **headers),
// "url": url,
// "status_code": 200,
// "_content_consumed": False,
// "request": None,
// "raw": None,
// "error": None,
// "config": {
// "decode_unicode": True,
// "pool_connections": 10,
// "verbose": None,
// "keep_alive": True,
// "max_retries": 0,
// "base_headers": {
// "Accept-Encoding": "identity, deflate, compress, gzip",
// "Accept": "|)}>#*",
// "User-Agent": "python-requests/0.8.1"
// },
// "pool_maxsize": 10,
// "safe_mode": False,
// "max_redirects": 30
// },
// "history": []
// }
// mocked_response = requests.Response()
// for k, v in mocked_response_dict.items():
// setattr(mocked_response, k, v)
//
// return Resource(
// url = url,
// response = mocked_response
// )
//
//
// @property
// def url(self):
// return self._url
//
//
// @url.setter
// def url(self, value):
// parsed_url = urlparse(value)
// if parsed_url.scheme not in ('http', 'https'):
// raise ValueError("Resource only allows HTTP and HTTPS urls.")
//
// if not parsed_url.netloc:
// raise ValueError("Relative URLs are not allowed.")
//
// self._url = value
//
// _parsed_url = None
// @property
// def parsed_url(self):
// if self._parsed_url is None:
// self._parsed_url = urlparse(self.url)
// return self._parsed_url
//
// @property
// def status_code(self):
// return self.response.status_code
//
//
// _content = None
// @property
// def content(self):
// """Return the content for a resource. Always returns unicode.
//
// """
// if self._content is None:
// # Requests that come in without content-type encoding headers will
// # default to iso-8859-1, which could be wrong
// if (self.response.encoding and
// self.response.encoding.lower() == 'iso-8859-1'):
// # Dont send unicode, because it could have been decoded wrong
// # by an incorrect content-type guess.
// encoding = get_charset_from_html(self.response.content) or 'iso-8859-1'
//
// if encoding != self.response.encoding:
// # First, try to use the encoding we found in the markup
// try:
// self._content = self.response.content.decode(encoding)
// except (LookupError, UnicodeDecodeError):
// stats.increment(
// 'iris.resource.encoding.encoding_mismatch')
// # That encoding might be wrong though, so if it is, use
// # the one it reported since they could have the wrong
// # one set in the markup. eg. sending the content over
// # as iso but declaring it to be utf-8 like gq.com does.
// # We may also end up with an invalid encoding type, at
// # which point we should also just use the request
// # encoding and replace silently.
// self._content = self.response.content.decode(
// self.response.encoding, 'replace')
// else:
// # If the encoding guess was right, just use the unicode
// self._content = self.response.text
//
// else:
// # Otherwise we trust the encoding
// self._content = self.response.text
//
// return self._content
//
//
// @property
// def content_type(self):
// return self.response.headers.get('content-type', '')
//
//
// @property
// def is_html(self):
// if 'html' in self.content_type:
// return True
//
// # Otherwise, just try parsing it and see if it succeeds
// try:
// return (self.doc is not None)
// except:
// return False
//
// @property
// def is_plaintext(self):
// if 'text/plain' in self.content_type:
// return True
//
// return False
//
// @property
// def is_image(self):
// if 'image' in self.content_type:
// return True
//
// return False
//
// @property
// def is_pdf(self):
// if 'pdf' in self.content_type:
// return True
//
// return False
//
// _lxml_doc = None
// @property
// def doc(self):
// if self._lxml_doc is None:
// self._generate_lxml_doc()
//
// return self._lxml_doc
//
// _docxp = None
// @property
// def docxp(self):
// """ Generate an XPath Evaluator for this doc. """
// if self._docxp is None:
// self._docxp = XPathEvaluator(self.doc)
//
// return self._docxp
//
// _redocxp = None
// @property
// def redocxp(self):
// """ Generate an XPath Evaluator for this doc, that includes the RE
// namespace for regular expression matching.
//
// """
// if self._redocxp is None:
// _rens = {'re':'http://exslt.org/regular-expressions'}
// self._redocxp = XPathEvaluator(self.doc, namespaces=_rens)
//
// return self._redocxp
//
// def _generate_lxml_doc(self):
// # First check if we have a text based resource
// if (not 'html' in self.content_type and
// not 'text' in self.content_type and
// not is_text(self.content[:512])):
// raise ValueError("Content does not appear to be text.")
//
//
// # Remove useless carriage returns which get parsed as &#13; otherwise
// content = re.sub(r'(\n\r|\r\n)', '\n', self.content)
//
// # Dont pass any content encodings into lxml, it is dumb about them
// content = strip_content_encodings(content)
//
// self._lxml_doc = lxml.html.fromstring(content)
//
//
//
//
// if len(self._lxml_doc.getchildren()) == 0:
// stats.increment('iris.resource.encoding.no_children')
// raise ValueError("No children, likely a bad parse.")
//
//
// # Sometimes, lxml (or BeautifulSoup) will wrap the whole document
// # in an extra html tag. This screws up a whole bunch of things in
// # the parsing process. If this is the case, reset the doc to the
// # ACTUAL root of the doc.
// # Sample cases:
// # * Strange Doctype causing issues: http://bit.ly/IATz0B
// # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o
// # Also check for a body inside of our internal HTML tag, to determine
// # that it's not just a junk HTML tag sibling at the bottom of the
// # doc or something.
// internal_html_tag = self._lxml_doc.find('html')
// if (internal_html_tag is not None and
// len(internal_html_tag.xpath('.//body')) > 0):
// self._lxml_doc = internal_html_tag
//
// self._normalize_meta_tags()
//
// self._lxml_doc.make_links_absolute(self.url)
//
// # Convert any lazy loaded images into normal images before clean_html
// # which will strip all other attributes
// self._lxml_doc = convert_lazy_loaded_images(self._lxml_doc)
//
// # Clean the doc of anything malicious.
// self._lxml_doc = cleaner.clean_html(self._lxml_doc)
//
// # Manually nofollow links so that we don't clobber rel author
// # Workaround for https://bugs.launchpad.net/lxml/+bug/971754
// for a in self.docxp('//a'):
// if a.attrib.get('rel', None):
// rel_attribs = set(a.attrib['rel'].split())
// rel_attribs.add('nofollow')
// a.attrib['rel'] = ' '.join(rel_attribs)
// else:
// a.attrib['rel'] = 'nofollow'
//
// # Re-relativize anchor links
// anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" %
// self.url.replace("'", "%27"))
// for link in self.docxp(anchor_link_xpath):
// link.attrib['href'] = link.attrib['href'].replace(self.url, '')
//
//
// _attrib_map = None
// @property
// def attrib_map(self):
// """ Create an AttribMap object for fast checking of class/id existence
// in the document. Used in association with extract_by_selector.
//
// """
// if self._attrib_map is None:
// self._attrib_map = AttribMap(self.doc)
//
// return self._attrib_map
//
//
// def extract_by_selector(self, selector):
// " Shortcut to run extract_by_selector on our doc with our AttribMap. "
// return ebs(self.doc, selector, self.attrib_map, self.docxp)
//
//

@ -0,0 +1,58 @@
import assert from 'assert'
import Resource from './index'
describe('Resource', () => {
describe('create(url)', function() {
this.timeout(3000)
it('fetches the page and returns a cheerio object', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057'
const $ = await Resource.create(url)
console.log($.html())
})
})
describe('generateDoc({ body, response })', () => {
it('returns a cheerio object if valid', () => {
const response = { headers: { "content-type": "text/html" } }
const body = `<div><p>Hi</p></div>`
const $ = Resource.generateDoc({ body, response })
assert.equal($.html(), body)
})
it('throws an error if the content is not text', () => {
const response = {
headers: {
"content-type": "foo"
}
}
const body = ''
assert.throws(
() => {
Resource.generateDoc({ body, response })
},
/content does not appear to be text/i
)
})
it('throws an error if the content has no children', () => {
const response = {
headers: {
"content-type": "html"
}
}
const body = ``
assert.throws(
() => {
Resource.generateDoc({ body, response })
},
/no children/i
)
})
})
})

@ -0,0 +1,36 @@
export const REQUEST_HEADERS = {
'User-Agent': 'Readability - http://readability.com/about/'
}
// The number of milliseconds to attempt to fetch a resource before timing out.
export const FETCH_TIMEOUT = 10000
// Content types that we do not extract content from
const BAD_CONTENT_TYPES = [
'audio/mpeg',
'image/gif',
'image/jpeg',
'image/jpg',
]
export const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i')
// Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.
export const MAX_CONTENT_LENGTH = 5242880
// Turn the global proxy on or off
// Proxying is not currently enabled in Python source
// so not implementing logic in port.
export const PROXY_DOMAINS = false
export const REQUESTS_PROXIES = {
'http': 'http://38.98.105.139:33333',
'https': 'http://38.98.105.139:33333',
}
export const DOMAINS_TO_PROXY = [
'nih.gov',
'gutenberg.org',
]

@ -0,0 +1,17 @@
import { TAGS_TO_REMOVE } from './constants'
export default function clean($) {
$(TAGS_TO_REMOVE).remove()
$ = cleanComments($)
return $
}
function isComment(index, node) {
return node.type === 'comment'
}
function cleanComments($) {
$.root().find('*').contents().filter(isComment).remove()
return $
}

@ -0,0 +1,27 @@
import assert from 'assert'
import cheerio from 'cheerio'
import clean from './clean'
describe('clean($)', () => {
it('removes script elements', () => {
const html = `<div><script>alert('hi')</script></div>`
const $ = cheerio.load(html)
assert.equal(clean($).html(), '<div></div>')
})
it('removes style elements', () => {
const html = `<div><style>foo: {color: red;}</style></div>`
const $ = cheerio.load(html)
assert.equal(clean($).html(), '<div></div>')
})
it('removes comments', () => {
const html = `<div>HI <!-- This is a comment --></div>`
const $ = cheerio.load(html)
assert.equal(clean($).html(), '<div>HI </div>')
})
})

@ -0,0 +1,8 @@
export const IS_LINK = new RegExp('https?://', 'i')
export const IS_IMAGE = new RegExp('\.(png|gif|jpe?g)', 'i')
export const TAGS_TO_REMOVE = [
'script',
'style',
'form',
].join(',')

@ -0,0 +1,26 @@
import 'babel-polyfill'
import {
IS_LINK,
IS_IMAGE,
} from './constants'
// Convert all instances of images with potentially
// lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
export default function convertLazyLoadedImages($) {
$('img').each((_, img) => {
Reflect.ownKeys(img.attribs).forEach((attr) => {
const value = img.attribs[attr]
if (attr !== 'src' && IS_LINK.test(value) &&
IS_IMAGE.test(value)) {
$(img).attr('src', value)
}
})
})
return $
}

@ -0,0 +1,44 @@
import assert from 'assert'
import cheerio from 'cheerio'
import convertLazyLoadedImages from './convert-lazy-loaded-images'
describe('convertLazyLoadedImages($)', () => {
it('moves image links to src if placed in another attribute', () => {
const html = `<img data-src="http://example.com/foo.jpg">`
const $ = cheerio.load(html)
const result = convertLazyLoadedImages($).html()
assert.equal(result, `<img data-src="http://example.com/foo.jpg" src="http://example.com/foo.jpg">`)
})
it('does nothing when value is not a link', () => {
// This is far from perfect, since a relative url could
// be perfectly correct.
const html = `<img data-src="foo.jpg">`
const $ = cheerio.load(html)
const result = convertLazyLoadedImages($).html()
assert.equal(result, `<img data-src="foo.jpg">`)
})
it('does nothing when value is not an image', () => {
const html = `<img data-src="http://example.com">`
const $ = cheerio.load(html)
const result = convertLazyLoadedImages($).html()
assert.equal(result, `<img data-src="http://example.com">`)
})
it('does not change a correct img with src', () => {
const html = `<img src="http://example.com/foo.jpg">`
const $ = cheerio.load(html)
const result = convertLazyLoadedImages($).html()
assert.equal(result, `<img src="http://example.com/foo.jpg">`)
})
})

@ -0,0 +1,4 @@
export { default as normalizeMetaTags } from './normalize-meta-tags'
export { default as makeLinksAbsolute } from './make-links-absolute'
export { default as convertLazyLoadedImages } from './convert-lazy-loaded-images'
export { default as clean } from './clean'

@ -0,0 +1,13 @@
import URL from 'url'
export default function makeLinksAbsolute($, url) {
['href', 'src'].forEach(attr => absolutize($, url, attr))
return $
}
function absolutize($, url, attr) {
$(`[${attr}]`).each((_, node) => {
const $node = $(node)
$node.attr(attr, URL.resolve(url, $node.attr(attr)))
})
}

@ -0,0 +1,42 @@
import assert from 'assert'
import cheerio from 'cheerio'
import makeLinksAbsolute from './make-links-absolute'
describe('makeLinksAbsolute($)', () => {
it('makes relative #hrefs absolute', () => {
const html = `<a href="#foo">bar</a>`
const $ = cheerio.load(html)
const result = makeLinksAbsolute($, 'http://example.com').html()
assert.equal(result, `<a href="http://example.com/#foo">bar</a>`)
})
it('makes relative ./relative paths absolute', () => {
const html = `<a href="foo/bar">bar</a>`
const $ = cheerio.load(html)
const result = makeLinksAbsolute($, 'http://example.com/baz/bat').html()
assert.equal(result, `<a href="http://example.com/baz/foo/bar">bar</a>`)
})
it('makes relative /root/paths absolute', () => {
const html = `<a href="/foo/bar">bar</a>`
const $ = cheerio.load(html)
const result = makeLinksAbsolute($, 'http://example.com/baz/bat').html()
assert.equal(result, `<a href="http://example.com/foo/bar">bar</a>`)
})
it('makes relative srcs absolute', () => {
const html = `<img src="#foo">`
const $ = cheerio.load(html)
const result = makeLinksAbsolute($, 'http://example.com').html()
assert.equal(result, `<img src="http://example.com/#foo">`)
})
})

@ -0,0 +1,24 @@
// For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.
export default function normalizeMetaTags($) {
$ = convertMetaProp($, 'content', 'value')
$ = convertMetaProp($, 'property', 'name')
return $
}
function convertMetaProp($, from, to) {
$(`meta[${from}]`).each((_, node) => {
const $node = $(node)
const value = $node.attr(from)
$node.attr(to, value)
$node.removeAttr(from)
})
return $
}

@ -0,0 +1,28 @@
import assert from 'assert'
import cheerio from 'cheerio'
import normalizeMetaTags from './normalize-meta-tags'
describe('normalizeMetaTags($)', () => {
it('replaces "content" attributes with "value"', () => {
const html = `<html><meta name="foo" content="bar"></html>`
const test = `<html><meta name="foo" value="bar"></html>`
const $ = cheerio.load(html)
const result = normalizeMetaTags($).html()
assert.equal(result, test)
})
it('replaces "property" attributes with "name"', () => {
const html = `<html><meta property="foo" value="bar"></html>`
const test = `<html><meta value="bar" name="foo"></html>`
const $ = cheerio.load(html)
const result = normalizeMetaTags($).html()
assert.equal(result, test)
})
})

@ -0,0 +1,96 @@
import 'babel-polyfill'
import URL from 'url'
import request from 'request'
import {
REQUEST_HEADERS,
FETCH_TIMEOUT,
BAD_CONTENT_TYPES_RE,
MAX_CONTENT_LENGTH,
} from './constants'
// Set our response attribute to the result of fetching our URL.
// TODO: This should gracefully handle timeouts and raise the
// proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
export default async function fetchResource(url) {
const parsedUrl = URL.parse(url)
const options = {
url: parsedUrl,
headers: REQUEST_HEADERS,
timeout: FETCH_TIMEOUT,
}
const { response, body } = await get(options)
try {
validateResponse(response)
return { body, response }
} catch(e) {
return e
}
}
function get(options){
return new Promise(function(resolve, reject){
request(options, function(err, response, body){
if(err){
reject(err)
} else {
resolve({ body, response })
}
})
})
}
// Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 level or
// not. Validation here means that we haven't found reason to bail from
// further processing of this url.
export function validateResponse(response, parseNon2xx=false) {
// Check if we got a valid status code
if (response.statusMessage !== "OK") {
if (!response.statusCode) {
throw new Error(
`Unable to fetch content. Original exception was ${response.error}`
)
} else if (!parseNon2xx) {
throw new Error(
`Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.`
)
}
}
const {
"content-type": contentType,
"content-length": contentLength
} = response.headers
// Check that the content is not in BAD_CONTENT_TYPES
if (BAD_CONTENT_TYPES_RE.test(contentType)) {
throw new Error(
`Content-type for this resource was ${contentType} and is not allowed.`
)
}
// Check that the content length is below maximum
if (contentLength > MAX_CONTENT_LENGTH) {
throw new Error(
`Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`
)
}
return true
}
// Grabs the last two pieces of the URL and joins them back together
// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'
export function baseDomain({ host }) {
return host.split('.').slice(-2).join('.')
}

@ -0,0 +1,110 @@
import assert from 'assert'
import URL from 'url'
import {
default as fetchResource,
baseDomain,
validateResponse,
} from './fetch-resource'
import { MAX_CONTENT_LENGTH } from './constants'
describe('fetchResource(url)', () => {
it('fetches domains', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057'
const { body, response } = await fetchResource(url)
assert.equal(typeof body, 'string')
})
})
describe('validateResponse(response)', () => {
it('validates a response object', () => {
const validResponse = {
statusMessage: "OK",
statusCode: 200,
headers: {
"content-type": 'text/html',
"content-length": 500,
}
}
assert.equal(validateResponse(validResponse), true)
})
it('throws an error if there is no status code', () => {
const invalidResponse = {
}
assert.throws(
() => {
validateResponse(invalidResponse)
},
/unable to fetch content/i
)
})
it('throws an error if response code is not 2xx', () => {
const invalidResponse = {
statusCode: 500,
}
assert.throws(
() => {
validateResponse(invalidResponse)
},
/instructed to reject non-2xx/i
)
})
it('throws an error if response has bad content-type', () => {
const invalidResponse = {
statusMessage: "OK",
statusCode: 200,
headers: {
"content-type": 'image/gif',
"content-length": 500,
}
}
assert.throws(
() => {
validateResponse(invalidResponse)
},
/content-type for this resource/i
)
})
it('throws an error if response length is > max', () => {
const invalidResponse = {
statusMessage: "OK",
statusCode: 200,
headers: {
"content-type": 'text/html',
"content-length": MAX_CONTENT_LENGTH + 1,
}
}
assert.throws(
() => {
validateResponse(invalidResponse)
},
/Content for this resource was too large/i
)
})
})
describe('baseDomain(parsedUrl)', () => {
it('returns the base domain, excluding subdomain', () => {
const url = 'https://www.npmjs.com/package/request#streaming'
const parsedUrl = URL.parse(url)
assert.equal(baseDomain(parsedUrl), 'npmjs.com')
})
it('returns the base domain as is if no subdomain', () => {
const url = 'https://npmjs.com/package/request#streaming'
const parsedUrl = URL.parse(url)
assert.equal(baseDomain(parsedUrl), 'npmjs.com')
})
})

@ -0,0 +1 @@
export { default as fetchResource } from './fetch-resource'

@ -0,0 +1,4 @@
#!/bin/bash
# Runs the mocha tests
mocha --compilers js:babel-register $(find src -name "*.test.js") --require babel-polyfill
Loading…
Cancel
Save