feat: resource fetches content from a URL and prepares for parsing

Squashed commit of the following: commit 7ba2d2b36d175f5ccbc02f918322ea0dd44bf2c1 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 17:55:10 2016 -0400 feat: resource fetches content from a URL and prepares for parsing commit 0abdfa49eed5b363169070dac6d65d0a5818c918 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 17:54:07 2016 -0400 fix: this was messing up double Esses ('ss', as in class => cla) commit 9dc65a99631e3a68267a68b2b4629c4be8f61546 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 14:58:57 2016 -0400 fix: test suite working w/new dirs commit 993dc33a5229bfa22ea998e3c4fe105be9d91c21 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 14:49:39 2016 -0400 feat: convertLazyLoadedImages puts img urls in the src commit e7fb105443dd16d036e460ad21fbcb47191f475b Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 14:30:43 2016 -0400 feat: makeLinksAbsolute to fully qualify urls commit dbd665078af854efe84bbbfe9b55acd02e1a652f Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 13:38:33 2016 -0400 feat: fetchResource to fetch a url and validate the response commit 42d3937c8f0f8df693996c2edee93625f13dced7 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 6 10:25:34 2016 -0400 feat: normalizing meta tags
8 years ago · 8da2425e59
parent bc97156718
commit 8da2425e59
30 changed files with 1039 additions and 36 deletions
--- a/.babelrc
+++ b/.babelrc
@ -2,6 +2,7 @@
  "presets": ["es2015"],
  "plugins": [
    "transform-es2015-destructuring",
-    "transform-object-rest-spread"
+    "transform-object-rest-spread",
+    "transform-async-to-generator"
  ]
 }
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@ node_modules
 build
 bundle.js
 npm-debug.log
+dist
--- a/TODO.md
+++ b/TODO.md
@ -1,4 +1,6 @@
 TODO:
+- run makeLinksAbsolute on extracted content before returning
+- remove logic for fetching meta attrs with custom props
 - Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
 - extractNextPageUrl
 - Rename all cleaners from cleanThing to clean
--- a/package.json
+++ b/package.json
@ -6,12 +6,13 @@
  "scripts": {
    "start": "node ./build",
    "build": "rollup -c",
-    "test": "mocha --compilers js:babel-register --recursive src/**/*.test.js"
+    "test": "./test-runner"
  },
  "author": "",
  "license": "ISC",
  "devDependencies": {
    "babel-plugin-external-helpers": "^6.8.0",
+    "babel-plugin-transform-async-to-generator": "^6.8.0",
    "babel-plugin-transform-es2015-destructuring": "^6.9.0",
    "babel-plugin-transform-object-rest-spread": "^6.8.0",
    "babel-polyfill": "^6.13.0",
@ -27,6 +28,7 @@
  "dependencies": {
    "cheerio": "^0.20.0",
    "moment": "^2.14.1",
+    "request-promise": "^4.1.1",
    "valid-url": "^1.0.9",
    "wuzzy": "^0.1.2"
  }
--- a/7
+++ b/7
@ -0,0 +1,7 @@
+#!/usr/bin/env node
+var Iris = require('./dist/bundle')
+
+var url = process.argv[2]
+var result = Iris.parse(url).then(function(result) {
+  console.log(result.content)
+})
--- a/rollup.config.js
+++ b/rollup.config.js
@ -9,5 +9,6 @@ export default {
    babel(babelrc()),
  ],
  format: 'cjs',
-  dest: 'dist/bundle.js' // equivalent to --output
+  dest: 'dist/bundle.js', // equivalent to --output
+  sourceMap: true,
 }
--- a/src/extractor/generic/content/extract-best-node.js
+++ b/src/extractor/generic/content/extract-best-node.js
@ -24,13 +24,14 @@ export default function extractBestNode($, opts) {
  // TODO Do I need this? – AP
  // let $root = $.root().clone()

+
  if (opts.stripUnlikelyCandidates) {
    $ = stripUnlikelyCandidates($)
  }

  $ = convertToParagraphs($)
  $ = scoreContent($, opts.weightNodes)
-  const topCandidate = findTopCandidate($)
+  const $topCandidate = findTopCandidate($)

-  return topCandidate
+  return $topCandidate
 }
--- a/src/extractor/generic/content/extractor.js
+++ b/src/extractor/generic/content/extractor.js
@ -17,8 +17,6 @@ const GenericContentExtractor = {
  parse($, html, title='', opts={}) {
    opts = { ...this.defaultOpts, ...opts }

-    // TODO: Title is used to clean headers.
-    // Should be passed from title extraction.
    return this.extract($, html, opts, title)
  },

--- a/src/extractor/generic/content/utils/scoring/find-top-candidate.js
+++ b/src/extractor/generic/content/utils/scoring/find-top-candidate.js
@ -8,31 +8,32 @@ import {
 // After we've calculated scores, loop through all of the possible
 // candidate nodes we found and find the one with the highest score.
 export default function findTopCandidate($) {
-  let candidate, topScore = 0
+  let $candidate, topScore = 0

  $('*[score]').each((index, node) => {
+    const $node = $(node)
    // Ignore tags like BR, HR, etc
    if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
      return
    }

-    const score = getScore($(node))
+    const score = getScore($node)

    if (score > topScore) {
      topScore = score
-      candidate = node
+      $candidate = $node
    }
  })

  // If we don't have a candidate, return the body
  // or whatever the first element is
-  if (!candidate) {
+  if (!$candidate) {
    return $('body') || $('*').first()
  }

-  candidate = mergeSiblings(candidate, topScore, $)
+  $candidate = mergeSiblings($candidate, topScore, $)

-  return $(candidate)
+  return $candidate
 }

 // Now that we have a top_candidate, look through the siblings of
@ -40,28 +41,29 @@ export default function findTopCandidate($) {
 // may be split parts of the content (Like two divs, a preamble and
 // a body.) Example:
 // http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
-export function mergeSiblings(candidate, topScore, $) {
-  if (!$(candidate).parent().length) {
-    return candidate
+export function mergeSiblings($candidate, topScore, $) {
+  if (!$candidate.parent().length) {
+    return $candidate
  }

  const siblingScoreThreshold = Math.max(10, topScore * 0.2)
  let wrappingDiv = $('<div></div>')

-  $(candidate).parent().children().each((index, child) => {
+  $candidate.parent().children().each((index, child) => {
+    const $child = $(child)
    // Ignore tags like BR, HR, etc
    if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
      return
    }

-    const childScore = getScore($(child))
+    const childScore = getScore($child)
    if (childScore) {
-      if (child === candidate) {
-        wrappingDiv.append(child)
+      if ($child === $candidate) {
+        wrappingDiv.append($child)
      } else {
        let contentBonus = 0
        // extract to scoreLinkDensity() TODO
-        const density = linkDensity($(child))
+        const density = linkDensity($child)

        // If sibling has a very low link density,
        // give it a small bonus
@ -77,23 +79,23 @@ export function mergeSiblings(candidate, topScore, $) {

        // If sibling node has the same class as
        // candidate, give it a bonus
-        if ($(child).attr('class') === $(candidate).attr('class')) {
+        if ($child.attr('class') === $candidate.attr('class')) {
          contentBonus = contentBonus + topScore * .2
        }

-        const newScore = getScore($(child)) + contentBonus
+        const newScore = getScore($child) + contentBonus

        if (newScore >= siblingScoreThreshold) {
-          return wrappingDiv.append(child)
+          return wrappingDiv.append($child)
        } else if (child.tagName === 'p') {
-          const childContentLength = textLength($(child).text())
+          const childContentLength = textLength($child.text())

          if (childContentLength > 80 && density < .25) {
-            return wrappingDiv.append(child)
+            return wrappingDiv.append($child)
          } else if (childContentLength <= 80 && density === 0 &&
                    hasSentenceEnd(childContent)) {

-            return wrappingDiv.append(child)
+            return wrappingDiv.append($child)
          }
        }
      }
--- a/src/extractor/generic/content/utils/text/normalize-spaces.js
+++ b/src/extractor/generic/content/utils/text/normalize-spaces.js
@ -1,4 +1,4 @@
-const NORMALIZE_RE = new RegExp('\s{2,}')
+const NORMALIZE_RE = /\s{2,}/

 export default function normalizeSpaces(text) {
  return text.replace(NORMALIZE_RE, ' ').trim()
--- a/src/extractor/generic/index.js
+++ b/src/extractor/generic/index.js
@ -8,8 +8,7 @@ import GenericDekExtractor from './dek/extractor'
 import GenericLeadImageUrlExtractor from './lead-image-url/extractor'

 const GenericExtractor = {
-  parse: (url, html) => {
-    let $
+  parse: (url, html, $) => {
    if (html) {
      $ = cheerio.load(html)
    } else {
--- a/src/index.js
+++ b/src/index.js
@ -1,10 +1,19 @@
 import fs from 'fs'

-import GenericExtractor from './extractor/generic/index.js'
+import Resource from './resource'
+import GenericExtractor from './extractor/generic'

-// const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
-// const url = 'http://wired.com'
-// const result = GenericExtractor.parse(url, html)
-// console.log(result)
+import fetchResource from './resource/utils/fetch-resource'
+// export default fetchResource

-export default GenericExtractor
+// export { default as GenericExtractor } from './extractor/generic/index.js'
+
+const Iris = {
+  parse: async function(url) {
+    const $ = await Resource.create(url)
+    const result = GenericExtractor.parse(url, null, $)
+    return result
+  }
+}
+
+export default Iris
--- a/src/index.test.js
+++ b/src/index.test.js
@ -0,0 +1,12 @@
+import Iris from './index'
+
+describe('Iris', function() {
+  describe('parse(url)', function() {
+    this.timeout(1000000)
+    it('does the whole thing', async function() {
+      const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220')
+
+      console.log(result)
+    })
+  })
+})
--- a/src/resource/index.js
+++ b/src/resource/index.js
@ -0,0 +1,430 @@
+import 'babel-polyfill'
+
+import cheerio from 'cheerio'
+
+import { fetchResource } from './utils'
+import {
+  normalizeMetaTags,
+  convertLazyLoadedImages,
+  clean,
+} from './utils/dom'
+
+const Resource = {
+  // Create a Resource.
+  //
+  // :param url: The URL for the document we should retrieve.
+  // :param parseNon2xx: If true, attempt to parse non-200 level
+  //                       resources. Default is false.
+  // :param response: If set, use as the response rather than
+  //                  attempting to fetch it ourselves. Expects a
+  //                  string.
+
+  create: async function(url, parseNon2xx=false, preparedResponse) {
+    const result = await fetchResource(url)
+    return this.generateDoc(result)
+  },
+
+  generateDoc({ body: content, response }) {
+    const { "content-type": contentType } = response.headers
+
+    // TODO: Implement is_text function from
+    // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
+    if (!contentType.includes('html') &&
+        !contentType.includes('text')) {
+          throw new Error(`Content does not appear to be text.`)
+    }
+
+    let $ = cheerio.load(content, { normalizeWhitespace: true })
+
+    if ($.root().children().length === 0) {
+      throw new Error(`No children, likely a bad parse.`)
+    }
+
+    $ = normalizeMetaTags($)
+    $ = convertLazyLoadedImages($)
+    $ = clean($)
+
+    return $
+  }
+}
+
+export default Resource
+//     def __init__(self, url, parse_non_2xx=False, response=None):
+//         """ Create a Resource.
+//         
+//             :param url: The URL for the document we should retrieve.
+//             :param parse_non_2xx: If True, attempt to parse non-200 level
+//                                   resources. If False, raise a RetrievalFailed
+//                                   based exception. Default is False.
+//             :param response: If not None, use as the response rather than
+//                              attempting to fetch it ourselves. Expects a
+//                              requests.models.Response object.
+//         """
+//         self.url = url
+//         self.parse_non_2xx = parse_non_2xx
+//
+//         if response:
+//             self.response = response
+//         else:
+//             self.response = self._fetch_resource()
+
+// Iris: Human-friendly content extraction.
+
+// import logging
+// import lxml
+// import re
+// import requests
+// import socket
+//
+// from django.conf import settings
+// from lxml.etree import XPathEvaluator
+// from lxml.html.clean import Cleaner
+// from urlparse import urlparse
+//
+// from utils.dom import extract_by_selector as ebs, convert_lazy_loaded_images
+// from utils.dom.attribmap import AttribMap
+// from utils.statsd import stats
+// from utils.text import is_text
+// from utils.html import get_charset_from_html, strip_content_encodings
+//
+// from . import exceptions
+//
+// logger = logging.getLogger(__name__)
+//
+// # Hosts that are allowed to use embeds and iframes. We should be very
+// # restrictive with this and only include top-tier video sites.
+// host_whitelist = ['www.youtube.com', 'www.vimeo.com']
+//
+// # The number of seconds to attempt to fetch a resource before timing out.
+// FETCH_TIMEOUT = 10
+//
+// cleaner = Cleaner(
+//     style=True,
+//     page_structure=False,
+//     meta=False,
+//     add_nofollow=False, # done by hand
+//     remove_unknown_tags=False,
+//     links=False,
+//     host_whitelist=host_whitelist)
+//
+//
+//
+// class Resource(object):
+//     """ A Resource is a wrapper class for an HTTP resource. Provides
+//         functionality to fetch a resource as well as a handful of shortcut
+//         methods to run xpath efficiently on HTML, etc.
+//     
+//         Uses requests and lxml internally for fetching and querying.
+//     """
+//
+//
+//     def __init__(self, url, parse_non_2xx=False, response=None):
+//         """ Create a Resource.
+//         
+//             :param url: The URL for the document we should retrieve.
+//             :param parse_non_2xx: If True, attempt to parse non-200 level
+//                                   resources. If False, raise a RetrievalFailed
+//                                   based exception. Default is False.
+//             :param response: If not None, use as the response rather than
+//                              attempting to fetch it ourselves. Expects a
+//                              requests.models.Response object.
+//         """
+//         self.url = url
+//         self.parse_non_2xx = parse_non_2xx
+//
+//         if response:
+//             self.response = response
+//         else:
+//             self.response = self._fetch_resource()
+//
+//     def __unicode__(self):
+//         return u'<Resource ({0})>'.format(self.url)
+//
+//     def __repr__(self):
+//         return "<Resource ({0})>".format(self.url)
+//
+//     @classmethod
+//     def fabricate(kls, url, content, headers=None):
+//         """ Given a URL and some content, create a fake Resource that looks
+//             as though it has already fetched the content. Useful for using
+//             Resource objects without having to do a GET.
+//         """
+//         
+//         if type(content) != unicode:
+//             raise TypeError("Provided content must be unicode.")
+//
+//         if headers is None:
+//             headers = {}
+//         
+//         try:
+//             utf8_content = content.encode('utf-8', 'strict')
+//         except UnicodeDecodeError:
+//             logger.warning("Unable to encode content for url %s. Content "
+//                             "should be unicode and encodeable at this point.")
+//             utf8_content = content.encode('utf-8', 'replace')
+//         
+//         mocked_response_dict = {
+//             "cookies": {},
+//             "_content": utf8_content,
+//             "headers": dict({
+//                 "content-length": len(content),
+//                 "accept-ranges": "bytes",
+//                 "vary": "Accept-Encoding,Cookie",
+//                 "server": "Apache/2.2.21",
+//                 "content-type": "text/html; charset=UTF-8"
+//             }, **headers),
+//             "url": url,
+//             "status_code": 200,
+//             "_content_consumed": False,
+//             "request": None,
+//             "raw": None,
+//             "error": None,
+//             "config": {
+//                 "decode_unicode": True,
+//                 "pool_connections": 10,
+//                 "verbose": None,
+//                 "keep_alive": True,
+//                 "max_retries": 0,
+//                 "base_headers": {
+//                     "Accept-Encoding": "identity, deflate, compress, gzip",
+//                     "Accept": "|)}>#*",
+//                     "User-Agent": "python-requests/0.8.1"
+//                 },
+//                 "pool_maxsize": 10,
+//                 "safe_mode": False,
+//                 "max_redirects": 30
+//             },
+//             "history": []
+//         }
+//         mocked_response = requests.Response()
+//         for k, v in mocked_response_dict.items():
+//             setattr(mocked_response, k, v)
+//         
+//         return Resource(
+//             url = url,
+//             response = mocked_response
+//         )
+//
+//
+//     @property
+//     def url(self):
+//         return self._url
+//
+//     
+//     @url.setter
+//     def url(self, value):
+//         parsed_url = urlparse(value)
+//         if parsed_url.scheme not in ('http', 'https'):
+//             raise ValueError("Resource only allows HTTP and HTTPS urls.")
+//         
+//         if not parsed_url.netloc:
+//             raise ValueError("Relative URLs are not allowed.")
+//
+//         self._url = value
+//
+//     _parsed_url = None
+//     @property
+//     def parsed_url(self):
+//         if self._parsed_url is None:
+//             self._parsed_url = urlparse(self.url)
+//         return self._parsed_url
+//
+//     @property
+//     def status_code(self):
+//         return self.response.status_code
+//
+//
+//     _content = None
+//     @property
+//     def content(self):
+//         """Return the content for a resource. Always returns unicode.
+//
+//         """
+//         if self._content is None:
+//             # Requests that come in without content-type encoding headers will
+//             # default to iso-8859-1, which could be wrong
+//             if (self.response.encoding and
+//                 self.response.encoding.lower() == 'iso-8859-1'):
+//                 # Dont send unicode, because it could have been decoded wrong
+//                 # by an incorrect content-type guess.
+//                 encoding = get_charset_from_html(self.response.content) or 'iso-8859-1'
+//
+//                 if encoding != self.response.encoding:
+//                     # First, try to use the encoding we found in the markup
+//                     try:
+//                         self._content = self.response.content.decode(encoding)
+//                     except (LookupError, UnicodeDecodeError):
+//                         stats.increment(
+//                             'iris.resource.encoding.encoding_mismatch')
+//                         # That encoding might be wrong though, so if it is, use
+//                         # the one it reported since they could have the wrong
+//                         # one set in the markup. eg. sending the content over
+//                         # as iso but declaring it to be utf-8 like gq.com does.
+//                         # We may also end up with an invalid encoding type, at
+//                         # which point we should also just use the request
+//                         # encoding and replace silently.
+//                         self._content = self.response.content.decode(
+//                             self.response.encoding, 'replace')
+//                 else:
+//                     # If the encoding guess was right, just use the unicode
+//                     self._content = self.response.text
+//
+//             else:
+//                 # Otherwise we trust the encoding
+//                 self._content = self.response.text
+//
+//         return self._content
+//
+//
+//     @property
+//     def content_type(self):
+//         return self.response.headers.get('content-type', '')
+//
+//
+//     @property
+//     def is_html(self):
+//         if 'html' in self.content_type:
+//             return True
+//
+//         # Otherwise, just try parsing it and see if it succeeds
+//         try:
+//             return (self.doc is not None)
+//         except:
+//             return False
+//
+//     @property
+//     def is_plaintext(self):
+//         if 'text/plain' in self.content_type:
+//             return True
+//         
+//         return False
+//     
+//     @property
+//     def is_image(self):
+//         if 'image' in self.content_type:
+//             return True
+//         
+//         return False
+//     
+//     @property
+//     def is_pdf(self):
+//         if 'pdf' in self.content_type:
+//             return True
+//         
+//         return False
+//
+//     _lxml_doc = None
+//     @property
+//     def doc(self):
+//         if self._lxml_doc is None:
+//             self._generate_lxml_doc()
+//
+//         return self._lxml_doc
+//
+//     _docxp = None
+//     @property
+//     def docxp(self):
+//         """ Generate an XPath Evaluator for this doc. """
+//         if self._docxp is None:
+//             self._docxp = XPathEvaluator(self.doc)
+//         
+//         return self._docxp
+//
+//     _redocxp = None
+//     @property
+//     def redocxp(self):
+//         """ Generate an XPath Evaluator for this doc, that includes the RE
+//             namespace for regular expression matching.
+//             
+//         """
+//         if self._redocxp is None:
+//             _rens = {'re':'http://exslt.org/regular-expressions'}
+//             self._redocxp = XPathEvaluator(self.doc, namespaces=_rens)
+//
+//         return self._redocxp
+//
+//     def _generate_lxml_doc(self):
+//         # First check if we have a text based resource
+//         if (not 'html' in self.content_type and
+//             not 'text' in self.content_type and
+//             not is_text(self.content[:512])):
+//                 raise ValueError("Content does not appear to be text.")
+//
+//         
+//         # Remove useless carriage returns which get parsed as &#13; otherwise
+//         content = re.sub(r'(\n\r|\r\n)', '\n', self.content)
+//
+//         # Dont pass any content encodings into lxml, it is dumb about them
+//         content = strip_content_encodings(content)
+//
+//         self._lxml_doc = lxml.html.fromstring(content)
+//
+//
+//
+//             
+//         if len(self._lxml_doc.getchildren()) == 0:
+//             stats.increment('iris.resource.encoding.no_children')
+//             raise ValueError("No children, likely a bad parse.")
+//
+//
+//         # Sometimes, lxml (or BeautifulSoup) will wrap the whole document
+//         # in an extra html tag. This screws up a whole bunch of things in 
+//         # the parsing process. If this is the case, reset the doc to the
+//         # ACTUAL root of the doc.    
+//         # Sample cases:
+//         # * Strange Doctype causing issues: http://bit.ly/IATz0B
+//         # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o
+//         # Also check for a body inside of our internal HTML tag, to determine
+//         # that it's not just a junk HTML tag sibling at the bottom of the
+//         # doc or something.
+//         internal_html_tag = self._lxml_doc.find('html')
+//         if (internal_html_tag is not None and
+//             len(internal_html_tag.xpath('.//body')) > 0):
+//             self._lxml_doc = internal_html_tag
+//
+//         self._normalize_meta_tags()
+//
+//         self._lxml_doc.make_links_absolute(self.url)
+//
+//         # Convert any lazy loaded images into normal images before clean_html
+//         # which will strip all other attributes
+//         self._lxml_doc = convert_lazy_loaded_images(self._lxml_doc)
+//
+//         # Clean the doc of anything malicious.
+//         self._lxml_doc = cleaner.clean_html(self._lxml_doc)
+//
+//         # Manually nofollow links so that we don't clobber rel author
+//         # Workaround for https://bugs.launchpad.net/lxml/+bug/971754
+//         for a in self.docxp('//a'):
+//             if a.attrib.get('rel', None):
+//                 rel_attribs = set(a.attrib['rel'].split())
+//                 rel_attribs.add('nofollow')
+//                 a.attrib['rel'] = ' '.join(rel_attribs)
+//             else:
+//                 a.attrib['rel'] = 'nofollow'
+//                 
+//         # Re-relativize anchor links
+//         anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" %
+//                              self.url.replace("'", "%27"))
+//         for link in self.docxp(anchor_link_xpath):
+//             link.attrib['href'] = link.attrib['href'].replace(self.url, '')
+//
+//
+//     _attrib_map = None
+//     @property
+//     def attrib_map(self):
+//         """ Create an AttribMap object for fast checking of class/id existence
+//             in the document. Used in association with extract_by_selector.
+//             
+//         """
+//         if self._attrib_map is None:
+//             self._attrib_map = AttribMap(self.doc)
+//         
+//         return self._attrib_map
+//
+//
+//     def extract_by_selector(self, selector):
+//         " Shortcut to run extract_by_selector on our doc with our AttribMap. "
+//         return ebs(self.doc, selector, self.attrib_map, self.docxp)
+//     
+//
--- a/src/resource/index.test.js
+++ b/src/resource/index.test.js
@ -0,0 +1,58 @@
+import assert from 'assert'
+
+import Resource from './index'
+
+describe('Resource', () => {
+  describe('create(url)', function() {
+    this.timeout(3000)
+    it('fetches the page and returns a cheerio object', async () => {
+      const url = 'http://theconcourse.deadspin.com/1786177057'
+      const $ = await Resource.create(url)
+
+      console.log($.html())
+    })
+  })
+
+  describe('generateDoc({ body, response })', () => {
+    it('returns a cheerio object if valid', () => {
+      const response = { headers: { "content-type": "text/html" } }
+
+      const body = `<div><p>Hi</p></div>`
+      const $ = Resource.generateDoc({ body, response })
+
+      assert.equal($.html(), body)
+    })
+
+    it('throws an error if the content is not text', () => {
+      const response = {
+        headers: {
+          "content-type": "foo"
+        }
+      }
+      const body = ''
+
+      assert.throws(
+        () => {
+          Resource.generateDoc({ body, response })
+        },
+          /content does not appear to be text/i
+      )
+    })
+
+    it('throws an error if the content has no children', () => {
+      const response = {
+        headers: {
+          "content-type": "html"
+        }
+      }
+      const body = ``
+
+      assert.throws(
+        () => {
+          Resource.generateDoc({ body, response })
+        },
+          /no children/i
+      )
+    })
+  })
+})
--- a/src/resource/utils/constants.js
+++ b/src/resource/utils/constants.js
@ -0,0 +1,36 @@
+export const REQUEST_HEADERS = {
+  'User-Agent': 'Readability - http://readability.com/about/'
+}
+
+// The number of milliseconds to attempt to fetch a resource before timing out.
+export const FETCH_TIMEOUT = 10000
+
+// Content types that we do not extract content from
+const BAD_CONTENT_TYPES = [
+    'audio/mpeg',
+    'image/gif',
+    'image/jpeg',
+    'image/jpg',
+]
+
+export const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i')
+
+
+
+// Use this setting as the maximum size an article can be
+// for us to attempt parsing. Defaults to 5 MB.
+export const MAX_CONTENT_LENGTH = 5242880
+
+// Turn the global proxy on or off
+// Proxying is not currently enabled in Python source
+// so not implementing logic in port.
+export const PROXY_DOMAINS = false
+export const REQUESTS_PROXIES = {
+  'http': 'http://38.98.105.139:33333',
+  'https': 'http://38.98.105.139:33333',
+}
+
+export const DOMAINS_TO_PROXY = [
+  'nih.gov',
+  'gutenberg.org',
+]
--- a/src/resource/utils/dom/clean.js
+++ b/src/resource/utils/dom/clean.js
@ -0,0 +1,17 @@
+import { TAGS_TO_REMOVE } from './constants'
+export default function clean($) {
+  $(TAGS_TO_REMOVE).remove()
+
+  $ = cleanComments($)
+  return $
+}
+
+function isComment(index, node) {
+  return node.type === 'comment'
+}
+
+function cleanComments($) {
+  $.root().find('*').contents().filter(isComment).remove()
+
+  return $
+}
--- a/src/resource/utils/dom/clean.test.js
+++ b/src/resource/utils/dom/clean.test.js
@ -0,0 +1,27 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import clean from './clean'
+
+describe('clean($)', () => {
+  it('removes script elements', () => {
+    const html = `<div><script>alert('hi')</script></div>`
+    const $ = cheerio.load(html)
+
+    assert.equal(clean($).html(), '<div></div>')
+  })
+
+  it('removes style elements', () => {
+    const html = `<div><style>foo: {color: red;}</style></div>`
+    const $ = cheerio.load(html)
+
+    assert.equal(clean($).html(), '<div></div>')
+  })
+
+  it('removes comments', () => {
+    const html = `<div>HI <!-- This is a comment --></div>`
+    const $ = cheerio.load(html)
+
+    assert.equal(clean($).html(), '<div>HI </div>')
+  })
+})
--- a/src/resource/utils/dom/constants.js
+++ b/src/resource/utils/dom/constants.js
@ -0,0 +1,8 @@
+export const IS_LINK = new RegExp('https?://', 'i')
+export const IS_IMAGE = new RegExp('\.(png|gif|jpe?g)', 'i')
+
+export const TAGS_TO_REMOVE = [
+  'script',
+  'style',
+  'form',
+].join(',')
--- a/src/resource/utils/dom/convert-lazy-loaded-images.js
+++ b/src/resource/utils/dom/convert-lazy-loaded-images.js
@ -0,0 +1,26 @@
+import 'babel-polyfill'
+
+import {
+  IS_LINK,
+  IS_IMAGE,
+} from './constants'
+
+// Convert all instances of images with potentially
+// lazy loaded images into normal images.
+// Many sites will have img tags with no source, or an image tag with a src
+// attribute that a is a placeholer. We need to be able to properly fill in
+// the src attribute so the images are no longer lazy loaded.
+export default function convertLazyLoadedImages($) {
+  $('img').each((_, img) => {
+    Reflect.ownKeys(img.attribs).forEach((attr) => {
+      const value = img.attribs[attr]
+
+      if (attr !== 'src' && IS_LINK.test(value) &&
+          IS_IMAGE.test(value)) {
+        $(img).attr('src', value)
+      }
+    })
+  })
+
+  return $
+}
--- a/src/resource/utils/dom/convert-lazy-loaded-images.test.js
+++ b/src/resource/utils/dom/convert-lazy-loaded-images.test.js
@ -0,0 +1,44 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import convertLazyLoadedImages from './convert-lazy-loaded-images'
+
+describe('convertLazyLoadedImages($)', () => {
+  it('moves image links to src if placed in another attribute', () => {
+    const html = `<img data-src="http://example.com/foo.jpg">`
+    const $ = cheerio.load(html)
+
+    const result = convertLazyLoadedImages($).html()
+
+    assert.equal(result, `<img data-src="http://example.com/foo.jpg" src="http://example.com/foo.jpg">`)
+  })
+
+  it('does nothing when value is not a link', () => {
+    // This is far from perfect, since a relative url could
+    // be perfectly correct.
+    const html = `<img data-src="foo.jpg">`
+    const $ = cheerio.load(html)
+
+    const result = convertLazyLoadedImages($).html()
+
+    assert.equal(result, `<img data-src="foo.jpg">`)
+  })
+
+  it('does nothing when value is not an image', () => {
+    const html = `<img data-src="http://example.com">`
+    const $ = cheerio.load(html)
+
+    const result = convertLazyLoadedImages($).html()
+
+    assert.equal(result, `<img data-src="http://example.com">`)
+  })
+
+  it('does not change a correct img with src', () => {
+    const html = `<img src="http://example.com/foo.jpg">`
+    const $ = cheerio.load(html)
+
+    const result = convertLazyLoadedImages($).html()
+
+    assert.equal(result, `<img src="http://example.com/foo.jpg">`)
+  })
+})
--- a/src/resource/utils/dom/index.js
+++ b/src/resource/utils/dom/index.js
@ -0,0 +1,4 @@
+export { default as normalizeMetaTags } from './normalize-meta-tags'
+export { default as makeLinksAbsolute } from './make-links-absolute'
+export { default as convertLazyLoadedImages } from './convert-lazy-loaded-images'
+export { default as clean } from './clean'
--- a/src/resource/utils/dom/make-links-absolute.js
+++ b/src/resource/utils/dom/make-links-absolute.js
@ -0,0 +1,13 @@
+import URL from 'url'
+
+export default function makeLinksAbsolute($, url) {
+  ['href', 'src'].forEach(attr => absolutize($, url, attr))
+  return $
+}
+
+function absolutize($, url, attr) {
+  $(`[${attr}]`).each((_, node) => {
+    const $node = $(node)
+    $node.attr(attr, URL.resolve(url, $node.attr(attr)))
+  })
+}
--- a/src/resource/utils/dom/make-links-absolute.test.js
+++ b/src/resource/utils/dom/make-links-absolute.test.js
@ -0,0 +1,42 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import makeLinksAbsolute from './make-links-absolute'
+
+describe('makeLinksAbsolute($)', () => {
+  it('makes relative #hrefs absolute', () => {
+    const html = `<a href="#foo">bar</a>`
+    const $ = cheerio.load(html)
+
+    const result = makeLinksAbsolute($, 'http://example.com').html()
+
+    assert.equal(result,  `<a href="http://example.com/#foo">bar</a>`)
+  })
+
+  it('makes relative ./relative paths absolute', () => {
+    const html = `<a href="foo/bar">bar</a>`
+    const $ = cheerio.load(html)
+
+    const result = makeLinksAbsolute($, 'http://example.com/baz/bat').html()
+
+    assert.equal(result,  `<a href="http://example.com/baz/foo/bar">bar</a>`)
+  })
+
+  it('makes relative /root/paths absolute', () => {
+    const html = `<a href="/foo/bar">bar</a>`
+    const $ = cheerio.load(html)
+
+    const result = makeLinksAbsolute($, 'http://example.com/baz/bat').html()
+
+    assert.equal(result,  `<a href="http://example.com/foo/bar">bar</a>`)
+  })
+
+  it('makes relative srcs absolute', () => {
+    const html = `<img src="#foo">`
+    const $ = cheerio.load(html)
+
+    const result = makeLinksAbsolute($, 'http://example.com').html()
+
+    assert.equal(result,  `<img src="http://example.com/#foo">`)
+  })
+})
--- a/src/resource/utils/dom/normalize-meta-tags.js
+++ b/src/resource/utils/dom/normalize-meta-tags.js
@ -0,0 +1,24 @@
+// For ease of use in extracting from meta tags,
+// replace the "content" attribute on meta tags with the
+// "value" attribute.
+//
+// In addition, normalize 'property' attributes to 'name' for ease of
+// querying later. See, e.g., og or twitter meta tags.
+
+export default function normalizeMetaTags($) {
+  $ = convertMetaProp($, 'content', 'value')
+  $ = convertMetaProp($, 'property', 'name')
+  return $
+}
+
+function convertMetaProp($, from, to) {
+  $(`meta[${from}]`).each((_, node) => {
+    const $node = $(node)
+
+    const value = $node.attr(from)
+    $node.attr(to, value)
+    $node.removeAttr(from)
+  })
+
+  return $
+}
--- a/src/resource/utils/dom/normalize-meta-tags.test.js
+++ b/src/resource/utils/dom/normalize-meta-tags.test.js
@ -0,0 +1,28 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import normalizeMetaTags from './normalize-meta-tags'
+
+describe('normalizeMetaTags($)', () => {
+  it('replaces "content" attributes with "value"', () => {
+    const html = `<html><meta name="foo" content="bar"></html>`
+    const test = `<html><meta name="foo" value="bar"></html>`
+
+    const $ = cheerio.load(html)
+
+    const result = normalizeMetaTags($).html()
+
+    assert.equal(result, test)
+  })
+
+  it('replaces "property" attributes with "name"', () => {
+    const html = `<html><meta property="foo" value="bar"></html>`
+    const test = `<html><meta value="bar" name="foo"></html>`
+
+    const $ = cheerio.load(html)
+
+    const result = normalizeMetaTags($).html()
+
+    assert.equal(result, test)
+  })
+})
--- a/src/resource/utils/fetch-resource.js
+++ b/src/resource/utils/fetch-resource.js
@ -0,0 +1,96 @@
+import 'babel-polyfill'
+
+import URL from 'url'
+import request from 'request'
+
+import {
+  REQUEST_HEADERS,
+  FETCH_TIMEOUT,
+  BAD_CONTENT_TYPES_RE,
+  MAX_CONTENT_LENGTH,
+} from './constants'
+
+// Set our response attribute to the result of fetching our URL.
+// TODO: This should gracefully handle timeouts and raise the
+//       proper exceptions on the many failure cases of HTTP.
+// TODO: Ensure we are not fetching something enormous. Always return
+//       unicode content for HTML, with charset conversion.
+
+export default async function fetchResource(url) {
+  const parsedUrl = URL.parse(url)
+
+  const options = {
+    url: parsedUrl,
+    headers: REQUEST_HEADERS,
+    timeout: FETCH_TIMEOUT,
+  }
+
+  const { response, body } = await get(options)
+
+  try {
+    validateResponse(response)
+    return { body, response }
+  } catch(e) {
+    return e
+  }
+}
+
+function get(options){
+  return new Promise(function(resolve, reject){
+    request(options, function(err, response, body){
+      if(err){
+        reject(err)
+      } else {
+        resolve({ body, response })
+      }
+    })
+  })
+}
+
+// Evaluate a response to ensure it's something we should be keeping.
+// This does not validate in the sense of a response being 200 level or
+// not. Validation here means that we haven't found reason to bail from
+// further processing of this url.
+
+export function validateResponse(response, parseNon2xx=false) {
+  // Check if we got a valid status code
+  if (response.statusMessage !== "OK") {
+    if (!response.statusCode) {
+      throw new Error(
+        `Unable to fetch content. Original exception was ${response.error}`
+      )
+    } else if (!parseNon2xx) {
+      throw new Error(
+        `Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.`
+      )
+    }
+
+  }
+
+  const {
+    "content-type": contentType,
+    "content-length": contentLength
+  } = response.headers
+
+  // Check that the content is not in BAD_CONTENT_TYPES
+  if (BAD_CONTENT_TYPES_RE.test(contentType)) {
+    throw new Error(
+      `Content-type for this resource was ${contentType} and is not allowed.`
+    )
+  }
+
+  // Check that the content length is below maximum
+  if (contentLength > MAX_CONTENT_LENGTH) {
+    throw new Error(
+      `Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`
+    )
+  }
+
+  return true
+}
+
+// Grabs the last two pieces of the URL and joins them back together
+// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'
+export function baseDomain({ host }) {
+  return host.split('.').slice(-2).join('.')
+}
--- a/src/resource/utils/fetch-resource.test.js
+++ b/src/resource/utils/fetch-resource.test.js
@ -0,0 +1,110 @@
+import assert from 'assert'
+import URL from 'url'
+
+import {
+  default as fetchResource,
+  baseDomain,
+  validateResponse,
+} from './fetch-resource'
+import { MAX_CONTENT_LENGTH } from './constants'
+
+describe('fetchResource(url)', () => {
+  it('fetches domains', async () => {
+    const url = 'http://theconcourse.deadspin.com/1786177057'
+    const { body, response } = await fetchResource(url)
+
+    assert.equal(typeof body, 'string')
+  })
+})
+
+describe('validateResponse(response)', () => {
+  it('validates a response object', () => {
+    const validResponse = {
+      statusMessage: "OK",
+      statusCode: 200,
+      headers: {
+        "content-type": 'text/html',
+        "content-length": 500,
+      }
+    }
+
+    assert.equal(validateResponse(validResponse), true)
+  })
+
+  it('throws an error if there is no status code', () => {
+    const invalidResponse = {
+    }
+
+    assert.throws(
+      () => {
+        validateResponse(invalidResponse)
+      },
+      /unable to fetch content/i
+    )
+  })
+
+  it('throws an error if response code is not 2xx', () => {
+    const invalidResponse = {
+      statusCode: 500,
+    }
+
+    assert.throws(
+      () => {
+        validateResponse(invalidResponse)
+      },
+      /instructed to reject non-2xx/i
+    )
+  })
+
+  it('throws an error if response has bad content-type', () => {
+    const invalidResponse = {
+      statusMessage: "OK",
+      statusCode: 200,
+      headers: {
+        "content-type": 'image/gif',
+        "content-length": 500,
+      }
+    }
+
+    assert.throws(
+      () => {
+        validateResponse(invalidResponse)
+      },
+      /content-type for this resource/i
+    )
+  })
+
+  it('throws an error if response length is > max', () => {
+    const invalidResponse = {
+      statusMessage: "OK",
+      statusCode: 200,
+      headers: {
+        "content-type": 'text/html',
+        "content-length": MAX_CONTENT_LENGTH + 1,
+      }
+    }
+
+    assert.throws(
+      () => {
+        validateResponse(invalidResponse)
+      },
+      /Content for this resource was too large/i
+    )
+  })
+})
+
+describe('baseDomain(parsedUrl)', () => {
+  it('returns the base domain, excluding subdomain', () => {
+    const url = 'https://www.npmjs.com/package/request#streaming'
+    const parsedUrl = URL.parse(url)
+
+    assert.equal(baseDomain(parsedUrl), 'npmjs.com')
+  })
+
+  it('returns the base domain as is if no subdomain', () => {
+    const url = 'https://npmjs.com/package/request#streaming'
+    const parsedUrl = URL.parse(url)
+
+    assert.equal(baseDomain(parsedUrl), 'npmjs.com')
+  })
+})
--- a/src/resource/utils/index.js
+++ b/src/resource/utils/index.js
@ -0,0 +1 @@
+export { default as fetchResource } from './fetch-resource'
--- a/4
+++ b/4
@ -0,0 +1,4 @@
+#!/bin/bash
+# Runs the mocha tests
+
+mocha --compilers js:babel-register $(find src -name "*.test.js") --require babel-polyfill
				`@ -0,0 +1 @@`
				`export { default as fetchResource } from './fetch-resource'`