release: 1.1.0 (#245)

5 years ago · d884c3470c
parent 6844975c94
commit d884c3470c
5 changed files with 20690 additions and 35656 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,30 @@
 # Mercury Parser Changelog

+### 1.1.0 (Feb 5, 2019)
+
+##### Commits
+
+- [[`6844975c94`](https://github.com/postlight/mercury-parser/commit/6844975c94)] - **feat**: add mercury-parser cli (#244) (Adam Pash)
+- [[`7bdbbc8ed8`](https://github.com/postlight/mercury-parser/commit/7bdbbc8ed8)] - **deps**: update dependencies to enable Greenkeeper 🌴 (#243) (greenkeeper[bot])
+- [[`e38aff9c17`](https://github.com/postlight/mercury-parser/commit/e38aff9c17)] - **docs**: add npm install instructions (#240) (Adam Pash)
+- [[`dc3dff6584`](https://github.com/postlight/mercury-parser/commit/dc3dff6584)] - **docs**: add hero to README (#239) (Gina Trapani)
+- [[`15f7fa1e27`](https://github.com/postlight/mercury-parser/commit/15f7fa1e27)] - a more explicit .prettierrc (Adam Pash)
+- [[`c6f42c1278`](https://github.com/postlight/mercury-parser/commit/c6f42c1278)] - **docs**: cleanup and update docs (#238) (Adam Pash)
+- [[`92de5ce4ed`](https://github.com/postlight/mercury-parser/commit/92de5ce4ed)] - **docs**: remove contributors (github already has this covered) (#237) (Adam Pash)
+- [[`2845a1bb7e`](https://github.com/postlight/mercury-parser/commit/2845a1bb7e)] - **docs**: add gitter room text and link (#235) (George Haddad)
+- [[`380196b709`](https://github.com/postlight/mercury-parser/commit/380196b709)] - **docs**: change text to include AMP and Reader (#236) (George Haddad)
+- [[`33bf5882b9`](https://github.com/postlight/mercury-parser/commit/33bf5882b9)] - **docs**: add mit license badge (#234) (George Haddad)
+- [[`5c0325f5a7`](https://github.com/postlight/mercury-parser/commit/5c0325f5a7)] - **feat**: hook up ci to publish to npm (#226) (George Haddad)
+- [[`663cc45bf4`](https://github.com/postlight/mercury-parser/commit/663cc45bf4)] - fresh run of prettier; remove NOTES.md (#233) (Adam Pash)
+- [[`244d17ddd3`](https://github.com/postlight/mercury-parser/commit/244d17ddd3)] - **fix**: proxy browser in build tests (#232) (Adam Pash)
+- [[`0668f5d75b`](https://github.com/postlight/mercury-parser/commit/0668f5d75b)] - **docs**: add instructions for browser usage to parse current page (#231) (Toufic Mouallem)
+- [[`4ab50133f4`](https://github.com/postlight/mercury-parser/commit/4ab50133f4)] - **chore**: update node rollup config (#229) (Jad Termsani)
+- [[`1ccd14e1e9`](https://github.com/postlight/mercury-parser/commit/1ccd14e1e9)] - **feat**: add fortinet custom parser (#188) (Wajeeh Zantout)
+- [[`9b36003b62`](https://github.com/postlight/mercury-parser/commit/9b36003b62)] - **feat**: add fastcompany custom parser (#191) (Wajeeh Zantout)
+- [[`199fe70b03`](https://github.com/postlight/mercury-parser/commit/199fe70b03)] - Docs contributors (#227) (Ralph Jbeily)
+- [[`9756e6ee67`](https://github.com/postlight/mercury-parser/commit/9756e6ee67)] - **docs**: update mercury parser installation (#228) (Ralph Jbeily)
+- [[`1c7ae48de0`](https://github.com/postlight/mercury-parser/commit/1c7ae48de0)] - **dx**: include test results in comment (#230) (Adam Pash)
+
 ### 1.0.13 (Oct 11, 2018)

 ##### Commits
--- a/cli.js
+++ b/cli.js
@ -1,5 +1,5 @@
 #!/usr/bin/env node
-/* eslint-disable no-multi-str */
+/* eslint-disable */

 const Mercury = require('./dist/mercury');

--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -38,6 +38,7 @@ var _defineProperty = _interopDefault(
 var _parseFloat = _interopDefault(
  require('@babel/runtime-corejs2/core-js/parse-float')
 );
+var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
 var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
 var _getIterator = _interopDefault(
  require('@babel/runtime-corejs2/core-js/get-iterator')
@ -391,20 +392,28 @@ function _fetchResource() {
            switch ((_context.prev = _context.next)) {
              case 0:
                parsedUrl = parsedUrl || URL.parse(encodeURI(url));
-                options = {
-                  url: parsedUrl.href,
-                  headers: _objectSpread({}, REQUEST_HEADERS),
-                  timeout: FETCH_TIMEOUT,
-                  // Accept cookies
-                  jar: true,
-                  // Set to null so the response returns as binary and body as buffer
-                  // https://github.com/request/request#requestoptions-callback
-                  encoding: null,
-                  // Accept and decode gzip
-                  gzip: true,
-                  // Follow any redirect
-                  followAllRedirects: true,
-                };
+                options = _objectSpread(
+                  {
+                    url: parsedUrl.href,
+                    headers: _objectSpread({}, REQUEST_HEADERS),
+                    timeout: FETCH_TIMEOUT,
+                    // Accept cookies
+                    jar: true,
+                    // Set to null so the response returns as binary and body as buffer
+                    // https://github.com/request/request#requestoptions-callback
+                    encoding: null,
+                    // Accept and decode gzip
+                    gzip: true,
+                    // Follow any non-GET redirects
+                    followAllRedirects: true,
+                  },
+                  typeof window !== 'undefined'
+                    ? {}
+                    : {
+                        // Follow GET redirects; this option is for Node only
+                        followRedirect: true,
+                      }
+                );
                _context.next = 4;
                return get(options);

@ -803,8 +812,7 @@ function brsToPs$$1($) {
      collapsing = true;
      $element.remove();
    } else if (collapsing) {
-      collapsing = false; // $(element).replaceWith('<p />')
-
+      collapsing = false;
      paragraphize(element, $, true);
    }
  });
@ -899,7 +907,7 @@ function convertNodeTo$$1($node, $) {
    return $;
  }

-  var attrs = getAttrs(node) || {}; // console.log(attrs)
+  var attrs = getAttrs(node) || {};

  var attribString = _Reflect$ownKeys(attrs)
    .map(function(key) {
@ -1039,12 +1047,7 @@ function removeAllButWhitelist($article, $) {

  $('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
  return $article;
-} // function removeAttrs(article, $) {
-//   REMOVE_ATTRS.forEach((attr) => {
-//     $(`[${attr}]`, article).removeAttr(attr);
-//   });
-// }
-// Remove attributes like style or align
+} // Remove attributes like style or align

 function cleanAttributes$$1($article, $) {
  // Grabbing the parent because at this point
@ -1709,13 +1712,43 @@ function rewriteTopLevel$$1(article, $) {
 }

 function absolutize($, rootUrl, attr, $content) {
+  var baseUrl = $('base').attr('href');
  $('['.concat(attr, ']'), $content).each(function(_, node) {
    var attrs = getAttrs(node);
    var url = attrs[attr];
+    var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
+    setAttr(node, attr, absoluteUrl);
+  });
+}

-    if (url) {
-      var absoluteUrl = URL.resolve(rootUrl, url);
-      setAttr(node, attr, absoluteUrl);
+function absolutizeSet($, rootUrl, $content) {
+  $('[srcset]', $content).each(function(_, node) {
+    var attrs = getAttrs(node);
+    var urlSet = attrs.srcset;
+
+    if (urlSet) {
+      // a comma should be considered part of the candidate URL unless preceded by a descriptor
+      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
+      // space characters inside the URL should be encoded (%20 or +)
+      var candidates = urlSet.match(
+        /(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g
+      );
+      var absoluteCandidates = candidates.map(function(candidate) {
+        // a candidate URL cannot start or end with a comma
+        // descriptors are separated from the URLs by unescaped whitespace
+        var parts = candidate
+          .trim()
+          .replace(/,$/, '')
+          .split(/\s+/);
+        parts[0] = URL.resolve(rootUrl, parts[0]);
+        return parts.join(' ');
+      });
+
+      var absoluteUrlSet = _toConsumableArray(
+        new _Set(absoluteCandidates)
+      ).join(', ');
+
+      setAttr(node, 'srcset', absoluteUrlSet);
    }
  });
 }
@ -1724,6 +1757,7 @@ function makeLinksAbsolute$$1($content, $, url) {
  ['href', 'src'].forEach(function(attr) {
    return absolutize($, url, attr, $content);
  });
+  absolutizeSet($, url, $content);
  return $content;
 }

@ -2027,12 +2061,14 @@ var Resource = {
  // :param response: If set, use as the response rather than
  //                  attempting to fetch it ourselves. Expects a
  //                  string.
-  create: function create(url, preparedResponse, parsedUrl) {
-    var _this = this;
-
-    return _asyncToGenerator(
+  create: (function() {
+    var _create = _asyncToGenerator(
      /*#__PURE__*/
-      _regeneratorRuntime.mark(function _callee() {
+      _regeneratorRuntime.mark(function _callee(
+        url,
+        preparedResponse,
+        parsedUrl
+      ) {
        var result, validResponse;
        return _regeneratorRuntime.wrap(
          function _callee$(_context) {
@ -2076,7 +2112,7 @@ var Resource = {
                  return _context.abrupt('return', result);

                case 11:
-                  return _context.abrupt('return', _this.generateDoc(result));
+                  return _context.abrupt('return', this.generateDoc(result));

                case 12:
                case 'end':
@ -2088,8 +2124,14 @@ var Resource = {
          this
        );
      })
-    )();
-  },
+    );
+
+    function create(_x, _x2, _x3) {
+      return _create.apply(this, arguments);
+    }
+
+    return create;
+  })(),
  generateDoc: function generateDoc(_ref) {
    var content = _ref.body,
      response = _ref.response;
@ -2301,16 +2343,7 @@ var NYTimesExtractor = {
    selectors: ['div.g-blocks', 'article#story'],
    transforms: {
      'img.g-lazy': function imgGLazy($node) {
-        var src = $node.attr('src'); // const widths = $node.attr('data-widths')
-        //                   .slice(1)
-        //                   .slice(0, -1)
-        //                   .split(',');
-        // if (widths.length) {
-        //   width = widths.slice(-1);
-        // } else {
-        //   width = '900';
-        // }
-
+        var src = $node.attr('src');
        var width = 640;
        src = src.replace('{{size}}', width);
        $node.attr('src', src);
@ -2944,10 +2977,10 @@ var WwwWashingtonpostComExtractor = {
    selectors: ['h1', '#topper-headline-wrapper'],
  },
  author: {
-    selectors: ['.pb-byline'],
+    selectors: ['.pb-author-name'],
  },
  date_published: {
-    selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']],
+    selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']],
  },
  dek: {
    selectors: [],
@ -3002,12 +3035,7 @@ var WwwHuffingtonpostComExtractor = {
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {
-      // 'div.top-media': ($node) => {
-      //   const $figure = $node.children('figure');
-      //   $node.replaceWith($figure);
-      // },
-    },
+    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
@ -5065,10 +5093,7 @@ var WwwProspectmagazineCoUkExtractor = {
    selectors: [['meta[name="og:image"]', 'value']],
  },
  content: {
-    selectors: [
-      // ['article.type-post div.post_content p'],
-      'article .post_content',
-    ],
+    selectors: ['article .post_content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
@ -5290,6 +5315,60 @@ var IciRadioCanadaCaExtractor = {
  },
 };

+var WwwFortinetComExtractor = {
+  domain: 'www.fortinet.com',
+  title: {
+    selectors: ['h1'],
+  },
+  author: {
+    selectors: ['.b15-blog-meta__author'],
+  },
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']],
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']],
+  },
+  content: {
+    selectors: [
+      'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12',
+    ],
+    transforms: {
+      noscript: function noscript($node) {
+        var $children = $node.children();
+
+        if ($children.length === 1 && $children.get(0).tagName === 'img') {
+          return 'figure';
+        }
+
+        return null;
+      },
+    },
+  },
+};
+
+var WwwFastcompanyComExtractor = {
+  domain: 'www.fastcompany.com',
+  title: {
+    selectors: ['h1'],
+  },
+  author: {
+    selectors: ['.post__by'],
+  },
+  date_published: {
+    selectors: [['meta[name="article:published_time"]', 'value']],
+  },
+  dek: {
+    selectors: ['.post__deck'],
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']],
+  },
+  content: {
+    selectors: ['.post__article'],
+  },
+};
+
 var CustomExtractors = /*#__PURE__*/ Object.freeze({
  BloggerExtractor: BloggerExtractor,
  NYMagExtractor: NYMagExtractor,
@ -5382,6 +5461,8 @@ var CustomExtractors = /*#__PURE__*/ Object.freeze({
  WwwFoolComExtractor: WwwFoolComExtractor,
  WwwSlateComExtractor: WwwSlateComExtractor,
  IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
+  WwwFortinetComExtractor: WwwFortinetComExtractor,
+  WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
 });

 var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
@ -5390,8 +5471,7 @@ var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
 }, {});

 // CLEAN AUTHOR CONSTANTS
-var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; //     author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
-// CLEAN DEK CONSTANTS
+var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS

 var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.

@ -5699,10 +5779,6 @@ var Cleaners = {
 // Returns a cheerio object $

 function extractBestNode($, opts) {
-  // clone the node so we can get back to our
-  // initial parsed state if needed
-  // TODO Do I need this? – AP
-  // let $root = $.root().clone()
  if (opts.stripUnlikelyCandidates) {
    $ = stripUnlikelyCandidates($);
  }
@ -5813,10 +5889,7 @@ var GenericContentExtractor = {
      return null;
    }

-    return normalizeSpaces($.html(node)); // if return_type == "html":
-    //     return normalize_spaces(node_to_html(node))
-    // else:
-    //     return node
+    return normalizeSpaces($.html(node));
  },
 };

@ -5994,12 +6067,10 @@ var GenericAuthorExtractor = {
        !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
        _iteratorNormalCompletion = true
      ) {
-        var _ref4 = _step.value;
+        var _step$value = _slicedToArray(_step.value, 2),
+          selector = _step$value[0],
+          regex = _step$value[1];

-        var _ref3 = _slicedToArray(_ref4, 2);
-
-        var selector = _ref3[0];
-        var regex = _ref3[1];
        var node = $(selector);

        if (node.length === 1) {
@ -6078,11 +6149,8 @@ var DATE_PUBLISHED_SELECTORS = [

 var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
 var DATE_PUBLISHED_URL_RES = [
-  // /2012/01/27/ but not /2012/01/293
-  new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733
-  // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
-  // 2012-01-27
-  new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/
+  new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
+  new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
  new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'),
 ];

@ -6113,50 +6181,15 @@ var GenericDatePublishedExtractor = {
  },
 };

-// import {
-//   DEK_META_TAGS,
-//   DEK_SELECTORS,
-//   DEK_URL_RES,
-// } from './constants';
-// import { cleanDek } from 'cleaners';
-// import {
-//   extractFromMeta,
-//   extractFromSelectors,
-// } from 'utils/dom';
 // Currently there is only one selector for
 // deks. We should simply return null here
 // until we have a more robust generic option.
 // Below is the original source for this, for reference.
 var GenericDekExtractor = {
-  // extract({ $, content, metaCache }) {
  extract: function extract() {
    return null;
  },
 };
-//     # First, check to see if we have a matching meta tag that we can make
-//     # use of.
-//     dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
-//     if not dek:
-//         # Second, look through our CSS/XPath selectors. This may return
-//         # an HTML fragment.
-//         dek = self.extract_from_selectors('dek',
-//                                            constants.DEK_SELECTORS,
-//                                            text_only=False)
-//
-//     if dek:
-//         # Make sure our dek isn't in the first few thousand characters
-//         # of the content, otherwise it's just the start of the article
-//         # and not a true dek.
-//         content = self.extract_content()
-//         content_chunk = normalize_spaces(strip_tags(content[:2000]))
-//         dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
-//
-//         # 80% or greater similarity means the dek was very similar to some
-//         # of the starting content, so we skip it.
-//         if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
-//             return dek
-//
-//     return None

 // An ordered list of meta tag names that denote likely article leading images.
 // All attributes should be lowercase for faster case-insensitive matching.
@ -6443,159 +6476,6 @@ var GenericLeadImageUrlExtractor = {
    return null;
  },
 };
-//     """
-//     # First, try to find the "best" image via the content.
-//     # We'd rather not have to fetch each image and check dimensions,
-//     # so try to do some analysis and determine them instead.
-//     content = self.extractor.extract_content(return_type="node")
-//     imgs = content.xpath('.//img')
-//     img_scores = defaultdict(int)
-//     logger.debug('Scoring %d images from content', len(imgs))
-//     for (i, img) in enumerate(imgs):
-//         img_score = 0
-//
-//         if not 'src' in img.attrib:
-//             logger.debug('No src attribute found')
-//             continue
-//
-//         try:
-//             parsed_img = urlparse(img.attrib['src'])
-//             img_path = parsed_img.path.lower()
-//         except ValueError:
-//             logger.debug('ValueError getting img path.')
-//             continue
-//         logger.debug('Image path is %s', img_path)
-//
-//         if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
-//             logger.debug('Positive URL hints match. Adding 20.')
-//             img_score += 20
-//
-//         if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
-//             logger.debug('Negative URL hints match. Subtracting 20.')
-//             img_score -= 20
-//
-//         # Gifs are more often structure than photos
-//         if img_path.endswith('gif'):
-//             logger.debug('gif found. Subtracting 10.')
-//             img_score -= 10
-//
-//         # JPGs are more often photographs
-//         if img_path.endswith('jpg'):
-//             logger.debug('jpg found. Adding 10.')
-//             img_score += 10
-//
-//         # PNGs are neutral.
-//
-//         # Alt attribute usually means non-presentational image.
-//         if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
-//             logger.debug('alt attribute found. Adding 5.')
-//             img_score += 5
-//
-//         # Look through our parent and grandparent for figure-like
-//         # container elements, give a bonus if we find them
-//         parents = [img.getparent()]
-//         if parents[0] is not None and parents[0].getparent() is not None:
-//             parents.append(parents[0].getparent())
-//         for p in parents:
-//             if p.tag == 'figure':
-//                 logger.debug('Parent with <figure> tag found. Adding 25.')
-//                 img_score += 25
-//
-//             p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
-//             if constants.PHOTO_HINTS_RE.search(p_sig):
-//                 logger.debug('Photo hints regex match. Adding 15.')
-//                 img_score += 15
-//
-//         # Look at our immediate sibling and see if it looks like it's a
-//         # caption. Bonus if so.
-//         sibling = img.getnext()
-//         if sibling is not None:
-//             if sibling.tag == 'figcaption':
-//                 img_score += 25
-//
-//             sib_sig = ' '.join([sibling.get('id', ''),
-//                                 sibling.get('class', '')]).lower()
-//             if 'caption' in sib_sig:
-//                 img_score += 15
-//
-//         # Pull out width/height if they were set.
-//         img_width = None
-//         img_height = None
-//         if 'width' in img.attrib:
-//             try:
-//                 img_width = float(img.get('width'))
-//             except ValueError:
-//                 pass
-//         if 'height' in img.attrib:
-//             try:
-//                 img_height = float(img.get('height'))
-//             except ValueError:
-//                 pass
-//
-//         # Penalty for skinny images
-//         if img_width and img_width <= 50:
-//             logger.debug('Skinny image found. Subtracting 50.')
-//             img_score -= 50
-//
-//         # Penalty for short images
-//         if img_height and img_height <= 50:
-//             # Wide, short images are more common than narrow, tall ones
-//             logger.debug('Short image found. Subtracting 25.')
-//             img_score -= 25
-//
-//         if img_width and img_height and not 'sprite' in img_path:
-//             area = img_width * img_height
-//
-//             if area < 5000: # Smaller than 50x100
-//                 logger.debug('Image with small area found. Subtracting 100.')
-//                 img_score -= 100
-//             else:
-//                 img_score += round(area/1000.0)
-//
-//         # If the image is higher on the page than other images,
-//         # it gets a bonus. Penalty if lower.
-//         logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
-//         img_score += len(imgs)/2 - i
-//
-//         # Use the raw src here because we munged img_path for case
-//         # insensitivity
-//         logger.debug('Final score is %d.', img_score)
-//         img_scores[img.attrib['src']] += img_score
-//
-//     top_score = 0
-//     top_url = None
-//     for (url, score) in img_scores.items():
-//         if score > top_score:
-//             top_url = url
-//             top_score = score
-//
-//     if top_score > 0:
-//         logger.debug('Using top score image from content. Score was %d', top_score)
-//         return top_url
-//
-//
-//     # If nothing else worked, check to see if there are any really
-//     # probable nodes in the doc, like <link rel="image_src" />.
-//     logger.debug('Trying to find lead image in probable nodes')
-//     for selector in constants.LEAD_IMAGE_URL_SELECTORS:
-//         nodes = self.resource.extract_by_selector(selector)
-//         for node in nodes:
-//             clean_value = None
-//             if node.attrib.get('src'):
-//                 clean_value = self.clean(node.attrib['src'])
-//
-//             if not clean_value and node.attrib.get('href'):
-//                 clean_value = self.clean(node.attrib['href'])
-//
-//             if not clean_value and node.attrib.get('value'):
-//                 clean_value = self.clean(node.attrib['value'])
-//
-//             if clean_value:
-//                 logger.debug('Found lead image in probable nodes.')
-//                 logger.debug('Node was: %s', node)
-//                 return clean_value
-//
-//     return None

 function scoreSimilarity(score, articleUrl, href) {
  // Do this last and only if we have a real candidate, because it's
@ -7543,13 +7423,12 @@ function _collectAllPages() {
 }

 var Mercury = {
-  parse: function parse(url, html) {
-    var opts =
-      arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
-    return _asyncToGenerator(
+  parse: (function() {
+    var _parse = _asyncToGenerator(
      /*#__PURE__*/
-      _regeneratorRuntime.mark(function _callee() {
-        var _opts$fetchAllPages,
+      _regeneratorRuntime.mark(function _callee(url, html) {
+        var opts,
+          _opts$fetchAllPages,
          fetchAllPages,
          _opts$fallback,
          fallback,
@ -7560,13 +7439,16 @@ var Mercury = {
          result,
          _result,
          title,
-          next_page_url;
+          next_page_url,
+          _args = arguments;

        return _regeneratorRuntime.wrap(
          function _callee$(_context) {
            while (1) {
              switch ((_context.prev = _context.next)) {
                case 0:
+                  opts =
+                    _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
                  (_opts$fetchAllPages = opts.fetchAllPages),
                    (fetchAllPages =
                      _opts$fetchAllPages === void 0
@ -7587,29 +7469,29 @@ var Mercury = {
                  parsedUrl = URL.parse(url);

                  if (validateUrl(parsedUrl)) {
-                    _context.next = 5;
+                    _context.next = 6;
                    break;
                  }

                  return _context.abrupt('return', Errors.badUrl);

-                case 5:
-                  _context.next = 7;
+                case 6:
+                  _context.next = 8;
                  return Resource.create(url, html, parsedUrl);

-                case 7:
+                case 8:
                  $ = _context.sent;
                  Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
                  // If we found an error creating the resource, return that error

                  if (!$.failed) {
-                    _context.next = 11;
+                    _context.next = 12;
                    break;
                  }

                  return _context.abrupt('return', $);

-                case 11:
+                case 12:
                  // if html still has not been set (i.e., url passed to Mercury.parse),
                  // set html from the response of Resource.create
                  if (!html) {
@ -7635,11 +7517,11 @@ var Mercury = {
                    (next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found

                  if (!(fetchAllPages && next_page_url)) {
-                    _context.next = 21;
+                    _context.next = 22;
                    break;
                  }

-                  _context.next = 18;
+                  _context.next = 19;
                  return collectAllPages({
                    Extractor: Extractor,
                    next_page_url: next_page_url,
@ -7651,21 +7533,21 @@ var Mercury = {
                    url: url,
                  });

-                case 18:
+                case 19:
                  result = _context.sent;
-                  _context.next = 22;
+                  _context.next = 23;
                  break;

-                case 21:
+                case 22:
                  result = _objectSpread({}, result, {
                    total_pages: 1,
                    rendered_pages: 1,
                  });

-                case 22:
+                case 23:
                  return _context.abrupt('return', result);

-                case 23:
+                case 24:
                case 'end':
                  return _context.stop();
              }
@ -7675,8 +7557,14 @@ var Mercury = {
          this
        );
      })
-    )();
-  },
+    );
+
+    function parse(_x, _x2) {
+      return _parse.apply(this, arguments);
+    }
+
+    return parse;
+  })(),
  browser: !!cheerio.browser,
  // A convenience method for getting a resource
  // to work with, e.g., for custom extractor generator
--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "@postlight/mercury-parser",
-  "version": "1.0.13",
+  "version": "1.1.0",
  "description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
  "author": "Postlight <mercury@postlight.com>",
  "homepage": "https://mercury.postlight.com",