mercury-parser/dist/mercury.js

'use strict';

function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }

var _regeneratorRuntime = _interopDefault(require('@babel/runtime-corejs2/regenerator'));
var _objectSpread = _interopDefault(require('@babel/runtime-corejs2/helpers/objectSpread'));
var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/helpers/objectWithoutProperties'));
var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
var TurndownService = _interopDefault(require('turndown'));
var iconv = _interopDefault(require('iconv-lite'));
var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
var _Promise = _interopDefault(require('@babel/runtime-corejs2/core-js/promise'));
var request = _interopDefault(require('postman-request'));
var _Reflect$ownKeys = _interopDefault(require('@babel/runtime-corejs2/core-js/reflect/own-keys'));
var _toConsumableArray = _interopDefault(require('@babel/runtime-corejs2/helpers/toConsumableArray'));
var _defineProperty = _interopDefault(require('@babel/runtime-corejs2/helpers/defineProperty'));
var _parseFloat = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-float'));
var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
var _Object$assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
var stringDirection = _interopDefault(require('string-direction'));
var validUrl = _interopDefault(require('valid-url'));
var moment = _interopDefault(require('moment-timezone'));
var parseFormat = _interopDefault(require('moment-parseformat'));
var wuzzy = _interopDefault(require('wuzzy'));
var difflib = _interopDefault(require('difflib'));
var _Array$from = _interopDefault(require('@babel/runtime-corejs2/core-js/array/from'));
var ellipsize = _interopDefault(require('ellipsize'));
var _Array$isArray = _interopDefault(require('@babel/runtime-corejs2/core-js/array/is-array'));

var NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
function normalizeSpaces(text) {
  return text.replace(NORMALIZE_RE, ' ').trim();
}

// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
function extractFromUrl(url, regexList) {
  var matchRe = regexList.find(function (re) {
    return re.test(url);
  });

  if (matchRe) {
    return matchRe.exec(url)[1];
  }

  return null;
}

// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
//  page=1
//  pg=1
//  p=1
//  paging=12
//  pag=7
//  pagination/1
//  paging/88
//  pa/83
//  p/11
//
// Does not match:
//  pg=102
//  page:2
var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
var HAS_ALPHA_RE = /[a-z]/i;
var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
var ENCODING_RE = /charset=([\w-]+)\b/;
var DEFAULT_ENCODING = 'utf-8';

function pageNumFromUrl(url) {
  var matches = url.match(PAGE_IN_HREF_RE);
  if (!matches) return null;

  var pageNum = _parseInt(matches[6], 10); // Return pageNum < 100, otherwise
  // return null


  return pageNum < 100 ? pageNum : null;
}

function removeAnchor(url) {
  return url.split('#')[0].replace(/\/$/, '');
}

function isGoodSegment(segment, index, firstSegmentHasLetters) {
  var goodSegment = true; // If this is purely a number, and it's the first or second
  // url_segment, it's probably a page number. Remove it.

  if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
    goodSegment = true;
  } // If this is the first url_segment and it's just "index",
  // remove it


  if (index === 0 && segment.toLowerCase() === 'index') {
    goodSegment = false;
  } // If our first or second url_segment is smaller than 3 characters,
  // and the first url_segment had no alphas, remove it.


  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
    goodSegment = false;
  }

  return goodSegment;
} // Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.


function articleBaseUrl(url, parsed) {
  var parsedUrl = parsed || URL.parse(url);
  var protocol = parsedUrl.protocol,
      host = parsedUrl.host,
      path = parsedUrl.path;
  var firstSegmentHasLetters = false;
  var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
    var segment = rawSegment; // Split off and save anything that looks like a file type.

    if (segment.includes('.')) {
      var _segment$split = segment.split('.'),
          _segment$split2 = _slicedToArray(_segment$split, 2),
          possibleSegment = _segment$split2[0],
          fileExt = _segment$split2[1];

      if (IS_ALPHA_RE.test(fileExt)) {
        segment = possibleSegment;
      }
    } // If our first or second segment has anything looking like a page
    // number, remove it.


    if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
      segment = segment.replace(PAGE_IN_HREF_RE, '');
    } // If we're on the first segment, check to see if we have any
    // characters in it. The first segment is actually the last bit of
    // the URL, and this will be helpful to determine if we're on a URL
    // segment that looks like "/2/" for example.


    if (index === 0) {
      firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
    } // If it's not marked for deletion, push it to cleaned_segments.


    if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
      acc.push(segment);
    }

    return acc;
  }, []);
  return "".concat(protocol, "//").concat(host).concat(cleanedSegments.reverse().join('/'));
}

// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
var SENTENCE_END_RE = new RegExp('.( |$)');
function hasSentenceEnd(text) {
  return SENTENCE_END_RE.test(text);
}

function excerptContent(content) {
  var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
  return content.trim().split(/\s+/).slice(0, words).join(' ');
}

// used in our fetchResource function to
// ensure correctly encoded responses

function getEncoding(str) {
  var encoding = DEFAULT_ENCODING;
  var matches = ENCODING_RE.exec(str);

  if (matches !== null) {
    var _matches = _slicedToArray(matches, 2);

    str = _matches[1];
  }

  if (iconv.encodingExists(str)) {
    encoding = str;
  }

  return encoding;
}

var REQUEST_HEADERS = cheerio.browser ? {} : {
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}; // The number of milliseconds to attempt to fetch a resource before timing out.

var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from

var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];
var BAD_CONTENT_TYPES_RE = new RegExp("^(".concat(BAD_CONTENT_TYPES.join('|'), ")$"), 'i'); // Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.

var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off

function get(options) {
  return new _Promise(function (resolve, reject) {
    request(options, function (err, response, body) {
      if (err) {
        reject(err);
      } else {
        resolve({
          body: body,
          response: response
        });
      }
    });
  });
} // Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 or not.
// Validation here means that we haven't found reason to bail from
// further processing of this url.


function validateResponse(response) {
  var parseNon200 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;

  // Check if we got a valid status code
  // This isn't great, but I'm requiring a statusMessage to be set
  // before short circuiting b/c nock doesn't set it in tests
  // statusMessage only not set in nock response, in which case
  // I check statusCode, which is currently only 200 for OK responses
  // in tests
  if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) {
    if (!response.statusCode) {
      throw new Error("Unable to fetch content. Original exception was ".concat(response.error));
    } else if (!parseNon200) {
      throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-200 status codes."));
    }
  }

  var _response$headers = response.headers,
      contentType = _response$headers['content-type'],
      contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES

  if (BAD_CONTENT_TYPES_RE.test(contentType)) {
    throw new Error("Content-type for this resource was ".concat(contentType, " and is not allowed."));
  } // Check that the content length is below maximum


  if (contentLength > MAX_CONTENT_LENGTH) {
    throw new Error("Content for this resource was too large. Maximum content length is ".concat(MAX_CONTENT_LENGTH, "."));
  }

  return true;
} // Grabs the last two pieces of the URL and joins them back together
// TODO: This should gracefully handle timeouts and raise the
//       proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
//       unicode content for HTML, with charset conversion.

function fetchResource(_x, _x2) {
  return _fetchResource.apply(this, arguments);
}

function _fetchResource() {
  _fetchResource = _asyncToGenerator(
  /*#__PURE__*/
  _regeneratorRuntime.mark(function _callee(url, parsedUrl) {
    var headers,
        options,
        _ref2,
        response,
        body,
        _args = arguments;

    return _regeneratorRuntime.wrap(function _callee$(_context) {
      while (1) {
        switch (_context.prev = _context.next) {
          case 0:
            headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
            parsedUrl = parsedUrl || URL.parse(encodeURI(url));
            options = _objectSpread({
              url: parsedUrl.href,
              headers: _objectSpread({}, REQUEST_HEADERS, headers),
              timeout: FETCH_TIMEOUT,
              // Accept cookies
              jar: true,
              // Set to null so the response returns as binary and body as buffer
              // https://github.com/request/request#requestoptions-callback
              encoding: null,
              // Accept and decode gzip
              gzip: true,
              // Follow any non-GET redirects
              followAllRedirects: true
            }, typeof window !== 'undefined' ? {} : {
              // Follow GET redirects; this option is for Node only
              followRedirect: true
            });
            _context.next = 5;
            return get(options);

          case 5:
            _ref2 = _context.sent;
            response = _ref2.response;
            body = _ref2.body;
            _context.prev = 8;
            validateResponse(response);
            return _context.abrupt("return", {
              body: body,
              response: response
            });

          case 13:
            _context.prev = 13;
            _context.t0 = _context["catch"](8);
            return _context.abrupt("return", {
              error: true,
              message: _context.t0.message
            });

          case 16:
          case "end":
            return _context.stop();
        }
      }
    }, _callee, this, [[8, 13]]);
  }));
  return _fetchResource.apply(this, arguments);
}

function convertMetaProp($, from, to) {
  $("meta[".concat(from, "]")).each(function (_, node) {
    var $node = $(node);
    var value = $node.attr(from);
    $node.attr(to, value);
    $node.removeAttr(from);
  });
  return $;
} // For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.


function normalizeMetaTags($) {
  $ = convertMetaProp($, 'content', 'value');
  $ = convertMetaProp($, 'property', 'name');
  return $;
}

// Spacer images to be removed
var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep
// but would normally remove

var KEEP_CLASS = 'mercury-parser-keep';
var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.

var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty

var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders

var HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];
var HEADER_TAG_LIST = HEADER_TAGS.join(','); // // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.

var UNLIKELY_CANDIDATES_BLACKLIST = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', // 'form', // This is too generic, has too many false positives
'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.
'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box
'presence_control_external', // lifehacker.com container full of false positives
'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools']; // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.

var UNLIKELY_CANDIDATES_WHITELIST = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form
'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow']; // A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.

var DIV_TO_P_BLOCK_TAGS = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(','); // A list of tags that should be ignored when trying to find the top candidate
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var POSITIVE_SCORE_HINTS = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
'\\Bcopy']; // The above list, joined into a matching regular expression

var POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var NEGATIVE_SCORE_HINTS = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
'promo', 'pr_', // autoblog - press release
'related', 'respond', 'roundcontent', // lifehacker restricted content warning
'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression

var NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful.

var IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear.

var PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next
// http://bit.ly/qneNIT

var BLOCK_LEVEL_TAGS = ['article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video'];
var BLOCK_LEVEL_TAGS_RE = new RegExp("^(".concat(BLOCK_LEVEL_TAGS.join('|'), ")$"), 'i'); // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.

var candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
var CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
var candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
var CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');

function stripUnlikelyCandidates($) {
  //  Loop through the provided document and remove any non-link nodes
  //  that are unlikely candidates for article content.
  //
  //  Links are ignored because there are very often links to content
  //  that are identified as non-body-content, but may be inside
  //  article-like content.
  //
  //  :param $: a cheerio object to strip nodes from
  //  :return $: the cleaned cheerio object
  $('*').not('a').each(function (index, node) {
    var $node = $(node);
    var classes = $node.attr('class');
    var id = $node.attr('id');
    if (!id && !classes) return;
    var classAndId = "".concat(classes || '', " ").concat(id || '');

    if (CANDIDATES_WHITELIST.test(classAndId)) {
      return;
    }

    if (CANDIDATES_BLACKLIST.test(classAndId)) {
      $node.remove();
    }
  });
  return $;
}

// Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
//  Given cheerio object, convert consecutive <br /> tags into
//  <p /> tags instead.
//
//  :param $: A cheerio object

function brsToPs$$1($) {
  var collapsing = false;
  $('br').each(function (index, element) {
    var $element = $(element);
    var nextElement = $element.next().get(0);

    if (nextElement && nextElement.tagName.toLowerCase() === 'br') {
      collapsing = true;
      $element.remove();
    } else if (collapsing) {
      collapsing = false;
      paragraphize(element, $, true);
    }
  });
  return $;
}

// make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
// If the node is a <br />, it treats the following inline siblings
// as if they were its children.
//
// :param node: The node to paragraphize; this is a raw node
// :param $: The cheerio object to handle dom manipulation
// :param br: Whether or not the passed node is a br

function paragraphize(node, $) {
  var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
  var $node = $(node);

  if (br) {
    var sibling = node.nextSibling;
    var p = $('<p></p>'); // while the next node is text or not a block level element
    // append it to a new p node

    while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {
      var _sibling = sibling,
          nextSibling = _sibling.nextSibling;
      $(sibling).appendTo(p);
      sibling = nextSibling;
    }

    $node.replaceWith(p);
    $node.remove();
    return $;
  }

  return $;
}

function convertDivs($) {
  $('div').each(function (index, div) {
    var $div = $(div);
    var convertible = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;

    if (convertible) {
      convertNodeTo$$1($div, $, 'p');
    }
  });
  return $;
}

function convertSpans($) {
  $('span').each(function (index, span) {
    var $span = $(span);
    var convertible = $span.parents('p, div, li, figcaption').length === 0;

    if (convertible) {
      convertNodeTo$$1($span, $, 'p');
    }
  });
  return $;
} // Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
//   Things fitting this criteria:
//   * Multiple consecutive <br /> tags.
//   * <div /> tags without block level elements inside of them
//   * <span /> tags who are not children of <p /> or <div /> tags.
//
//   :param $: A cheerio object to search
//   :return cheerio object with new p elements
//   (By-reference mutation, though. Returned just for convenience.)


function convertToParagraphs$$1($) {
  $ = brsToPs$$1($);
  $ = convertDivs($);
  $ = convertSpans($);
  return $;
}

function convertNodeTo$$1($node, $) {
  var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
  var node = $node.get(0);

  if (!node) {
    return $;
  }

  var attrs = getAttrs(node) || {};

  var attribString = _Reflect$ownKeys(attrs).map(function (key) {
    return "".concat(key, "=").concat(attrs[key]);
  }).join(' ');

  var html;

  if ($.browser) {
    // In the browser, the contents of noscript tags aren't rendered, therefore
    // transforms on the noscript tag (commonly used for lazy-loading) don't work
    // as expected. This test case handles that
    html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
  } else {
    html = $node.contents();
  }

  $node.replaceWith("<".concat(tag, " ").concat(attribString, ">").concat(html, "</").concat(tag, ">"));
  return $;
}

function cleanForHeight($img, $) {
  var height = _parseInt($img.attr('height'), 10);

  var width = _parseInt($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or
  // widths, because they are most likely shims or icons,
  // which aren't very useful for reading.

  if ((height || 20) < 10 || width < 10) {
    $img.remove();
  } else if (height) {
    // Don't ever specify a height on images, so that we can
    // scale with respect to width without screwing up the
    // aspect ratio.
    $img.removeAttr('height');
  }

  return $;
} // Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP


function removeSpacers($img, $) {
  if (SPACER_RE.test($img.attr('src'))) {
    $img.remove();
  }

  return $;
}

function cleanImages($article, $) {
  $article.find('img').each(function (index, img) {
    var $img = $(img);
    cleanForHeight($img, $);
    removeSpacers($img, $);
  });
  return $;
}

function markToKeep(article, $, url) {
  var tags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];

  if (tags.length === 0) {
    tags = KEEP_SELECTORS;
  }

  if (url) {
    var _URL$parse = URL.parse(url),
        protocol = _URL$parse.protocol,
        hostname = _URL$parse.hostname;

    tags = [].concat(_toConsumableArray(tags), ["iframe[src^=\"".concat(protocol, "//").concat(hostname, "\"]")]);
  }

  $(tags.join(','), article).addClass(KEEP_CLASS);
  return $;
}

function stripJunkTags(article, $) {
  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];

  if (tags.length === 0) {
    tags = STRIP_OUTPUT_TAGS;
  } // Remove matching elements, but ignore
  // any element with a class of mercury-parser-keep


  $(tags.join(','), article).not(".".concat(KEEP_CLASS)).remove();
  return $;
}

// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.

function cleanHOnes$$1(article, $) {
  var $hOnes = $('h1', article);

  if ($hOnes.length < 3) {
    $hOnes.each(function (index, node) {
      return $(node).remove();
    });
  } else {
    $hOnes.each(function (index, node) {
      convertNodeTo$$1($(node), $, 'h2');
    });
  }

  return $;
}

function removeAllButWhitelist($article, $) {
  $article.find('*').each(function (index, node) {
    var attrs = getAttrs(node);
    setAttrs(node, _Reflect$ownKeys(attrs).reduce(function (acc, attr) {
      if (WHITELIST_ATTRS_RE.test(attr)) {
        return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr]));
      }

      return acc;
    }, {}));
  }); // Remove the mercury-parser-keep class from result

  $(".".concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
  return $article;
} // Remove attributes like style or align


function cleanAttributes$$1($article, $) {
  // Grabbing the parent because at this point
  // $article will be wrapped in a div which will
  // have a score set on it.
  return removeAllButWhitelist($article.parent().length ? $article.parent() : $article, $);
}

function removeEmpty($article, $) {
  $article.find('p').each(function (index, p) {
    var $p = $(p);
    if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove();
  });
  return $;
}

// // CONTENT FETCHING CONSTANTS ////
// for a document.

var NON_TOP_CANDIDATE_TAGS$1 = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
var NON_TOP_CANDIDATE_TAGS_RE$1 = new RegExp("^(".concat(NON_TOP_CANDIDATE_TAGS$1.join('|'), ")$"), 'i'); // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats

var HNEWS_CONTENT_SELECTORS$1 = [['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body']];
var PHOTO_HINTS$1 = ['figure', 'photo', 'image', 'caption'];
var PHOTO_HINTS_RE$1 = new RegExp(PHOTO_HINTS$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var POSITIVE_SCORE_HINTS$1 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
'\\Bcopy']; // The above list, joined into a matching regular expression

var POSITIVE_SCORE_RE$1 = new RegExp(POSITIVE_SCORE_HINTS$1.join('|'), 'i'); // Readability publisher-specific guidelines

var READABILITY_ASSET$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?

var NEGATIVE_SCORE_HINTS$1 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
'promo', 'pr_', // autoblog - press release
'related', 'respond', 'roundcontent', // lifehacker restricted content warning
'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression

var NEGATIVE_SCORE_RE$1 = new RegExp(NEGATIVE_SCORE_HINTS$1.join('|'), 'i'); // Match a digit. Pretty clear.
var PARAGRAPH_SCORE_TAGS$1 = new RegExp('^(p|li|span|pre)$', 'i');
var CHILD_CONTENT_TAGS$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
var BAD_TAGS$1 = new RegExp('^(address|form)$', 'i');

function getWeight(node) {
  var classes = node.attr('class');
  var id = node.attr('id');
  var score = 0;

  if (id) {
    // if id exists, try to score on both positive and negative
    if (POSITIVE_SCORE_RE$1.test(id)) {
      score += 25;
    }

    if (NEGATIVE_SCORE_RE$1.test(id)) {
      score -= 25;
    }
  }

  if (classes) {
    if (score === 0) {
      // if classes exist and id did not contribute to score
      // try to score on both positive and negative
      if (POSITIVE_SCORE_RE$1.test(classes)) {
        score += 25;
      }

      if (NEGATIVE_SCORE_RE$1.test(classes)) {
        score -= 25;
      }
    } // even if score has been set by id, add score for
    // possible photo matches
    // "try to keep photos if we can"


    if (PHOTO_HINTS_RE$1.test(classes)) {
      score += 10;
    } // add 25 if class matches entry-content-asset,
    // a class apparently instructed for use in the
    // Readability publisher guidelines
    // https://www.readability.com/developers/guidelines


    if (READABILITY_ASSET$1.test(classes)) {
      score += 25;
    }
  }

  return score;
}

// returns the score of a node based on
// the node's score attribute
// returns null if no score set
function getScore($node) {
  return _parseFloat($node.attr('score')) || null;
}

// return 1 for every comma in text
function scoreCommas(text) {
  return (text.match(/,/g) || []).length;
}

var idkRe = new RegExp('^(p|pre)$', 'i');
function scoreLength(textLength) {
  var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
  var chunks = textLength / 50;

  if (chunks > 0) {
    var lengthBonus; // No idea why p or pre are being tamped down here
    // but just following the source for now
    // Not even sure why tagName is included here,
    // since this is only being called from the context
    // of scoreParagraph

    if (idkRe.test(tagName)) {
      lengthBonus = chunks - 2;
    } else {
      lengthBonus = chunks - 1.25;
    }

    return Math.min(Math.max(lengthBonus, 0), 3);
  }

  return 0;
}

// commas, etc. Higher is better.

function scoreParagraph$$1(node) {
  var score = 1;
  var text = node.text().trim();
  var textLength = text.length; // If this paragraph is less than 25 characters, don't count it.

  if (textLength < 25) {
    return 0;
  } // Add points for any commas within this paragraph


  score += scoreCommas(text); // For every 50 characters in this paragraph, add another point. Up
  // to 3 points.

  score += scoreLength(textLength); // Articles can end with short paragraphs when people are being clever
  // but they can also end with short paragraphs setting up lists of junk
  // that we strip. This negative tweaks junk setup paragraphs just below
  // the cutoff threshold.

  if (text.slice(-1) === ':') {
    score -= 1;
  }

  return score;
}

function setScore($node, $, score) {
  $node.attr('score', score);
  return $node;
}

function addScore$$1($node, $, amount) {
  try {
    var score = getOrInitScore$$1($node, $) + amount;
    setScore($node, $, score);
  } catch (e) {// Ignoring; error occurs in scoreNode
  }

  return $node;
}

function addToParent$$1(node, $, score) {
  var parent = node.parent();

  if (parent) {
    addScore$$1(parent, $, score * 0.25);
  }

  return node;
}

// if not, initializes a score based on
// the node's tag type

function getOrInitScore$$1($node, $) {
  var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
  var score = getScore($node);

  if (score) {
    return score;
  }

  score = scoreNode$$1($node);

  if (weightNodes) {
    score += getWeight($node);
  }

  addToParent$$1($node, $, score);
  return score;
}

// just scores based on tag.

function scoreNode$$1($node) {
  var _$node$get = $node.get(0),
      tagName = _$node$get.tagName; // TODO: Consider ordering by most likely.
  // E.g., if divs are a more common tag on a page,
  // Could save doing that regex test on every node – AP


  if (PARAGRAPH_SCORE_TAGS$1.test(tagName)) {
    return scoreParagraph$$1($node);
  }

  if (tagName.toLowerCase() === 'div') {
    return 5;
  }

  if (CHILD_CONTENT_TAGS$1.test(tagName)) {
    return 3;
  }

  if (BAD_TAGS$1.test(tagName)) {
    return -3;
  }

  if (tagName.toLowerCase() === 'th') {
    return -5;
  }

  return 0;
}

function convertSpans$1($node, $) {
  if ($node.get(0)) {
    var _$node$get = $node.get(0),
        tagName = _$node$get.tagName;

    if (tagName === 'span') {
      // convert spans to divs
      convertNodeTo$$1($node, $, 'div');
    }
  }
}

function addScoreTo($node, $, score) {
  if ($node) {
    convertSpans$1($node, $);
    addScore$$1($node, $, score);
  }
}

function scorePs($, weightNodes) {
  $('p, pre').not('[score]').each(function (index, node) {
    // The raw score for this paragraph, before we add any parent/child
    // scores.
    var $node = $(node);
    $node = setScore($node, $, getOrInitScore$$1($node, $, weightNodes));
    var $parent = $node.parent();
    var rawScore = scoreNode$$1($node);
    addScoreTo($parent, $, rawScore, weightNodes);

    if ($parent) {
      // Add half of the individual content score to the
      // grandparent
      addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
    }
  });
  return $;
} // score content. Parents get the full value of their children's
// content score, grandparents half


function scoreContent$$1($) {
  var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
  // First, look for special hNews based selectors and give them a big
  // boost, if they exist
  HNEWS_CONTENT_SELECTORS$1.forEach(function (_ref) {
    var _ref2 = _slicedToArray(_ref, 2),
        parentSelector = _ref2[0],
        childSelector = _ref2[1];

    $("".concat(parentSelector, " ").concat(childSelector)).each(function (index, node) {
      addScore$$1($(node).parent(parentSelector), $, 80);
    });
  }); // Doubling this again
  // Previous solution caused a bug
  // in which parents weren't retaining
  // scores. This is not ideal, and
  // should be fixed.

  scorePs($, weightNodes);
  scorePs($, weightNodes);
  return $;
}

// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14

function mergeSiblings($candidate, topScore, $) {
  if (!$candidate.parent().length) {
    return $candidate;
  }

  var siblingScoreThreshold = Math.max(10, topScore * 0.25);
  var wrappingDiv = $('<div></div>');
  $candidate.parent().children().each(function (index, sibling) {
    var $sibling = $(sibling); // Ignore tags like BR, HR, etc

    if (NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)) {
      return null;
    }

    var siblingScore = getScore($sibling);

    if (siblingScore) {
      if ($sibling.get(0) === $candidate.get(0)) {
        wrappingDiv.append($sibling);
      } else {
        var contentBonus = 0;
        var density = linkDensity($sibling); // If sibling has a very low link density,
        // give it a small bonus

        if (density < 0.05) {
          contentBonus += 20;
        } // If sibling has a high link density,
        // give it a penalty


        if (density >= 0.5) {
          contentBonus -= 20;
        } // If sibling node has the same class as
        // candidate, give it a bonus


        if ($sibling.attr('class') === $candidate.attr('class')) {
          contentBonus += topScore * 0.2;
        }

        var newScore = siblingScore + contentBonus;

        if (newScore >= siblingScoreThreshold) {
          return wrappingDiv.append($sibling);
        }

        if (sibling.tagName === 'p') {
          var siblingContent = $sibling.text();
          var siblingContentLength = textLength(siblingContent);

          if (siblingContentLength > 80 && density < 0.25) {
            return wrappingDiv.append($sibling);
          }

          if (siblingContentLength <= 80 && density === 0 && hasSentenceEnd(siblingContent)) {
            return wrappingDiv.append($sibling);
          }
        }
      }
    }

    return null;
  });

  if (wrappingDiv.children().length === 1 && wrappingDiv.children().first().get(0) === $candidate.get(0)) {
    return $candidate;
  }

  return wrappingDiv;
}

// candidate nodes we found and find the one with the highest score.

function findTopCandidate$$1($) {
  var $candidate;
  var topScore = 0;
  $('[score]').each(function (index, node) {
    // Ignore tags like BR, HR, etc
    if (NON_TOP_CANDIDATE_TAGS_RE$1.test(node.tagName)) {
      return;
    }

    var $node = $(node);
    var score = getScore($node);

    if (score > topScore) {
      topScore = score;
      $candidate = $node;
    }
  }); // If we don't have a candidate, return the body
  // or whatever the first element is

  if (!$candidate) {
    return $('body') || $('*').first();
  }

  $candidate = mergeSiblings($candidate, topScore, $);
  return $candidate;
}

// Scoring

function removeUnlessContent($node, $, weight) {
  // Explicitly save entry-content-asset tags, which are
  // noted as valuable in the Publisher guidelines. For now
  // this works everywhere. We may want to consider making
  // this less of a sure-thing later.
  if ($node.hasClass('entry-content-asset')) {
    return;
  }

  var content = normalizeSpaces($node.text());

  if (scoreCommas(content) < 10) {
    var pCount = $('p', $node).length;
    var inputCount = $('input', $node).length; // Looks like a form, too many inputs.

    if (inputCount > pCount / 3) {
      $node.remove();
      return;
    }

    var contentLength = content.length;
    var imgCount = $('img', $node).length; // Content is too short, and there are no images, so
    // this is probably junk content.

    if (contentLength < 25 && imgCount === 0) {
      $node.remove();
      return;
    }

    var density = linkDensity($node); // Too high of link density, is probably a menu or
    // something similar.
    // console.log(weight, density, contentLength)

    if (weight < 25 && density > 0.2 && contentLength > 75) {
      $node.remove();
      return;
    } // Too high of a link density, despite the score being
    // high.


    if (weight >= 25 && density > 0.5) {
      // Don't remove the node if it's a list and the
      // previous sibling starts with a colon though. That
      // means it's probably content.
      var tagName = $node.get(0).tagName.toLowerCase();
      var nodeIsList = tagName === 'ol' || tagName === 'ul';

      if (nodeIsList) {
        var previousNode = $node.prev();

        if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') {
          return;
        }
      }

      $node.remove();
      return;
    }

    var scriptCount = $('script', $node).length; // Too many script tags, not enough content.

    if (scriptCount > 0 && contentLength < 150) {
      $node.remove();
    }
  }
} // Given an article, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.


function cleanTags$$1($article, $) {
  $(CLEAN_CONDITIONALLY_TAGS, $article).each(function (index, node) {
    var $node = $(node); // If marked to keep, skip it

    if ($node.hasClass(KEEP_CLASS) || $node.find(".".concat(KEEP_CLASS)).length > 0) return;
    var weight = getScore($node);

    if (!weight) {
      weight = getOrInitScore$$1($node, $);
      setScore($node, $, weight);
    } // drop node if its weight is < 0


    if (weight < 0) {
      $node.remove();
    } else {
      // deteremine if node seems like content
      removeUnlessContent($node, $, weight);
    }
  });
  return $;
}

function cleanHeaders($article, $) {
  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
  $(HEADER_TAG_LIST, $article).each(function (index, header) {
    var $header = $(header); // Remove any headers that appear before all other p tags in the
    // document. This probably means that it was part of the title, a
    // subtitle or something else extraneous like a datestamp or byline,
    // all of which should be handled by other metadata handling.

    if ($($header, $article).prevAll('p').length === 0) {
      return $header.remove();
    } // Remove any headers that match the title exactly.


    if (normalizeSpaces($(header).text()) === title) {
      return $header.remove();
    } // If this header has a negative weight, it's probably junk.
    // Get rid of it.


    if (getWeight($(header)) < 0) {
      return $header.remove();
    }

    return $header;
  });
  return $;
}

// html to avoid later complications with multiple body tags.

function rewriteTopLevel$$1(article, $) {
  // I'm not using context here because
  // it's problematic when converting the
  // top-level/root node - AP
  $ = convertNodeTo$$1($('html'), $, 'div');
  $ = convertNodeTo$$1($('body'), $, 'div');
  return $;
}

function absolutize($, rootUrl, attr) {
  var baseUrl = $('base').attr('href');
  $("[".concat(attr, "]")).each(function (_, node) {
    var attrs = getAttrs(node);
    var url = attrs[attr];
    if (!url) return;
    var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
    setAttr(node, attr, absoluteUrl);
  });
}

function absolutizeSet($, rootUrl, $content) {
  $('[srcset]', $content).each(function (_, node) {
    var attrs = getAttrs(node);
    var urlSet = attrs.srcset;

    if (urlSet) {
      // a comma should be considered part of the candidate URL unless preceded by a descriptor
      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
      // space characters inside the URL should be encoded (%20 or +)
      var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g);
      if (!candidates) return;
      var absoluteCandidates = candidates.map(function (candidate) {
        // a candidate URL cannot start or end with a comma
        // descriptors are separated from the URLs by unescaped whitespace
        var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
        parts[0] = URL.resolve(rootUrl, parts[0]);
        return parts.join(' ');
      });

      var absoluteUrlSet = _toConsumableArray(new _Set(absoluteCandidates)).join(', ');

      setAttr(node, 'srcset', absoluteUrlSet);
    }
  });
}

function makeLinksAbsolute$$1($content, $, url) {
  ['href', 'src'].forEach(function (attr) {
    return absolutize($, url, attr);
  });
  absolutizeSet($, url, $content);
  return $content;
}

function textLength(text) {
  return text.trim().replace(/\s+/g, ' ').length;
} // Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float

function linkDensity($node) {
  var totalTextLength = textLength($node.text());
  var linkText = $node.find('a').text();
  var linkLength = textLength(linkText);

  if (totalTextLength > 0) {
    return linkLength / totalTextLength;
  }

  if (totalTextLength === 0 && linkLength > 0) {
    return 1;
  }

  return 0;
}

// search for, find a meta tag associated.

function extractFromMeta$$1($, metaNames, cachedNames) {
  var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
  var foundNames = metaNames.filter(function (name) {
    return cachedNames.indexOf(name) !== -1;
  }); // eslint-disable-next-line no-restricted-syntax

  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;

  try {
    var _loop = function _loop() {
      var name = _step.value;
      var type = 'name';
      var value = 'value';
      var nodes = $("meta[".concat(type, "=\"").concat(name, "\"]")); // Get the unique value of every matching node, in case there
      // are two meta tags with the same name and value.
      // Remove empty values.

      var values = nodes.map(function (index, node) {
        return $(node).attr(value);
      }).toArray().filter(function (text) {
        return text !== '';
      }); // If we have more than one value for the same name, we have a
      // conflict and can't trust any of them. Skip this name. If we have
      // zero, that means our meta tags had no values. Skip this name
      // also.

      if (values.length === 1) {
        var metaValue; // Meta values that contain HTML should be stripped, as they
        // weren't subject to cleaning previously.

        if (cleanTags) {
          metaValue = stripTags(values[0], $);
        } else {
          var _values = _slicedToArray(values, 1);

          metaValue = _values[0];
        }

        return {
          v: metaValue
        };
      }
    };

    for (var _iterator = _getIterator(foundNames), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
      var _ret = _loop();

      if (_typeof(_ret) === "object") return _ret.v;
    } // If nothing is found, return null

  } catch (err) {
    _didIteratorError = true;
    _iteratorError = err;
  } finally {
    try {
      if (!_iteratorNormalCompletion && _iterator.return != null) {
        _iterator.return();
      }
    } finally {
      if (_didIteratorError) {
        throw _iteratorError;
      }
    }
  }

  return null;
}

function isGoodNode($node, maxChildren) {
  // If it has a number of children, it's more likely a container
  // element. Skip it.
  if ($node.children().length > maxChildren) {
    return false;
  } // If it looks to be within a comment, skip it.


  if (withinComment$$1($node)) {
    return false;
  }

  return true;
} // Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.


function extractFromSelectors$$1($, selectors) {
  var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
  var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
  // eslint-disable-next-line no-restricted-syntax
  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;

  try {
    for (var _iterator = _getIterator(selectors), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
      var selector = _step.value;
      var nodes = $(selector); // If we didn't get exactly one of this selector, this may be
      // a list of articles or comments. Skip it.

      if (nodes.length === 1) {
        var $node = $(nodes[0]);

        if (isGoodNode($node, maxChildren)) {
          var content = void 0;

          if (textOnly) {
            content = $node.text();
          } else {
            content = $node.html();
          }

          if (content) {
            return content;
          }
        }
      }
    }
  } catch (err) {
    _didIteratorError = true;
    _iteratorError = err;
  } finally {
    try {
      if (!_iteratorNormalCompletion && _iterator.return != null) {
        _iterator.return();
      }
    } finally {
      if (_didIteratorError) {
        throw _iteratorError;
      }
    }
  }

  return null;
}

// strips all tags from a string of text
function stripTags(text, $) {
  // Wrapping text in html element prevents errors when text
  // has no html
  var cleanText = $("<span>".concat(text, "</span>")).text();
  return cleanText === '' ? text : cleanText;
}

function withinComment$$1($node) {
  var parents = $node.parents().toArray();
  var commentParent = parents.find(function (parent) {
    var attrs = getAttrs(parent);
    var nodeClass = attrs.class,
        id = attrs.id;
    var classAndId = "".concat(nodeClass, " ").concat(id);
    return classAndId.includes('comment');
  });
  return commentParent !== undefined;
}

// Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean
function nodeIsSufficient($node) {
  return $node.text().trim().length >= 100;
}

function isWordpress($) {
  return $(IS_WP_SELECTOR).length > 0;
}

function getAttrs(node) {
  var attribs = node.attribs,
      attributes = node.attributes;

  if (!attribs && attributes) {
    var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
      var attr = attributes[index];
      if (!attr.name || !attr.value) return acc;
      acc[attr.name] = attr.value;
      return acc;
    }, {});

    return attrs;
  }

  return attribs;
}

function setAttr(node, attr, val) {
  if (node.attribs) {
    node.attribs[attr] = val;
  } else if (node.attributes) {
    node.setAttribute(attr, val);
  }

  return node;
}

function setAttrs(node, attrs) {
  if (node.attribs) {
    node.attribs = attrs;
  } else if (node.attributes) {
    while (node.attributes.length > 0) {
      node.removeAttribute(node.attributes[0].name);
    }

    _Reflect$ownKeys(attrs).forEach(function (key) {
      node.setAttribute(key, attrs[key]);
    });
  }

  return node;
}

// DOM manipulation

var IS_LINK = new RegExp('https?://', 'i');
var IMAGE_RE = '.(png|gif|jpe?g)';
var IS_IMAGE = new RegExp("".concat(IMAGE_RE), 'i');
var IS_SRCSET = new RegExp("".concat(IMAGE_RE, "(\\?\\S+)?(\\s*[\\d.]+[wx])"), 'i');
var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');

// lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.

function convertLazyLoadedImages($) {
  var extractSrcFromJSON = function extractSrcFromJSON(str) {
    try {
      var _JSON$parse = JSON.parse(str),
          src = _JSON$parse.src;

      if (typeof src === 'string') return src;
    } catch (e) {
      return false;
    }

    return false;
  };

  $('img').each(function (_, img) {
    var attrs = getAttrs(img);

    _Reflect$ownKeys(attrs).forEach(function (attr) {
      var value = attrs[attr];

      if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
        $(img).attr('srcset', value);
      } else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
        var existingSrc = extractSrcFromJSON(value);

        if (existingSrc) {
          $(img).attr('src', existingSrc);
        } else {
          $(img).attr('src', value);
        }
      }
    });
  });
  return $;
}

function isComment(index, node) {
  return node.type === 'comment';
}

function cleanComments($) {
  $.root().find('*').contents().filter(isComment).remove();
  return $;
}

function clean($) {
  $(TAGS_TO_REMOVE).remove();
  $ = cleanComments($);
  return $;
}

var Resource = {
  // Create a Resource.
  //
  // :param url: The URL for the document we should retrieve.
  // :param response: If set, use as the response rather than
  //                  attempting to fetch it ourselves. Expects a
  //                  string.
  // :param headers: Custom headers to be included in the request
  create: function () {
    var _create = _asyncToGenerator(
    /*#__PURE__*/
    _regeneratorRuntime.mark(function _callee(url, preparedResponse, parsedUrl) {
      var headers,
          result,
          validResponse,
          _args = arguments;
      return _regeneratorRuntime.wrap(function _callee$(_context) {
        while (1) {
          switch (_context.prev = _context.next) {
            case 0:
              headers = _args.length > 3 && _args[3] !== undefined ? _args[3] : {};

              if (!preparedResponse) {
                _context.next = 6;
                break;
              }

              validResponse = {
                statusMessage: 'OK',
                statusCode: 200,
                headers: {
                  'content-type': 'text/html',
                  'content-length': 500
                }
              };
              result = {
                body: preparedResponse,
                response: validResponse,
                alreadyDecoded: true
              };
              _context.next = 9;
              break;

            case 6:
              _context.next = 8;
              return fetchResource(url, parsedUrl, headers);

            case 8:
              result = _context.sent;

            case 9:
              if (!result.error) {
                _context.next = 12;
                break;
              }

              result.failed = true;
              return _context.abrupt("return", result);

            case 12:
              return _context.abrupt("return", this.generateDoc(result));

            case 13:
            case "end":
              return _context.stop();
          }
        }
      }, _callee, this);
    }));

    function create(_x, _x2, _x3) {
      return _create.apply(this, arguments);
    }

    return create;
  }(),
  generateDoc: function generateDoc(_ref) {
    var content = _ref.body,
        response = _ref.response,
        _ref$alreadyDecoded = _ref.alreadyDecoded,
        alreadyDecoded = _ref$alreadyDecoded === void 0 ? false : _ref$alreadyDecoded;
    var _response$headers$con = response.headers['content-type'],
        contentType = _response$headers$con === void 0 ? '' : _response$headers$con; // TODO: Implement is_text function from
    // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57

    if (!contentType.includes('html') && !contentType.includes('text')) {
      throw new Error('Content does not appear to be text.');
    }

    var $ = this.encodeDoc({
      content: content,
      contentType: contentType,
      alreadyDecoded: alreadyDecoded
    });

    if ($.root().children().length === 0) {
      throw new Error('No children, likely a bad parse.');
    }

    $ = normalizeMetaTags($);
    $ = convertLazyLoadedImages($);
    $ = clean($);
    return $;
  },
  encodeDoc: function encodeDoc(_ref2) {
    var content = _ref2.content,
        contentType = _ref2.contentType,
        _ref2$alreadyDecoded = _ref2.alreadyDecoded,
        alreadyDecoded = _ref2$alreadyDecoded === void 0 ? false : _ref2$alreadyDecoded;

    if (alreadyDecoded) {
      return cheerio.load(content);
    }

    var encoding = getEncoding(contentType);
    var decodedContent = iconv.decode(content, encoding);
    var $ = cheerio.load(decodedContent); // after first cheerio.load, check to see if encoding matches

    var contentTypeSelector = cheerio.browser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]';
    var metaContentType = $(contentTypeSelector).attr('content') || $('meta[charset]').attr('charset');
    var properEncoding = getEncoding(metaContentType); // if encodings in the header/body dont match, use the one in the body

    if (metaContentType && properEncoding !== encoding) {
      decodedContent = iconv.decode(content, properEncoding);
      $ = cheerio.load(decodedContent);
    }

    return $;
  }
};

var _marked =
/*#__PURE__*/
_regeneratorRuntime.mark(range);

function range() {
  var start,
      end,
      _args = arguments;
  return _regeneratorRuntime.wrap(function range$(_context) {
    while (1) {
      switch (_context.prev = _context.next) {
        case 0:
          start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
          end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;

        case 2:
          if (!(start <= end)) {
            _context.next = 7;
            break;
          }

          _context.next = 5;
          return start += 1;

        case 5:
          _context.next = 2;
          break;

        case 7:
        case "end":
          return _context.stop();
      }
    }
  }, _marked, this);
}

// extremely simple url validation as a first step
function validateUrl(_ref) {
  var hostname = _ref.hostname;
  // If this isn't a valid url, return an error message
  return !!hostname;
}

var merge = function merge(extractor, domains) {
  return domains.reduce(function (acc, domain) {
    acc[domain] = extractor;
    return acc;
  }, {});
};

function mergeSupportedDomains(extractor) {
  return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
}

var apiExtractors = {};
function addExtractor(extractor) {
  if (!extractor || !extractor.domain) {
    return {
      error: true,
      message: 'Unable to add custom extractor. Invalid parameters.'
    };
  }

  _Object$assign(apiExtractors, mergeSupportedDomains(extractor));

  return apiExtractors;
}

var BloggerExtractor = {
  domain: 'blogspot.com',
  content: {
    // Blogger is insane and does not load its content
    // initially in the page, but it's all there
    // in noscript
    selectors: ['.post-content noscript'],
    // Selectors to remove from the extracted content
    clean: [],
    // Convert the noscript tag to a div
    transforms: {
      noscript: 'div'
    }
  },
  author: {
    selectors: ['.post-author-name']
  },
  title: {
    selectors: ['.post h2.title']
  },
  date_published: {
    selectors: ['span.publishdate']
  }
};

var NYMagExtractor = {
  domain: 'nymag.com',
  content: {
    // Order by most likely. Extractor will stop on first occurrence
    selectors: ['div.article-content', 'section.body', 'article.article'],
    // Selectors to remove from the extracted content
    clean: ['.ad', '.single-related-story'],
    // Object of tranformations to make on matched elements
    // Each key is the selector, each value is the tag to
    // transform to.
    // If a function is given, it should return a string
    // to convert to or nothing (in which case it will not perform
    // the transformation.
    transforms: {
      // Convert h1s to h2s
      h1: 'h2',
      // Convert lazy-loaded noscript images to figures
      noscript: function noscript($node, $) {
        var $children = $.browser ? $($node.text()) : $node.children();

        if ($children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img') {
          return 'figure';
        }

        return null;
      }
    }
  },
  title: {
    selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1']
  },
  author: {
    selectors: ['.by-authors', '.lede-feature-author']
  },
  dek: {
    selectors: ['.lede-feature-teaser']
  },
  date_published: {
    selectors: [['time.article-timestamp[datetime]', 'datetime'], 'time.article-timestamp']
  }
};

var WikipediaExtractor = {
  domain: 'wikipedia.org',
  content: {
    selectors: ['#mw-content-text'],
    defaultCleaner: false,
    // transform top infobox to an image with caption
    transforms: {
      '.infobox img': function infoboxImg($node) {
        var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox

        if ($parent.children('img').length === 0) {
          $parent.prepend($node);
        }
      },
      '.infobox caption': 'figcaption',
      '.infobox': 'figure'
    },
    // Selectors to remove from the extracted content
    clean: ['.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox']
  },
  author: 'Wikipedia Contributors',
  title: {
    selectors: ['h2.title']
  },
  date_published: {
    selectors: ['#footer-info-lastmod']
  }
};

var TwitterExtractor = {
  domain: 'twitter.com',
  content: {
    transforms: {
      // We're transforming essentially the whole page here.
      // Twitter doesn't have nice selectors, so our initial
      // selector grabs the whole page, then we're re-writing
      // it to fit our needs before we clean it up.
      '.permalink[role=main]': function permalinkRoleMain($node, $) {
        var tweets = $node.find('.tweet');
        var $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
        $tweetContainer.append(tweets);
        $node.replaceWith($tweetContainer);
      },
      // Twitter wraps @ with s, which
      // renders as a strikethrough
      s: 'span'
    },
    selectors: ['.permalink[role=main]'],
    defaultCleaner: false,
    clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']
  },
  author: {
    selectors: ['.tweet.permalink-tweet .username']
  },
  date_published: {
    selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']]
  }
};

var NYTimesExtractor = {
  domain: 'www.nytimes.com',
  title: {
    selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline', 'h1 .balancedHeadline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline', ['meta[name="byl"]', 'value']]
  },
  content: {
    selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
    transforms: {
      'img.g-lazy': function imgGLazy($node) {
        var src = $node.attr('src');
        var width = 640;
        src = src.replace('{{size}}', width);
        $node.attr('src', src);
      }
    },
    clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden', '.comments', '.supplemental', '.nocontent', '.story-footer-links']
  },
  date_published: {
    selectors: [['meta[name="article:published"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: null,
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
var TheAtlanticExtractor = {
  domain: 'www.theatlantic.com',
  title: {
    selectors: ['h1', '.c-article-header__hed']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], '.c-byline__author']
  },
  content: {
    selectors: ['article', '.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet']
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  date_published: {
    selectors: [['time[itemprop="datePublished"]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['img[itemprop="url"]', 'src']]
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var NewYorkerExtractor = {
  domain: 'www.newyorker.com',
  title: {
    selectors: ['h1[class^="content-header"]', 'h1[class^="ArticleHeader__hed"]', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], 'div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"]']
  },
  content: {
    selectors: ['article.article.main-content', 'main[class^="Layout__content"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['footer[class^="ArticleFooter__footer"]']
  },
  date_published: {
    selectors: ['time.content-header__publish-date', ['meta[name="pubdate"]', 'value']],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: ['div.content-header__dek', 'h2[class^="ArticleHeader__dek"]']
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WiredExtractor = {
  domain: 'www.wired.com',
  title: {
    selectors: ['h1.content-header__hed', 'h1.post-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value'], 'a[rel="author"]']
  },
  content: {
    selectors: ['article.article.main-content', 'article.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.visually-hidden', 'figcaption img.photo']
  },
  date_published: {
    selectors: ['time.content-header__publish-date', ['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var MSNExtractor = {
  domain: 'www.msn.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['span.authorname-txt']
  },
  content: {
    selectors: ['div.richtext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['span.caption']
  },
  date_published: {
    selectors: ['span.time']
  },
  lead_image_url: {
    selectors: []
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var YahooExtractor = {
  domain: 'www.yahoo.com',
  title: {
    selectors: ['header.canvas-header']
  },
  author: {
    selectors: ['span.provider-name']
  },
  content: {
    selectors: [// enter content selectors
    '.content-canvas'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.figure-caption']
  },
  date_published: {
    selectors: [['time.date[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: [// enter dek selectors
    ]
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BuzzfeedExtractor = {
  domain: 'www.buzzfeed.com',
  title: {
    selectors: ['h1[id="post-title"]']
  },
  author: {
    selectors: ['a[data-action="user/username"]', 'byline__author']
  },
  content: {
    selectors: [['.longform_custom_header_media', '#buzz_sub_buzz'], '#buzz_sub_buzz'],
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: 'b',
      'div.longform_custom_header_media': function divLongform_custom_header_media($node) {
        if ($node.has('img') && $node.has('.longform_header_image_source')) {
          return 'figure';
        }

        return null;
      },
      'figure.longform_custom_header_media .longform_header_image_source': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.instapaper_ignore', '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline', '.share-box', '.print']
  },
  date_published: {
    selectors: ['.buzz-datetime']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WikiaExtractor = {
  domain: 'fandom.wikia.com',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['.author vcard', '.fn']
  },
  content: {
    selectors: ['.grid-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var LittleThingsExtractor = {
  domain: 'www.littlethings.com',
  title: {
    selectors: ['h1.post-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  content: {
    selectors: [// enter content selectors
    '.mainContentIntro', '.content-wrapper'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  next_page_url: null,
  excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var PoliticoExtractor = {
  domain: 'www.politico.com',
  title: {
    selectors: [// enter title selectors
    ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['.story-main-content .byline .vcard']
  },
  content: {
    selectors: [// enter content selectors
    '.story-main-content', '.content-group', '.story-core', '.story-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['figcaption']
  },
  date_published: {
    selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [// enter lead_image_url selectors
    ['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: null,
  excerpt: null
};

var DeadspinExtractor = {
  domain: 'deadspin.com',
  supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', 'avclub.com', 'clickhole.com', 'splinternews.com', 'theonion.com', 'theroot.com', 'thetakeout.com', 'theinventory.com'],
  title: {
    selectors: ['h1.headline']
  },
  author: {
    selectors: ['.author']
  },
  content: {
    selectors: ['.post-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube($node) {
        var youtubeId = $node.attr('id').split('youtube-')[1];
        $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.magnifier', '.lightbox']
  },
  date_published: {
    selectors: [['time.updated[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BroadwayWorldExtractor = {
  domain: 'www.broadwayworld.com',
  title: {
    selectors: ['h1.article-title']
  },
  author: {
    selectors: ['span[itemprop=author]']
  },
  content: {
    selectors: ['div[itemprop=articlebody]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  date_published: {
    selectors: [['meta[itemprop=datePublished]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var ApartmentTherapyExtractor = {
  domain: 'www.apartmenttherapy.com',
  title: {
    selectors: ['h1.headline']
  },
  author: {
    selectors: ['.PostByline__name']
  },
  content: {
    selectors: ['div.post__content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture($node, $) {
        var data = JSON.parse($node.attr('data-props'));
        var src = data.sources[0].src;
        var $img = $('<img />').attr('src', src);
        $node.replaceWith($img);
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  },
  date_published: {
    selectors: [['.PostByline__timestamp[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
    selectors: []
  },
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
};

var MediumExtractor = {
  domain: 'medium.com',
  title: {
    selectors: ['h1', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  content: {
    selectors: ['article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // Allow drop cap character.
      'section span:first-of-type': function sectionSpanFirstOfType($node) {
        var $text = $node.html();

        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
          $node.replaceWith($text);
        }
      },
      // Re-write lazy-loaded youtube videos
      iframe: function iframe($node) {
        var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
        var thumb = decodeURIComponent($node.attr('data-thumbnail'));
        var $parent = $node.parents('figure');

        if (ytRe.test(thumb)) {
          var _thumb$match = thumb.match(ytRe),
              _thumb$match2 = _slicedToArray(_thumb$match, 2),
              _ = _thumb$match2[0],
              youtubeId = _thumb$match2[1]; // eslint-disable-line


          $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
          var $caption = $parent.find('figcaption');
          $parent.empty().append([$node, $caption]);
          return;
        } // If we can't draw the YouTube preview, remove the figure.


        $parent.remove();
      },
      // rewrite figures to pull out image and caption, remove rest
      figure: function figure($node) {
        // ignore if figure has an iframe
        if ($node.find('iframe').length > 0) return;
        var $img = $node.find('img').slice(-1)[0];
        var $caption = $node.find('figcaption');
        $node.empty().append([$img, $caption]);
      },
      // Remove any smaller images that did not get caught by the generic image
      // cleaner (author photo 48px, leading sentence images 79px, etc.).
      img: function img($node) {
        var width = _parseInt($node.attr('width'), 10);

        if (width < 100) $node.remove();
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['span a', 'svg']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: null,
  next_page_url: {
    selectors: [// enter selectors
    ]
  },
  excerpt: {
    selectors: [// enter selectors
    ]
  }
};

var WwwTmzComExtractor = {
  domain: 'www.tmz.com',
  title: {
    selectors: ['.post-title-breadcrumb', 'h1', '.headline']
  },
  author: 'TMZ STAFF',
  date_published: {
    selectors: ['.article-posted-date'],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-content', '.all-post-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.lightbox-link']
  }
};

var WwwWashingtonpostComExtractor = {
  domain: 'www.washingtonpost.com',
  title: {
    selectors: ['h1', '#topper-headline-wrapper']
  },
  author: {
    selectors: ['.pb-author-name']
  },
  date_published: {
    selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']]
  },
  dek: {
    selectors: []
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.inline-content': function divInlineContent($node) {
        if ($node.has('img,iframe,video').length > 0) {
          return 'figure';
        }

        $node.remove();
        return null;
      },
      '.pb-caption': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.interstitial-link', '.newsletter-inline-unit']
  }
};

var WwwHuffingtonpostComExtractor = {
  domain: 'www.huffingtonpost.com',
  title: {
    selectors: ['h1.headline__title']
  },
  author: {
    selectors: ['span.author-card__details__name']
  },
  date_published: {
    selectors: [['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.headline__subtitle']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.entry__body'],
    defaultCleaner: false,
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story']
  }
};

var NewrepublicComExtractor = {
  domain: 'newrepublic.com',
  title: {
    selectors: ['h1.article-headline', '.minutes-primary h1.minute-title']
  },
  author: {
    selectors: ['div.author-list', '.minutes-primary h3.minute-byline']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['h2.article-subhead']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.article-cover', 'div.content-body'], ['.minute-image', '.minutes-primary div.content-body']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['aside']
  }
};

var MoneyCnnComExtractor = {
  domain: 'money.cnn.com',
  title: {
    selectors: ['.article-title']
  },
  author: {
    selectors: ['.byline a']
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']],
    timezone: 'GMT'
  },
  dek: {
    selectors: ['#storytext h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#storytext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.inStoryHeading']
  }
};

var WwwThevergeComExtractor = {
  domain: 'www.theverge.com',
  supportedDomains: ['www.polygon.com'],
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [// feature template multi-match
    ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match
    ['.e-image--hero', '.c-entry-content'], // feature template fallback
    '.l-wrapper .l-feature', // regular post fallback
    'div.c-entry-content'],
    // Transform lazy-loaded images
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'span';
        }

        return null;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.aside', 'img.c-dynamic-image']
  }
};

var WwwCnnComExtractor = {
  domain: 'www.cnn.com',
  title: {
    selectors: ['h1.pg-headline', 'h1']
  },
  author: {
    selectors: ['.metadata__byline__author']
  },
  date_published: {
    selectors: [['meta[name="pubdate"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [// a more specific selector to grab the lead image and the body
    ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above
    '.zn-body-text', 'div[itemprop="articleBody"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph($node) {
        var $text = $node.html();

        if ($text) {
          return 'p';
        }

        return null;
      },
      // this transform cleans the short, all-link sections linking
      // to related content but not marked as such in any way.
      '.zn-body__paragraph': function znBody__paragraph($node) {
        if ($node.has('a')) {
          if ($node.text().trim() === $node.find('a').text().trim()) {
            $node.remove();
          }
        }
      },
      '.media__video--thumbnail': 'figure'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwAolComExtractor = {
  domain: 'www.aol.com',
  title: {
    selectors: ['h1.p-article__title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: ['.p-article__byline__date'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwYoutubeComExtractor = {
  domain: 'www.youtube.com',
  title: {
    selectors: ['.watch-title', 'h1.watch-title-container']
  },
  author: {
    selectors: ['.yt-user-info']
  },
  date_published: {
    selectors: [['meta[itemProp="datePublished"]', 'value']],
    timezone: 'GMT'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    defaultCleaner: false,
    selectors: [['#player-api', '#eow-description']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '#player-api': function playerApi($node, $) {
        var videoId = $('meta[itemProp="videoId"]').attr('value');
        $node.html("\n          <iframe src=\"https://www.youtube.com/embed/".concat(videoId, "\" frameborder=\"0\" allowfullscreen></iframe>"));
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwTheguardianComExtractor = {
  domain: 'www.theguardian.com',
  title: {
    selectors: ['.content__headline']
  },
  author: {
    selectors: ['p.byline']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.content__standfirst']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.content__article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.hide-on-mobile', '.inline-icon']
  }
};

var WwwSbnationComExtractor = {
  domain: 'www.sbnation.com',
  title: {
    selectors: ['h1.c-page-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.c-entry-summary.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwBloombergComExtractor = {
  domain: 'www.bloomberg.com',
  title: {
    selectors: [// normal articles
    '.lede-headline', // /graphics/ template
    'h1.article-title', // /news/ template
    'h1.lede-text-only__hed']
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template
    '.bydek', // /news/ template
    '.author']
  },
  date_published: {
    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value']]
  },
  dek: {
    selectors: []
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-body__content', // /graphics/ template
    ['section.copy-block'], // /news/ template
    '.body-copy'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.inline-newsletter', '.page-ad']
  }
};

var WwwBustleComExtractor = {
  domain: 'www.bustle.com',
  title: {
    selectors: ['h1.post-page__title']
  },
  author: {
    selectors: ['div.content-meta__author']
  },
  date_published: {
    selectors: [['time.content-meta__published-date[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.post-page__body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwNprOrgExtractor = {
  domain: 'www.npr.org',
  title: {
    selectors: ['h1', '.storytitle']
  },
  author: {
    selectors: ['p.byline__name.byline__name--block']
  },
  date_published: {
    selectors: [['.dateblock time[datetime]', 'datetime'], ['meta[name="date"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="twitter:image:src"]', 'value']]
  },
  content: {
    selectors: ['.storytext'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.bucketwrap.image': 'figure',
      '.bucketwrap.image .credit-caption': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['div.enlarge_measure']
  }
};

var WwwRecodeNetExtractor = {
  domain: 'www.recode.net',
  title: {
    selectors: ['h1.c-page-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.c-entry-summary.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var QzComExtractor = {
  domain: 'qz.com',
  title: {
    selectors: ['article header h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['time[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[name="twitter:image"]', 'content']]
  },
  content: {
    selectors: ['#article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwDmagazineComExtractor = {
  domain: 'www.dmagazine.com',
  title: {
    selectors: ['h1.story__title']
  },
  author: {
    selectors: ['.story__info .story__info__item:first-child']
  },
  date_published: {
    selectors: [// enter selectors
    '.story__info'],
    timezone: 'America/Chicago',
    format: 'MMMM D, YYYY h:mm a'
  },
  dek: {
    selectors: ['.story__subhead']
  },
  lead_image_url: {
    selectors: [['article figure a:first-child', 'href']]
  },
  content: {
    selectors: ['.story__content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwReutersComExtractor = {
  domain: 'www.reuters.com',
  title: {
    selectors: ['h1.article-headline']
  },
  author: {
    selectors: ['.author']
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#article-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.article-subtitle': 'h4'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['#article-byline .author']
  }
};

var MashableComExtractor = {
  domain: 'mashable.com',
  title: {
    selectors: ['h1.title']
  },
  author: {
    selectors: ['span.author_name a']
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['section.article-content.blueprint'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.image-credit': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwChicagotribuneComExtractor = {
  domain: 'www.chicagotribune.com',
  title: {
    selectors: ['h1.trb_ar_hl_t']
  },
  author: {
    selectors: ['span.trb_ar_by_nm_au']
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.trb_ar_page'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwVoxComExtractor = {
  domain: 'www.vox.com',
  title: {
    selectors: ['h1.c-page-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.p-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'figure .e-image__image noscript': function figureEImage__imageNoscript($node) {
        var imgHtml = $node.html();
        $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml);
      },
      'figure .e-image__meta': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var NewsNationalgeographicComExtractor = {
  domain: 'news.nationalgeographic.com',
  title: {
    selectors: ['h1', 'h1.main-title']
  },
  author: {
    selectors: ['.byline-component__contributors b span']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    format: 'ddd MMM DD HH:mm:ss zz YYYY',
    timezone: 'EST'
  },
  dek: {
    selectors: ['.article__deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.parsys.content': function parsysContent($node, $) {
        var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');

        if ($imgSrc) {
          $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote.pull-quote--large']
  }
};

var WwwNationalgeographicComExtractor = {
  domain: 'www.nationalgeographic.com',
  title: {
    selectors: ['h1', 'h1.main-title']
  },
  author: {
    selectors: ['.byline-component__contributors b span']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.article__deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.parsys.content': function parsysContent($node, $) {
        var $imageParent = $node.children().first();

        if ($imageParent.hasClass('imageGroup')) {
          var $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();
          var imgPath1 = $dataAttrContainer.data('platform-image1-path');
          var imgPath2 = $dataAttrContainer.data('platform-image2-path');

          if (imgPath2 && imgPath1) {
            $node.prepend($("<div class=\"__image-lead__\">\n                <img src=\"".concat(imgPath1, "\"/>\n                <img src=\"").concat(imgPath2, "\"/>\n              </div>")));
          }
        } else {
          var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');

          if ($imgSrc) {
            $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
          }
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pull-quote.pull-quote--small']
  }
};

var WwwLatimesComExtractor = {
  domain: 'www.latimes.com',
  title: {
    selectors: ['.trb_ar_hl']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.trb_ar_main'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.trb_ar_la': function trb_ar_la($node) {
        var $figure = $node.find('figure');
        $node.replaceWith($figure);
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.trb_ar_by', '.trb_ar_cr']
  }
};

var PagesixComExtractor = {
  domain: 'pagesix.com',
  supportedDomains: ['nypost.com'],
  title: {
    selectors: ['h1 a']
  },
  author: {
    selectors: ['.byline']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['#featured-image-wrapper', '.entry-content'], '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '#featured-image-wrapper': 'figure',
      '.wp-caption-text': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.modal-trigger']
  }
};

var ThefederalistpapersOrgExtractor = {
  domain: 'thefederalistpapers.org',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['main span.entry-author-name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [['p[style]']]
  }
};

var WwwCbssportsComExtractor = {
  domain: 'www.cbssports.com',
  title: {
    selectors: ['.article-headline']
  },
  author: {
    selectors: ['.author-name']
  },
  date_published: {
    selectors: [['.date-original-reading-time time', 'datetime']],
    timezone: 'UTC'
  },
  dek: {
    selectors: ['.article-subline']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwMsnbcComExtractor = {
  domain: 'www.msnbc.com',
  title: {
    selectors: ['h1', 'h1.is-title-pane']
  },
  author: {
    selectors: ['.author']
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.pane-node-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.pane-node-body': function paneNodeBody($node, $) {
        var _WwwMsnbcComExtractor = _slicedToArray(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
            selector = _WwwMsnbcComExtractor[0],
            attr = _WwwMsnbcComExtractor[1];

        var src = $(selector).attr(attr);

        if (src) {
          $node.prepend("<img src=\"".concat(src, "\" />"));
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwThepoliticalinsiderComExtractor = {
  domain: 'www.thepoliticalinsider.com',
  title: {
    selectors: [['meta[name="sailthru.title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="sailthru.author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div#article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwMentalflossComExtractor = {
  domain: 'www.mentalfloss.com',
  title: {
    selectors: ['h1.title', '.title-group', '.inner']
  },
  author: {
    selectors: ['.field-name-field-enhanced-authors']
  },
  date_published: {
    selectors: ['.date-display-single'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.field.field-name-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var AbcnewsGoComExtractor = {
  domain: 'abcnews.go.com',
  title: {
    selectors: ['.article-header h1']
  },
  author: {
    selectors: ['.authors'],
    clean: ['.author-overlay', '.by-text']
  },
  date_published: {
    selectors: ['.timestamp'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-copy'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwNydailynewsComExtractor = {
  domain: 'www.nydailynews.com',
  title: {
    selectors: ['h1#ra-headline']
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article#ra-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom']
  }
};

var WwwCnbcComExtractor = {
  domain: 'www.cnbc.com',
  title: {
    selectors: ['h1.title', 'h1.ArticleHeader-headline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div#article_body.content', 'div.story', 'div.ArticleBody-articleBody'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwPopsugarComExtractor = {
  domain: 'www.popsugar.com',
  title: {
    selectors: ['h2.post-title', 'title-text']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.share-copy-title', '.post-tags', '.reactions']
  }
};

var ObserverComExtractor = {
  domain: 'observer.com',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['.author', '.vcard']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var PeopleComExtractor = {
  domain: 'people.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['a.author.url.fn']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-body__inner'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwUsmagazineComExtractor = {
  domain: 'www.usmagazine.com',
  title: {
    selectors: ['header h1']
  },
  author: {
    selectors: ['a.article-byline.tracked-offpage']
  },
  date_published: {
    timezone: 'America/New_York',
    selectors: ['time.article-published-date']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-body-inner'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.module-related']
  }
};

var WwwRollingstoneComExtractor = {
  domain: 'www.rollingstone.com',
  title: {
    selectors: ['h1.content-title']
  },
  author: {
    selectors: ['a.content-author.tracked-offpage']
  },
  date_published: {
    selectors: ['time.content-published-date'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['.content-description']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.lead-container', '.article-content'], '.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.module-related']
  }
};

var twofortysevensportsComExtractor = {
  domain: '247sports.com',
  title: {
    selectors: ['title', 'article header h1']
  },
  author: {
    selectors: ['.author']
  },
  date_published: {
    selectors: [['time[data-published]', 'data-published']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['section.body.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var UproxxComExtractor = {
  domain: 'uproxx.com',
  title: {
    selectors: ['div.post-top h1']
  },
  author: {
    selectors: ['.post-top .authorname']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.post-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.image': 'figure',
      'div.image .wp-media-credit': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwEonlineComExtractor = {
  domain: 'www.eonline.com',
  title: {
    selectors: ['h1.article__title']
  },
  author: {
    selectors: ['.entry-meta__author a']
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.post-content section, .post-content div.post-content__image']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.post-content__image': 'figure',
      'div.post-content__image .image__credits': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwMiamiheraldComExtractor = {
  domain: 'www.miamiherald.com',
  title: {
    selectors: ['h1.title']
  },
  date_published: {
    selectors: ['p.published-date'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.dateline-storybody'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwRefinery29ComExtractor = {
  domain: 'www.refinery29.com',
  title: {
    selectors: ['h1.title']
  },
  author: {
    selectors: ['.contributor']
  },
  date_published: {
    selectors: [['meta[name="sailthru.date"]', 'value']],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.full-width-opener', '.article-content'], '.article-content', '.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.loading noscript': function divLoadingNoscript($node) {
        var imgHtml = $node.html();
        $node.parents('.loading').replaceWith(imgHtml);
      },
      '.section-image': 'figure',
      '.section-image .content-caption': 'figcaption',
      '.section-text': 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.story-share']
  }
};

var WwwMacrumorsComExtractor = {
  domain: 'www.macrumors.com',
  title: {
    selectors: ['h1', 'h1.title']
  },
  author: {
    selectors: ['.author-url']
  },
  date_published: {
    selectors: ['.article .byline'],
    // Wednesday January 18, 2017 11:44 am PST
    format: 'dddd MMMM D, YYYY h:mm A zz',
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: [['meta[name="description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwAndroidcentralComExtractor = {
  domain: 'www.androidcentral.com',
  title: {
    selectors: ['h1', 'h1.main-title']
  },
  author: {
    selectors: ['.meta-by']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['.image-large', 'src']]
  },
  content: {
    selectors: ['.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.intro', 'blockquote']
  }
};

var WwwSiComExtractor = {
  domain: 'www.si.com',
  title: {
    selectors: ['h1', 'h1.headline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: ['.timestamp'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['.quick-hit ul']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['p', '.marquee_large_2x', '.component.image']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'figure';
        }

        return null;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: [['.inline-thumb', '.primary-message', '.description', '.instructions']]
  }
};

var WwwRawstoryComExtractor = {
  domain: 'www.rawstory.com',
  title: {
    selectors: ['.blog-title']
  },
  author: {
    selectors: ['.blog-author a:first-of-type']
  },
  date_published: {
    selectors: ['.blog-author a:last-of-type'],
    timezone: 'EST'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.blog-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwCnetComExtractor = {
  domain: 'www.cnet.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['a.author']
  },
  date_published: {
    selectors: ['time'],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: ['.article-dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['img.__image-lead__', '.article-main-body'], '.article-main-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'figure.image': function figureImage($node) {
        var $img = $node.find('img');
        $img.attr('width', '100%');
        $img.attr('height', '100%');
        $img.addClass('__image-lead__');
        $node.remove('.imgContainer').prepend($img);
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwCinemablendComExtractor = {
  domain: 'www.cinemablend.com',
  title: {
    selectors: ['.story_title']
  },
  author: {
    selectors: ['.author']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']],
    timezone: 'EST'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div#wrap_left_content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwTodayComExtractor = {
  domain: 'www.today.com',
  title: {
    selectors: ['h1.entry-headline']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-container'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.label-comment']
  }
};

var WwwHowtogeekComExtractor = {
  domain: 'www.howtogeek.com',
  title: {
    selectors: ['title']
  },
  author: {
    selectors: ['#authorinfobox a']
  },
  date_published: {
    selectors: ['#authorinfobox + div li'],
    timezone: 'GMT'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.thecontent'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwAlComExtractor = {
  domain: 'www.al.com',
  title: {
    selectors: [['meta[name="title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="article_author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article_date_original"]', 'value']],
    timezone: 'EST'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwThepennyhoarderComExtractor = {
  domain: 'www.thepennyhoarder.com',
  title: {
    selectors: [['meta[name="dcterms.title"]', 'value']]
  },
  author: {
    selectors: [['link[rel="author"]', 'title']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.post-img', '.post-text'], '.post-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwWesternjournalismComExtractor = {
  domain: 'www.westernjournalism.com',
  title: {
    selectors: ['title', 'h1.entry-title']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="DC.date.issued"]', 'value']]
  },
  dek: {
    selectors: ['.subtitle']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-sharing.top + div'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.ad-notice-small']
  }
};

var FusionNetExtractor = {
  domain: 'fusion.net',
  title: {
    selectors: ['.post-title', '.single-title', '.headline']
  },
  author: {
    selectors: ['.show-for-medium .byline']
  },
  date_published: {
    selectors: [['time.local-time', 'datetime']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.post-featured-media', '.article-content'], '.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.fusion-youtube-oembed': 'figure'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwAmericanowComExtractor = {
  domain: 'www.americanow.com',
  title: {
    selectors: ['.title', ['meta[name="title"]', 'value']]
  },
  author: {
    selectors: ['.byline']
  },
  date_published: {
    selectors: [['meta[name="publish_date"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.article-content', '.image', '.body'], '.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.article-video-wrapper', '.show-for-small-only']
  }
};

var ScienceflyComExtractor = {
  domain: 'sciencefly.com',
  title: {
    selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title']
  },
  author: {
    selectors: ['div.cb-author', 'div.cb-author-title']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['div.theiaPostSlider_slides img', 'src']]
  },
  content: {
    selectors: ['div.theiaPostSlider_slides'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var HellogigglesComExtractor = {
  domain: 'hellogiggles.com',
  title: {
    selectors: ['.title']
  },
  author: {
    selectors: ['.author-link']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var ThoughtcatalogComExtractor = {
  domain: 'thoughtcatalog.com',
  title: {
    selectors: ['h1.title', ['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 'h1.writer-name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry.post'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.tc_mark']
  }
};

var WwwNjComExtractor = {
  domain: 'www.nj.com',
  title: {
    selectors: [['meta[name="title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="article_author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article_date_original"]', 'value']],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwInquisitrComExtractor = {
  domain: 'www.inquisitr.com',
  title: {
    selectors: ['h1.entry-title.story--header--title']
  },
  author: {
    selectors: ['div.story--header--author']
  },
  date_published: {
    selectors: [['meta[name="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article.story', '.entry-content.'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.post-category', '.story--header--socials', '.story--header--content']
  }
};

var WwwNbcnewsComExtractor = {
  domain: 'www.nbcnews.com',
  title: {
    selectors: ['div.article-hed h1']
  },
  author: {
    selectors: ['span.byline_author']
  },
  date_published: {
    selectors: [['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var FortuneComExtractor = {
  domain: 'fortune.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: ['.MblGHNMJ'],
    timezone: 'UTC'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['picture', 'article.row'], 'article.row'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwLinkedinComExtractor = {
  domain: 'www.linkedin.com',
  title: {
    selectors: ['.article-title', 'h1']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value'], '.entity-name a[rel=author]']
  },
  date_published: {
    selectors: [['time[itemprop="datePublished"]', 'datetime']],
    timezone: 'America/Los_Angeles'
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['header figure', '.prose'], '.prose'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.entity-image']
  }
};

var ObamawhitehouseArchivesGovExtractor = {
  domain: 'obamawhitehouse.archives.gov',
  supportedDomains: ['whitehouse.gov'],
  title: {
    selectors: ['h1', '.pane-node-title']
  },
  author: {
    selectors: ['.blog-author-link', '.node-person-name-link']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.field-name-field-forall-summary']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    defaultCleaner: false,
    selectors: ['div#content-start', '.pane-node-field-forall-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.pane-node-title', '.pane-custom.pane-1']
  }
};

var WwwOpposingviewsComExtractor = {
  domain: 'www.opposingviews.com',
  title: {
    selectors: ['h1.title']
  },
  author: {
    selectors: ['div.date span span a']
  },
  date_published: {
    selectors: [['meta[name="publish_date"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.show-for-small-only']
  }
};

var WwwProspectmagazineCoUkExtractor = {
  domain: 'www.prospectmagazine.co.uk',
  title: {
    selectors: ['.page-title']
  },
  author: {
    selectors: ['.aside_author .title']
  },
  date_published: {
    selectors: ['.post-info'],
    timezone: 'Europe/London'
  },
  dek: {
    selectors: ['.page-subtitle']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article .post_content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var ForwardComExtractor = {
  domain: 'forward.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.post-item-media-wrap', '.post-item p']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.donate-box', '.message', '.subtitle']
  }
};

var WwwQdailyComExtractor = {
  domain: 'www.qdaily.com',
  title: {
    selectors: ['h2', 'h2.title']
  },
  author: {
    selectors: ['.name']
  },
  date_published: {
    selectors: [['.date.smart-date', 'data-origindate']]
  },
  dek: {
    selectors: ['.excerpt']
  },
  lead_image_url: {
    selectors: [['.article-detail-hd img', 'src']]
  },
  content: {
    selectors: ['.detail'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.lazyload', '.lazylad', '.lazylood']
  }
};

var GothamistComExtractor = {
  domain: 'gothamist.com',
  supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
  title: {
    selectors: ['h1', '.entry-header h1']
  },
  author: {
    selectors: ['.author']
  },
  date_published: {
    selectors: ['abbr', 'abbr.published'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: [null]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.entry-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div.image-none': 'figure',
      '.image-none i': 'figcaption',
      'div.image-left': 'figure',
      '.image-left i': 'figcaption',
      'div.image-right': 'figure',
      '.image-right i': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
  }
};

var WwwFoolComExtractor = {
  domain: 'www.fool.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.author-inline .author-name']
  },
  date_published: {
    selectors: [['meta[name="date"]', 'value']]
  },
  dek: {
    selectors: ['header h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      '.caption img': function captionImg($node) {
        var src = $node.attr('src');
        $node.parent().replaceWith("<figure><img src=\"".concat(src, "\"/></figure>"));
      },
      '.caption': 'figcaption'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['#pitch']
  }
};

var WwwSlateComExtractor = {
  domain: 'www.slate.com',
  title: {
    selectors: ['.hed', 'h1']
  },
  author: {
    selectors: ['a[rel=author]']
  },
  date_published: {
    selectors: ['.pub-date'],
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['.dek']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment']
  }
};

var IciRadioCanadaCaExtractor = {
  domain: 'ici.radio-canada.ca',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="dc.creator"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="dc.date.created"]', 'value']],
    format: 'YYYY-MM-DD|HH[h]mm',
    timezone: 'America/New_York'
  },
  dek: {
    selectors: ['.bunker-component.lead']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['.main-multimedia-item', '.news-story-content']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwFortinetComExtractor = {
  domain: 'www.fortinet.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.b15-blog-meta__author']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12'],
    transforms: {
      noscript: function noscript($node) {
        var $children = $node.children();

        if ($children.length === 1 && $children.get(0).tagName === 'img') {
          return 'figure';
        }

        return null;
      }
    }
  }
};

var WwwFastcompanyComExtractor = {
  domain: 'www.fastcompany.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.post__by']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.post__deck']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.post__article']
  }
};

var BlisterreviewComExtractor = {
  domain: 'blisterreview.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
  },
  author: {
    selectors: ['span.author-name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value'], ['time.entry-date', 'datetime'], ['meta[itemprop="datePublished"]', 'content']]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[itemprop="image"]', 'content'], ['meta[name="twitter:image"]', 'content'], ['img.attachment-large', 'src']]
  },
  content: {
    selectors: [['.elementor-section-wrap', '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      figcaption: 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.comments-area']
  }
};

var NewsMynaviJpExtractor = {
  domain: 'news.mynavi.jp',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['main div.article-author a.article-author__name']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['main article div'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      img: function img($node) {
        var src = $node.attr('data-original');

        if (src !== '') {
          $node.attr('src', src);
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var ClinicaltrialsGovExtractor = {
  domain: 'clinicaltrials.gov',
  title: {
    selectors: ['h1.tr-solo_record']
  },
  author: {
    selectors: ['div#sponsor.tr-info-text']
  },
  date_published: {
    // selectors: ['span.term[data-term="Last Update Posted"]'],
    selectors: ['div:has(> span.term[data-term="Last Update Posted"])']
  },
  content: {
    selectors: ['div#tab-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.usa-alert> img']
  }
};

var GithubComExtractor = {
  domain: 'github.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [// enter author selectors
    ]
  },
  date_published: {
    selectors: [['span[itemprop="dateModified"] relative-time', 'datetime']]
  },
  dek: {
    selectors: ['span[itemprop="about"]']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['#readme article']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwRedditComExtractor = {
  domain: 'www.reddit.com',
  title: {
    selectors: ['div[data-test-id="post-content"] h2']
  },
  author: {
    selectors: ['div[data-test-id="post-content"] a[href*="user/"]']
  },
  date_published: {
    selectors: ['div[data-test-id="post-content"] a[data-click-id="timestamp"]']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['div[data-test-id="post-content"] p'], // text post
    ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
    'div[data-test-id="post-content"] div[data-click-id="media"]'], // external link with media preview (YouTube, imgur album, etc...)
    ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
    ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])'], // external link
    'div[data-test-id="post-content"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'div[role="img"]': function divRoleImg($node) {
        // External link image preview
        var $img = $node.find('img');
        var bgImg = $node.css('background-image');

        if ($img.length === 1 && bgImg) {
          $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
          return $img;
        }

        return $node;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.icon']
  }
};

var OtrsComExtractor = {
  domain: 'otrs.com',
  title: {
    selectors: ['#main article h1']
  },
  author: {
    selectors: ['div.dateplusauthor a']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#main article'],
    defaultCleaner: false,
    transforms: {},
    clean: ['div.dateplusauthor', 'div.gr-12.push-6.footershare', '#atftbx', 'div.category-modul']
  }
};

var WwwOssnewsJpExtractor = {
  domain: 'www.ossnews.jp',
  title: {
    selectors: ['#alpha-block h1.hxnewstitle']
  },
  author: null,
  date_published: {
    selectors: ['p.fs12'],
    format: 'YYYY年MM月DD日 HH:mm',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#alpha-block .section:has(h1.hxnewstitle)'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var BuzzapJpExtractor = {
  domain: 'buzzap.jp',
  title: {
    selectors: ['h1.entry-title']
  },
  author: null,
  date_published: {
    selectors: [['time.entry-date', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.ctiframe'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var WwwAsahiComExtractor = {
  domain: 'www.asahi.com',
  title: {
    selectors: ['.ArticleTitle h1']
  },
  author: {
    selectors: [['meta[name="article:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="pubdate"]', 'value']]
  },
  dek: null,
  excerpt: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#MainInner div.ArticleBody'],
    defaultCleaner: false,
    transforms: {},
    clean: ['div.AdMod', 'div.LoginSelectArea']
  }
};

var WwwSanwaCoJpExtractor = {
  domain: 'www.sanwa.co.jp',
  title: {
    selectors: ['#newsContent h1']
  },
  author: null,
  date_published: {
    selectors: ['p.date'],
    format: 'YYYY.MM.DD',
    timezone: 'Asia/Tokyo'
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#newsContent'],
    defaultCleaner: false,
    transforms: {},
    clean: ['#smartphone', 'div.sns_box', 'div.contentFoot']
  }
};

var WwwElecomCoJpExtractor = {
  domain: 'www.elecom.co.jp',
  title: {
    selectors: ['title']
  },
  author: null,
  date_published: {
    selectors: ['p.section-last'],
    format: 'YYYY.MM.DD',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['td.TableMain2'],
    defaultCleaner: false,
    transforms: {
      table: function table($node) {
        $node.attr('width', 'auto');
      }
    },
    clean: []
  }
};

var ScanNetsecurityNeJpExtractor = {
  domain: 'scan.netsecurity.ne.jp',
  title: {
    selectors: ['header.arti-header h1.head']
  },
  author: null,
  date_published: {
    selectors: [['meta[name="article:modified_time"]', 'value']]
  },
  dek: {
    selectors: ['header.arti-header p.arti-summary']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.arti-content.arti-content--thumbnail'],
    defaultCleaner: false,
    transforms: {},
    clean: ['aside.arti-giga']
  }
};

var JvndbJvnJpExtractor = {
  domain: 'jvndb.jvn.jp',
  title: {
    selectors: ['title']
  },
  author: null,
  date_published: {
    selectors: ['div.modifytxt:nth-child(2)'],
    format: 'YYYY/MM/DD',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['#news-list'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var GeniusComExtractor = {
  domain: 'genius.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['h2 a']
  },
  date_published: {
    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
      var json = JSON.parse(res);
      return json.song.release_date;
    }]]
  },
  dek: {
    selectors: [// enter selectors
    ]
  },
  lead_image_url: {
    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
      var json = JSON.parse(res);
      return json.song.album.cover_art_url;
    }]]
  },
  content: {
    selectors: ['.lyrics'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwJnsaOrgExtractor = {
  domain: 'www.jnsa.org',
  title: {
    selectors: ['#wgtitle h2']
  },
  author: null,
  date_published: null,
  dek: null,
  excerpt: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#main_area'],
    transforms: {},
    clean: ['#pankuzu', '#side']
  }
};

var PhpspotOrgExtractor = {
  domain: 'phpspot.org',
  title: {
    selectors: ['h3.hl']
  },
  author: null,
  date_published: {
    selectors: ['h4.hl'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['div.entrybody'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var WwwInfoqComExtractor = {
  domain: 'www.infoq.com',
  title: {
    selectors: ['h1.heading']
  },
  author: {
    selectors: ['div.widget.article__authors']
  },
  date_published: {
    selectors: ['.article__readTime.date'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article__data'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var WwwMoongiftJpExtractor = {
  domain: 'www.moongift.jp',
  title: {
    selectors: ['h1.title a']
  },
  author: null,
  date_published: {
    selectors: ['ul.meta li:not(.social):first-of-type'],
    timezone: 'Asia/Tokyo'
  },
  dek: {
    selectors: [['meta[name="og:description"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#main'],
    transforms: {},
    clean: ['ul.mg_service.cf']
  }
};

var WwwItmediaCoJpExtractor = {
  domain: 'www.itmedia.co.jp',
  supportedDomains: ['www.atmarkit.co.jp', 'techtarget.itmedia.co.jp', 'nlab.itmedia.co.jp'],
  title: {
    selectors: ['#cmsTitle h1']
  },
  author: {
    selectors: ['#byline']
  },
  date_published: {
    selectors: [['meta[name="article:modified_time"]', 'value']]
  },
  dek: {
    selectors: ['#cmsAbstract h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#cmsBody'],
    defaultCleaner: false,
    transforms: {},
    clean: ['#snsSharebox']
  }
};

var WwwPublickey1JpExtractor = {
  domain: 'www.publickey1.jp',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['#subcol p:has(img)']
  },
  date_published: {
    selectors: ['div.pubdate'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#maincol'],
    defaultCleaner: false,
    transforms: {},
    clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer']
  }
};

var TakagihiromitsuJpExtractor = {
  domain: 'takagi-hiromitsu.jp',
  title: {
    selectors: ['h3']
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[http-equiv="Last-Modified"]', 'value']]
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['div.body'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var BookwalkerJpExtractor = {
  domain: 'bookwalker.jp',
  title: {
    selectors: ['h1.main-heading']
  },
  author: {
    selectors: ['div.authors']
  },
  date_published: {
    selectors: ['.work-info .work-detail:first-of-type .work-detail-contents:last-of-type'],
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: [['div.main-info', 'div.main-cover-inner']],
    defaultCleaner: false,
    transforms: {},
    clean: ['span.label.label--trial', 'dt.info-head.info-head--coin', 'dd.info-contents.info-contents--coin', 'div.info-notice.fn-toggleClass']
  }
};

var WwwYomiuriCoJpExtractor = {
  domain: 'www.yomiuri.co.jp',
  title: {
    selectors: ['h1.title-article.c-article-title']
  },
  author: null,
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.p-main-contents'],
    transforms: {},
    clean: []
  }
};

var JapanCnetComExtractor = {
  domain: 'japan.cnet.com',
  title: {
    selectors: ['.leaf-headline-ttl']
  },
  author: {
    selectors: ['.writer']
  },
  date_published: {
    selectors: ['.date'],
    format: 'YYYY年MM月DD日 HH時mm分',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article_body'],
    transforms: {},
    clean: []
  }
};

var DeadlineComExtractor = {
  domain: 'deadline.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['section.author h3']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'],
    transforms: {
      '.embed-twitter': function embedTwitter($node) {
        var innerHtml = $node.html();
        $node.replaceWith(innerHtml);
      }
    },
    clean: []
  }
};

var WwwGizmodoJpExtractor = {
  domain: 'www.gizmodo.jp',
  title: {
    selectors: ['h1.p-post-title']
  },
  author: {
    selectors: ['li.p-post-AssistAuthor']
  },
  date_published: {
    selectors: [['li.p-post-AssistTime time', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article.p-post'],
    transforms: {
      'img.p-post-thumbnailImage': function imgPPostThumbnailImage($node) {
        var src = $node.attr('src');
        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
      }
    },
    clean: ['h1.p-post-title', 'ul.p-post-Assist']
  }
};

var GetnewsJpExtractor = {
  domain: 'getnews.jp',
  title: {
    selectors: ['article h1']
  },
  author: {
    selectors: ['span.prof']
  },
  date_published: {
    selectors: [['ul.cattag-top time', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.post-bodycopy'],
    transforms: {},
    clean: []
  }
};

var WwwLifehackerJpExtractor = {
  domain: 'www.lifehacker.jp',
  title: {
    selectors: ['h1.lh-summary-title']
  },
  author: {
    selectors: ['p.lh-entryDetailInner--credit']
  },
  date_published: {
    selectors: [['div.lh-entryDetail-header time', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.lh-entryDetail-body'],
    transforms: {
      'img.lazyload': function imgLazyload($node) {
        var src = $node.attr('src');
        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
      }
    },
    clean: ['p.lh-entryDetailInner--credit']
  }
};

var SectIijAdJpExtractor = {
  domain: 'sect.iij.ad.jp',
  title: {
    selectors: ['h3']
  },
  author: {
    selectors: ['dl.entrydate dd']
  },
  date_published: {
    selectors: ['dl.entrydate dd'],
    format: 'YYYY年MM月DD日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#article'],
    transforms: {},
    clean: ['dl.entrydate']
  }
};

var WwwOreillyCoJpExtractor = {
  domain: 'www.oreilly.co.jp',
  title: {
    selectors: ['h3']
  },
  author: {
    selectors: ['li[itemprop="author"]']
  },
  date_published: {
    selectors: [['meta[itemprop="datePublished"]', 'value']],
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#content'],
    defaultCleaner: false,
    transforms: {},
    clean: ['.social-tools']
  }
};

var WwwIpaGoJpExtractor = {
  domain: 'www.ipa.go.jp',
  title: {
    selectors: ['h1']
  },
  author: null,
  date_published: {
    selectors: ['p.ipar_text_right'],
    format: 'YYYY年M月D日',
    timezone: 'Asia/Tokyo'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['#ipar_main'],
    defaultCleaner: false,
    transforms: {},
    clean: ['p.ipar_text_right']
  }
};

var WeeklyAsciiJpExtractor = {
  domain: 'weekly.ascii.jp',
  title: {
    selectors: ['h1[itemprop="headline"]']
  },
  author: {
    selectors: ['p.author']
  },
  date_published: {
    selectors: [['meta[name="odate"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article'],
    transforms: {},
    clean: []
  }
};

var TechlogIijAdJpExtractor = {
  domain: 'techlog.iij.ad.jp',
  title: {
    selectors: ['h1.entry-title']
  },
  author: {
    selectors: ['a[rel="author"]']
  },
  date_published: {
    selectors: [['time.entry-date', 'datetime']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.entry-content'],
    defaultCleaner: false,
    transforms: {},
    clean: []
  }
};

var WiredJpExtractor = {
  domain: 'wired.jp',
  title: {
    selectors: ['h1.post-title']
  },
  author: {
    selectors: ['p[itemprop="author"]']
  },
  date_published: {
    selectors: [['time', 'datetime']]
  },
  dek: {
    selectors: ['.post-intro']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article.article-detail'],
    transforms: {
      'img[data-original]': function imgDataOriginal($node) {
        var dataOriginal = $node.attr('data-original');
        var src = $node.attr('src');
        var url = URL.resolve(src, dataOriginal);
        $node.attr('src', url);
      }
    },
    clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer']
  }
};

var JapanZdnetComExtractor = {
  domain: 'japan.zdnet.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: [['meta[name="cXenseParse:author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: null,
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.article_body'],
    transforms: {},
    clean: []
  }
};

var WwwRbbtodayComExtractor = {
  domain: 'www.rbbtoday.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.writer.writer-name']
  },
  date_published: {
    selectors: [['header time', 'datetime']]
  },
  dek: {
    selectors: ['.arti-summary']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.arti-content'],
    transforms: {},
    clean: ['.arti-giga']
  }
};

var WwwLemondeFrExtractor = {
  domain: 'www.lemonde.fr',
  title: {
    selectors: ['h1.article__title']
  },
  author: {
    selectors: ['.author__name']
  },
  date_published: {
    selectors: [['meta[name="og:article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['.article__desc']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['.article__content'],
    transforms: {},
    clean: []
  }
};

var WwwPhoronixComExtractor = {
  domain: 'www.phoronix.com',
  title: {
    selectors: ['article header']
  },
  author: {
    selectors: ['.author a:first-child']
  },
  date_published: {
    selectors: ['.author'],
    // 1 June 2019 at 08:34 PM EDT
    format: 'D MMMM YYYY at hh:mm',
    timezone: 'America/New_York'
  },
  dek: null,
  lead_image_url: null,
  content: {
    selectors: ['.content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var PitchforkComExtractor = {
  domain: 'pitchfork.com',
  title: {
    selectors: ['title']
  },
  author: {
    selectors: ['.authors-detail__display-name']
  },
  date_published: {
    selectors: [['.pub-date', 'datetime']]
  },
  dek: {
    selectors: ['.review-detail__abstract']
  },
  lead_image_url: {
    selectors: [['.single-album-tombstone__art img', 'src']]
  },
  content: {
    selectors: ['.review-detail__text']
  },
  extend: {
    score: {
      selectors: ['.score']
    }
  }
};

var BiorxivOrgExtractor = {
  domain: 'biorxiv.org',
  title: {
    selectors: ['h1#page-title']
  },
  author: {
    selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors']
  },
  content: {
    selectors: ['div#abstract-1'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var EpaperZeitDeExtractor = {
  domain: 'epaper.zeit.de',
  title: {
    selectors: ['p.title']
  },
  author: {
    selectors: ['.article__author']
  },
  date_published: null,
  excerpt: {
    selectors: ['subtitle']
  },
  lead_image_url: null,
  content: {
    selectors: ['.article'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      'p.title': 'h1',
      '.article__author': 'p',
      byline: 'p',
      linkbox: 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['image-credits', 'box[type=citation]']
  }
};

var WwwLadbibleComExtractor = {
  domain: 'www.ladbible.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['[class*=Byline]']
  },
  date_published: {
    selectors: ['time'],
    timezone: 'Europe/London'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['[class*=ArticleContainer]'],
    clean: ['time', 'source', 'a[href^="https://www.ladbible.com/"]', 'picture', '[class*=StyledCardBlock]']
  }
};

var TimesofindiaIndiatimesComExtractor = {
  domain: 'timesofindia.indiatimes.com',
  title: {
    selectors: ['h1']
  },
  extend: {
    reporter: {
      selectors: ['div.byline'],
      transforms: {}
    }
  },
  date_published: {
    selectors: ['.byline'],
    format: 'MMM D, YYYY, HH:mm z',
    timezone: 'Asia/Kolkata'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div.contentwrapper:has(section)'],
    defaultCleaner: false,
    clean: ['section', 'h1', '.byline', '.img_cptn']
  }
};

var MaTtiasBeExtractor = {
  domain: 'ma.ttias.be',
  title: {
    selectors: [['meta[name="twitter:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  content: {
    selectors: [['.content']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: function h2($node) {
        // The "id" attribute values would result in low scores and the element being
        // removed.
        $node.attr('id', null); // h1 elements will be demoted to h2, so demote h2 elements to h3.

        return 'h3';
      },
      h1: function h1($node) {
        // The "id" attribute values would result in low scores and the element being
        // removed.
        $node.attr('id', null); // A subsequent h2 will be removed if there is not a paragraph before it, so
        // add a paragraph here. It will be removed anyway because it is empty.

        $node.after('<p></p>');
      },
      ul: function ul($node) {
        // Articles contain lists of links which look like, but are not, navigation
        // elements. Adding this class attribute avoids them being incorrectly removed.
        $node.attr('class', 'entry-content-asset');
      }
    }
  }
};

var PastebinComExtractor = {
  domain: 'pastebin.com',
  title: {
    selectors: ['h1']
  },
  author: {
    selectors: ['.paste_box_line2 .t_us + a']
  },
  date_published: {
    selectors: ['.paste_box_line2 .t_da + span'],
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['#selectable .text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      ol: 'div',
      li: 'p'
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

/* eslint-disable no-nested-ternary */

/* eslint-disable no-unused-expressions */
var WwwAbendblattDeExtractor = {
  domain: 'www.abendblatt.de',
  title: {
    selectors: ['h2.article__header__headline']
  },
  author: {
    selectors: ['span.author-info__name-text']
  },
  date_published: {
    selectors: [['time.article__header__date', 'datetime']]
  },
  dek: {
    selectors: ["span[itemprop='description']"]
  },
  lead_image_url: {
    selectors: [["meta[name='og:image']", 'value']]
  },
  content: {
    selectors: ['div.article__body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      p: function p($node) {
        if (!$node.hasClass('obfuscated')) return null;
        var o = '';
        var n = 0;

        for (var i = $node.text(); n < i.length; n += 1) {
          var r = i.charCodeAt(n);
          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
        }

        $node.html(o);
        $node.removeClass('obfuscated');
        $node.addClass('deobfuscated');
        return null;
      },
      div: function div($node) {
        if (!$node.hasClass('obfuscated')) return null;
        var o = '';
        var n = 0;

        for (var i = $node.text(); n < i.length; n += 1) {
          var r = i.charCodeAt(n);
          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
        }

        $node.html(o);
        $node.removeClass('obfuscated');
        $node.addClass('deobfuscated');
        return null;
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var WwwGrueneDeExtractor = {
  domain: 'www.gruene.de',
  title: {
    selectors: ['header h1']
  },
  author: null,
  date_published: null,
  dek: null,
  lead_image_url: {
    selectors: [['meta[property="og:image"]', 'content']]
  },
  content: {
    // selectors: ['section'],
    selectors: [['section header', 'section h2', 'section p', 'section ol']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['figcaption', 'p[class]']
  }
};

var WwwEngadgetComExtractor = {
  domain: 'www.engadget.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: ['a.th-meta[data-ylk*="subsec:author"]']
  },
  // Engadget stories have publish dates, but the only representation of them on the page
  // is in a format like "2h ago". There are also these tags with blank values:
  // <meta class="swiftype" name="published_at" data-type="date" value="">
  date_published: {
    selectors: [// enter selectors
    ]
  },
  dek: {
    selectors: ['div[class*="o-title_mark"] div']
  },
  // Engadget stories do have lead images specified by an og:image meta tag, but selecting
  // the value attribute of that tag fails. I believe the "&#x2111;" sequence of characters
  // is triggering this inability to select the attribute value.
  lead_image_url: {
    selectors: [// enter selectors
    ]
  },
  content: {
    selectors: [[// Some figures will be inside div.article-text, but some header figures/images
    // will not.
    '#page_body figure:not(div.article-text figure)', 'div.article-text']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: []
  }
};

var ArstechnicaComExtractor = {
  domain: 'arstechnica.com',
  // Articles from this site are often paginated, but I was unable to write a CSS
  // selector to find the next page. On the last page, there will be a link with a CSS
  // selector indicating that the previous page is next. But the parser appears to find
  // the next page without this extractor finding it, as long as the fallback option is
  // left at its default value of true.
  title: {
    selectors: ['title']
  },
  author: {
    selectors: ['*[rel="author"] *[itemprop="name"]']
  },
  date_published: {
    selectors: [['.byline time', 'datetime']]
  },
  dek: {
    selectors: ['h2[itemprop="description"]']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div[itemprop="articleBody"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      h2: function h2($node) {
        // Some pages have an element h2 that is significant, and that the parser will
        // remove if not following a paragraph. Adding this empty paragraph fixes it, and
        // the empty paragraph will be removed anyway.
        $node.before('<p></p>');
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result.
    clean: [// Remove enlarge links and separators inside image captions.
    'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
    // removed them.
    'figure.video', // Image galleries that do not work.
    '.gallery', 'aside', '.sidebar']
  }
};

var WwwNdtvComExtractor = {
  domain: 'www.ndtv.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
  },
  author: {
    selectors: ['span[itemprop="author"] span[itemprop="name"]']
  },
  date_published: {
    selectors: [['span[itemprop="dateModified"]', 'content']]
  },
  dek: {
    selectors: ['h2']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['div[itemprop="articleBody"]'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // This site puts a dateline in a 'b' above the first paragraph, and then somehow
      // blends it into the first paragraph with CSS. This transform moves the dateline
      // to the first paragraph.
      '.place_cont': function place_cont($node) {
        if (!$node.parents('p').length) {
          var nextSibling = $node.next('p');

          if (nextSibling) {
            $node.remove();
            nextSibling.prepend($node);
          }
        }
      }
    },
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['.highlghts_Wdgt', '.ins_instory_dv_caption', 'input', '._world-wrapper .mt20']
  }
};

var SpektrumExtractor = {
  domain: 'www.spektrum.de',
  title: {
    selectors: ['.content__title']
  },
  author: {
    selectors: ['.content__author__info__name']
  },
  date_published: {
    selectors: ['.content__meta__date'],
    timezone: 'Europe/Berlin'
  },
  dek: {
    selectors: ['.content__intro']
  },
  lead_image_url: {
    selectors: [// This is how the meta tag appears in the original source code.
    ['meta[name="og:image"]', 'value'], // This is how the meta tag appears in the DOM in Chrome.
    // The selector is included here to make the code work within the browser as well.
    ['meta[property="og:image"]', 'content'], // This is the image that is shown on the page.
    // It can be slightly cropped compared to the original in the meta tag.
    '.image__article__top img']
  },
  content: {
    selectors: ['article.content'],
    clean: ['.breadcrumbs', '.hide-for-print', 'aside', 'header h2', '.image__article__top', '.content__author', '.copyright', '.callout-box']
  }
};

var PostlightComExtractor = {
  domain: 'postlight.com',
  title: {
    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value']]
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
  },
  dek: {
    selectors: ['h2.single-hero__abstract']
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
    selectors: ['article.body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
    clean: ['section.pl-post-link']
  }
};


var CustomExtractors = /*#__PURE__*/Object.freeze({
  BloggerExtractor: BloggerExtractor,
  NYMagExtractor: NYMagExtractor,
  WikipediaExtractor: WikipediaExtractor,
  TwitterExtractor: TwitterExtractor,
  NYTimesExtractor: NYTimesExtractor,
  TheAtlanticExtractor: TheAtlanticExtractor,
  NewYorkerExtractor: NewYorkerExtractor,
  WiredExtractor: WiredExtractor,
  MSNExtractor: MSNExtractor,
  YahooExtractor: YahooExtractor,
  BuzzfeedExtractor: BuzzfeedExtractor,
  WikiaExtractor: WikiaExtractor,
  LittleThingsExtractor: LittleThingsExtractor,
  PoliticoExtractor: PoliticoExtractor,
  DeadspinExtractor: DeadspinExtractor,
  BroadwayWorldExtractor: BroadwayWorldExtractor,
  ApartmentTherapyExtractor: ApartmentTherapyExtractor,
  MediumExtractor: MediumExtractor,
  WwwTmzComExtractor: WwwTmzComExtractor,
  WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
  WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
  NewrepublicComExtractor: NewrepublicComExtractor,
  MoneyCnnComExtractor: MoneyCnnComExtractor,
  WwwThevergeComExtractor: WwwThevergeComExtractor,
  WwwCnnComExtractor: WwwCnnComExtractor,
  WwwAolComExtractor: WwwAolComExtractor,
  WwwYoutubeComExtractor: WwwYoutubeComExtractor,
  WwwTheguardianComExtractor: WwwTheguardianComExtractor,
  WwwSbnationComExtractor: WwwSbnationComExtractor,
  WwwBloombergComExtractor: WwwBloombergComExtractor,
  WwwBustleComExtractor: WwwBustleComExtractor,
  WwwNprOrgExtractor: WwwNprOrgExtractor,
  WwwRecodeNetExtractor: WwwRecodeNetExtractor,
  QzComExtractor: QzComExtractor,
  WwwDmagazineComExtractor: WwwDmagazineComExtractor,
  WwwReutersComExtractor: WwwReutersComExtractor,
  MashableComExtractor: MashableComExtractor,
  WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
  WwwVoxComExtractor: WwwVoxComExtractor,
  NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
  WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
  WwwLatimesComExtractor: WwwLatimesComExtractor,
  PagesixComExtractor: PagesixComExtractor,
  ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
  WwwCbssportsComExtractor: WwwCbssportsComExtractor,
  WwwMsnbcComExtractor: WwwMsnbcComExtractor,
  WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
  WwwMentalflossComExtractor: WwwMentalflossComExtractor,
  AbcnewsGoComExtractor: AbcnewsGoComExtractor,
  WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
  WwwCnbcComExtractor: WwwCnbcComExtractor,
  WwwPopsugarComExtractor: WwwPopsugarComExtractor,
  ObserverComExtractor: ObserverComExtractor,
  PeopleComExtractor: PeopleComExtractor,
  WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
  WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
  twofortysevensportsComExtractor: twofortysevensportsComExtractor,
  UproxxComExtractor: UproxxComExtractor,
  WwwEonlineComExtractor: WwwEonlineComExtractor,
  WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
  WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
  WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
  WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
  WwwSiComExtractor: WwwSiComExtractor,
  WwwRawstoryComExtractor: WwwRawstoryComExtractor,
  WwwCnetComExtractor: WwwCnetComExtractor,
  WwwCinemablendComExtractor: WwwCinemablendComExtractor,
  WwwTodayComExtractor: WwwTodayComExtractor,
  WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
  WwwAlComExtractor: WwwAlComExtractor,
  WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
  WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
  FusionNetExtractor: FusionNetExtractor,
  WwwAmericanowComExtractor: WwwAmericanowComExtractor,
  ScienceflyComExtractor: ScienceflyComExtractor,
  HellogigglesComExtractor: HellogigglesComExtractor,
  ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
  WwwNjComExtractor: WwwNjComExtractor,
  WwwInquisitrComExtractor: WwwInquisitrComExtractor,
  WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
  FortuneComExtractor: FortuneComExtractor,
  WwwLinkedinComExtractor: WwwLinkedinComExtractor,
  ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
  WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
  WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
  ForwardComExtractor: ForwardComExtractor,
  WwwQdailyComExtractor: WwwQdailyComExtractor,
  GothamistComExtractor: GothamistComExtractor,
  WwwFoolComExtractor: WwwFoolComExtractor,
  WwwSlateComExtractor: WwwSlateComExtractor,
  IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
  WwwFortinetComExtractor: WwwFortinetComExtractor,
  WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
  BlisterreviewComExtractor: BlisterreviewComExtractor,
  NewsMynaviJpExtractor: NewsMynaviJpExtractor,
  ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
  GithubComExtractor: GithubComExtractor,
  WwwRedditComExtractor: WwwRedditComExtractor,
  OtrsComExtractor: OtrsComExtractor,
  WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
  BuzzapJpExtractor: BuzzapJpExtractor,
  WwwAsahiComExtractor: WwwAsahiComExtractor,
  WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
  WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
  ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
  JvndbJvnJpExtractor: JvndbJvnJpExtractor,
  GeniusComExtractor: GeniusComExtractor,
  WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
  PhpspotOrgExtractor: PhpspotOrgExtractor,
  WwwInfoqComExtractor: WwwInfoqComExtractor,
  WwwMoongiftJpExtractor: WwwMoongiftJpExtractor,
  WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor,
  WwwPublickey1JpExtractor: WwwPublickey1JpExtractor,
  TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
  BookwalkerJpExtractor: BookwalkerJpExtractor,
  WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
  JapanCnetComExtractor: JapanCnetComExtractor,
  DeadlineComExtractor: DeadlineComExtractor,
  WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
  GetnewsJpExtractor: GetnewsJpExtractor,
  WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
  SectIijAdJpExtractor: SectIijAdJpExtractor,
  WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
  WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
  WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
  TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
  WiredJpExtractor: WiredJpExtractor,
  JapanZdnetComExtractor: JapanZdnetComExtractor,
  WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
  WwwLemondeFrExtractor: WwwLemondeFrExtractor,
  WwwPhoronixComExtractor: WwwPhoronixComExtractor,
  PitchforkComExtractor: PitchforkComExtractor,
  BiorxivOrgExtractor: BiorxivOrgExtractor,
  EpaperZeitDeExtractor: EpaperZeitDeExtractor,
  WwwLadbibleComExtractor: WwwLadbibleComExtractor,
  TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
  MaTtiasBeExtractor: MaTtiasBeExtractor,
  PastebinComExtractor: PastebinComExtractor,
  WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
  WwwGrueneDeExtractor: WwwGrueneDeExtractor,
  WwwEngadgetComExtractor: WwwEngadgetComExtractor,
  ArstechnicaComExtractor: ArstechnicaComExtractor,
  WwwNdtvComExtractor: WwwNdtvComExtractor,
  SpektrumExtractor: SpektrumExtractor,
  PostlightComExtractor: PostlightComExtractor
});

var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
  var extractor = CustomExtractors[key];
  return _objectSpread({}, acc, mergeSupportedDomains(extractor));
}, {});

// CLEAN AUTHOR CONSTANTS
var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS

var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.

var MS_DATE_STRING = /^\d{13}$/i;
var SEC_DATE_STRING = /^\d{10}$/i;
var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
var TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
var timeUnits = ['seconds?', 'minutes?', 'hours?', 'days?', 'weeks?', 'months?', 'years?'];
var allTimeUnits = timeUnits.join('|');
var TIME_AGO_STRING = new RegExp("(\\d+)\\s+(".concat(allTimeUnits, ")\\s+ago"), 'i');
var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
var allMonths = months.join('|');
var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
var timestamp3 = '-[0-9]{3,4}$';
var SPLIT_DATE_STRING = new RegExp("(".concat(timestamp1, ")|(").concat(timestamp2, ")|(").concat(timestamp3, ")|([0-9]{1,4})|(").concat(allMonths, ")"), 'ig'); // 2016-11-22T08:57-500
// Check if datetime string has an offset at the end

var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.

var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g');

// just the name(s): 'David Smith'.

function cleanAuthor(author) {
  return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}

function clean$1(leadImageUrl) {
  leadImageUrl = leadImageUrl.trim();

  if (validUrl.isWebUri(leadImageUrl)) {
    return leadImageUrl;
  }

  return null;
}

// Return None if the dek wasn't good enough.

function cleanDek(dek, _ref) {
  var $ = _ref.$,
      excerpt = _ref.excerpt;
  // Sanity check that we didn't get too short or long of a dek.
  if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt

  if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;
  var dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's
  // not a good dek - bail.

  if (TEXT_LINK_RE.test(dekText)) return null;
  return normalizeSpaces(dekText.trim());
}

function cleanDateString(dateString) {
  return (dateString.match(SPLIT_DATE_STRING) || []).join(' ').replace(TIME_MERIDIAN_DOTS_RE, 'm').replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3').replace(CLEAN_DATE_STRING_RE, '$1').trim();
}
function createDate(dateString, timezone, format) {
  if (TIME_WITH_OFFSET_RE.test(dateString)) {
    return moment(new Date(dateString));
  }

  if (TIME_AGO_STRING.test(dateString)) {
    var fragments = TIME_AGO_STRING.exec(dateString);
    return moment().subtract(fragments[1], fragments[2]);
  }

  if (TIME_NOW_STRING.test(dateString)) {
    return moment();
  }

  return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString));
} // Take a date published string, and hopefully return a date out of
// it. Return none if we fail.

function cleanDatePublished(dateString) {
  var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
      timezone = _ref.timezone,
      format = _ref.format;

  // If string is in milliseconds or seconds, convert to int and return
  if (MS_DATE_STRING.test(dateString)) {
    return new Date(_parseInt(dateString, 10)).toISOString();
  }

  if (SEC_DATE_STRING.test(dateString)) {
    return new Date(_parseInt(dateString, 10) * 1000).toISOString();
  }

  var date = createDate(dateString, timezone, format);

  if (!date.isValid()) {
    dateString = cleanDateString(dateString);
    date = createDate(dateString, timezone, format);
  }

  return date.isValid() ? date.toISOString() : null;
}

function extractCleanNode(article, _ref) {
  var $ = _ref.$,
      _ref$cleanConditional = _ref.cleanConditionally,
      cleanConditionally = _ref$cleanConditional === void 0 ? true : _ref$cleanConditional,
      _ref$title = _ref.title,
      title = _ref$title === void 0 ? '' : _ref$title,
      _ref$url = _ref.url,
      url = _ref$url === void 0 ? '' : _ref$url,
      _ref$defaultCleaner = _ref.defaultCleaner,
      defaultCleaner = _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner;
  // Rewrite the tag name to div if it's a top level node like body or
  // html to avoid later complications with multiple body tags.
  rewriteTopLevel$$1(article, $); // Drop small images and spacer images
  // Only do this is defaultCleaner is set to true;
  // this can sometimes be too aggressive.

  if (defaultCleaner) cleanImages(article, $); // Make links absolute

  makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed.
  // E.g., stripJunkTags will remove iframes, so we're going to mark
  // YouTube/Vimeo videos as elements we want to keep.

  markToKeep(article, $, url); // Drop certain tags like <title>, etc
  // This is -mostly- for cleanliness, not security.

  stripJunkTags(article, $); // H1 tags are typically the article title, which should be extracted
  // by the title extractor instead. If there's less than 3 of them (<3),
  // strip them. Otherwise, turn 'em into H2s.

  cleanHOnes$$1(article, $); // Clean headers

  cleanHeaders(article, $, title); // We used to clean UL's and OL's here, but it was leading to
  // too many in-article lists being removed. Consider a better
  // way to detect menus particularly and remove them.
  // Also optionally running, since it can be overly aggressive.

  if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes

  removeEmpty(article, $); // Remove unnecessary attributes

  cleanAttributes$$1(article, $);
  return article;
}

function cleanTitle$$1(title, _ref) {
  var url = _ref.url,
      $ = _ref.$;

  // If title has |, :, or - in it, see if
  // we can clean it up.
  if (TITLE_SPLITTERS_RE.test(title)) {
    title = resolveSplitTitle(title, url);
  } // Final sanity check that we didn't get a crazy title.
  // if (title.length > 150 || title.length < 15) {


  if (title.length > 150) {
    // If we did, return h1 from the document if it exists
    var h1 = $('h1');

    if (h1.length === 1) {
      title = h1.text();
    }
  } // strip any html tags in the title text


  return normalizeSpaces(stripTags(title, $).trim());
}

function extractBreadcrumbTitle(splitTitle, text) {
  // This must be a very breadcrumbed title, like:
  // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
  // NYTimes - Blogs - Bits - The Best Gadgets on Earth
  if (splitTitle.length >= 6) {
    // Look to see if we can find a breadcrumb splitter that happens
    // more than once. If we can, we'll be able to better pull out
    // the title.
    var termCounts = splitTitle.reduce(function (acc, titleText) {
      acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
      return acc;
    }, {});

    var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(function (acc, key) {
      if (acc[1] < termCounts[key]) {
        return [key, termCounts[key]];
      }

      return acc;
    }, [0, 0]),
        _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
        maxTerm = _Reflect$ownKeys$redu2[0],
        termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it
    // is probably the breadcrumber. Split our title on that instead.
    // Note: max_term should be <= 4 characters, so that " >> "
    // will match, but nothing longer than that.


    if (termCount >= 2 && maxTerm.length <= 4) {
      splitTitle = text.split(maxTerm);
    }

    var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
    var longestEnd = splitEnds.reduce(function (acc, end) {
      return acc.length > end.length ? acc : end;
    }, '');

    if (longestEnd.length > 10) {
      return longestEnd;
    }

    return text;
  }

  return null;
}

function cleanDomainFromTitle(splitTitle, url) {
  // Search the ends of the title, looking for bits that fuzzy match
  // the URL too closely. If one is found, discard it and return the
  // rest.
  //
  // Strip out the big TLDs - it just makes the matching a bit more
  // accurate. Not the end of the world if it doesn't strip right.
  var _URL$parse = URL.parse(url),
      host = _URL$parse.host;

  var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
  var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
  var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);

  if (startSlugRatio > 0.4 && startSlug.length > 5) {
    return splitTitle.slice(2).join('');
  }

  var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
  var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);

  if (endSlugRatio > 0.4 && endSlug.length >= 5) {
    return splitTitle.slice(0, -2).join('');
  }

  return null;
} // Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.


function resolveSplitTitle(title) {
  var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';
  // Splits while preserving splitters, like:
  // ['The New New York', ' - ', 'The Washington Post']
  var splitTitle = title.split(TITLE_SPLITTERS_RE);

  if (splitTitle.length === 1) {
    return title;
  }

  var newTitle = extractBreadcrumbTitle(splitTitle, title);
  if (newTitle) return newTitle;
  newTitle = cleanDomainFromTitle(splitTitle, url);
  if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit.
  // Just return it all.

  return title;
}

var Cleaners = {
  author: cleanAuthor,
  lead_image_url: clean$1,
  dek: cleanDek,
  date_published: cleanDatePublished,
  content: extractCleanNode,
  title: cleanTitle$$1
};

// likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns a cheerio object $

function extractBestNode($, opts) {
  if (opts.stripUnlikelyCandidates) {
    $ = stripUnlikelyCandidates($);
  }

  $ = convertToParagraphs$$1($);
  $ = scoreContent$$1($, opts.weightNodes);
  var $topCandidate = findTopCandidate$$1($);
  return $topCandidate;
}

var GenericContentExtractor = {
  defaultOpts: {
    stripUnlikelyCandidates: true,
    weightNodes: true,
    cleanConditionally: true
  },
  // Extract the content for this resource - initially, pass in our
  // most restrictive opts which will return the highest quality
  // content. On each failure, retry with slightly more lax opts.
  //
  // :param return_type: string. If "node", should return the content
  // as a cheerio node rather than as an HTML string.
  //
  // Opts:
  // stripUnlikelyCandidates: Remove any elements that match
  // non-article-like criteria first.(Like, does this element
  //   have a classname of "comment")
  //
  // weightNodes: Modify an elements score based on whether it has
  // certain classNames or IDs. Examples: Subtract if a node has
  // a className of 'comment', Add if a node has an ID of
  // 'entry-content'.
  //
  // cleanConditionally: Clean the node to return of some
  // superfluous content. Things like forms, ads, etc.
  extract: function extract(_ref, opts) {
    var $ = _ref.$,
        html = _ref.html,
        title = _ref.title,
        url = _ref.url;
    opts = _objectSpread({}, this.defaultOpts, opts);
    $ = $ || cheerio.load(html); // Cascade through our extraction-specific opts in an ordered fashion,
    // turning them off as we try to extract content.

    var node = this.getContentNode($, title, url, opts);

    if (nodeIsSufficient(node)) {
      return this.cleanAndReturnNode(node, $);
    } // We didn't succeed on first pass, one by one disable our
    // extraction opts and try again.
    // eslint-disable-next-line no-restricted-syntax


    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (var _iterator = _getIterator(_Reflect$ownKeys(opts).filter(function (k) {
        return opts[k] === true;
      })), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
        var key = _step.value;
        opts[key] = false;
        $ = cheerio.load(html);
        node = this.getContentNode($, title, url, opts);

        if (nodeIsSufficient(node)) {
          break;
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return this.cleanAndReturnNode(node, $);
  },
  // Get node given current options
  getContentNode: function getContentNode($, title, url, opts) {
    return extractCleanNode(extractBestNode($, opts), {
      $: $,
      cleanConditionally: opts.cleanConditionally,
      title: title,
      url: url
    });
  },
  // Once we got here, either we're at our last-resort node, or
  // we broke early. Make sure we at least have -something- before we
  // move forward.
  cleanAndReturnNode: function cleanAndReturnNode(node, $) {
    if (!node) {
      return null;
    }

    return normalizeSpaces($.html(node));
  }
};

// TODO: It would be great if we could merge the meta and selector lists into
// a list of objects, because we could then rank them better. For example,
// .hentry .entry-title is far better suited than <meta title>.
// An ordered list of meta tag names that denote likely article titles. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
var STRONG_TITLE_META_TAGS = ['tweetmeme-title', 'dc.title', 'rbtitle', 'headline', 'title']; // og:title is weak because it typically contains context that we don't like,
// for example the source site's name. Gotta get that brand into facebook!

var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.

var STRONG_TITLE_SELECTORS = ['.hentry .entry-title', 'h1#articleHeader', 'h1.articleHeader', 'h1.article', '.instapaper_title', '#meebo-title'];
var WEAK_TITLE_SELECTORS = ['article h1', '#entry-title', '.entry-title', '#entryTitle', '#entrytitle', '.entryTitle', '.entrytitle', '#articleTitle', '.articleTitle', 'post post-title', 'h1.title', 'h2.article', 'h1', 'html head title', 'title'];

var GenericTitleExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        metaCache = _ref.metaCache;
    // First, check to see if we have a matching meta tag that we can make
    // use of that is strongly associated with the headline.
    var title;
    title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // Second, look through our content selectors for the most likely
    // article title that is strongly associated with the headline.

    title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // Third, check for weaker meta tags that may match.

    title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // Last, look for weaker selector tags that may match.

    title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS);
    if (title) return cleanTitle$$1(title, {
      url: url,
      $: $
    }); // If no matches, return an empty string

    return '';
  }
};

// An ordered list of meta tag names that denote likely article authors. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
//
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
var AUTHOR_META_TAGS = ['byl', 'clmst', 'dc.author', 'dcsext.author', 'dc.creator', 'rbauthors', 'authors'];
var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.

var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .vcard .fn', '.byline.vcard .fn', '.byline .vcard .fn', '.byline .by .author', '.byline .by', '.byline .author', '.post-author.vcard', '.post-author .vcard', 'a[rel=author]', '#by_author', '.by_author', '#entryAuthor', '.entryAuthor', '.byline a[href*=author]', '#author .authorname', '.author .authorname', '#author', '.author', '.articleauthor', '.ArticleAuthor', '.byline']; // An ordered list of Selectors to find likely article authors, with
// regular expression for content.

var bylineRe = /^[\n\s]*By/i;
var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];

var GenericAuthorExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        metaCache = _ref.metaCache;
    var author; // First, check to see if we have a matching
    // meta tag that we can make use of.

    author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache);

    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    } // Second, look through our selectors looking for potential authors.


    author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2);

    if (author && author.length < AUTHOR_MAX_LENGTH) {
      return cleanAuthor(author);
    } // Last, use our looser regular-expression based selectors for
    // potential authors.
    // eslint-disable-next-line no-restricted-syntax


    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (var _iterator = _getIterator(BYLINE_SELECTORS_RE), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
        var _step$value = _slicedToArray(_step.value, 2),
            selector = _step$value[0],
            regex = _step$value[1];

        var node = $(selector);

        if (node.length === 1) {
          var text = node.text();

          if (regex.test(text)) {
            return cleanAuthor(text);
          }
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return null;
  }
};

// An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var DATE_PUBLISHED_META_TAGS = ['article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date']; // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.

var DATE_PUBLISHED_SELECTORS = ['.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate']; // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse

var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
var DATE_PUBLISHED_URL_RES = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), new RegExp("/(20\\d{2}/".concat(abbrevMonthsStr, "/[0-3]\\d)/"), 'i')];

var GenericDatePublishedExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        metaCache = _ref.metaCache;
    var datePublished; // First, check to see if we have a matching meta tag
    // that we can make use of.
    // Don't try cleaning tags from this string

    datePublished = extractFromMeta$$1($, DATE_PUBLISHED_META_TAGS, metaCache, false);
    if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential
    // date_published's.

    datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL

    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);
    return null;
  }
};

// Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.
var GenericDekExtractor = {
  extract: function extract() {
    return null;
  }
};

// An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src'];
var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]'];
var POSITIVE_LEAD_IMAGE_URL_HINTS = ['upload', 'wp-content', 'large', 'photo', 'wp-image'];
var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
var NEGATIVE_LEAD_IMAGE_URL_HINTS = ['spacer', 'sprite', 'blank', 'throbber', 'gradient', 'tile', 'bg', 'background', 'icon', 'social', 'header', 'hdr', 'advert', 'spinner', 'loader', 'loading', 'default', 'rating', 'share', 'facebook', 'twitter', 'theme', 'promo', 'ads', 'wp-includes'];
var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
var GIF_RE = /\.gif(\?.*)?$/i;
var JPG_RE = /\.jpe?g(\?.*)?$/i;

function getSig($node) {
  return "".concat($node.attr('class') || '', " ").concat($node.attr('id') || '');
} // Scores image urls based on a variety of heuristics.


function scoreImageUrl(url) {
  url = url.trim();
  var score = 0;

  if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
    score += 20;
  }

  if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
    score -= 20;
  } // TODO: We might want to consider removing this as
  // gifs are much more common/popular than they once were


  if (GIF_RE.test(url)) {
    score -= 10;
  }

  if (JPG_RE.test(url)) {
    score += 10;
  } // PNGs are neutral.


  return score;
} // Alt attribute usually means non-presentational image.

function scoreAttr($img) {
  if ($img.attr('alt')) {
    return 5;
  }

  return 0;
} // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them

function scoreByParents($img) {
  var score = 0;
  var $figParent = $img.parents('figure').first();

  if ($figParent.length === 1) {
    score += 25;
  }

  var $parent = $img.parent();
  var $gParent;

  if ($parent.length === 1) {
    $gParent = $parent.parent();
  }

  [$parent, $gParent].forEach(function ($node) {
    if (PHOTO_HINTS_RE$1.test(getSig($node))) {
      score += 15;
    }
  });
  return score;
} // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.

function scoreBySibling($img) {
  var score = 0;
  var $sibling = $img.next();
  var sibling = $sibling.get(0);

  if (sibling && sibling.tagName.toLowerCase() === 'figcaption') {
    score += 25;
  }

  if (PHOTO_HINTS_RE$1.test(getSig($sibling))) {
    score += 15;
  }

  return score;
}
function scoreByDimensions($img) {
  var score = 0;

  var width = _parseFloat($img.attr('width'));

  var height = _parseFloat($img.attr('height'));

  var src = $img.attr('src'); // Penalty for skinny images

  if (width && width <= 50) {
    score -= 50;
  } // Penalty for short images


  if (height && height <= 50) {
    score -= 50;
  }

  if (width && height && !src.includes('sprite')) {
    var area = width * height;

    if (area < 5000) {
      // Smaller than 50 x 100
      score -= 100;
    } else {
      score += Math.round(area / 1000);
    }
  }

  return score;
}
function scoreByPosition($imgs, index) {
  return $imgs.length / 2 - index;
}

// it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
//   * domain
//   * weird aspect ratio

var GenericLeadImageUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        content = _ref.content,
        metaCache = _ref.metaCache,
        html = _ref.html;
    var cleanUrl;

    if (!$.browser && $('head').length === 0) {
      $('*').first().prepend(html);
    } // Check to see if we have a matching meta tag that we can make use of.
    // Moving this higher because common practice is now to use large
    // images on things like Open Graph or Twitter cards.
    // images usually have for things like Open Graph.


    var imageUrl = extractFromMeta$$1($, LEAD_IMAGE_URL_META_TAGS, metaCache, false);

    if (imageUrl) {
      cleanUrl = clean$1(imageUrl);
      if (cleanUrl) return cleanUrl;
    } // Next, try to find the "best" image via the content.
    // We'd rather not have to fetch each image and check dimensions,
    // so try to do some analysis and determine them instead.


    var $content = $(content);
    var imgs = $('img', $content).toArray();
    var imgScores = {};
    imgs.forEach(function (img, index) {
      var $img = $(img);
      var src = $img.attr('src');
      if (!src) return;
      var score = scoreImageUrl(src);
      score += scoreAttr($img);
      score += scoreByParents($img);
      score += scoreBySibling($img);
      score += scoreByDimensions($img);
      score += scoreByPosition(imgs, index);
      imgScores[src] = score;
    });

    var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(function (acc, key) {
      return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
    }, [null, 0]),
        _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
        topUrl = _Reflect$ownKeys$redu2[0],
        topScore = _Reflect$ownKeys$redu2[1];

    if (topScore > 0) {
      cleanUrl = clean$1(topUrl);
      if (cleanUrl) return cleanUrl;
    } // If nothing else worked, check to see if there are any really
    // probable nodes in the doc, like <link rel="image_src" />.
    // eslint-disable-next-line no-restricted-syntax


    var _iteratorNormalCompletion = true;
    var _didIteratorError = false;
    var _iteratorError = undefined;

    try {
      for (var _iterator = _getIterator(LEAD_IMAGE_URL_SELECTORS), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
        var selector = _step.value;
        var $node = $(selector).first();
        var src = $node.attr('src');

        if (src) {
          cleanUrl = clean$1(src);
          if (cleanUrl) return cleanUrl;
        }

        var href = $node.attr('href');

        if (href) {
          cleanUrl = clean$1(href);
          if (cleanUrl) return cleanUrl;
        }

        var value = $node.attr('value');

        if (value) {
          cleanUrl = clean$1(value);
          if (cleanUrl) return cleanUrl;
        }
      }
    } catch (err) {
      _didIteratorError = true;
      _iteratorError = err;
    } finally {
      try {
        if (!_iteratorNormalCompletion && _iterator.return != null) {
          _iterator.return();
        }
      } finally {
        if (_didIteratorError) {
          throw _iteratorError;
        }
      }
    }

    return null;
  }
};

function scoreSimilarity(score, articleUrl, href) {
  // Do this last and only if we have a real candidate, because it's
  // potentially expensive computationally. Compare the link to this
  // URL using difflib to get the % similarity of these URLs. On a
  // sliding scale, subtract points from this link based on
  // similarity.
  if (score > 0) {
    var similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio(); // Subtract .1 from diff_percent when calculating modifier,
    // which means that if it's less than 10% different, we give a
    // bonus instead. Ex:
    //  3% different = +17.5 points
    // 10% different = 0 points
    // 20% different = -25 points

    var diffPercent = 1.0 - similarity;
    var diffModifier = -(250 * (diffPercent - 0.2));
    return score + diffModifier;
  }

  return 0;
}

function scoreLinkText(linkText, pageNum) {
  // If the link text can be parsed as a number, give it a minor
  // bonus, with a slight bias towards lower numbered pages. This is
  // so that pages that might not have 'next' in their text can still
  // get scored, and sorted properly by score.
  var score = 0;

  if (IS_DIGIT_RE.test(linkText.trim())) {
    var linkTextAsNum = _parseInt(linkText, 10); // If it's the first page, we already got it on the first call.
    // Give it a negative score. Otherwise, up to page 10, give a
    // small bonus.


    if (linkTextAsNum < 2) {
      score = -30;
    } else {
      score = Math.max(0, 10 - linkTextAsNum);
    } // If it appears that the current page number is greater than
    // this links page number, it's a very bad sign. Give it a big
    // penalty.


    if (pageNum && pageNum >= linkTextAsNum) {
      score -= 50;
    }
  }

  return score;
}

function scorePageInLink(pageNum, isWp) {
  // page in the link = bonus. Intentionally ignore wordpress because
  // their ?p=123 link style gets caught by this even though it means
  // separate documents entirely.
  if (pageNum && !isWp) {
    return 50;
  }

  return 0;
}

var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.

var EXTRANEOUS_LINK_HINTS$1 = ['print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated'];
var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(EXTRANEOUS_LINK_HINTS$1.join('|'), 'i'); // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.

var NEXT_LINK_TEXT_RE$1 = new RegExp('(next|weiter|continue|>([^|]|$)|»([^|]|$))', 'i'); // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.

var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous
// page.

var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination

function scoreExtraneousLinks(href) {
  // If the URL itself contains extraneous values, give a penalty.
  if (EXTRANEOUS_LINK_HINTS_RE$1.test(href)) {
    return -25;
  }

  return 0;
}

function makeSig($link) {
  return "".concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
}

function scoreByParents$1($link) {
  // If a parent node contains paging-like classname or id, give a
  // bonus. Additionally, if a parent_node contains bad content
  // (like 'sponsor'), give a penalty.
  var $parent = $link.parent();
  var positiveMatch = false;
  var negativeMatch = false;
  var score = 0;

  _Array$from(range(0, 4)).forEach(function () {
    if ($parent.length === 0) {
      return;
    }

    var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good
    // sign. Add a bonus.

    if (!positiveMatch && PAGE_RE.test(parentData)) {
      positiveMatch = true;
      score += 25;
    } // If we have 'comment' or something in our data, and
    // we don't have something like 'content' as well, that's
    // a bad sign. Give a penalty.


    if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData) && EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)) {
      if (!POSITIVE_SCORE_RE.test(parentData)) {
        negativeMatch = true;
        score -= 25;
      }
    }

    $parent = $parent.parent();
  });

  return score;
}

function scorePrevLink(linkData) {
  // If the link has something like "previous", its definitely
  // an old link, skip it.
  if (PREV_LINK_TEXT_RE$1.test(linkData)) {
    return -200;
  }

  return 0;
}

function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls) {
  // skip if we've already fetched this url
  if (previousUrls.find(function (url) {
    return href === url;
  }) !== undefined) {
    return false;
  } // If we've already parsed this URL, or the URL matches the base
  // URL, or is empty, skip it.


  if (!href || href === articleUrl || href === baseUrl) {
    return false;
  }

  var hostname = parsedUrl.hostname;

  var _URL$parse = URL.parse(href),
      linkHost = _URL$parse.hostname; // Domain mismatch.


  if (linkHost !== hostname) {
    return false;
  } // If href doesn't contain a digit after removing the base URL,
  // it's certainly not the next page.


  var fragment = href.replace(baseUrl, '');

  if (!DIGIT_RE$2.test(fragment)) {
    return false;
  } // This link has extraneous content (like "comment") in its link
  // text, so we skip it.


  if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) {
    return false;
  } // Next page link text is never long, skip if it is too long.


  if (linkText.length > 25) {
    return false;
  }

  return true;
}

function scoreBaseUrl(href, baseRegex) {
  // If the baseUrl isn't part of this URL, penalize this
  // link. It could still be the link, but the odds are lower.
  // Example:
  // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
  if (!baseRegex.test(href)) {
    return -25;
  }

  return 0;
}

function scoreNextLinkText(linkData) {
  // Things like "next", ">>", etc.
  if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
    return 50;
  }

  return 0;
}

function scoreCapLinks(linkData) {
  // Cap links are links like "last", etc.
  if (CAP_LINK_TEXT_RE$1.test(linkData)) {
    // If we found a link like "last", but we've already seen that
    // this link is also "next", it's fine. If it's not been
    // previously marked as "next", then it's probably bad.
    // Penalize.
    if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
      return -65;
    }
  }

  return 0;
}

function makeBaseRegex(baseUrl) {
  return new RegExp("^".concat(baseUrl), 'i');
}

function makeSig$1($link, linkText) {
  return "".concat(linkText || $link.text(), " ").concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
}

function scoreLinks(_ref) {
  var links = _ref.links,
      articleUrl = _ref.articleUrl,
      baseUrl = _ref.baseUrl,
      parsedUrl = _ref.parsedUrl,
      $ = _ref.$,
      _ref$previousUrls = _ref.previousUrls,
      previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
  parsedUrl = parsedUrl || URL.parse(articleUrl);
  var baseRegex = makeBaseRegex(baseUrl);
  var isWp = isWordpress($); // Loop through all links, looking for hints that they may be next-page
  // links. Things like having "page" in their textContent, className or
  // id, or being a child of a node with a page-y className or id.
  //
  // After we do that, assign each page a score, and pick the one that
  // looks most like the next page link, as long as its score is strong
  // enough to have decent confidence.

  var scoredPages = links.reduce(function (possiblePages, link) {
    // Remove any anchor data since we don't do a good job
    // standardizing URLs (it's hard), we're going to do
    // some checking with and without a trailing slash
    var attrs = getAttrs(link); // if href is undefined, return

    if (!attrs.href) return possiblePages;
    var href = removeAnchor(attrs.href);
    var $link = $(link);
    var linkText = $link.text();

    if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
      return possiblePages;
    } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##


    if (!possiblePages[href]) {
      possiblePages[href] = {
        score: 0,
        linkText: linkText,
        href: href
      };
    } else {
      possiblePages[href].linkText = "".concat(possiblePages[href].linkText, "|").concat(linkText);
    }

    var possiblePage = possiblePages[href];
    var linkData = makeSig$1($link, linkText);
    var pageNum = pageNumFromUrl(href);
    var score = scoreBaseUrl(href, baseRegex);
    score += scoreNextLinkText(linkData);
    score += scoreCapLinks(linkData);
    score += scorePrevLink(linkData);
    score += scoreByParents$1($link);
    score += scoreExtraneousLinks(href);
    score += scorePageInLink(pageNum, isWp);
    score += scoreLinkText(linkText, pageNum);
    score += scoreSimilarity(score, articleUrl, href);
    possiblePage.score = score;
    return possiblePages;
  }, {});
  return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages;
}

// for multi-page articles

var GenericNextPageUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        parsedUrl = _ref.parsedUrl,
        _ref$previousUrls = _ref.previousUrls,
        previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
    parsedUrl = parsedUrl || URL.parse(url);
    var articleUrl = removeAnchor(url);
    var baseUrl = articleBaseUrl(url, parsedUrl);
    var links = $('a[href]').toArray();
    var scoredLinks = scoreLinks({
      links: links,
      articleUrl: articleUrl,
      baseUrl: baseUrl,
      parsedUrl: parsedUrl,
      $: $,
      previousUrls: previousUrls
    }); // If no links were scored, return null

    if (!scoredLinks) return null; // now that we've scored all possible pages,
    // find the biggest one.

    var topPage = _Reflect$ownKeys(scoredLinks).reduce(function (acc, link) {
      var scoredLink = scoredLinks[link];
      return scoredLink.score > acc.score ? scoredLink : acc;
    }, {
      score: -100
    }); // If the score is less than 50, we're not confident enough to use it,
    // so we fail.


    if (topPage.score >= 50) {
      return topPage.href;
    }

    return null;
  }
};

var CANONICAL_META_SELECTORS = ['og:url'];

function parseDomain(url) {
  var parsedUrl = URL.parse(url);
  var hostname = parsedUrl.hostname;
  return hostname;
}

function result(url) {
  return {
    url: url,
    domain: parseDomain(url)
  };
}

var GenericUrlExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        url = _ref.url,
        metaCache = _ref.metaCache;
    var $canonical = $('link[rel=canonical]');

    if ($canonical.length !== 0) {
      var href = $canonical.attr('href');

      if (href) {
        return result(href);
      }
    }

    var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache);

    if (metaUrl) {
      return result(metaUrl);
    }

    return result(url);
  }
};

var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];

function clean$2(content, $) {
  var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
  content = content.replace(/[\s\n]+/g, ' ').trim();
  return ellipsize(content, maxLength, {
    ellipse: '&hellip;'
  });
}
var GenericExcerptExtractor = {
  extract: function extract(_ref) {
    var $ = _ref.$,
        content = _ref.content,
        metaCache = _ref.metaCache;
    var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache);

    if (excerpt) {
      return clean$2(stripTags(excerpt, $));
    } // Fall back to excerpting from the extracted content


    var maxLength = 200;
    var shortContent = content.slice(0, maxLength * 5);
    return clean$2($(shortContent).text(), $, maxLength);
  }
};

var getWordCount = function getWordCount(content) {
  var $ = cheerio.load(content);
  var $content = $('div').first();
  var text = normalizeSpaces($content.text());
  return text.split(/\s/).length;
};

var getWordCountAlt = function getWordCountAlt(content) {
  content = content.replace(/<[^>]*>/g, ' ');
  content = content.replace(/\s+/g, ' ');
  content = content.trim();
  return content.split(' ').length;
};

var GenericWordCountExtractor = {
  extract: function extract(_ref) {
    var content = _ref.content;
    var count = getWordCount(content);
    if (count === 1) count = getWordCountAlt(content);
    return count;
  }
};

var GenericExtractor = {
  // This extractor is the default for all domains
  domain: '*',
  title: GenericTitleExtractor.extract,
  date_published: GenericDatePublishedExtractor.extract,
  author: GenericAuthorExtractor.extract,
  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
  lead_image_url: GenericLeadImageUrlExtractor.extract,
  dek: GenericDekExtractor.extract,
  next_page_url: GenericNextPageUrlExtractor.extract,
  url_and_domain: GenericUrlExtractor.extract,
  excerpt: GenericExcerptExtractor.extract,
  word_count: GenericWordCountExtractor.extract,
  direction: function direction(_ref) {
    var title = _ref.title;
    return stringDirection.getDirection(title);
  },
  extract: function extract(options) {
    var html = options.html,
        $ = options.$;

    if (html && !$) {
      var loaded = cheerio.load(html);
      options.$ = loaded;
    }

    var title = this.title(options);
    var date_published = this.date_published(options);
    var author = this.author(options);
    var content = this.content(_objectSpread({}, options, {
      title: title
    }));
    var lead_image_url = this.lead_image_url(_objectSpread({}, options, {
      content: content
    }));
    var dek = this.dek(_objectSpread({}, options, {
      content: content
    }));
    var next_page_url = this.next_page_url(options);
    var excerpt = this.excerpt(_objectSpread({}, options, {
      content: content
    }));
    var word_count = this.word_count(_objectSpread({}, options, {
      content: content
    }));
    var direction = this.direction({
      title: title
    });

    var _this$url_and_domain = this.url_and_domain(options),
        url = _this$url_and_domain.url,
        domain = _this$url_and_domain.domain;

    return {
      title: title,
      author: author,
      date_published: date_published || null,
      dek: dek,
      lead_image_url: lead_image_url,
      content: content,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
      excerpt: excerpt,
      word_count: word_count,
      direction: direction
    };
  }
};

var Detectors = {
  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
  'meta[name="generator"][value="blogger"]': BloggerExtractor
};
function detectByHtml($) {
  var selector = _Reflect$ownKeys(Detectors).find(function (s) {
    return $(s).length > 0;
  });

  return Detectors[selector];
}

function getExtractor(url, parsedUrl, $) {
  parsedUrl = parsedUrl || URL.parse(url);
  var _parsedUrl = parsedUrl,
      hostname = _parsedUrl.hostname;
  var baseDomain = hostname.split('.').slice(-2).join('.');
  return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
}

function cleanBySelectors($content, $, _ref) {
  var clean = _ref.clean;
  if (!clean) return $content;
  $(clean.join(','), $content).remove();
  return $content;
} // Transform matching elements

function transformElements($content, $, _ref2) {
  var transforms = _ref2.transforms;
  if (!transforms) return $content;

  _Reflect$ownKeys(transforms).forEach(function (key) {
    var $matches = $(key, $content);
    var value = transforms[key]; // If value is a string, convert directly

    if (typeof value === 'string') {
      $matches.each(function (index, node) {
        convertNodeTo$$1($(node), $, transforms[key]);
      });
    } else if (typeof value === 'function') {
      // If value is function, apply function to node
      $matches.each(function (index, node) {
        var result = value($(node), $); // If function returns a string, convert node to that value

        if (typeof result === 'string') {
          convertNodeTo$$1($(node), $, result);
        }
      });
    }
  });

  return $content;
}

function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
  return selectors.find(function (selector) {
    if (_Array$isArray(selector)) {
      if (extractHtml) {
        return selector.reduce(function (acc, s) {
          return acc && $(s).length > 0;
        }, true);
      }

      var _selector = _slicedToArray(selector, 2),
          s = _selector[0],
          attr = _selector[1];

      return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
    }

    return (allowMultiple || !allowMultiple && $(selector).length === 1) && $(selector).text().trim() !== '';
  });
}

function select(opts) {
  var $ = opts.$,
      type = opts.type,
      extractionOpts = opts.extractionOpts,
      _opts$extractHtml = opts.extractHtml,
      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type

  if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
  // contributors), return the string

  if (typeof extractionOpts === 'string') return extractionOpts;
  var selectors = extractionOpts.selectors,
      _extractionOpts$defau = extractionOpts.defaultCleaner,
      defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
      allowMultiple = extractionOpts.allowMultiple;
  var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
  var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
  if (!matchingSelector) return null;

  function transformAndClean($node) {
    makeLinksAbsolute$$1($node, $, opts.url || '');
    cleanBySelectors($node, $, extractionOpts);
    transformElements($node, $, extractionOpts);
    return $node;
  }

  function selectHtml() {
    // If the selector type requests html as its return type
    // transform and clean the element with provided selectors
    var $content; // If matching selector is an array, we're considering this a
    // multi-match selection, which allows the parser to choose several
    // selectors to include in the result. Note that all selectors in the
    // array must match in order for this selector to trigger

    if (_Array$isArray(matchingSelector)) {
      $content = $(matchingSelector.join(','));
      var $wrapper = $('<div></div>');
      $content.each(function (_, element) {
        $wrapper.append(element);
      });
      $content = $wrapper;
    } else {
      $content = $(matchingSelector);
    } // Wrap in div so transformation can take place on root element


    $content.wrap($('<div></div>'));
    $content = $content.parent();
    $content = transformAndClean($content);

    if (Cleaners[type]) {
      Cleaners[type]($content, _objectSpread({}, opts, {
        defaultCleaner: defaultCleaner
      }));
    }

    if (allowMultiple) {
      return $content.children().toArray().map(function (el) {
        return $.html($(el));
      });
    }

    return $.html($content);
  }

  if (extractHtml) {
    return selectHtml(matchingSelector);
  }

  var $match;
  var result; // if selector is an array (e.g., ['img', 'src']),
  // extract the attr

  if (_Array$isArray(matchingSelector)) {
    var _matchingSelector = _slicedToArray(matchingSelector, 3),
        selector = _matchingSelector[0],
        attr = _matchingSelector[1],
        transform = _matchingSelector[2];

    $match = $(selector);
    $match = transformAndClean($match);
    result = $match.map(function (_, el) {
      var item = $(el).attr(attr).trim();
      return transform ? transform(item) : item;
    });
  } else {
    $match = $(matchingSelector);
    $match = transformAndClean($match);
    result = $match.map(function (_, el) {
      return $(el).text().trim();
    });
  }

  result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0]; // Allow custom extractor to skip default cleaner
  // for this type; defaults to true

  if (defaultCleaner && Cleaners[type]) {
    return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
  }

  return result;
}
function selectExtendedTypes(extend, opts) {
  var results = {};

  _Reflect$ownKeys(extend).forEach(function (t) {
    if (!results[t]) {
      results[t] = select(_objectSpread({}, opts, {
        type: t,
        extractionOpts: extend[t]
      }));
    }
  });

  return results;
}

function extractResult(opts) {
  var type = opts.type,
      extractor = opts.extractor,
      _opts$fallback = opts.fallback,
      fallback = _opts$fallback === void 0 ? true : _opts$fallback;
  var result = select(_objectSpread({}, opts, {
    extractionOpts: extractor[type]
  })); // If custom parser succeeds, return the result

  if (result) {
    return result;
  } // If nothing matches the selector, and fallback is enabled,
  // run the Generic extraction


  if (fallback) return GenericExtractor[type](opts);
  return null;
}

var RootExtractor = {
  extract: function extract() {
    var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
    var opts = arguments.length > 1 ? arguments[1] : undefined;
    var _opts = opts,
        contentOnly = _opts.contentOnly,
        extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method

    if (extractor.domain === '*') return extractor.extract(opts);
    opts = _objectSpread({}, opts, {
      extractor: extractor
    });

    if (contentOnly) {
      var _content = extractResult(_objectSpread({}, opts, {
        type: 'content',
        extractHtml: true,
        title: extractedTitle
      }));

      return {
        content: _content
      };
    }

    var title = extractResult(_objectSpread({}, opts, {
      type: 'title'
    }));
    var date_published = extractResult(_objectSpread({}, opts, {
      type: 'date_published'
    }));
    var author = extractResult(_objectSpread({}, opts, {
      type: 'author'
    }));
    var next_page_url = extractResult(_objectSpread({}, opts, {
      type: 'next_page_url'
    }));
    var content = extractResult(_objectSpread({}, opts, {
      type: 'content',
      extractHtml: true,
      title: title
    }));
    var lead_image_url = extractResult(_objectSpread({}, opts, {
      type: 'lead_image_url',
      content: content
    }));
    var excerpt = extractResult(_objectSpread({}, opts, {
      type: 'excerpt',
      content: content
    }));
    var dek = extractResult(_objectSpread({}, opts, {
      type: 'dek',
      content: content,
      excerpt: excerpt
    }));
    var word_count = extractResult(_objectSpread({}, opts, {
      type: 'word_count',
      content: content
    }));
    var direction = extractResult(_objectSpread({}, opts, {
      type: 'direction',
      title: title
    }));

    var _ref3 = extractResult(_objectSpread({}, opts, {
      type: 'url_and_domain'
    })) || {
      url: null,
      domain: null
    },
        url = _ref3.url,
        domain = _ref3.domain;

    var extendedResults = {};

    if (extractor.extend) {
      extendedResults = selectExtendedTypes(extractor.extend, opts);
    }

    return _objectSpread({
      title: title,
      content: content,
      author: author,
      date_published: date_published,
      lead_image_url: lead_image_url,
      dek: dek,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
      excerpt: excerpt,
      word_count: word_count,
      direction: direction
    }, extendedResults);
  }
};

function collectAllPages(_x) {
  return _collectAllPages.apply(this, arguments);
}

function _collectAllPages() {
  _collectAllPages = _asyncToGenerator(
  /*#__PURE__*/
  _regeneratorRuntime.mark(function _callee(_ref) {
    var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count;
    return _regeneratorRuntime.wrap(function _callee$(_context) {
      while (1) {
        switch (_context.prev = _context.next) {
          case 0:
            next_page_url = _ref.next_page_url, html = _ref.html, $ = _ref.$, metaCache = _ref.metaCache, result = _ref.result, Extractor = _ref.Extractor, title = _ref.title, url = _ref.url;
            // At this point, we've fetched just the first page
            pages = 1;
            previousUrls = [removeAnchor(url)]; // If we've gone over 26 pages, something has
            // likely gone wrong.

          case 3:
            if (!(next_page_url && pages < 26)) {
              _context.next = 16;
              break;
            }

            pages += 1; // eslint-disable-next-line no-await-in-loop

            _context.next = 7;
            return Resource.create(next_page_url);

          case 7:
            $ = _context.sent;
            html = $.html();
            extractorOpts = {
              url: next_page_url,
              html: html,
              $: $,
              metaCache: metaCache,
              extractedTitle: title,
              previousUrls: previousUrls
            };
            nextPageResult = RootExtractor.extract(Extractor, extractorOpts);
            previousUrls.push(next_page_url);
            result = _objectSpread({}, result, {
              content: "".concat(result.content, "<hr><h4>Page ").concat(pages, "</h4>").concat(nextPageResult.content)
            }); // eslint-disable-next-line prefer-destructuring

            next_page_url = nextPageResult.next_page_url;
            _context.next = 3;
            break;

          case 16:
            word_count = GenericExtractor.word_count({
              content: "<div>".concat(result.content, "</div>")
            });
            return _context.abrupt("return", _objectSpread({}, result, {
              total_pages: pages,
              rendered_pages: pages,
              word_count: word_count
            }));

          case 18:
          case "end":
            return _context.stop();
        }
      }
    }, _callee, this);
  }));
  return _collectAllPages.apply(this, arguments);
}

var Parser = {
  parse: function () {
    var _parse = _asyncToGenerator(
    /*#__PURE__*/
    _regeneratorRuntime.mark(function _callee(url) {
      var _ref,
          html,
          opts,
          _opts$fetchAllPages,
          fetchAllPages,
          _opts$fallback,
          fallback,
          _opts$contentType,
          contentType,
          _opts$headers,
          headers,
          extend,
          customExtractor,
          parsedUrl,
          $,
          Extractor,
          metaCache,
          extendedTypes,
          result,
          _result,
          title,
          next_page_url,
          turndownService,
          _args = arguments;

      return _regeneratorRuntime.wrap(function _callee$(_context) {
        while (1) {
          switch (_context.prev = _context.next) {
            case 0:
              _ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
              _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
              // set url to window.location.href and load the html
              // from the current page

              if (!url && cheerio.browser) {
                url = window.location.href; // eslint-disable-line no-undef

                html = html || cheerio.html();
              }

              parsedUrl = URL.parse(url);

              if (validateUrl(parsedUrl)) {
                _context.next = 6;
                break;
              }

              return _context.abrupt("return", {
                error: true,
                message: 'The url parameter passed does not look like a valid URL. Please check your URL and try again.'
              });

            case 6:
              _context.next = 8;
              return Resource.create(url, html, parsedUrl, headers);

            case 8:
              $ = _context.sent;

              if (!$.failed) {
                _context.next = 11;
                break;
              }

              return _context.abrupt("return", $);

            case 11:
              // Add custom extractor via cli.
              if (customExtractor) {
                addExtractor(customExtractor);
              }

              Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
              // if html still has not been set (i.e., url passed to Parser.parse),
              // set html from the response of Resource.create

              if (!html) {
                html = $.html();
              } // Cached value of every meta name in our document.
              // Used when extracting title/author/date_published/dek


              metaCache = $('meta').map(function (_, node) {
                return $(node).attr('name');
              }).toArray();
              extendedTypes = {};

              if (extend) {
                extendedTypes = selectExtendedTypes(extend, {
                  $: $,
                  url: url,
                  html: html
                });
              }

              result = RootExtractor.extract(Extractor, {
                url: url,
                html: html,
                $: $,
                metaCache: metaCache,
                parsedUrl: parsedUrl,
                fallback: fallback,
                contentType: contentType
              });
              _result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found

              if (!(fetchAllPages && next_page_url)) {
                _context.next = 25;
                break;
              }

              _context.next = 22;
              return collectAllPages({
                Extractor: Extractor,
                next_page_url: next_page_url,
                html: html,
                $: $,
                metaCache: metaCache,
                result: result,
                title: title,
                url: url
              });

            case 22:
              result = _context.sent;
              _context.next = 26;
              break;

            case 25:
              result = _objectSpread({}, result, {
                total_pages: 1,
                rendered_pages: 1
              });

            case 26:
              if (contentType === 'markdown') {
                turndownService = new TurndownService();
                result.content = turndownService.turndown(result.content);
              } else if (contentType === 'text') {
                result.content = $.text($(result.content));
              }

              return _context.abrupt("return", _objectSpread({}, result, extendedTypes));

            case 28:
            case "end":
              return _context.stop();
          }
        }
      }, _callee, this);
    }));

    function parse(_x) {
      return _parse.apply(this, arguments);
    }

    return parse;
  }(),
  browser: !!cheerio.browser,
  // A convenience method for getting a resource
  // to work with, e.g., for custom extractor generator
  fetchResource: function fetchResource(url) {
    return Resource.create(url);
  },
  addExtractor: function addExtractor$$1(extractor) {
    return addExtractor(extractor);
  }
};

module.exports = Parser;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								'use strict';
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _regeneratorRuntime = _interopDefault(require('@babel/runtime-corejs2/regenerator'));
 								var _objectSpread = _interopDefault(require('@babel/runtime-corejs2/helpers/objectSpread'));
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/helpers/objectWithoutProperties'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var URL = _interopDefault(require('url'));
 								var cheerio = _interopDefault(require('cheerio'));
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								var TurndownService = _interopDefault(require('turndown'));
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								var iconv = _interopDefault(require('iconv-lite'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
 								var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
 								var _Promise = _interopDefault(require('@babel/runtime-corejs2/core-js/promise'));
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var request = _interopDefault(require('postman-request'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _Reflect$ownKeys = _interopDefault(require('@babel/runtime-corejs2/core-js/reflect/own-keys'));
 								var _toConsumableArray = _interopDefault(require('@babel/runtime-corejs2/helpers/toConsumableArray'));
 								var _defineProperty = _interopDefault(require('@babel/runtime-corejs2/helpers/defineProperty'));
 								var _parseFloat = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-float'));
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								var _Object$assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var stringDirection = _interopDefault(require('string-direction'));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var validUrl = _interopDefault(require('valid-url'));
-												Feat: custom timezones (#29)

* using moment-timezone to allow custom timezones

* added tz to tmz, even though still so-so

											
										
										
											8 years ago
+								var moment = _interopDefault(require('moment-timezone'));
 								var parseFormat = _interopDefault(require('moment-parseformat'));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var wuzzy = _interopDefault(require('wuzzy'));
 								var difflib = _interopDefault(require('difflib'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _Array$from = _interopDefault(require('@babel/runtime-corejs2/core-js/array/from'));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var ellipsize = _interopDefault(require('ellipsize'));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var _Array$isArray = _interopDefault(require('@babel/runtime-corejs2/core-js/array/is-array'));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								function normalizeSpaces(text) {
 								  return text.replace(NORMALIZE_RE, ' ').trim();
 								}
 								// Given a node type to search for, and a list of regular expressions,
 								// look to see if this extraction can be found in the URL. Expects
 								// that each expression in r_list will return group(1) as the proper
 								// string to be cleaned.
 								// Only used for date_published currently.
 								function extractFromUrl(url, regexList) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var matchRe = regexList.find(function (re) {
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								    return re.test(url);
 								  });
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  if (matchRe) {
 								    return matchRe.exec(url)[1];
 								  }
 								  return null;
 								}
 								// An expression that looks to try to find the page digit within a URL, if
 								// it exists.
 								// Matches:
 								//  page=1
 								//  pg=1
 								//  p=1
 								//  paging=12
 								//  pag=7
 								//  pagination/1
 								//  paging/88
 								//  pa/83
 								//  p/11
 								//
 								// Does not match:
 								//  pg=102
 								//  page:2
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								var HAS_ALPHA_RE = /[a-z]/i;
 								var IS_ALPHA_RE = /^[a-z]+$/i;
 								var IS_DIGIT_RE = /^[0-9]+$/i;
 								var ENCODING_RE = /charset=([\w-]+)\b/;
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								var DEFAULT_ENCODING = 'utf-8';
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
 								function pageNumFromUrl(url) {
 								  var matches = url.match(PAGE_IN_HREF_RE);
 								  if (!matches) return null;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var pageNum = _parseInt(matches[6], 10); // Return pageNum < 100, otherwise
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  // return null
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  return pageNum < 100 ? pageNum : null;
 								}
 								function removeAnchor(url) {
 								  return url.split('#')[0].replace(/\/$/, '');
 								}
 								function isGoodSegment(segment, index, firstSegmentHasLetters) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var goodSegment = true; // If this is purely a number, and it's the first or second
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  // url_segment, it's probably a page number. Remove it.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
 								    goodSegment = true;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // If this is the first url_segment and it's just "index",
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  // remove it
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  if (index === 0 && segment.toLowerCase() === 'index') {
 								    goodSegment = false;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // If our first or second url_segment is smaller than 3 characters,
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  // and the first url_segment had no alphas, remove it.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
 								    goodSegment = false;
 								  }
 								  return goodSegment;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Take a URL, and return the article base of said URL. That is, no
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								// pagination data exists in it. Useful for comparing to other links
 								// that might have pagination data within them.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								function articleBaseUrl(url, parsed) {
 								  var parsedUrl = parsed || URL.parse(url);
 								  var protocol = parsedUrl.protocol,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      host = parsedUrl.host,
 								      path = parsedUrl.path;
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  var firstSegmentHasLetters = false;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
 								    var segment = rawSegment; // Split off and save anything that looks like a file type.
 								    if (segment.includes('.')) {
 								      var _segment$split = segment.split('.'),
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								          _segment$split2 = _slicedToArray(_segment$split, 2),
 								          possibleSegment = _segment$split2[0],
 								          fileExt = _segment$split2[1];
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      if (IS_ALPHA_RE.test(fileExt)) {
 								        segment = possibleSegment;
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								      }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    } // If our first or second segment has anything looking like a page
 								    // number, remove it.
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								    if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
 								      segment = segment.replace(PAGE_IN_HREF_RE, '');
 								    } // If we're on the first segment, check to see if we have any
 								    // characters in it. The first segment is actually the last bit of
 								    // the URL, and this will be helpful to determine if we're on a URL
 								    // segment that looks like "/2/" for example.
 								    if (index === 0) {
 								      firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
 								    } // If it's not marked for deletion, push it to cleaned_segments.
 								    if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
 								      acc.push(segment);
 								    }
 								    return acc;
 								  }, []);
 								  return "".concat(protocol, "//").concat(host).concat(cleanedSegments.reverse().join('/'));
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								}
 								// Given a string, return True if it appears to have an ending sentence
 								// within it, false otherwise.
 								var SENTENCE_END_RE = new RegExp('.( |$)');
 								function hasSentenceEnd(text) {
 								  return SENTENCE_END_RE.test(text);
 								}
 								function excerptContent(content) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
 								  return content.trim().split(/\s+/).slice(0, words).join(' ');
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								}
 								// used in our fetchResource function to
 								// ensure correctly encoded responses
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								function getEncoding(str) {
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  var encoding = DEFAULT_ENCODING;
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  var matches = ENCODING_RE.exec(str);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								  if (matches !== null) {
 								    var _matches = _slicedToArray(matches, 2);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    str = _matches[1];
 								  }
 								  if (iconv.encodingExists(str)) {
 								    encoding = str;
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
+								  }
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  return encoding;
 								}
-												feat: encoding response body based on content-type charset (#21)

Also some small code organization
											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var REQUEST_HEADERS = cheerio.browser ? {} : {
 								  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
 								}; // The number of milliseconds to attempt to fetch a resource before timing out.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var BAD_CONTENT_TYPES_RE = new RegExp("^(".concat(BAD_CONTENT_TYPES.join('|'), ")$"), 'i'); // Use this setting as the maximum size an article can be
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// for us to attempt parsing. Defaults to 5 MB.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function get(options) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return new _Promise(function (resolve, reject) {
 								    request(options, function (err, response, body) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      if (err) {
 								        reject(err);
 								      } else {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        resolve({
 								          body: body,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          response: response
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        });
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      }
 								    });
 								  });
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Evaluate a response to ensure it's something we should be keeping.
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								// This does not validate in the sense of a response being 200 or not.
 								// Validation here means that we haven't found reason to bail from
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// further processing of this url.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function validateResponse(response) {
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								  var parseNon200 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								  // Check if we got a valid status code
-												fix: servers returning bad headers was breaking request. temporarily (#20)

using fork with a fix for this until request merges the necessary pull request
											
										
										
											8 years ago
+								  // This isn't great, but I'm requiring a statusMessage to be set
 								  // before short circuiting b/c nock doesn't set it in tests
 								  // statusMessage only not set in nock response, in which case
 								  // I check statusCode, which is currently only 200 for OK responses
 								  // in tests
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (!response.statusCode) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      throw new Error("Unable to fetch content. Original exception was ".concat(response.error));
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								    } else if (!parseNon200) {
 								      throw new Error("Resource returned a response status code of ".concat(response.statusCode, " and resource was instructed to reject non-200 status codes."));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
 								  }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var _response$headers = response.headers,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      contentType = _response$headers['content-type'],
 								      contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								  if (BAD_CONTENT_TYPES_RE.test(contentType)) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    throw new Error("Content-type for this resource was ".concat(contentType, " and is not allowed."));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // Check that the content length is below maximum
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (contentLength > MAX_CONTENT_LENGTH) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    throw new Error("Content for this resource was too large. Maximum content length is ".concat(MAX_CONTENT_LENGTH, "."));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
 								  return true;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Grabs the last two pieces of the URL and joins them back together
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// TODO: This should gracefully handle timeouts and raise the
 								//       proper exceptions on the many failure cases of HTTP.
 								// TODO: Ensure we are not fetching something enormous. Always return
 								//       unicode content for HTML, with charset conversion.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function fetchResource(_x, _x2) {
 								  return _fetchResource.apply(this, arguments);
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function _fetchResource() {
 								  _fetchResource = _asyncToGenerator(
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  /*#__PURE__*/
 								  _regeneratorRuntime.mark(function _callee(url, parsedUrl) {
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								    var headers,
 								        options,
 								        _ref2,
 								        response,
 								        body,
 								        _args = arguments;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								    return _regeneratorRuntime.wrap(function _callee$(_context) {
 								      while (1) {
 								        switch (_context.prev = _context.next) {
 								          case 0:
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            parsedUrl = parsedUrl || URL.parse(encodeURI(url));
 								            options = _objectSpread({
 								              url: parsedUrl.href,
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								              headers: _objectSpread({}, REQUEST_HEADERS, headers),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              timeout: FETCH_TIMEOUT,
 								              // Accept cookies
 								              jar: true,
 								              // Set to null so the response returns as binary and body as buffer
 								              // https://github.com/request/request#requestoptions-callback
 								              encoding: null,
 								              // Accept and decode gzip
 								              gzip: true,
 								              // Follow any non-GET redirects
 								              followAllRedirects: true
 								            }, typeof window !== 'undefined' ? {} : {
 								              // Follow GET redirects; this option is for Node only
 								              followRedirect: true
 								            });
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            _context.next = 5;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            return get(options);
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								          case 5:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            _ref2 = _context.sent;
 								            response = _ref2.response;
 								            body = _ref2.body;
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            _context.prev = 8;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            validateResponse(response);
 								            return _context.abrupt("return", {
 								              body: body,
 								              response: response
 								            });
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								          case 13:
 								            _context.prev = 13;
 								            _context.t0 = _context["catch"](8);
 								            return _context.abrupt("return", {
 								              error: true,
 								              message: _context.t0.message
 								            });
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								          case 16:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          case "end":
 								            return _context.stop();
 								        }
 								      }
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								    }, _callee, this, [[8, 13]]);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  return _fetchResource.apply(this, arguments);
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								function convertMetaProp($, from, to) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $("meta[".concat(from, "]")).each(function (_, node) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var $node = $(node);
 								    var value = $node.attr(from);
 								    $node.attr(to, value);
 								    $node.removeAttr(from);
 								  });
 								  return $;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // For ease of use in extracting from meta tags,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// replace the "content" attribute on meta tags with the
 								// "value" attribute.
 								//
 								// In addition, normalize 'property' attributes to 'name' for ease of
 								// querying later. See, e.g., og or twitter meta tags.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function normalizeMetaTags($) {
 								  $ = convertMetaProp($, 'content', 'value');
 								  $ = convertMetaProp($, 'property', 'name');
 								  return $;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Spacer images to be removed
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// but would normally remove
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var KEEP_CLASS = 'mercury-parser-keep';
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty
 								var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var HEADER_TAG_LIST = HEADER_TAGS.join(','); // // CONTENT FETCHING CONSTANTS ////
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// A list of strings that can be considered unlikely candidates when
 								// extracting content from a resource. These strings are joined together
 								// and then tested for existence using re:test, so may contain simple,
 								// non-pipe style regular expression queries if necessary.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var UNLIKELY_CANDIDATES_BLACKLIST = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', // 'form', // This is too generic, has too many false positives
 								'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.
 								'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box
 								'presence_control_external', // lifehacker.com container full of false positives
 								'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools']; // A list of strings that can be considered LIKELY candidates when
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// extracting content from a resource. Essentially, the inverse of the
 								// blacklist above - if something matches both blacklist and whitelist,
 								// it is kept. This is useful, for example, if something has a className
 								// of "rss-content entry-content". It matched 'rss', so it would normally
 								// be removed, however, it's also the entry content, so it should be left
 								// alone.
 								//
 								// These strings are joined together and then tested for existence using
 								// re:test, so may contain simple, non-pipe style regular expression queries
 								// if necessary.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var UNLIKELY_CANDIDATES_WHITELIST = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form
 								'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow']; // A list of tags which, if found inside, should cause a <div /> to NOT
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// be turned into a paragraph tag. Shallow div tags without these elements
 								// should be turned into <p /> tags.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var DIV_TO_P_BLOCK_TAGS = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(','); // A list of tags that should be ignored when trying to find the top candidate
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var POSITIVE_SCORE_HINTS = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
 								'\\Bcopy']; // The above list, joined into a matching regular expression
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								var POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var NEGATIVE_SCORE_HINTS = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
 								'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
 								'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
 								'promo', 'pr_', // autoblog - press release
 								'related', 'respond', 'roundcontent', // lifehacker restricted content warning
 								'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								var NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful.
 								var IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear.
 								var PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// http://bit.ly/qneNIT
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var BLOCK_LEVEL_TAGS = ['article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video'];
 								var BLOCK_LEVEL_TAGS_RE = new RegExp("^(".concat(BLOCK_LEVEL_TAGS.join('|'), ")$"), 'i'); // The removal is implemented as a blacklist and whitelist, this test finds
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// blacklisted elements that aren't whitelisted. We do this all in one
 								// expression-both because it's only one pass, and because this skips the
 								// serialization for whitelisted nodes.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
 								var CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
 								var candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
 								var CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function stripUnlikelyCandidates($) {
 								  //  Loop through the provided document and remove any non-link nodes
 								  //  that are unlikely candidates for article content.
 								  //
 								  //  Links are ignored because there are very often links to content
 								  //  that are identified as non-body-content, but may be inside
 								  //  article-like content.
 								  //
 								  //  :param $: a cheerio object to strip nodes from
 								  //  :return $: the cleaned cheerio object
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('*').not('a').each(function (index, node) {
 								    var $node = $(node);
 								    var classes = $node.attr('class');
 								    var id = $node.attr('id');
 								    if (!id && !classes) return;
 								    var classAndId = "".concat(classes || '', " ").concat(id || '');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (CANDIDATES_WHITELIST.test(classAndId)) {
 								      return;
 								    }
 								    if (CANDIDATES_BLACKLIST.test(classAndId)) {
 								      $node.remove();
 								    }
 								  });
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Another good candidate for refactoring/optimizing.
 								// Very imperative code, I don't love it. - AP
 								//  Given cheerio object, convert consecutive <br /> tags into
 								//  <p /> tags instead.
 								//
 								//  :param $: A cheerio object
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function brsToPs$$1($) {
 								  var collapsing = false;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('br').each(function (index, element) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $element = $(element);
 								    var nextElement = $element.next().get(0);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (nextElement && nextElement.tagName.toLowerCase() === 'br') {
 								      collapsing = true;
 								      $element.remove();
 								    } else if (collapsing) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								      collapsing = false;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      paragraphize(element, $, true);
 								    }
 								  });
 								  return $;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// make sure it conforms to the constraints of a P tag (I.E. does
 								// not contain any other block tags.)
 								//
 								// If the node is a <br />, it treats the following inline siblings
 								// as if they were its children.
 								//
 								// :param node: The node to paragraphize; this is a raw node
 								// :param $: The cheerio object to handle dom manipulation
 								// :param br: Whether or not the passed node is a br
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function paragraphize(node, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var $node = $(node);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (br) {
 								    var sibling = node.nextSibling;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var p = $('<p></p>'); // while the next node is text or not a block level element
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // append it to a new p node
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								      var _sibling = sibling,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          nextSibling = _sibling.nextSibling;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      $(sibling).appendTo(p);
 								      sibling = nextSibling;
 								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    $node.replaceWith(p);
 								    $node.remove();
 								    return $;
 								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function convertDivs($) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('div').each(function (index, div) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $div = $(div);
-												chore: Update builds

											
										
										
											2 years ago
+								    var convertible = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												chore: Update builds

											
										
										
											2 years ago
+								    if (convertible) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      convertNodeTo$$1($div, $, 'p');
 								    }
 								  });
 								  return $;
 								}
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function convertSpans($) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('span').each(function (index, span) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $span = $(span);
-												chore: Update builds

											
										
										
											2 years ago
+								    var convertible = $span.parents('p, div, li, figcaption').length === 0;
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: Update builds

											
										
										
											2 years ago
+								    if (convertible) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      convertNodeTo$$1($span, $, 'p');
 								    }
 								  });
 								  return $;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Loop through the provided doc, and convert any p-like elements to
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// actual paragraph tags.
 								//
 								//   Things fitting this criteria:
 								//   * Multiple consecutive <br /> tags.
 								//   * <div /> tags without block level elements inside of them
 								//   * <span /> tags who are not children of <p /> or <div /> tags.
 								//
 								//   :param $: A cheerio object to search
 								//   :return cheerio object with new p elements
 								//   (By-reference mutation, though. Returned just for convenience.)
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function convertToParagraphs$$1($) {
 								  $ = brsToPs$$1($);
 								  $ = convertDivs($);
 								  $ = convertSpans($);
 								  return $;
 								}
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function convertNodeTo$$1($node, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var node = $node.get(0);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (!node) {
 								    return $;
 								  }
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  var attrs = getAttrs(node) || {};
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var attribString = _Reflect$ownKeys(attrs).map(function (key) {
 								    return "".concat(key, "=").concat(attrs[key]);
 								  }).join(' ');
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								  var html;
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if ($.browser) {
 								    // In the browser, the contents of noscript tags aren't rendered, therefore
 								    // transforms on the noscript tag (commonly used for lazy-loading) don't work
 								    // as expected. This test case handles that
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  } else {
 								    html = $node.contents();
 								  }
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $node.replaceWith("<".concat(tag, " ").concat(attribString, ">").concat(html, "</").concat(tag, ">"));
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function cleanForHeight($img, $) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var height = _parseInt($img.attr('height'), 10);
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var width = _parseInt($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // widths, because they are most likely shims or icons,
 								  // which aren't very useful for reading.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if ((height || 20) < 10 || width < 10) {
 								    $img.remove();
 								  } else if (height) {
 								    // Don't ever specify a height on images, so that we can
 								    // scale with respect to width without screwing up the
 								    // aspect ratio.
 								    $img.removeAttr('height');
 								  }
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Cleans out images where the source string matches transparent/spacer/etc
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// TODO This seems very aggressive - AP
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function removeSpacers($img, $) {
 								  if (SPACER_RE.test($img.attr('src'))) {
 								    $img.remove();
 								  }
-												chore: cleaned up unused files, slight reorg

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
-												chore: cleaned up unused files, slight reorg

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function cleanImages($article, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $article.find('img').each(function (index, img) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $img = $(img);
 								    cleanForHeight($img, $);
 								    removeSpacers($img, $);
 								  });
 								  return $;
 								}
-												chore: cleaned up unused files, slight reorg

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function markToKeep(article, $, url) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var tags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];
-												chore: cleaned up unused files, slight reorg

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (tags.length === 0) {
 								    tags = KEEP_SELECTORS;
 								  }
-												chore: cleaned up unused files, slight reorg

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (url) {
 								    var _URL$parse = URL.parse(url),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        protocol = _URL$parse.protocol,
 								        hostname = _URL$parse.hostname;
-												chore: cleaned up unused files, slight reorg

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    tags = [].concat(_toConsumableArray(tags), ["iframe[src^=\"".concat(protocol, "//").concat(hostname, "\"]")]);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  $(tags.join(','), article).addClass(KEEP_CLASS);
 								  return $;
 								}
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function stripJunkTags(article, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (tags.length === 0) {
 								    tags = STRIP_OUTPUT_TAGS;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // Remove matching elements, but ignore
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // any element with a class of mercury-parser-keep
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								  $(tags.join(','), article).not(".".concat(KEEP_CLASS)).remove();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// by the title extractor instead. If there's less than 3 of them (<3),
 								// strip them. Otherwise, turn 'em into H2s.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function cleanHOnes$$1(article, $) {
 								  var $hOnes = $('h1', article);
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if ($hOnes.length < 3) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    $hOnes.each(function (index, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      return $(node).remove();
 								    });
 								  } else {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    $hOnes.each(function (index, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      convertNodeTo$$1($(node), $, 'h2');
 								    });
 								  }
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
-												feat: added wired custom extractor

											
										
										
											8 years ago
-												release: 1.0.9 (#167)


											
										
										
											7 years ago
+								function removeAllButWhitelist($article, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $article.find('*').each(function (index, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var attrs = getAttrs(node);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    setAttrs(node, _Reflect$ownKeys(attrs).reduce(function (acc, attr) {
 								      if (WHITELIST_ATTRS_RE.test(attr)) {
 								        return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr]));
 								      }
-												feat: added wired custom extractor

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      return acc;
 								    }, {}));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }); // Remove the mercury-parser-keep class from result
-												release: 1.0.9 (#167)


											
										
										
											7 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $(".".concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $article;
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								} // Remove attributes like style or align
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												release: 1.0.9 (#167)


											
										
										
											7 years ago
+								function cleanAttributes$$1($article, $) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // Grabbing the parent because at this point
 								  // $article will be wrapped in a div which will
 								  // have a score set on it.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return removeAllButWhitelist($article.parent().length ? $article.parent() : $article, $);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												feat: added wired custom extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function removeEmpty($article, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $article.find('p').each(function (index, p) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $p = $(p);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  });
 								  return $;
 								}
-												feat: added wired custom extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// // CONTENT FETCHING CONSTANTS ////
 								// for a document.
-												feat: added incomplete msn extractor

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var NON_TOP_CANDIDATE_TAGS$1 = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
 								var NON_TOP_CANDIDATE_TAGS_RE$1 = new RegExp("^(".concat(NON_TOP_CANDIDATE_TAGS$1.join('|'), ")$"), 'i'); // A list of selectors that specify, very clearly, either hNews or other
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// very content-specific style content, like Blogger templates.
 								// More examples here: http://microformats.org/wiki/blog-post-formats
-												feat: added incomplete msn extractor

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var HNEWS_CONTENT_SELECTORS$1 = [['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body']];
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var PHOTO_HINTS$1 = ['figure', 'photo', 'image', 'caption'];
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var PHOTO_HINTS_RE$1 = new RegExp(PHOTO_HINTS$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
-												feat: added incomplete msn extractor

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var POSITIVE_SCORE_HINTS$1 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
 								'\\Bcopy']; // The above list, joined into a matching regular expression
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								var POSITIVE_SCORE_RE$1 = new RegExp(POSITIVE_SCORE_HINTS$1.join('|'), 'i'); // Readability publisher-specific guidelines
 								var READABILITY_ASSET$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// an article container. Checked against className and id.
 								//
 								// TODO: Perhaps have these scale based on their odds of being quality?
-												feat: added incomplete buzzfeed extractor

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var NEGATIVE_SCORE_HINTS$1 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
 								'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
 								'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
 								'promo', 'pr_', // autoblog - press release
 								'related', 'respond', 'roundcontent', // lifehacker restricted content warning
 								'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget']; // The above list, joined into a matching regular expression
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								var NEGATIVE_SCORE_RE$1 = new RegExp(NEGATIVE_SCORE_HINTS$1.join('|'), 'i'); // Match a digit. Pretty clear.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var PARAGRAPH_SCORE_TAGS$1 = new RegExp('^(p|li|span|pre)$', 'i');
 								var CHILD_CONTENT_TAGS$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
 								var BAD_TAGS$1 = new RegExp('^(address|form)$', 'i');
-												feat: added incomplete buzzfeed extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function getWeight(node) {
 								  var classes = node.attr('class');
 								  var id = node.attr('id');
 								  var score = 0;
-												feat: added incomplete buzzfeed extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (id) {
 								    // if id exists, try to score on both positive and negative
 								    if (POSITIVE_SCORE_RE$1.test(id)) {
 								      score += 25;
 								    }
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (NEGATIVE_SCORE_RE$1.test(id)) {
 								      score -= 25;
 								    }
 								  }
-												feat: added incomplete buzzfeed extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (classes) {
 								    if (score === 0) {
 								      // if classes exist and id did not contribute to score
 								      // try to score on both positive and negative
 								      if (POSITIVE_SCORE_RE$1.test(classes)) {
 								        score += 25;
 								      }
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      if (NEGATIVE_SCORE_RE$1.test(classes)) {
 								        score -= 25;
 								      }
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // even if score has been set by id, add score for
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // possible photo matches
 								    // "try to keep photos if we can"
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (PHOTO_HINTS_RE$1.test(classes)) {
 								      score += 10;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // add 25 if class matches entry-content-asset,
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // a class apparently instructed for use in the
 								    // Readability publisher guidelines
 								    // https://www.readability.com/developers/guidelines
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (READABILITY_ASSET$1.test(classes)) {
 								      score += 25;
 								    }
 								  }
-												feat: added wikia extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return score;
 								}
-												feat: added wikia extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// returns the score of a node based on
 								// the node's score attribute
 								// returns null if no score set
 								function getScore($node) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  return _parseFloat($node.attr('score')) || null;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												feat: added wikia extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// return 1 for every comma in text
 								function scoreCommas(text) {
 								  return (text.match(/,/g) || []).length;
 								}
-												feat: added wikia extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var idkRe = new RegExp('^(p|pre)$', 'i');
 								function scoreLength(textLength) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var chunks = textLength / 50;
-												feat: added wikia extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (chunks > 0) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var lengthBonus; // No idea why p or pre are being tamped down here
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // but just following the source for now
 								    // Not even sure why tagName is included here,
 								    // since this is only being called from the context
 								    // of scoreParagraph
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (idkRe.test(tagName)) {
 								      lengthBonus = chunks - 2;
 								    } else {
 								      lengthBonus = chunks - 1.25;
 								    }
-												feat: added wikia extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return Math.min(Math.max(lengthBonus, 0), 3);
 								  }
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return 0;
 								}
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// commas, etc. Higher is better.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function scoreParagraph$$1(node) {
 								  var score = 1;
 								  var text = node.text().trim();
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var textLength = text.length; // If this paragraph is less than 25 characters, don't count it.
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (textLength < 25) {
 								    return 0;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // Add points for any commas within this paragraph
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  score += scoreCommas(text); // For every 50 characters in this paragraph, add another point. Up
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // to 3 points.
-												feat: added politico extractor

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  score += scoreLength(textLength); // Articles can end with short paragraphs when people are being clever
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // but they can also end with short paragraphs setting up lists of junk
 								  // that we strip. This negative tweaks junk setup paragraphs just below
 								  // the cutoff threshold.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (text.slice(-1) === ':') {
 								    score -= 1;
 								  }
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return score;
 								}
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function setScore($node, $, score) {
 								  $node.attr('score', score);
 								  return $node;
 								}
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function addScore$$1($node, $, amount) {
 								  try {
 								    var score = getOrInitScore$$1($node, $) + amount;
 								    setScore($node, $, score);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  } catch (e) {// Ignoring; error occurs in scoreNode
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $node;
 								}
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function addToParent$$1(node, $, score) {
 								  var parent = node.parent();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (parent) {
 								    addScore$$1(parent, $, score * 0.25);
 								  }
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return node;
 								}
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// if not, initializes a score based on
 								// the node's tag type
-												feat: added politico extractor

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function getOrInitScore$$1($node, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var score = getScore($node);
-												feat: added politico extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (score) {
 								    return score;
 								  }
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  score = scoreNode$$1($node);
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (weightNodes) {
 								    score += getWeight($node);
 								  }
-												feat: added littlethings extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  addToParent$$1($node, $, score);
 								  return score;
 								}
-												feat: allowing extractors to support multiple domains

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// just scores based on tag.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function scoreNode$$1($node) {
 								  var _$node$get = $node.get(0),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      tagName = _$node$get.tagName; // TODO: Consider ordering by most likely.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // E.g., if divs are a more common tag on a page,
 								  // Could save doing that regex test on every node – AP
-												feat: added deadspin custom parser

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (PARAGRAPH_SCORE_TAGS$1.test(tagName)) {
 								    return scoreParagraph$$1($node);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }
 								  if (tagName.toLowerCase() === 'div') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return 5;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }
 								  if (CHILD_CONTENT_TAGS$1.test(tagName)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return 3;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }
 								  if (BAD_TAGS$1.test(tagName)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return -3;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }
 								  if (tagName.toLowerCase() === 'th') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return -5;
 								  }
-												feat: added deadspin custom parser

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return 0;
 								}
-												feat: added deadspin custom parser

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function convertSpans$1($node, $) {
 								  if ($node.get(0)) {
 								    var _$node$get = $node.get(0),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        tagName = _$node$get.tagName;
-												feat: added deadspin custom parser

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (tagName === 'span') {
 								      // convert spans to divs
 								      convertNodeTo$$1($node, $, 'div');
 								    }
 								  }
 								}
-												feat: added deadspin custom parser

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function addScoreTo($node, $, score) {
 								  if ($node) {
 								    convertSpans$1($node, $);
 								    addScore$$1($node, $, score);
 								  }
 								}
-												feat: added deadspin custom parser

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function scorePs($, weightNodes) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('p, pre').not('[score]').each(function (index, node) {
 								    // The raw score for this paragraph, before we add any parent/child
 								    // scores.
 								    var $node = $(node);
 								    $node = setScore($node, $, getOrInitScore$$1($node, $, weightNodes));
 								    var $parent = $node.parent();
 								    var rawScore = scoreNode$$1($node);
 								    addScoreTo($parent, $, rawScore, weightNodes);
 								    if ($parent) {
 								      // Add half of the individual content score to the
 								      // grandparent
 								      addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
 								    }
 								  });
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // score content. Parents get the full value of their children's
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// content score, grandparents half
-												feat: Add custom parser for broadwayworld.com

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function scoreContent$$1($) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // First, look for special hNews based selectors and give them a big
 								  // boost, if they exist
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  HNEWS_CONTENT_SELECTORS$1.forEach(function (_ref) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var _ref2 = _slicedToArray(_ref, 2),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        parentSelector = _ref2[0],
 								        childSelector = _ref2[1];
-												feat: Add custom parser for broadwayworld.com

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    $("".concat(parentSelector, " ").concat(childSelector)).each(function (index, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      addScore$$1($(node).parent(parentSelector), $, 80);
 								    });
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }); // Doubling this again
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // Previous solution caused a bug
 								  // in which parents weren't retaining
 								  // scores. This is not ideal, and
 								  // should be fixed.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  scorePs($, weightNodes);
 								  scorePs($, weightNodes);
 								  return $;
 								}
-												feat: Add custom parser for broadwayworld.com

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// it to see if any of them are decently scored. If they are, they
 								// may be split parts of the content (Like two divs, a preamble and
 								// a body.) Example:
 								// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function mergeSiblings($candidate, topScore, $) {
 								  if (!$candidate.parent().length) {
 								    return $candidate;
 								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var siblingScoreThreshold = Math.max(10, topScore * 0.25);
 								  var wrappingDiv = $('<div></div>');
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $candidate.parent().children().each(function (index, sibling) {
 								    var $sibling = $(sibling); // Ignore tags like BR, HR, etc
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)) {
 								      return null;
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var siblingScore = getScore($sibling);
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (siblingScore) {
 								      if ($sibling.get(0) === $candidate.get(0)) {
 								        wrappingDiv.append($sibling);
 								      } else {
 								        var contentBonus = 0;
 								        var density = linkDensity($sibling); // If sibling has a very low link density,
 								        // give it a small bonus
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        if (density < 0.05) {
 								          contentBonus += 20;
 								        } // If sibling has a high link density,
 								        // give it a penalty
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        if (density >= 0.5) {
 								          contentBonus -= 20;
 								        } // If sibling node has the same class as
 								        // candidate, give it a bonus
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        if ($sibling.attr('class') === $candidate.attr('class')) {
 								          contentBonus += topScore * 0.2;
 								        }
 								        var newScore = siblingScore + contentBonus;
 								        if (newScore >= siblingScoreThreshold) {
 								          return wrappingDiv.append($sibling);
 								        }
 								        if (sibling.tagName === 'p') {
 								          var siblingContent = $sibling.text();
 								          var siblingContentLength = textLength(siblingContent);
 								          if (siblingContentLength > 80 && density < 0.25) {
 								            return wrappingDiv.append($sibling);
 								          }
 								          if (siblingContentLength <= 80 && density === 0 && hasSentenceEnd(siblingContent)) {
 								            return wrappingDiv.append($sibling);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          }
 								        }
 								      }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    return null;
 								  });
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  if (wrappingDiv.children().length === 1 && wrappingDiv.children().first().get(0) === $candidate.get(0)) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return $candidate;
 								  }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return wrappingDiv;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// candidate nodes we found and find the one with the highest score.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function findTopCandidate$$1($) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var $candidate;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var topScore = 0;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('[score]').each(function (index, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Ignore tags like BR, HR, etc
 								    if (NON_TOP_CANDIDATE_TAGS_RE$1.test(node.tagName)) {
 								      return;
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $node = $(node);
 								    var score = getScore($node);
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (score > topScore) {
 								      topScore = score;
 								      $candidate = $node;
 								    }
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }); // If we don't have a candidate, return the body
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  // or whatever the first element is
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (!$candidate) {
 								    return $('body') || $('*').first();
 								  }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  $candidate = mergeSiblings($candidate, topScore, $);
 								  return $candidate;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Scoring
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function removeUnlessContent($node, $, weight) {
 								  // Explicitly save entry-content-asset tags, which are
 								  // noted as valuable in the Publisher guidelines. For now
 								  // this works everywhere. We may want to consider making
 								  // this less of a sure-thing later.
 								  if ($node.hasClass('entry-content-asset')) {
 								    return;
 								  }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var content = normalizeSpaces($node.text());
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (scoreCommas(content) < 10) {
 								    var pCount = $('p', $node).length;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var inputCount = $('input', $node).length; // Looks like a form, too many inputs.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (inputCount > pCount / 3) {
 								      $node.remove();
 								      return;
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var contentLength = content.length;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var imgCount = $('img', $node).length; // Content is too short, and there are no images, so
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // this is probably junk content.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (contentLength < 25 && imgCount === 0) {
 								      $node.remove();
 								      return;
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var density = linkDensity($node); // Too high of link density, is probably a menu or
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // something similar.
 								    // console.log(weight, density, contentLength)
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (weight < 25 && density > 0.2 && contentLength > 75) {
 								      $node.remove();
 								      return;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Too high of a link density, despite the score being
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // high.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (weight >= 25 && density > 0.5) {
 								      // Don't remove the node if it's a list and the
 								      // previous sibling starts with a colon though. That
 								      // means it's probably content.
 								      var tagName = $node.get(0).tagName.toLowerCase();
 								      var nodeIsList = tagName === 'ol' || tagName === 'ul';
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      if (nodeIsList) {
 								        var previousNode = $node.prev();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          return;
 								        }
 								      }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      $node.remove();
 								      return;
 								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var scriptCount = $('script', $node).length; // Too many script tags, not enough content.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (scriptCount > 0 && contentLength < 150) {
 								      $node.remove();
 								    }
 								  }
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Given an article, clean it of some superfluous content specified by
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// tags. Things like forms, ads, etc.
 								//
 								// Tags is an array of tag name's to search through. (like div, form,
 								// etc)
 								//
 								// Return this same doc.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function cleanTags$$1($article, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $(CLEAN_CONDITIONALLY_TAGS, $article).each(function (index, node) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var $node = $(node); // If marked to keep, skip it
-												release: 1.0.9 (#167)


											
										
										
											7 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if ($node.hasClass(KEEP_CLASS) || $node.find(".".concat(KEEP_CLASS)).length > 0) return;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var weight = getScore($node);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (!weight) {
 								      weight = getOrInitScore$$1($node, $);
 								      setScore($node, $, weight);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // drop node if its weight is < 0
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (weight < 0) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      $node.remove();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    } else {
 								      // deteremine if node seems like content
 								      removeUnlessContent($node, $, weight);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
 								  });
 								  return $;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function cleanHeaders($article, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
 								  $(HEADER_TAG_LIST, $article).each(function (index, header) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var $header = $(header); // Remove any headers that appear before all other p tags in the
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // document. This probably means that it was part of the title, a
 								    // subtitle or something else extraneous like a datestamp or byline,
 								    // all of which should be handled by other metadata handling.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if ($($header, $article).prevAll('p').length === 0) {
 								      return $header.remove();
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Remove any headers that match the title exactly.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (normalizeSpaces($(header).text()) === title) {
 								      return $header.remove();
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // If this header has a negative weight, it's probably junk.
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Get rid of it.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (getWeight($(header)) < 0) {
 								      return $header.remove();
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								    return $header;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  });
 								  return $;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// html to avoid later complications with multiple body tags.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function rewriteTopLevel$$1(article, $) {
 								  // I'm not using context here because
 								  // it's problematic when converting the
 								  // top-level/root node - AP
 								  $ = convertNodeTo$$1($('html'), $, 'div');
 								  $ = convertNodeTo$$1($('body'), $, 'div');
 								  return $;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								function absolutize($, rootUrl, attr) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  var baseUrl = $('base').attr('href');
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  $("[".concat(attr, "]")).each(function (_, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var attrs = getAttrs(node);
 								    var url = attrs[attr];
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								    if (!url) return;
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
 								    setAttr(node, attr, absoluteUrl);
 								  });
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								function absolutizeSet($, rootUrl, $content) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('[srcset]', $content).each(function (_, node) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    var attrs = getAttrs(node);
 								    var urlSet = attrs.srcset;
 								    if (urlSet) {
 								      // a comma should be considered part of the candidate URL unless preceded by a descriptor
 								      // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
 								      // space characters inside the URL should be encoded (%20 or +)
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g);
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								      if (!candidates) return;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      var absoluteCandidates = candidates.map(function (candidate) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								        // a candidate URL cannot start or end with a comma
 								        // descriptors are separated from the URLs by unescaped whitespace
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								        parts[0] = URL.resolve(rootUrl, parts[0]);
 								        return parts.join(' ');
 								      });
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      var absoluteUrlSet = _toConsumableArray(new _Set(absoluteCandidates)).join(', ');
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
 								      setAttr(node, 'srcset', absoluteUrlSet);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  });
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function makeLinksAbsolute$$1($content, $, url) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  ['href', 'src'].forEach(function (attr) {
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    return absolutize($, url, attr);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  });
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  absolutizeSet($, url, $content);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $content;
 								}
 								function textLength(text) {
 								  return text.trim().replace(/\s+/g, ' ').length;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Determines what percentage of the text
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// in a node is link text
 								// Takes a node, returns a float
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function linkDensity($node) {
 								  var totalTextLength = textLength($node.text());
 								  var linkText = $node.find('a').text();
 								  var linkLength = textLength(linkText);
 								  if (totalTextLength > 0) {
 								    return linkLength / totalTextLength;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }
 								  if (totalTextLength === 0 && linkLength > 0) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return 1;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return 0;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// search for, find a meta tag associated.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function extractFromMeta$$1($, metaNames, cachedNames) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
 								  var foundNames = metaNames.filter(function (name) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return cachedNames.indexOf(name) !== -1;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  }); // eslint-disable-next-line no-restricted-syntax
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								  var _iteratorNormalCompletion = true;
 								  var _didIteratorError = false;
 								  var _iteratorError = undefined;
 								  try {
 								    var _loop = function _loop() {
 								      var name = _step.value;
 								      var type = 'name';
 								      var value = 'value';
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      var nodes = $("meta[".concat(type, "=\"").concat(name, "\"]")); // Get the unique value of every matching node, in case there
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      // are two meta tags with the same name and value.
 								      // Remove empty values.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      var values = nodes.map(function (index, node) {
 								        return $(node).attr(value);
 								      }).toArray().filter(function (text) {
 								        return text !== '';
 								      }); // If we have more than one value for the same name, we have a
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      // conflict and can't trust any of them. Skip this name. If we have
 								      // zero, that means our meta tags had no values. Skip this name
 								      // also.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      if (values.length === 1) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        var metaValue; // Meta values that contain HTML should be stripped, as they
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        // weren't subject to cleaning previously.
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								        if (cleanTags) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          metaValue = stripTags(values[0], $);
 								        } else {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								          var _values = _slicedToArray(values, 1);
 								          metaValue = _values[0];
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        }
 								        return {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          v: metaValue
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        };
 								      }
 								    };
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    for (var _iterator = _getIterator(foundNames), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      var _ret = _loop();
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      if (_typeof(_ret) === "object") return _ret.v;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // If nothing is found, return null
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  } catch (err) {
 								    _didIteratorError = true;
 								    _iteratorError = err;
 								  } finally {
 								    try {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								      if (!_iteratorNormalCompletion && _iterator.return != null) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        _iterator.return();
 								      }
 								    } finally {
 								      if (_didIteratorError) {
 								        throw _iteratorError;
 								      }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return null;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function isGoodNode($node, maxChildren) {
 								  // If it has a number of children, it's more likely a container
 								  // element. Skip it.
 								  if ($node.children().length > maxChildren) {
 								    return false;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // If it looks to be within a comment, skip it.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (withinComment$$1($node)) {
 								    return false;
 								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return true;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Given a a list of selectors find content that may
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// be extractable from the document. This is for flat
 								// meta-information, like author, title, date published, etc.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function extractFromSelectors$$1($, selectors) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
 								  var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  // eslint-disable-next-line no-restricted-syntax
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var _iteratorNormalCompletion = true;
 								  var _didIteratorError = false;
 								  var _iteratorError = undefined;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  try {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    for (var _iterator = _getIterator(selectors), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      var selector = _step.value;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								      var nodes = $(selector); // If we didn't get exactly one of this selector, this may be
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      // a list of articles or comments. Skip it.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      if (nodes.length === 1) {
 								        var $node = $(nodes[0]);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        if (isGoodNode($node, maxChildren)) {
 								          var content = void 0;
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          if (textOnly) {
 								            content = $node.text();
 								          } else {
 								            content = $node.html();
 								          }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          if (content) {
 								            return content;
 								          }
 								        }
 								      }
 								    }
 								  } catch (err) {
 								    _didIteratorError = true;
 								    _iteratorError = err;
 								  } finally {
 								    try {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								      if (!_iteratorNormalCompletion && _iterator.return != null) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        _iterator.return();
 								      }
 								    } finally {
 								      if (_didIteratorError) {
 								        throw _iteratorError;
 								      }
 								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return null;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// strips all tags from a string of text
 								function stripTags(text, $) {
 								  // Wrapping text in html element prevents errors when text
 								  // has no html
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var cleanText = $("<span>".concat(text, "</span>")).text();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return cleanText === '' ? text : cleanText;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function withinComment$$1($node) {
 								  var parents = $node.parents().toArray();
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var commentParent = parents.find(function (parent) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var attrs = getAttrs(parent);
 								    var nodeClass = attrs.class,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        id = attrs.id;
 								    var classAndId = "".concat(nodeClass, " ").concat(id);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return classAndId.includes('comment');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  });
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return commentParent !== undefined;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Given a node, determine if it's article-like enough to return
 								// param: node (a cheerio node)
 								// return: boolean
 								function nodeIsSufficient($node) {
 								  return $node.text().trim().length >= 100;
 								}
-												feat: keeping youtube and vimeo iframe embeds (#14)

* feat: keeping youtube and vimeo iframe embeds

* fix: removing class from article correctly

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function isWordpress($) {
 								  return $(IS_WP_SELECTOR).length > 0;
 								}
-												feat: custom medium extractor

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function getAttrs(node) {
 								  var attribs = node.attribs,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      attributes = node.attributes;
-												feat: keeping youtube and vimeo iframe embeds (#14)

* feat: keeping youtube and vimeo iframe embeds

* fix: removing class from article correctly

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (!attribs && attributes) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      var attr = attributes[index];
 								      if (!attr.name || !attr.value) return acc;
 								      acc[attr.name] = attr.value;
 								      return acc;
 								    }, {});
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return attrs;
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return attribs;
 								}
-												feat: keeping youtube and vimeo iframe embeds (#14)

* feat: keeping youtube and vimeo iframe embeds

* fix: removing class from article correctly

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function setAttr(node, attr, val) {
 								  if (node.attribs) {
 								    node.attribs[attr] = val;
 								  } else if (node.attributes) {
 								    node.setAttribute(attr, val);
 								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return node;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function setAttrs(node, attrs) {
 								  if (node.attribs) {
 								    node.attribs = attrs;
 								  } else if (node.attributes) {
 								    while (node.attributes.length > 0) {
 								      node.removeAttribute(node.attributes[0].name);
 								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    _Reflect$ownKeys(attrs).forEach(function (key) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      node.setAttribute(key, attrs[key]);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    });
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return node;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// DOM manipulation
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var IS_LINK = new RegExp('https?://', 'i');
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var IMAGE_RE = '.(png|gif|jpe?g)';
 								var IS_IMAGE = new RegExp("".concat(IMAGE_RE), 'i');
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								var IS_SRCSET = new RegExp("".concat(IMAGE_RE, "(\\?\\S+)?(\\s*[\\d.]+[wx])"), 'i');
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
 								// lazy loaded images into normal images.
 								// Many sites will have img tags with no source, or an image tag with a src
 								// attribute that a is a placeholer. We need to be able to properly fill in
 								// the src attribute so the images are no longer lazy loaded.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function convertLazyLoadedImages($) {
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								  var extractSrcFromJSON = function extractSrcFromJSON(str) {
 								    try {
 								      var _JSON$parse = JSON.parse(str),
 								          src = _JSON$parse.src;
 								      if (typeof src === 'string') return src;
 								    } catch (e) {
 								      return false;
 								    }
 								    return false;
 								  };
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $('img').each(function (_, img) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var attrs = getAttrs(img);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    _Reflect$ownKeys(attrs).forEach(function (attr) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      var value = attrs[attr];
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								      if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
 								        $(img).attr('srcset', value);
 								      } else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
 								        var existingSrc = extractSrcFromJSON(value);
 								        if (existingSrc) {
 								          $(img).attr('src', existingSrc);
 								        } else {
 								          $(img).attr('src', value);
 								        }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      }
 								    });
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  });
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function isComment(index, node) {
 								  return node.type === 'comment';
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function cleanComments($) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  $.root().find('*').contents().filter(isComment).remove();
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  return $;
 								}
 								function clean($) {
 								  $(TAGS_TO_REMOVE).remove();
 								  $ = cleanComments($);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  return $;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var Resource = {
 								  // Create a Resource.
 								  //
 								  // :param url: The URL for the document we should retrieve.
 								  // :param response: If set, use as the response rather than
 								  //                  attempting to fetch it ourselves. Expects a
 								  //                  string.
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								  // :param headers: Custom headers to be included in the request
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  create: function () {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    var _create = _asyncToGenerator(
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    /*#__PURE__*/
 								    _regeneratorRuntime.mark(function _callee(url, preparedResponse, parsedUrl) {
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								      var headers,
 								          result,
 								          validResponse,
 								          _args = arguments;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      return _regeneratorRuntime.wrap(function _callee$(_context) {
 								        while (1) {
 								          switch (_context.prev = _context.next) {
 								            case 0:
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								              headers = _args.length > 3 && _args[3] !== undefined ? _args[3] : {};
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              if (!preparedResponse) {
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								                _context.next = 6;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								                break;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								              }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								              validResponse = {
 								                statusMessage: 'OK',
 								                statusCode: 200,
 								                headers: {
 								                  'content-type': 'text/html',
 								                  'content-length': 500
 								                }
 								              };
 								              result = {
 								                body: preparedResponse,
-												chore: Update builds

											
										
										
											2 years ago
+								                response: validResponse,
 								                alreadyDecoded: true
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              };
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								              _context.next = 9;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              break;
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            case 6:
 								              _context.next = 8;
 								              return fetchResource(url, parsedUrl, headers);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            case 8:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              result = _context.sent;
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            case 9:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              if (!result.error) {
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								                _context.next = 12;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								                break;
 								              }
 								              result.failed = true;
 								              return _context.abrupt("return", result);
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            case 12:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              return _context.abrupt("return", this.generateDoc(result));
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								            case 13:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            case "end":
 								              return _context.stop();
 								          }
 								        }
 								      }, _callee, this);
 								    }));
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
 								    function create(_x, _x2, _x3) {
 								      return _create.apply(this, arguments);
 								    }
 								    return create;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }(),
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  generateDoc: function generateDoc(_ref) {
 								    var content = _ref.body,
-												chore: Update builds

											
										
										
											2 years ago
+								        response = _ref.response,
 								        _ref$alreadyDecoded = _ref.alreadyDecoded,
 								        alreadyDecoded = _ref$alreadyDecoded === void 0 ? false : _ref$alreadyDecoded;
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								    var _response$headers$con = response.headers['content-type'],
 								        contentType = _response$headers$con === void 0 ? '' : _response$headers$con; // TODO: Implement is_text function from
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (!contentType.includes('html') && !contentType.includes('text')) {
 								      throw new Error('Content does not appear to be text.');
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var $ = this.encodeDoc({
 								      content: content,
-												chore: Update builds

											
										
										
											2 years ago
+								      contentType: contentType,
 								      alreadyDecoded: alreadyDecoded
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    });
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if ($.root().children().length === 0) {
 								      throw new Error('No children, likely a bad parse.');
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    $ = normalizeMetaTags($);
 								    $ = convertLazyLoadedImages($);
 								    $ = clean($);
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								    return $;
 								  },
 								  encodeDoc: function encodeDoc(_ref2) {
 								    var content = _ref2.content,
-												chore: Update builds

											
										
										
											2 years ago
+								        contentType = _ref2.contentType,
 								        _ref2$alreadyDecoded = _ref2.alreadyDecoded,
 								        alreadyDecoded = _ref2$alreadyDecoded === void 0 ? false : _ref2$alreadyDecoded;
 								    if (alreadyDecoded) {
 								      return cheerio.load(content);
 								    }
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								    var encoding = getEncoding(contentType);
 								    var decodedContent = iconv.decode(content, encoding);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var $ = cheerio.load(decodedContent); // after first cheerio.load, check to see if encoding matches
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
-												release: 2.0.0 (#275)


											
										
										
											5 years ago
+								    var contentTypeSelector = cheerio.browser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]';
 								    var metaContentType = $(contentTypeSelector).attr('content') || $('meta[charset]').attr('charset');
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var properEncoding = getEncoding(metaContentType); // if encodings in the header/body dont match, use the one in the body
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    if (metaContentType && properEncoding !== encoding) {
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								      decodedContent = iconv.decode(content, properEncoding);
 								      $ = cheerio.load(decodedContent);
 								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    return $;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								var _marked =
 								/*#__PURE__*/
 								_regeneratorRuntime.mark(range);
 								function range() {
 								  var start,
 								      end,
 								      _args = arguments;
 								  return _regeneratorRuntime.wrap(function range$(_context) {
 								    while (1) {
 								      switch (_context.prev = _context.next) {
 								        case 0:
 								          start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
 								          end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;
 								        case 2:
 								          if (!(start <= end)) {
 								            _context.next = 7;
 								            break;
 								          }
 								          _context.next = 5;
 								          return start += 1;
 								        case 5:
 								          _context.next = 2;
 								          break;
 								        case 7:
 								        case "end":
 								          return _context.stop();
 								      }
 								    }
 								  }, _marked, this);
 								}
 								// extremely simple url validation as a first step
 								function validateUrl(_ref) {
 								  var hostname = _ref.hostname;
 								  // If this isn't a valid url, return an error message
 								  return !!hostname;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var merge = function merge(extractor, domains) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return domains.reduce(function (acc, domain) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    acc[domain] = extractor;
 								    return acc;
 								  }, {});
 								};
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								function mergeSupportedDomains(extractor) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								var apiExtractors = {};
 								function addExtractor(extractor) {
 								  if (!extractor || !extractor.domain) {
 								    return {
 								      error: true,
 								      message: 'Unable to add custom extractor. Invalid parameters.'
 								    };
 								  }
 								  _Object$assign(apiExtractors, mergeSupportedDomains(extractor));
 								  return apiExtractors;
 								}
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var BloggerExtractor = {
 								  domain: 'blogspot.com',
 								  content: {
 								    // Blogger is insane and does not load its content
 								    // initially in the page, but it's all there
 								    // in noscript
 								    selectors: ['.post-content noscript'],
 								    // Selectors to remove from the extracted content
 								    clean: [],
 								    // Convert the noscript tag to a div
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      noscript: 'div'
 								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post-author-name']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post h2.title']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.publishdate']
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var NYMagExtractor = {
 								  domain: 'nymag.com',
 								  content: {
 								    // Order by most likely. Extractor will stop on first occurrence
 								    selectors: ['div.article-content', 'section.body', 'article.article'],
 								    // Selectors to remove from the extracted content
 								    clean: ['.ad', '.single-related-story'],
 								    // Object of tranformations to make on matched elements
 								    // Each key is the selector, each value is the tag to
 								    // transform to.
 								    // If a function is given, it should return a string
 								    // to convert to or nothing (in which case it will not perform
 								    // the transformation.
 								    transforms: {
 								      // Convert h1s to h2s
 								      h1: 'h2',
 								      // Convert lazy-loaded noscript images to figures
 								      noscript: function noscript($node, $) {
 								        var $children = $.browser ? $($node.text()) : $node.children();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        if ($children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          return 'figure';
 								        }
 								        return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
 								    }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.by-authors', '.lede-feature-author']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.lede-feature-teaser']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time.article-timestamp[datetime]', 'datetime'], 'time.article-timestamp']
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
 								var WikipediaExtractor = {
 								  domain: 'wikipedia.org',
 								  content: {
 								    selectors: ['#mw-content-text'],
 								    defaultCleaner: false,
 								    // transform top infobox to an image with caption
 								    transforms: {
 								      '.infobox img': function infoboxImg($node) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        if ($parent.children('img').length === 0) {
 								          $parent.prepend($node);
 								        }
 								      },
 								      '.infobox caption': 'figcaption',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.infobox': 'figure'
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
 								    // Selectors to remove from the extracted content
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: 'Wikipedia Contributors',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.title']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['#footer-info-lastmod']
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var TwitterExtractor = {
 								  domain: 'twitter.com',
 								  content: {
 								    transforms: {
 								      // We're transforming essentially the whole page here.
 								      // Twitter doesn't have nice selectors, so our initial
 								      // selector grabs the whole page, then we're re-writing
 								      // it to fit our needs before we clean it up.
 								      '.permalink[role=main]': function permalinkRoleMain($node, $) {
 								        var tweets = $node.find('.tweet');
 								        var $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
 								        $tweetContainer.append(tweets);
 								        $node.replaceWith($tweetContainer);
 								      },
 								      // Twitter wraps @ with s, which
 								      // renders as a strikethrough
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      s: 'span'
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
 								    selectors: ['.permalink[role=main]'],
 								    defaultCleaner: false,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.tweet.permalink-tweet .username']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']]
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var NYTimesExtractor = {
 								  domain: 'www.nytimes.com',
 								  title: {
-												release: 2.2.1 (#631)


											
										
										
											3 years ago
+								    selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline', 'h1 .balancedHeadline']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												release: 2.2.1 (#631)


											
										
										
											3 years ago
+								    selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline', ['meta[name="byl"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												release: 2.2.1 (#631)


											
										
										
											3 years ago
+								    selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    transforms: {
 								      'img.g-lazy': function imgGLazy($node) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								        var src = $node.attr('src');
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        var width = 640;
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        src = src.replace('{{size}}', width);
 								        $node.attr('src', src);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden', '.comments', '.supplemental', '.nocontent', '.story-footer-links']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published"]', 'value']]
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  dek: null,
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								var TheAtlanticExtractor = {
 								  domain: 'www.theatlantic.com',
 								  title: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: ['h1', '.c-article-header__hed']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value'], '.c-byline__author']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: ['article', '.article-body'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet']
 								  },
 								  dek: {
 								    selectors: [['meta[name="description"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: [['time[itemprop="datePublished"]', 'datetime']]
 								  },
 								  lead_image_url: {
 								    selectors: [['img[itemprop="url"]', 'src']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var NewYorkerExtractor = {
 								  domain: 'www.newyorker.com',
 								  title: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['h1[class^="content-header"]', 'h1[class^="ArticleHeader__hed"]', ['meta[name="og:title"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: [['meta[name="author"]', 'value'], 'div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"]']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['article.article.main-content', 'main[class^="Layout__content"]'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								    clean: ['footer[class^="ArticleFooter__footer"]']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['time.content-header__publish-date', ['meta[name="pubdate"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['div.content-header__dek', 'h2[class^="ArticleHeader__dek"]']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var WiredExtractor = {
 								  domain: 'www.wired.com',
 								  title: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['h1.content-header__hed', 'h1.post-title']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: [['meta[name="author"]', 'value'], 'a[rel="author"]']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['article.article.main-content', 'article.content'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.visually-hidden', 'figcaption img.photo']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['time.content-header__publish-date', ['meta[itemprop="datePublished"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var MSNExtractor = {
 								  domain: 'www.msn.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.authorname-txt']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['div.richtext'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['span.caption']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.time']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var YahooExtractor = {
 								  domain: 'www.yahoo.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['header.canvas-header']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.provider-name']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter content selectors
 								    '.content-canvas'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.figure-caption']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time.date[datetime]', 'datetime']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter dek selectors
 								    ]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var BuzzfeedExtractor = {
 								  domain: 'www.buzzfeed.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1[id="post-title"]']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['a[data-action="user/username"]', 'byline__author']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.longform_custom_header_media', '#buzz_sub_buzz'], '#buzz_sub_buzz'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    defaultCleaner: false,
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								      h2: 'b',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'div.longform_custom_header_media': function divLongform_custom_header_media($node) {
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								        if ($node.has('img') && $node.has('.longform_header_image_source')) {
 								          return 'figure';
 								        }
 								        return null;
 								      },
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'figure.longform_custom_header_media .longform_header_image_source': 'figcaption'
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.instapaper_ignore', '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline', '.share-box', '.print']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.buzz-datetime']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var WikiaExtractor = {
 								  domain: 'fandom.wikia.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.entry-title']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author vcard', '.fn']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.grid-content', '.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var LittleThingsExtractor = {
 								  domain: 'www.littlethings.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.post-title']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter content selectors
 								    '.mainContentIntro', '.content-wrapper'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var PoliticoExtractor = {
 								  domain: 'www.politico.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter title selectors
 								    ['meta[name="og:title"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.story-main-content .byline .vcard']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter content selectors
 								    '.story-main-content', '.content-group', '.story-core', '.story-text'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: [],
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['figcaption']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter lead_image_url selectors
 								    ['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: null,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  excerpt: null
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var DeadspinExtractor = {
 								  domain: 'deadspin.com',
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', 'avclub.com', 'clickhole.com', 'splinternews.com', 'theonion.com', 'theroot.com', 'thetakeout.com', 'theinventory.com'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.headline']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.post-content', '.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube($node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        var youtubeId = $node.attr('id').split('youtube-')[1];
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
 								      }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.magnifier', '.lightbox']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time.updated[datetime]', 'datetime']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  excerpt: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var BroadwayWorldExtractor = {
 								  domain: 'www.broadwayworld.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.article-title']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span[itemprop=author]']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['div[itemprop=articlebody]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[itemprop=datePublished]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  excerpt: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								// Rename CustomExtractor
 								// to fit your publication
 								// (e.g., NYTimesExtractor)
 								var ApartmentTherapyExtractor = {
 								  domain: 'www.apartmenttherapy.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.headline']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.PostByline__name']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['div.post__content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture($node, $) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        var data = JSON.parse($node.attr('data-props'));
 								        var src = data.sources[0].src;
 								        var $img = $('<img />').attr('src', src);
 								        $node.replaceWith($img);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.PostByline__timestamp[datetime]', 'datetime']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  next_page_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  excerpt: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								var MediumExtractor = {
 								  domain: 'medium.com',
 								  title: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: ['h1', ['meta[name="og:title"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  content: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: ['article'],
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								      // Allow drop cap character.
 								      'section span:first-of-type': function sectionSpanFirstOfType($node) {
 								        var $text = $node.html();
 								        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
 								          $node.replaceWith($text);
 								        }
 								      },
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								      // Re-write lazy-loaded youtube videos
 								      iframe: function iframe($node) {
 								        var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
 								        var thumb = decodeURIComponent($node.attr('data-thumbnail'));
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								        var $parent = $node.parents('figure');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        if (ytRe.test(thumb)) {
 								          var _thumb$match = thumb.match(ytRe),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              _thumb$match2 = _slicedToArray(_thumb$match, 2),
 								              _ = _thumb$match2[0],
 								              youtubeId = _thumb$match2[1]; // eslint-disable-line
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								          $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								          var $caption = $parent.find('figcaption');
 								          $parent.empty().append([$node, $caption]);
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								          return;
 								        } // If we can't draw the YouTube preview, remove the figure.
 								        $parent.remove();
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								      },
 								      // rewrite figures to pull out image and caption, remove rest
 								      figure: function figure($node) {
 								        // ignore if figure has an iframe
 								        if ($node.find('iframe').length > 0) return;
 								        var $img = $node.find('img').slice(-1)[0];
 								        var $caption = $node.find('figcaption');
 								        $node.empty().append([$img, $caption]);
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								      },
 								      // Remove any smaller images that did not get caught by the generic image
 								      // cleaner (author photo 48px, leading sentence images 79px, etc.).
 								      img: function img($node) {
 								        var width = _parseInt($node.attr('width'), 10);
 								        if (width < 100) $node.remove();
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								    clean: ['span a', 'svg']
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  date_published: {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								  dek: null,
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  next_page_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  },
 								  excerpt: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
 								  }
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: added tmz custom parser (#22)


											
										
										
											8 years ago
+								var WwwTmzComExtractor = {
 								  domain: 'www.tmz.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post-title-breadcrumb', 'h1', '.headline']
-												feat: added tmz custom parser (#22)


											
										
										
											8 years ago
+								  },
 								  author: 'TMZ STAFF',
 								  date_published: {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    selectors: ['.article-posted-date'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
-												feat: added tmz custom parser (#22)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												feat: added tmz custom parser (#22)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												feat: added tmz custom parser (#22)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.article-content', '.all-post-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.lightbox-link']
 								  }
-												feat: added tmz custom parser (#22)


											
										
										
											8 years ago
+								};
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								var WwwWashingtonpostComExtractor = {
 								  domain: 'www.washingtonpost.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', '#topper-headline-wrapper']
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.pb-author-name']
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']]
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.article-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.inline-content': function divInlineContent($node) {
 								        if ($node.has('img,iframe,video').length > 0) {
 								          return 'figure';
 								        }
 								        $node.remove();
 								        return null;
 								      },
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.pb-caption': 'figcaption'
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.interstitial-link', '.newsletter-inline-unit']
 								  }
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								};
 								var WwwHuffingtonpostComExtractor = {
 								  domain: 'www.huffingtonpost.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.headline__title']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.author-card__details__name']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.headline__subtitle']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['div.entry__body'],
 								    defaultCleaner: false,
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    transforms: {},
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story']
 								  }
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								};
 								var NewrepublicComExtractor = {
 								  domain: 'newrepublic.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.article-headline', '.minutes-primary h1.minute-title']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.author-list', '.minutes-primary h3.minute-byline']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.article-subhead']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.article-cover', 'div.content-body'], ['.minute-image', '.minutes-primary div.content-body']],
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['aside']
 								  }
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								};
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								var MoneyCnnComExtractor = {
 								  domain: 'money.cnn.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-title']
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.byline a']
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								  },
 								  date_published: {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    selectors: [['meta[name="date"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'GMT'
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['#storytext h2']
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['#storytext'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.inStoryHeading']
 								  }
-												feat: add money.cnn custom parser (#26)

* feat: add money.cnn custom parser

* added timezone to cnn custom parser

											
										
										
											8 years ago
+								};
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								var WwwThevergeComExtractor = {
 								  domain: 'www.theverge.com',
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  supportedDomains: ['www.polygon.com'],
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.p-dek']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// feature template multi-match
 								    ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match
 								    ['.e-image--hero', '.c-entry-content'], // feature template fallback
 								    '.l-wrapper .l-feature', // regular post fallback
 								    'div.c-entry-content'],
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    // Transform lazy-loaded images
 								    transforms: {
 								      noscript: function noscript($node) {
 								        var $children = $node.children();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								        if ($children.length === 1 && $children.get(0).tagName === 'img') {
 								          return 'span';
 								        }
 								        return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.aside', 'img.c-dynamic-image']
 								  }
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								};
 								var WwwCnnComExtractor = {
 								  domain: 'www.cnn.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.pg-headline', 'h1']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.metadata__byline__author']
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="pubdate"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// a more specific selector to grab the lead image and the body
 								    ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above
 								    '.zn-body-text', 'div[itemprop="articleBody"]'],
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph($node) {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								        var $text = $node.html();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								        if ($text) {
 								          return 'p';
 								        }
 								        return null;
 								      },
 								      // this transform cleans the short, all-link sections linking
 								      // to related content but not marked as such in any way.
 								      '.zn-body__paragraph': function znBody__paragraph($node) {
 								        if ($node.has('a')) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          if ($node.text().trim() === $node.find('a').text().trim()) {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								            $node.remove();
 								          }
 								        }
 								      },
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.media__video--thumbnail': 'figure'
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								};
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								var WwwAolComExtractor = {
 								  domain: 'www.aol.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.p-article__title']
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
 								  date_published: {
 								    selectors: ['.p-article__byline__date'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.article-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.1 (#48)


											
										
										
											8 years ago
+								};
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								var WwwYoutubeComExtractor = {
 								  domain: 'www.youtube.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.watch-title', 'h1.watch-title-container']
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.yt-user-info']
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[itemProp="datePublished"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'GMT'
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    defaultCleaner: false,
 								    selectors: [['#player-api', '#eow-description']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '#player-api': function playerApi($node, $) {
 								        var videoId = $('meta[itemProp="videoId"]').attr('value');
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        $node.html("\n          <iframe src=\"https://www.youtube.com/embed/".concat(videoId, "\" frameborder=\"0\" allowfullscreen></iframe>"));
 								      }
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								};
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								var WwwTheguardianComExtractor = {
 								  domain: 'www.theguardian.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.content__headline']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['p.byline']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.content__standfirst']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.content__article-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.hide-on-mobile', '.inline-icon']
 								  }
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								};
 								var WwwSbnationComExtractor = {
 								  domain: 'www.sbnation.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.c-page-title']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.c-entry-summary.p-dek']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['div.c-entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								};
 								var WwwBloombergComExtractor = {
 								  domain: 'www.bloomberg.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// normal articles
 								    '.lede-headline', // /graphics/ template
 								    'h1.article-title', // /news/ template
 								    'h1.lede-text-only__hed']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template
 								    '.bydek', // /news/ template
 								    '.author']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: []
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-body__content', // /graphics/ template
 								    ['section.copy-block'], // /news/ template
 								    '.body-copy'],
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.inline-newsletter', '.page-ad']
 								  }
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								};
 								var WwwBustleComExtractor = {
 								  domain: 'www.bustle.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.post-page__title']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.content-meta__author']
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time.content-meta__published-date[datetime]', 'datetime']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								  },
 								  content: {
 								    selectors: ['.post-page__body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.3 (#62)


											
										
										
											8 years ago
+								};
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								var WwwNprOrgExtractor = {
 								  domain: 'www.npr.org',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', '.storytitle']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['p.byline__name.byline__name--block']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.dateblock time[datetime]', 'datetime'], ['meta[name="date"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="twitter:image:src"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.storytext'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.bucketwrap.image': 'figure',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.bucketwrap.image .credit-caption': 'figcaption'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['div.enlarge_measure']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var WwwRecodeNetExtractor = {
 								  domain: 'www.recode.net',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.c-page-title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.c-entry-summary.p-dek']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var QzComExtractor = {
 								  domain: 'qz.com',
 								  title: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['article header h1']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: [['time[datetime]', 'datetime']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[name="twitter:image"]', 'content']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												chore: Update builds

											
										
										
											2 years ago
+								    selectors: ['#article-content'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												chore: Update builds

											
										
										
											2 years ago
+								    clean: []
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var WwwDmagazineComExtractor = {
 								  domain: 'www.dmagazine.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.story__title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.story__info .story__info__item:first-child']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    '.story__info'],
-												chore: Update builds

											
										
										
											2 years ago
+								    timezone: 'America/Chicago',
 								    format: 'MMMM D, YYYY h:mm a'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.story__subhead']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['article figure a:first-child', 'href']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.story__content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var WwwReutersComExtractor = {
 								  domain: 'www.reuters.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.article-headline']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:article:published_time"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['#article-text'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.article-subtitle': 'h4'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['#article-byline .author']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var MashableComExtractor = {
 								  domain: 'mashable.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.author_name a']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['section.article-content.blueprint'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.image-credit': 'figcaption'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwChicagotribuneComExtractor = {
 								  domain: 'www.chicagotribune.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.trb_ar_hl_t']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.trb_ar_by_nm_au']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[itemprop="datePublished"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.trb_ar_page'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwVoxComExtractor = {
 								  domain: 'www.vox.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.c-page-title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.p-dek']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'figure .e-image__image noscript': function figureEImage__imageNoscript($node) {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        var imgHtml = $node.html();
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml);
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								      },
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'figure .e-image__meta': 'figcaption'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var NewsNationalgeographicComExtractor = {
 								  domain: 'news.nationalgeographic.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', 'h1.main-title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.byline-component__contributors b span']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']],
 								    format: 'ddd MMM DD HH:mm:ss zz YYYY',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'EST'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article__deck']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.parsys.content': function parsysContent($node, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        if ($imgSrc) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.pull-quote.pull-quote--large']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwNationalgeographicComExtractor = {
 								  domain: 'www.nationalgeographic.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', 'h1.main-title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.byline-component__contributors b span']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article__deck']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['.parsys.content', '.__image-lead__'], '.content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.parsys.content': function parsysContent($node, $) {
 								        var $imageParent = $node.children().first();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        if ($imageParent.hasClass('imageGroup')) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          var $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								          var imgPath1 = $dataAttrContainer.data('platform-image1-path');
 								          var imgPath2 = $dataAttrContainer.data('platform-image2-path');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								          if (imgPath2 && imgPath1) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            $node.prepend($("<div class=\"__image-lead__\">\n                <img src=\"".concat(imgPath1, "\"/>\n                <img src=\"").concat(imgPath2, "\"/>\n              </div>")));
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								          }
 								        } else {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								          if ($imgSrc) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            $node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								          }
 								        }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.pull-quote.pull-quote--small']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwLatimesComExtractor = {
 								  domain: 'www.latimes.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.trb_ar_hl']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[itemprop="datePublished"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.trb_ar_main'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.trb_ar_la': function trb_ar_la($node) {
 								        var $figure = $node.find('figure');
 								        $node.replaceWith($figure);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.trb_ar_by', '.trb_ar_cr']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var PagesixComExtractor = {
 								  domain: 'pagesix.com',
 								  supportedDomains: ['nypost.com'],
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1 a']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.byline']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="description"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['#featured-image-wrapper', '.entry-content'], '.entry-content'],
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '#featured-image-wrapper': 'figure',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.wp-caption-text': 'figcaption'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.modal-trigger']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var ThefederalistpapersOrgExtractor = {
 								  domain: 'thefederalistpapers.org',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.entry-title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['main span.entry-author-name']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: [['p[style]']]
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwCbssportsComExtractor = {
 								  domain: 'www.cbssports.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-headline']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author-name']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['.date-original-reading-time time', 'datetime']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'UTC'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-subline']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.article'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwMsnbcComExtractor = {
 								  domain: 'www.msnbc.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', 'h1.is-title-pane']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="DC.date.issued"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="description"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.pane-node-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.pane-node-body': function paneNodeBody($node, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        var _WwwMsnbcComExtractor = _slicedToArray(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
 								            selector = _WwwMsnbcComExtractor[0],
 								            attr = _WwwMsnbcComExtractor[1];
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
 								        var src = $(selector).attr(attr);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        if (src) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          $node.prepend("<img src=\"".concat(src, "\" />"));
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwThepoliticalinsiderComExtractor = {
 								  domain: 'www.thepoliticalinsider.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="sailthru.title"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="sailthru.author"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="sailthru.date"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div#article-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwMentalflossComExtractor = {
 								  domain: 'www.mentalfloss.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.title', '.title-group', '.inner']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.field-name-field-enhanced-authors']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['.date-display-single'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.field.field-name-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var AbcnewsGoComExtractor = {
 								  domain: 'abcnews.go.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-header h1']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
 								    selectors: ['.authors'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.author-overlay', '.by-text']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['.timestamp'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.article-copy'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwNydailynewsComExtractor = {
 								  domain: 'www.nydailynews.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1#ra-headline']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="parsely-author"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="sailthru.date"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['article#ra-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwCnbcComExtractor = {
 								  domain: 'www.cnbc.com',
 								  title: {
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								    selectors: ['h1.title', 'h1.ArticleHeader-headline']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								    selectors: ['div#article_body.content', 'div.story', 'div.ArticleBody-articleBody'],
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwPopsugarComExtractor = {
 								  domain: 'www.popsugar.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.post-title', 'title-text']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:author"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['#content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.share-copy-title', '.post-tags', '.reactions']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var ObserverComExtractor = {
 								  domain: 'observer.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.entry-title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author', '.vcard']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2.dek']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var PeopleComExtractor = {
 								  domain: 'people.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:title"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['a.author.url.fn']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.article-body__inner'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwUsmagazineComExtractor = {
 								  domain: 'www.usmagazine.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['header h1']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['a.article-byline.tracked-offpage']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    timezone: 'America/New_York',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['time.article-published-date']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.article-body-inner'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.module-related']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwRollingstoneComExtractor = {
 								  domain: 'www.rollingstone.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.content-title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['a.content-author.tracked-offpage']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['time.content-published-date'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.content-description']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['.lead-container', '.article-content'], '.article-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.module-related']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var twofortysevensportsComExtractor = {
 								  domain: '247sports.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['title', 'article header h1']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time[data-published]', 'data-published']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['section.body.article'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var UproxxComExtractor = {
 								  domain: 'uproxx.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.post-top h1']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post-top .authorname']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.post-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.image': 'figure',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'div.image .wp-media-credit': 'figcaption'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwEonlineComExtractor = {
 								  domain: 'www.eonline.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.article__title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.entry-meta__author a']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[itemprop="datePublished"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.post-content section, .post-content div.post-content__image']],
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.post-content__image': 'figure',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      'div.post-content__image .image__credits': 'figcaption'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwMiamiheraldComExtractor = {
 								  domain: 'www.miamiherald.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['p.published-date'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.dateline-storybody'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
 								var WwwRefinery29ComExtractor = {
 								  domain: 'www.refinery29.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.contributor']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="sailthru.date"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.full-width-opener', '.article-content'], '.article-content', '.body'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								      'div.loading noscript': function divLoadingNoscript($node) {
 								        var imgHtml = $node.html();
 								        $node.parents('.loading').replaceWith(imgHtml);
 								      },
 								      '.section-image': 'figure',
 								      '.section-image .content-caption': 'figcaption',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.section-text': 'p'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.story-share']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwMacrumorsComExtractor = {
 								  domain: 'www.macrumors.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', 'h1.title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author-url']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.article .byline'],
 								    // Wednesday January 18, 2017 11:44 am PST
 								    format: 'dddd MMMM D, YYYY h:mm A zz',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="description"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.article'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    transforms: {},
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwAndroidcentralComExtractor = {
 								  domain: 'www.androidcentral.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', 'h1.main-title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.meta-by']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:description"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.image-large', 'src']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.article-body'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.intro', 'blockquote']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwSiComExtractor = {
 								  domain: 'www.si.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', 'h1.headline']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.timestamp'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.quick-hit ul']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: [['p', '.marquee_large_2x', '.component.image']],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								      noscript: function noscript($node) {
 								        var $children = $node.children();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								        if ($children.length === 1 && $children.get(0).tagName === 'img') {
 								          return 'figure';
 								        }
 								        return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: [['.inline-thumb', '.primary-message', '.description', '.instructions']]
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwRawstoryComExtractor = {
 								  domain: 'www.rawstory.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.blog-title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.blog-author a:first-of-type']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.blog-author a:last-of-type'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'EST'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.blog-content'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwCnetComExtractor = {
 								  domain: 'www.cnet.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:title"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['a.author']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['time'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-dek']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['img.__image-lead__', '.article-main-body'], '.article-main-body'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    transforms: {
 								      'figure.image': function figureImage($node) {
 								        var $img = $node.find('img');
 								        $img.attr('width', '100%');
 								        $img.attr('height', '100%');
 								        $img.addClass('__image-lead__');
 								        $node.remove('.imgContainer').prepend($img);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    },
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwCinemablendComExtractor = {
 								  domain: 'www.cinemablend.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.story_title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'EST'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['div#wrap_left_content'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwTodayComExtractor = {
 								  domain: 'www.today.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.entry-headline']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="DC.date.issued"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.entry-container'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.label-comment']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwHowtogeekComExtractor = {
 								  domain: 'www.howtogeek.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['#authorinfobox a']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['#authorinfobox + div li'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'GMT'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.thecontent'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwAlComExtractor = {
 								  domain: 'www.al.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="title"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article_author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: [['meta[name="article_date_original"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'EST'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['.entry-content'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwThepennyhoarderComExtractor = {
 								  domain: 'www.thepennyhoarder.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="dcterms.title"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['link[rel="author"]', 'title']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: [['.post-img', '.post-text'], '.post-text'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwWesternjournalismComExtractor = {
 								  domain: 'www.westernjournalism.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['title', 'h1.entry-title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="DC.date.issued"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.subtitle']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['div.article-sharing.top + div'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    transforms: {},
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.ad-notice-small']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var FusionNetExtractor = {
 								  domain: 'fusion.net',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post-title', '.single-title', '.headline']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.show-for-medium .byline']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['time.local-time', 'datetime']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.post-featured-media', '.article-content'], '.article-content'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.fusion-youtube-oembed': 'figure'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwAmericanowComExtractor = {
 								  domain: 'www.americanow.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.title', ['meta[name="title"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.byline']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="publish_date"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: [['.article-content', '.image', '.body'], '.body'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.article-video-wrapper', '.show-for-small-only']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var ScienceflyComExtractor = {
 								  domain: 'sciencefly.com',
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.cb-author', 'div.cb-author-title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['div.theiaPostSlider_slides img', 'src']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    selectors: ['div.theiaPostSlider_slides'],
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    transforms: {},
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var HellogigglesComExtractor = {
 								  domain: 'hellogiggles.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author-link']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var ThoughtcatalogComExtractor = {
 								  domain: 'thoughtcatalog.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.title', ['meta[name="og:title"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 'h1.writer-name']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.entry.post'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.tc_mark']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var WwwNjComExtractor = {
 								  domain: 'www.nj.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="title"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article_author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="article_date_original"]', 'value']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.entry-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var WwwInquisitrComExtractor = {
 								  domain: 'www.inquisitr.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.entry-title.story--header--title']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.story--header--author']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="datePublished"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['article.story', '.entry-content.'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.post-category', '.story--header--socials', '.story--header--content']
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var WwwNbcnewsComExtractor = {
 								  domain: 'www.nbcnews.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.article-hed h1']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['span.byline_author']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
 								    timezone: 'America/New_York'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['div.article-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
 								var FortuneComExtractor = {
 								  domain: 'fortune.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="author"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['.MblGHNMJ'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'UTC'
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['picture', 'article.row'], 'article.row'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.4 (#122)


											
										
										
											7 years ago
+								};
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								var WwwLinkedinComExtractor = {
 								  domain: 'www.linkedin.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.article-title', 'h1']
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:author"]', 'value'], '.entity-name a[rel=author]']
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['time[itemprop="datePublished"]', 'datetime']],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/Los_Angeles'
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['header figure', '.prose'], '.prose'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.entity-image']
 								  }
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								};
 								var ObamawhitehouseArchivesGovExtractor = {
 								  domain: 'obamawhitehouse.archives.gov',
 								  supportedDomains: ['whitehouse.gov'],
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', '.pane-node-title']
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.blog-author-link', '.node-person-name-link']
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.field-name-field-forall-summary']
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.0.10 (#169)


											
										
										
											7 years ago
+								    defaultCleaner: false,
-												release: 1.0.8 (#164)


											
										
										
											7 years ago
+								    selectors: ['div#content-start', '.pane-node-field-forall-body'],
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
-												release: 1.0.9 (#167)


											
										
										
											7 years ago
+								    transforms: {},
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.pane-node-title', '.pane-custom.pane-1']
 								  }
-												release: 1.0.5 (#136)


											
										
										
											7 years ago
+								};
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								var WwwOpposingviewsComExtractor = {
 								  domain: 'www.opposingviews.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1.title']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.date span span a']
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="publish_date"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.article-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.show-for-small-only']
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								};
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								var WwwProspectmagazineCoUkExtractor = {
 								  domain: 'www.prospectmagazine.co.uk',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.page-title']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.aside_author .title']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['.post-info'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'Europe/London'
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.page-subtitle']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  content: {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    selectors: ['article .post_content'],
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								};
 								var ForwardComExtractor = {
 								  domain: 'forward.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:title"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="date"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [// enter selectors
 								    ]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['.post-item-media-wrap', '.post-item p']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.donate-box', '.message', '.subtitle']
 								  }
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								};
 								var WwwQdailyComExtractor = {
 								  domain: 'www.qdaily.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h2', 'h2.title']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.name']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.date.smart-date', 'data-origindate']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.excerpt']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['.article-detail-hd img', 'src']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.detail'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.lazyload', '.lazylad', '.lazylood']
 								  }
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								};
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								var GothamistComExtractor = {
 								  domain: 'gothamist.com',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1', '.entry-header h1']
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author']
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['abbr', 'abbr.published'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [null]
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.entry-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div.image-none': 'figure',
 								      '.image-none i': 'figcaption',
 								      'div.image-left': 'figure',
 								      '.image-left i': 'figcaption',
 								      'div.image-right': 'figure',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.image-right i': 'figcaption'
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
 								  }
-												Fixes an issue with encoding (#154)

* fix: fixes an issue with encoding on the fetch level
											
										
										
											7 years ago
+								};
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								var WwwFoolComExtractor = {
 								  domain: 'www.fool.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.author-inline .author-name']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="date"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['header h2']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.article-content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      '.caption img': function captionImg($node) {
 								        var src = $node.attr('src');
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        $node.parent().replaceWith("<figure><img src=\"".concat(src, "\"/></figure>"));
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								      },
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      '.caption': 'figcaption'
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['#pitch']
 								  }
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								};
 								var WwwSlateComExtractor = {
 								  domain: 'www.slate.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.hed', 'h1']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['a[rel=author]']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: ['.pub-date'],
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.dek']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: ['.body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: ['.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment']
 								  }
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								};
 								var IciRadioCanadaCaExtractor = {
 								  domain: 'ici.radio-canada.ca',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="dc.creator"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  date_published: {
 								    selectors: [['meta[name="dc.date.created"]', 'value']],
-												chore: Update builds

											
										
										
											2 years ago
+								    format: 'YYYY-MM-DD|HH[h]mm',
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    timezone: 'America/New_York'
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.bunker-component.lead']
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								  },
 								  content: {
 								    selectors: [['.main-multimedia-item', '.news-story-content']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    clean: []
 								  }
-												release: 1.0.7 (#160)


											
										
										
											7 years ago
+								};
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								var WwwFortinetComExtractor = {
 								  domain: 'www.fortinet.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.b15-blog-meta__author']
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12'],
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    transforms: {
 								      noscript: function noscript($node) {
 								        var $children = $node.children();
 								        if ($children.length === 1 && $children.get(0).tagName === 'img') {
 								          return 'figure';
 								        }
 								        return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }
 								    }
 								  }
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								};
 								var WwwFastcompanyComExtractor = {
 								  domain: 'www.fastcompany.com',
 								  title: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['h1']
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  author: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post__by']
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  date_published: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="article:published_time"]', 'value']]
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  dek: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post__deck']
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  lead_image_url: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: [['meta[name="og:image"]', 'value']]
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  },
 								  content: {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    selectors: ['.post__article']
 								  }
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								};
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var BlisterreviewComExtractor = {
 								  domain: 'blisterreview.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['span.author-name']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value'], ['time.entry-date', 'datetime'], ['meta[itemprop="datePublished"]', 'content']]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[itemprop="image"]', 'content'], ['meta[name="twitter:image"]', 'content'], ['img.attachment-large', 'src']]
 								  },
 								  content: {
 								    selectors: [['.elementor-section-wrap', '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      figcaption: 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.comments-area']
 								  }
 								};
 								var NewsMynaviJpExtractor = {
 								  domain: 'news.mynavi.jp',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: ['main div.article-author a.article-author__name']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['main article div'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      img: function img($node) {
 								        var src = $node.attr('data-original');
 								        if (src !== '') {
 								          $node.attr('src', src);
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								var ClinicaltrialsGovExtractor = {
 								  domain: 'clinicaltrials.gov',
 								  title: {
 								    selectors: ['h1.tr-solo_record']
 								  },
 								  author: {
 								    selectors: ['div#sponsor.tr-info-text']
 								  },
 								  date_published: {
 								    // selectors: ['span.term[data-term="Last Update Posted"]'],
 								    selectors: ['div:has(> span.term[data-term="Last Update Posted"])']
 								  },
 								  content: {
 								    selectors: ['div#tab-body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.usa-alert> img']
 								  }
 								};
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var GithubComExtractor = {
 								  domain: 'github.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [// enter author selectors
 								    ]
 								  },
 								  date_published: {
 								    selectors: [['span[itemprop="dateModified"] relative-time', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['span[itemprop="about"]']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['#readme article']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwRedditComExtractor = {
 								  domain: 'www.reddit.com',
 								  title: {
 								    selectors: ['div[data-test-id="post-content"] h2']
 								  },
 								  author: {
 								    selectors: ['div[data-test-id="post-content"] a[href*="user/"]']
 								  },
 								  date_published: {
 								    selectors: ['div[data-test-id="post-content"] a[data-click-id="timestamp"]']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['div[data-test-id="post-content"] p'], // text post
 								    ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
 								    'div[data-test-id="post-content"] div[data-click-id="media"]'], // external link with media preview (YouTube, imgur album, etc...)
 								    ['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
 								    ['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])'], // external link
 								    'div[data-test-id="post-content"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'div[role="img"]': function divRoleImg($node) {
 								        // External link image preview
 								        var $img = $node.find('img');
 								        var bgImg = $node.css('background-image');
 								        if ($img.length === 1 && bgImg) {
 								          $img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
 								          return $img;
 								        }
 								        return $node;
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.icon']
 								  }
 								};
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								var OtrsComExtractor = {
 								  domain: 'otrs.com',
 								  title: {
 								    selectors: ['#main article h1']
 								  },
 								  author: {
 								    selectors: ['div.dateplusauthor a']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#main article'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['div.dateplusauthor', 'div.gr-12.push-6.footershare', '#atftbx', 'div.category-modul']
 								  }
 								};
 								var WwwOssnewsJpExtractor = {
 								  domain: 'www.ossnews.jp',
 								  title: {
 								    selectors: ['#alpha-block h1.hxnewstitle']
 								  },
 								  author: null,
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								  date_published: {
 								    selectors: ['p.fs12'],
 								    format: 'YYYY年MM月DD日 HH:mm',
 								    timezone: 'Asia/Tokyo'
 								  },
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#alpha-block .section:has(h1.hxnewstitle)'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var BuzzapJpExtractor = {
 								  domain: 'buzzap.jp',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: [['time.entry-date', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.ctiframe'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwAsahiComExtractor = {
 								  domain: 'www.asahi.com',
 								  title: {
 								    selectors: ['.ArticleTitle h1']
 								  },
 								  author: {
 								    selectors: [['meta[name="article:author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="pubdate"]', 'value']]
 								  },
 								  dek: null,
 								  excerpt: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#MainInner div.ArticleBody'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['div.AdMod', 'div.LoginSelectArea']
 								  }
 								};
 								var WwwSanwaCoJpExtractor = {
 								  domain: 'www.sanwa.co.jp',
 								  title: {
 								    selectors: ['#newsContent h1']
 								  },
 								  author: null,
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								  date_published: {
 								    selectors: ['p.date'],
 								    format: 'YYYY.MM.DD',
 								    timezone: 'Asia/Tokyo'
 								  },
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#newsContent'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['#smartphone', 'div.sns_box', 'div.contentFoot']
 								  }
 								};
 								var WwwElecomCoJpExtractor = {
 								  domain: 'www.elecom.co.jp',
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: null,
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								  date_published: {
 								    selectors: ['p.section-last'],
 								    format: 'YYYY.MM.DD',
 								    timezone: 'Asia/Tokyo'
 								  },
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['td.TableMain2'],
 								    defaultCleaner: false,
 								    transforms: {
 								      table: function table($node) {
 								        $node.attr('width', 'auto');
 								      }
 								    },
 								    clean: []
 								  }
 								};
 								var ScanNetsecurityNeJpExtractor = {
 								  domain: 'scan.netsecurity.ne.jp',
 								  title: {
 								    selectors: ['header.arti-header h1.head']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: [['meta[name="article:modified_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['header.arti-header p.arti-summary']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.arti-content.arti-content--thumbnail'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['aside.arti-giga']
 								  }
 								};
 								var JvndbJvnJpExtractor = {
 								  domain: 'jvndb.jvn.jp',
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: null,
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								  date_published: {
 								    selectors: ['div.modifytxt:nth-child(2)'],
 								    format: 'YYYY/MM/DD',
 								    timezone: 'Asia/Tokyo'
 								  },
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['#news-list'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var GeniusComExtractor = {
 								  domain: 'genius.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['h2 a']
 								  },
 								  date_published: {
 								    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
 								      var json = JSON.parse(res);
 								      return json.song.release_date;
 								    }]]
 								  },
 								  dek: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[itemprop=page_data]', 'value', function (res) {
 								      var json = JSON.parse(res);
 								      return json.song.album.cover_art_url;
 								    }]]
 								  },
 								  content: {
 								    selectors: ['.lyrics'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwJnsaOrgExtractor = {
 								  domain: 'www.jnsa.org',
 								  title: {
 								    selectors: ['#wgtitle h2']
 								  },
 								  author: null,
 								  date_published: null,
 								  dek: null,
 								  excerpt: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#main_area'],
 								    transforms: {},
 								    clean: ['#pankuzu', '#side']
 								  }
 								};
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								var PhpspotOrgExtractor = {
 								  domain: 'phpspot.org',
 								  title: {
 								    selectors: ['h3.hl']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['h4.hl'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['div.entrybody'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwInfoqComExtractor = {
 								  domain: 'www.infoq.com',
 								  title: {
 								    selectors: ['h1.heading']
 								  },
 								  author: {
 								    selectors: ['div.widget.article__authors']
 								  },
 								  date_published: {
 								    selectors: ['.article__readTime.date'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article__data'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwMoongiftJpExtractor = {
 								  domain: 'www.moongift.jp',
 								  title: {
 								    selectors: ['h1.title a']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['ul.meta li:not(.social):first-of-type'],
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: {
 								    selectors: [['meta[name="og:description"]', 'value']]
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#main'],
 								    transforms: {},
 								    clean: ['ul.mg_service.cf']
 								  }
 								};
 								var WwwItmediaCoJpExtractor = {
 								  domain: 'www.itmedia.co.jp',
 								  supportedDomains: ['www.atmarkit.co.jp', 'techtarget.itmedia.co.jp', 'nlab.itmedia.co.jp'],
 								  title: {
 								    selectors: ['#cmsTitle h1']
 								  },
 								  author: {
 								    selectors: ['#byline']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:modified_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['#cmsAbstract h2']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#cmsBody'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['#snsSharebox']
 								  }
 								};
 								var WwwPublickey1JpExtractor = {
 								  domain: 'www.publickey1.jp',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['#subcol p:has(img)']
 								  },
 								  date_published: {
 								    selectors: ['div.pubdate'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#maincol'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer']
 								  }
 								};
 								var TakagihiromitsuJpExtractor = {
 								  domain: 'takagi-hiromitsu.jp',
 								  title: {
 								    selectors: ['h3']
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[http-equiv="Last-Modified"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['div.body'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var BookwalkerJpExtractor = {
 								  domain: 'bookwalker.jp',
 								  title: {
 								    selectors: ['h1.main-heading']
 								  },
 								  author: {
 								    selectors: ['div.authors']
 								  },
 								  date_published: {
 								    selectors: ['.work-info .work-detail:first-of-type .work-detail-contents:last-of-type'],
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['div.main-info', 'div.main-cover-inner']],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['span.label.label--trial', 'dt.info-head.info-head--coin', 'dd.info-contents.info-contents--coin', 'div.info-notice.fn-toggleClass']
 								  }
 								};
 								var WwwYomiuriCoJpExtractor = {
 								  domain: 'www.yomiuri.co.jp',
 								  title: {
 								    selectors: ['h1.title-article.c-article-title']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.p-main-contents'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var JapanCnetComExtractor = {
 								  domain: 'japan.cnet.com',
 								  title: {
 								    selectors: ['.leaf-headline-ttl']
 								  },
 								  author: {
 								    selectors: ['.writer']
 								  },
 								  date_published: {
 								    selectors: ['.date'],
 								    format: 'YYYY年MM月DD日 HH時mm分',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article_body'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var DeadlineComExtractor = {
 								  domain: 'deadline.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['section.author h3']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'],
 								    transforms: {
 								      '.embed-twitter': function embedTwitter($node) {
 								        var innerHtml = $node.html();
 								        $node.replaceWith(innerHtml);
 								      }
 								    },
 								    clean: []
 								  }
 								};
 								var WwwGizmodoJpExtractor = {
 								  domain: 'www.gizmodo.jp',
 								  title: {
 								    selectors: ['h1.p-post-title']
 								  },
 								  author: {
 								    selectors: ['li.p-post-AssistAuthor']
 								  },
 								  date_published: {
 								    selectors: [['li.p-post-AssistTime time', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['article.p-post'],
 								    transforms: {
 								      'img.p-post-thumbnailImage': function imgPPostThumbnailImage($node) {
 								        var src = $node.attr('src');
 								        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
 								      }
 								    },
 								    clean: ['h1.p-post-title', 'ul.p-post-Assist']
 								  }
 								};
 								var GetnewsJpExtractor = {
 								  domain: 'getnews.jp',
 								  title: {
 								    selectors: ['article h1']
 								  },
 								  author: {
 								    selectors: ['span.prof']
 								  },
 								  date_published: {
 								    selectors: [['ul.cattag-top time', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.post-bodycopy'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwLifehackerJpExtractor = {
 								  domain: 'www.lifehacker.jp',
 								  title: {
 								    selectors: ['h1.lh-summary-title']
 								  },
 								  author: {
 								    selectors: ['p.lh-entryDetailInner--credit']
 								  },
 								  date_published: {
 								    selectors: [['div.lh-entryDetail-header time', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.lh-entryDetail-body'],
 								    transforms: {
 								      'img.lazyload': function imgLazyload($node) {
 								        var src = $node.attr('src');
 								        $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
 								      }
 								    },
 								    clean: ['p.lh-entryDetailInner--credit']
 								  }
 								};
 								var SectIijAdJpExtractor = {
 								  domain: 'sect.iij.ad.jp',
 								  title: {
 								    selectors: ['h3']
 								  },
 								  author: {
 								    selectors: ['dl.entrydate dd']
 								  },
 								  date_published: {
 								    selectors: ['dl.entrydate dd'],
 								    format: 'YYYY年MM月DD日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#article'],
 								    transforms: {},
 								    clean: ['dl.entrydate']
 								  }
 								};
 								var WwwOreillyCoJpExtractor = {
 								  domain: 'www.oreilly.co.jp',
 								  title: {
 								    selectors: ['h3']
 								  },
 								  author: {
 								    selectors: ['li[itemprop="author"]']
 								  },
 								  date_published: {
 								    selectors: [['meta[itemprop="datePublished"]', 'value']],
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#content'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['.social-tools']
 								  }
 								};
 								var WwwIpaGoJpExtractor = {
 								  domain: 'www.ipa.go.jp',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: null,
 								  date_published: {
 								    selectors: ['p.ipar_text_right'],
 								    format: 'YYYY年M月D日',
 								    timezone: 'Asia/Tokyo'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['#ipar_main'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: ['p.ipar_text_right']
 								  }
 								};
 								var WeeklyAsciiJpExtractor = {
 								  domain: 'weekly.ascii.jp',
 								  title: {
 								    selectors: ['h1[itemprop="headline"]']
 								  },
 								  author: {
 								    selectors: ['p.author']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="odate"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var TechlogIijAdJpExtractor = {
 								  domain: 'techlog.iij.ad.jp',
 								  title: {
 								    selectors: ['h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['a[rel="author"]']
 								  },
 								  date_published: {
 								    selectors: [['time.entry-date', 'datetime']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.entry-content'],
 								    defaultCleaner: false,
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WiredJpExtractor = {
 								  domain: 'wired.jp',
 								  title: {
 								    selectors: ['h1.post-title']
 								  },
 								  author: {
 								    selectors: ['p[itemprop="author"]']
 								  },
 								  date_published: {
 								    selectors: [['time', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['.post-intro']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['article.article-detail'],
 								    transforms: {
 								      'img[data-original]': function imgDataOriginal($node) {
 								        var dataOriginal = $node.attr('data-original');
 								        var src = $node.attr('src');
 								        var url = URL.resolve(src, dataOriginal);
 								        $node.attr('src', url);
 								      }
 								    },
 								    clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer']
 								  }
 								};
 								var JapanZdnetComExtractor = {
 								  domain: 'japan.zdnet.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: [['meta[name="cXenseParse:author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article_body'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwRbbtodayComExtractor = {
 								  domain: 'www.rbbtoday.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['.writer.writer-name']
 								  },
 								  date_published: {
 								    selectors: [['header time', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['.arti-summary']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.arti-content'],
 								    transforms: {},
 								    clean: ['.arti-giga']
 								  }
 								};
 								var WwwLemondeFrExtractor = {
 								  domain: 'www.lemonde.fr',
 								  title: {
 								    selectors: ['h1.article__title']
 								  },
 								  author: {
 								    selectors: ['.author__name']
 								  },
 								  date_published: {
 								    selectors: [['meta[name="og:article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['.article__desc']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['.article__content'],
 								    transforms: {},
 								    clean: []
 								  }
 								};
 								var WwwPhoronixComExtractor = {
 								  domain: 'www.phoronix.com',
 								  title: {
 								    selectors: ['article header']
 								  },
 								  author: {
 								    selectors: ['.author a:first-child']
 								  },
 								  date_published: {
 								    selectors: ['.author'],
 								    // 1 June 2019 at 08:34 PM EDT
 								    format: 'D MMMM YYYY at hh:mm',
 								    timezone: 'America/New_York'
 								  },
 								  dek: null,
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['.content'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var PitchforkComExtractor = {
 								  domain: 'pitchfork.com',
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: {
 								    selectors: ['.authors-detail__display-name']
 								  },
 								  date_published: {
 								    selectors: [['.pub-date', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['.review-detail__abstract']
 								  },
 								  lead_image_url: {
 								    selectors: [['.single-album-tombstone__art img', 'src']]
 								  },
 								  content: {
 								    selectors: ['.review-detail__text']
 								  },
 								  extend: {
 								    score: {
 								      selectors: ['.score']
 								    }
 								  }
 								};
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								var BiorxivOrgExtractor = {
 								  domain: 'biorxiv.org',
 								  title: {
 								    selectors: ['h1#page-title']
 								  },
 								  author: {
 								    selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors']
 								  },
 								  content: {
 								    selectors: ['div#abstract-1'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var EpaperZeitDeExtractor = {
 								  domain: 'epaper.zeit.de',
 								  title: {
 								    selectors: ['p.title']
 								  },
 								  author: {
 								    selectors: ['.article__author']
 								  },
 								  date_published: null,
 								  excerpt: {
 								    selectors: ['subtitle']
 								  },
 								  lead_image_url: null,
 								  content: {
 								    selectors: ['.article'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      'p.title': 'h1',
 								      '.article__author': 'p',
 								      byline: 'p',
 								      linkbox: 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['image-credits', 'box[type=citation]']
 								  }
 								};
-												release: 2.2.1 (#631)


											
										
										
											3 years ago
+								var WwwLadbibleComExtractor = {
 								  domain: 'www.ladbible.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['[class*=Byline]']
 								  },
 								  date_published: {
 								    selectors: ['time'],
 								    timezone: 'Europe/London'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['[class*=ArticleContainer]'],
 								    clean: ['time', 'source', 'a[href^="https://www.ladbible.com/"]', 'picture', '[class*=StyledCardBlock]']
 								  }
 								};
 								var TimesofindiaIndiatimesComExtractor = {
 								  domain: 'timesofindia.indiatimes.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  extend: {
 								    reporter: {
 								      selectors: ['div.byline'],
 								      transforms: {}
 								    }
 								  },
 								  date_published: {
 								    selectors: ['.byline'],
 								    format: 'MMM D, YYYY, HH:mm z',
 								    timezone: 'Asia/Kolkata'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div.contentwrapper:has(section)'],
 								    defaultCleaner: false,
 								    clean: ['section', 'h1', '.byline', '.img_cptn']
 								  }
 								};
-												chore: Update builds

											
										
										
											2 years ago
+								var MaTtiasBeExtractor = {
 								  domain: 'ma.ttias.be',
 								  title: {
 								    selectors: [['meta[name="twitter:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['meta[name="author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  content: {
 								    selectors: [['.content']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      h2: function h2($node) {
 								        // The "id" attribute values would result in low scores and the element being
 								        // removed.
 								        $node.attr('id', null); // h1 elements will be demoted to h2, so demote h2 elements to h3.
 								        return 'h3';
 								      },
 								      h1: function h1($node) {
 								        // The "id" attribute values would result in low scores and the element being
 								        // removed.
 								        $node.attr('id', null); // A subsequent h2 will be removed if there is not a paragraph before it, so
 								        // add a paragraph here. It will be removed anyway because it is empty.
 								        $node.after('<p></p>');
 								      },
 								      ul: function ul($node) {
 								        // Articles contain lists of links which look like, but are not, navigation
 								        // elements. Adding this class attribute avoids them being incorrectly removed.
 								        $node.attr('class', 'entry-content-asset');
 								      }
 								    }
 								  }
 								};
 								var PastebinComExtractor = {
 								  domain: 'pastebin.com',
 								  title: {
 								    selectors: ['h1']
 								  },
 								  author: {
 								    selectors: ['.paste_box_line2 .t_us + a']
 								  },
 								  date_published: {
 								    selectors: ['.paste_box_line2 .t_da + span'],
 								    timezone: 'America/New_York'
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['#selectable .text'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      ol: 'div',
 								      li: 'p'
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								/* eslint-disable no-nested-ternary */
 								/* eslint-disable no-unused-expressions */
 								var WwwAbendblattDeExtractor = {
 								  domain: 'www.abendblatt.de',
 								  title: {
 								    selectors: ['h2.article__header__headline']
 								  },
 								  author: {
 								    selectors: ['span.author-info__name-text']
 								  },
 								  date_published: {
 								    selectors: [['time.article__header__date', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ["span[itemprop='description']"]
 								  },
 								  lead_image_url: {
 								    selectors: [["meta[name='og:image']", 'value']]
 								  },
 								  content: {
 								    selectors: ['div.article__body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      p: function p($node) {
 								        if (!$node.hasClass('obfuscated')) return null;
 								        var o = '';
 								        var n = 0;
 								        for (var i = $node.text(); n < i.length; n += 1) {
 								          var r = i.charCodeAt(n);
 								          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
 								        }
 								        $node.html(o);
 								        $node.removeClass('obfuscated');
 								        $node.addClass('deobfuscated');
 								        return null;
 								      },
 								      div: function div($node) {
 								        if (!$node.hasClass('obfuscated')) return null;
 								        var o = '';
 								        var n = 0;
 								        for (var i = $node.text(); n < i.length; n += 1) {
 								          var r = i.charCodeAt(n);
 								          r === 177 ? o += '%' : r === 178 ? o += '!' : r === 180 ? o += ';' : r === 181 ? o += '=' : r === 32 ? o += ' ' : r === 10 ? o += '\n' : r > 33 && (o += String.fromCharCode(r - 1));
 								        }
 								        $node.html(o);
 								        $node.removeClass('obfuscated');
 								        $node.addClass('deobfuscated');
 								        return null;
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var WwwGrueneDeExtractor = {
 								  domain: 'www.gruene.de',
 								  title: {
 								    selectors: ['header h1']
 								  },
 								  author: null,
 								  date_published: null,
 								  dek: null,
 								  lead_image_url: {
 								    selectors: [['meta[property="og:image"]', 'content']]
 								  },
 								  content: {
 								    // selectors: ['section'],
 								    selectors: [['section header', 'section h2', 'section p', 'section ol']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['figcaption', 'p[class]']
 								  }
 								};
 								var WwwEngadgetComExtractor = {
 								  domain: 'www.engadget.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: ['a.th-meta[data-ylk*="subsec:author"]']
 								  },
 								  // Engadget stories have publish dates, but the only representation of them on the page
 								  // is in a format like "2h ago". There are also these tags with blank values:
 								  // <meta class="swiftype" name="published_at" data-type="date" value="">
 								  date_published: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  dek: {
 								    selectors: ['div[class*="o-title_mark"] div']
 								  },
 								  // Engadget stories do have lead images specified by an og:image meta tag, but selecting
 								  // the value attribute of that tag fails. I believe the "&#x2111;" sequence of characters
 								  // is triggering this inability to select the attribute value.
 								  lead_image_url: {
 								    selectors: [// enter selectors
 								    ]
 								  },
 								  content: {
 								    selectors: [[// Some figures will be inside div.article-text, but some header figures/images
 								    // will not.
 								    '#page_body figure:not(div.article-text figure)', 'div.article-text']],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: []
 								  }
 								};
 								var ArstechnicaComExtractor = {
 								  domain: 'arstechnica.com',
 								  // Articles from this site are often paginated, but I was unable to write a CSS
 								  // selector to find the next page. On the last page, there will be a link with a CSS
 								  // selector indicating that the previous page is next. But the parser appears to find
 								  // the next page without this extractor finding it, as long as the fallback option is
 								  // left at its default value of true.
 								  title: {
 								    selectors: ['title']
 								  },
 								  author: {
 								    selectors: ['*[rel="author"] *[itemprop="name"]']
 								  },
 								  date_published: {
 								    selectors: [['.byline time', 'datetime']]
 								  },
 								  dek: {
 								    selectors: ['h2[itemprop="description"]']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div[itemprop="articleBody"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      h2: function h2($node) {
 								        // Some pages have an element h2 that is significant, and that the parser will
 								        // remove if not following a paragraph. Adding this empty paragraph fixes it, and
 								        // the empty paragraph will be removed anyway.
 								        $node.before('<p></p>');
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result.
 								    clean: [// Remove enlarge links and separators inside image captions.
 								    'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
 								    // removed them.
 								    'figure.video', // Image galleries that do not work.
 								    '.gallery', 'aside', '.sidebar']
 								  }
 								};
 								var WwwNdtvComExtractor = {
 								  domain: 'www.ndtv.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
 								  },
 								  author: {
 								    selectors: ['span[itemprop="author"] span[itemprop="name"]']
 								  },
 								  date_published: {
 								    selectors: [['span[itemprop="dateModified"]', 'content']]
 								  },
 								  dek: {
 								    selectors: ['h2']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['div[itemprop="articleBody"]'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {
 								      // This site puts a dateline in a 'b' above the first paragraph, and then somehow
 								      // blends it into the first paragraph with CSS. This transform moves the dateline
 								      // to the first paragraph.
 								      '.place_cont': function place_cont($node) {
 								        if (!$node.parents('p').length) {
 								          var nextSibling = $node.next('p');
 								          if (nextSibling) {
 								            $node.remove();
 								            nextSibling.prepend($node);
 								          }
 								        }
 								      }
 								    },
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['.highlghts_Wdgt', '.ins_instory_dv_caption', 'input', '._world-wrapper .mt20']
 								  }
 								};
 								var SpektrumExtractor = {
 								  domain: 'www.spektrum.de',
 								  title: {
 								    selectors: ['.content__title']
 								  },
 								  author: {
 								    selectors: ['.content__author__info__name']
 								  },
 								  date_published: {
 								    selectors: ['.content__meta__date'],
 								    timezone: 'Europe/Berlin'
 								  },
 								  dek: {
 								    selectors: ['.content__intro']
 								  },
 								  lead_image_url: {
 								    selectors: [// This is how the meta tag appears in the original source code.
 								    ['meta[name="og:image"]', 'value'], // This is how the meta tag appears in the DOM in Chrome.
 								    // The selector is included here to make the code work within the browser as well.
 								    ['meta[property="og:image"]', 'content'], // This is the image that is shown on the page.
 								    // It can be slightly cropped compared to the original in the meta tag.
 								    '.image__article__top img']
 								  },
 								  content: {
 								    selectors: ['article.content'],
 								    clean: ['.breadcrumbs', '.hide-for-print', 'aside', 'header h2', '.image__article__top', '.content__author', '.copyright', '.callout-box']
 								  }
 								};
-												feat: add postlight.com custom extractor (#695)


											
										
										
											2 years ago
+								var PostlightComExtractor = {
 								  domain: 'postlight.com',
 								  title: {
 								    selectors: [['meta[name="og:title"]', 'value']]
 								  },
 								  author: {
 								    selectors: [['meta[name="parsely-author"]', 'value']]
 								  },
 								  date_published: {
 								    selectors: [['meta[name="article:published_time"]', 'value']]
 								  },
 								  dek: {
 								    selectors: ['h2.single-hero__abstract']
 								  },
 								  lead_image_url: {
 								    selectors: [['meta[name="og:image"]', 'value']]
 								  },
 								  content: {
 								    selectors: ['article.body'],
 								    // Is there anything in the content you selected that needs transformed
 								    // before it's consumable content? E.g., unusual lazy loaded images
 								    transforms: {},
 								    // Is there anything that is in the result that shouldn't be?
 								    // The clean selectors will remove anything that matches from
 								    // the result
 								    clean: ['section.pl-post-link']
 								  }
 								};
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								var CustomExtractors = /*#__PURE__*/Object.freeze({
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  BloggerExtractor: BloggerExtractor,
 								  NYMagExtractor: NYMagExtractor,
 								  WikipediaExtractor: WikipediaExtractor,
 								  TwitterExtractor: TwitterExtractor,
 								  NYTimesExtractor: NYTimesExtractor,
 								  TheAtlanticExtractor: TheAtlanticExtractor,
 								  NewYorkerExtractor: NewYorkerExtractor,
 								  WiredExtractor: WiredExtractor,
 								  MSNExtractor: MSNExtractor,
 								  YahooExtractor: YahooExtractor,
 								  BuzzfeedExtractor: BuzzfeedExtractor,
 								  WikiaExtractor: WikiaExtractor,
 								  LittleThingsExtractor: LittleThingsExtractor,
 								  PoliticoExtractor: PoliticoExtractor,
 								  DeadspinExtractor: DeadspinExtractor,
 								  BroadwayWorldExtractor: BroadwayWorldExtractor,
 								  ApartmentTherapyExtractor: ApartmentTherapyExtractor,
 								  MediumExtractor: MediumExtractor,
 								  WwwTmzComExtractor: WwwTmzComExtractor,
 								  WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
 								  WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
 								  NewrepublicComExtractor: NewrepublicComExtractor,
 								  MoneyCnnComExtractor: MoneyCnnComExtractor,
 								  WwwThevergeComExtractor: WwwThevergeComExtractor,
 								  WwwCnnComExtractor: WwwCnnComExtractor,
 								  WwwAolComExtractor: WwwAolComExtractor,
 								  WwwYoutubeComExtractor: WwwYoutubeComExtractor,
 								  WwwTheguardianComExtractor: WwwTheguardianComExtractor,
 								  WwwSbnationComExtractor: WwwSbnationComExtractor,
 								  WwwBloombergComExtractor: WwwBloombergComExtractor,
 								  WwwBustleComExtractor: WwwBustleComExtractor,
 								  WwwNprOrgExtractor: WwwNprOrgExtractor,
 								  WwwRecodeNetExtractor: WwwRecodeNetExtractor,
 								  QzComExtractor: QzComExtractor,
 								  WwwDmagazineComExtractor: WwwDmagazineComExtractor,
 								  WwwReutersComExtractor: WwwReutersComExtractor,
 								  MashableComExtractor: MashableComExtractor,
 								  WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
 								  WwwVoxComExtractor: WwwVoxComExtractor,
 								  NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
 								  WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
 								  WwwLatimesComExtractor: WwwLatimesComExtractor,
 								  PagesixComExtractor: PagesixComExtractor,
 								  ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
 								  WwwCbssportsComExtractor: WwwCbssportsComExtractor,
 								  WwwMsnbcComExtractor: WwwMsnbcComExtractor,
 								  WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
 								  WwwMentalflossComExtractor: WwwMentalflossComExtractor,
 								  AbcnewsGoComExtractor: AbcnewsGoComExtractor,
 								  WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
 								  WwwCnbcComExtractor: WwwCnbcComExtractor,
 								  WwwPopsugarComExtractor: WwwPopsugarComExtractor,
 								  ObserverComExtractor: ObserverComExtractor,
 								  PeopleComExtractor: PeopleComExtractor,
 								  WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
 								  WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
 								  twofortysevensportsComExtractor: twofortysevensportsComExtractor,
 								  UproxxComExtractor: UproxxComExtractor,
 								  WwwEonlineComExtractor: WwwEonlineComExtractor,
 								  WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
 								  WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
 								  WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
 								  WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
 								  WwwSiComExtractor: WwwSiComExtractor,
 								  WwwRawstoryComExtractor: WwwRawstoryComExtractor,
 								  WwwCnetComExtractor: WwwCnetComExtractor,
 								  WwwCinemablendComExtractor: WwwCinemablendComExtractor,
 								  WwwTodayComExtractor: WwwTodayComExtractor,
 								  WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
 								  WwwAlComExtractor: WwwAlComExtractor,
 								  WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
 								  WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
 								  FusionNetExtractor: FusionNetExtractor,
 								  WwwAmericanowComExtractor: WwwAmericanowComExtractor,
 								  ScienceflyComExtractor: ScienceflyComExtractor,
 								  HellogigglesComExtractor: HellogigglesComExtractor,
 								  ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
 								  WwwNjComExtractor: WwwNjComExtractor,
 								  WwwInquisitrComExtractor: WwwInquisitrComExtractor,
 								  WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
 								  FortuneComExtractor: FortuneComExtractor,
 								  WwwLinkedinComExtractor: WwwLinkedinComExtractor,
 								  ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
 								  WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
 								  WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
 								  ForwardComExtractor: ForwardComExtractor,
 								  WwwQdailyComExtractor: WwwQdailyComExtractor,
 								  GothamistComExtractor: GothamistComExtractor,
 								  WwwFoolComExtractor: WwwFoolComExtractor,
 								  WwwSlateComExtractor: WwwSlateComExtractor,
 								  IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								  WwwFortinetComExtractor: WwwFortinetComExtractor,
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
 								  BlisterreviewComExtractor: BlisterreviewComExtractor,
 								  NewsMynaviJpExtractor: NewsMynaviJpExtractor,
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								  ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  GithubComExtractor: GithubComExtractor,
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								  WwwRedditComExtractor: WwwRedditComExtractor,
 								  OtrsComExtractor: OtrsComExtractor,
 								  WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
 								  BuzzapJpExtractor: BuzzapJpExtractor,
 								  WwwAsahiComExtractor: WwwAsahiComExtractor,
 								  WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
 								  WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
 								  ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
 								  JvndbJvnJpExtractor: JvndbJvnJpExtractor,
 								  GeniusComExtractor: GeniusComExtractor,
-												release: 2.1.1 (#446)


											
										
										
											5 years ago
+								  WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
 								  PhpspotOrgExtractor: PhpspotOrgExtractor,
 								  WwwInfoqComExtractor: WwwInfoqComExtractor,
 								  WwwMoongiftJpExtractor: WwwMoongiftJpExtractor,
 								  WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor,
 								  WwwPublickey1JpExtractor: WwwPublickey1JpExtractor,
 								  TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
 								  BookwalkerJpExtractor: BookwalkerJpExtractor,
 								  WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
 								  JapanCnetComExtractor: JapanCnetComExtractor,
 								  DeadlineComExtractor: DeadlineComExtractor,
 								  WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
 								  GetnewsJpExtractor: GetnewsJpExtractor,
 								  WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
 								  SectIijAdJpExtractor: SectIijAdJpExtractor,
 								  WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
 								  WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
 								  WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
 								  TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
 								  WiredJpExtractor: WiredJpExtractor,
 								  JapanZdnetComExtractor: JapanZdnetComExtractor,
 								  WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
 								  WwwLemondeFrExtractor: WwwLemondeFrExtractor,
 								  WwwPhoronixComExtractor: WwwPhoronixComExtractor,
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								  PitchforkComExtractor: PitchforkComExtractor,
 								  BiorxivOrgExtractor: BiorxivOrgExtractor,
-												release: 2.2.1 (#631)


											
										
										
											3 years ago
+								  EpaperZeitDeExtractor: EpaperZeitDeExtractor,
 								  WwwLadbibleComExtractor: WwwLadbibleComExtractor,
-												chore: Update builds

											
										
										
											2 years ago
+								  TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
 								  MaTtiasBeExtractor: MaTtiasBeExtractor,
 								  PastebinComExtractor: PastebinComExtractor,
 								  WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
 								  WwwGrueneDeExtractor: WwwGrueneDeExtractor,
 								  WwwEngadgetComExtractor: WwwEngadgetComExtractor,
 								  ArstechnicaComExtractor: ArstechnicaComExtractor,
 								  WwwNdtvComExtractor: WwwNdtvComExtractor,
-												feat: add postlight.com custom extractor (#695)


											
										
										
											2 years ago
+								  SpektrumExtractor: SpektrumExtractor,
 								  PostlightComExtractor: PostlightComExtractor
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								});
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  var extractor = CustomExtractors[key];
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  return _objectSpread({}, acc, mergeSupportedDomains(extractor));
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								}, {});
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// CLEAN AUTHOR CONSTANTS
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var MS_DATE_STRING = /^\d{13}$/i;
 								var SEC_DATE_STRING = /^\d{10}$/i;
 								var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
 								var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
 								var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								var TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
 								var timeUnits = ['seconds?', 'minutes?', 'hours?', 'days?', 'weeks?', 'months?', 'years?'];
 								var allTimeUnits = timeUnits.join('|');
 								var TIME_AGO_STRING = new RegExp("(\\d+)\\s+(".concat(allTimeUnits, ")\\s+ago"), 'i');
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var allMonths = months.join('|');
 								var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
 								var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
-												Fix extension bugs (#23)

* feat: cleaning supplemental elements in nytimes (visible in web only)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/102

* wip

* fix: more generous date published bits

* feat: added washington post extractor (including figure transforms)

closes https://github.com/postlight/mercury-reader-chrome-extension/issues/100

* feat: cleaning zoom lightbox from gizmodo/kinja

* lint fix

											
										
										
											8 years ago
+								var timestamp3 = '-[0-9]{3,4}$';
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var SPLIT_DATE_STRING = new RegExp("(".concat(timestamp1, ")|(").concat(timestamp2, ")|(").concat(timestamp3, ")|([0-9]{1,4})|(").concat(allMonths, ")"), 'ig'); // 2016-11-22T08:57-500
-												Feat: custom timezones (#29)

* using moment-timezone to allow custom timezones

* added tz to tmz, even though still so-so

											
										
										
											8 years ago
+								// Check if datetime string has an offset at the end
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// A regular expression that will match separating characters on a
 								// title, that usually denote breadcrumbs or something similar.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g');
 								// just the name(s): 'David Smith'.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function cleanAuthor(author) {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								function clean$1(leadImageUrl) {
 								  leadImageUrl = leadImageUrl.trim();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (validUrl.isWebUri(leadImageUrl)) {
 								    return leadImageUrl;
 								  }
 								  return null;
 								}
 								// Return None if the dek wasn't good enough.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function cleanDek(dek, _ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      excerpt = _ref.excerpt;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // Sanity check that we didn't get too short or long of a dek.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // not a good dek - bail.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (TEXT_LINK_RE.test(dekText)) return null;
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  return normalizeSpaces(dekText.trim());
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								function cleanDateString(dateString) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return (dateString.match(SPLIT_DATE_STRING) || []).join(' ').replace(TIME_MERIDIAN_DOTS_RE, 'm').replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3').replace(CLEAN_DATE_STRING_RE, '$1').trim();
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								function createDate(dateString, timezone, format) {
-												Feat: custom timezones (#29)

* using moment-timezone to allow custom timezones

* added tz to tmz, even though still so-so

											
										
										
											8 years ago
+								  if (TIME_WITH_OFFSET_RE.test(dateString)) {
 								    return moment(new Date(dateString));
 								  }
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  if (TIME_AGO_STRING.test(dateString)) {
 								    var fragments = TIME_AGO_STRING.exec(dateString);
 								    return moment().subtract(fragments[1], fragments[2]);
 								  }
 								  if (TIME_NOW_STRING.test(dateString)) {
 								    return moment();
 								  }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Take a date published string, and hopefully return a date out of
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// it. Return none if we fail.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function cleanDatePublished(dateString) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
 								      timezone = _ref.timezone,
 								      format = _ref.format;
-												Feat: custom timezones (#29)

* using moment-timezone to allow custom timezones

* added tz to tmz, even though still so-so

											
										
										
											8 years ago
 								  // If string is in milliseconds or seconds, convert to int and return
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								  if (MS_DATE_STRING.test(dateString)) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    return new Date(_parseInt(dateString, 10)).toISOString();
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								  if (SEC_DATE_STRING.test(dateString)) {
 								    return new Date(_parseInt(dateString, 10) * 1000).toISOString();
 								  }
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								  var date = createDate(dateString, timezone, format);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								  if (!date.isValid()) {
 								    dateString = cleanDateString(dateString);
-												release: 1.0.6 (#142)


											
										
										
											7 years ago
+								    date = createDate(dateString, timezone, format);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
 								  return date.isValid() ? date.toISOString() : null;
 								}
 								function extractCleanNode(article, _ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      _ref$cleanConditional = _ref.cleanConditionally,
 								      cleanConditionally = _ref$cleanConditional === void 0 ? true : _ref$cleanConditional,
 								      _ref$title = _ref.title,
 								      title = _ref$title === void 0 ? '' : _ref$title,
 								      _ref$url = _ref.url,
 								      url = _ref$url === void 0 ? '' : _ref$url,
 								      _ref$defaultCleaner = _ref.defaultCleaner,
 								      defaultCleaner = _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // Rewrite the tag name to div if it's a top level node like body or
 								  // html to avoid later complications with multiple body tags.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  rewriteTopLevel$$1(article, $); // Drop small images and spacer images
-												feat: content cleaner still runs, but can disable some cleaners

											
										
										
											8 years ago
+								  // Only do this is defaultCleaner is set to true;
 								  // this can sometimes be too aggressive.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (defaultCleaner) cleanImages(article, $); // Make links absolute
-												release: 1.0.9 (#167)


											
										
										
											7 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed.
-												feat: keeping youtube and vimeo iframe embeds (#14)

* feat: keeping youtube and vimeo iframe embeds

* fix: removing class from article correctly

											
										
										
											8 years ago
+								  // E.g., stripJunkTags will remove iframes, so we're going to mark
 								  // YouTube/Vimeo videos as elements we want to keep.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  markToKeep(article, $, url); // Drop certain tags like <title>, etc
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // This is -mostly- for cleanliness, not security.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  stripJunkTags(article, $); // H1 tags are typically the article title, which should be extracted
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // by the title extractor instead. If there's less than 3 of them (<3),
 								  // strip them. Otherwise, turn 'em into H2s.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  cleanHOnes$$1(article, $); // Clean headers
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  cleanHeaders(article, $, title); // We used to clean UL's and OL's here, but it was leading to
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // too many in-article lists being removed. Consider a better
 								  // way to detect menus particularly and remove them.
-												feat: content cleaner still runs, but can disable some cleaners

											
										
										
											8 years ago
+								  // Also optionally running, since it can be overly aggressive.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  removeEmpty(article, $); // Remove unnecessary attributes
-												fix: bug that stopped proper attr cleaning in certain cases

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  cleanAttributes$$1(article, $);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  return article;
 								}
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								function cleanTitle$$1(title, _ref) {
 								  var url = _ref.url,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      $ = _ref.$;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								  // If title has |, :, or - in it, see if
 								  // we can clean it up.
 								  if (TITLE_SPLITTERS_RE.test(title)) {
 								    title = resolveSplitTitle(title, url);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // Final sanity check that we didn't get a crazy title.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // if (title.length > 150 || title.length < 15) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (title.length > 150) {
 								    // If we did, return h1 from the document if it exists
 								    var h1 = $('h1');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (h1.length === 1) {
 								      title = h1.text();
 								    }
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // strip any html tags in the title text
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								  return normalizeSpaces(stripTags(title, $).trim());
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								function extractBreadcrumbTitle(splitTitle, text) {
 								  // This must be a very breadcrumbed title, like:
 								  // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
 								  // NYTimes - Blogs - Bits - The Best Gadgets on Earth
 								  if (splitTitle.length >= 6) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    // Look to see if we can find a breadcrumb splitter that happens
 								    // more than once. If we can, we'll be able to better pull out
 								    // the title.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var termCounts = splitTitle.reduce(function (acc, titleText) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								      acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
 								      return acc;
 								    }, {});
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(function (acc, key) {
 								      if (acc[1] < termCounts[key]) {
 								        return [key, termCounts[key]];
 								      }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      return acc;
 								    }, [0, 0]),
 								        _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
 								        maxTerm = _Reflect$ownKeys$redu2[0],
 								        termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    // is probably the breadcrumber. Split our title on that instead.
 								    // Note: max_term should be <= 4 characters, so that " >> "
 								    // will match, but nothing longer than that.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (termCount >= 2 && maxTerm.length <= 4) {
 								      splitTitle = text.split(maxTerm);
 								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var longestEnd = splitEnds.reduce(function (acc, end) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								      return acc.length > end.length ? acc : end;
 								    }, '');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (longestEnd.length > 10) {
 								      return longestEnd;
 								    }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    return text;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
 								  return null;
 								}
 								function cleanDomainFromTitle(splitTitle, url) {
 								  // Search the ends of the title, looking for bits that fuzzy match
 								  // the URL too closely. If one is found, discard it and return the
 								  // rest.
 								  //
 								  // Strip out the big TLDs - it just makes the matching a bit more
 								  // accurate. Not the end of the world if it doesn't strip right.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var _URL$parse = URL.parse(url),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      host = _URL$parse.host;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								  var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
 								  var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
 								  var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
 								  if (startSlugRatio > 0.4 && startSlug.length > 5) {
 								    return splitTitle.slice(2).join('');
 								  }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
 								  if (endSlugRatio > 0.4 && endSlug.length >= 5) {
 								    return splitTitle.slice(0, -2).join('');
 								  }
 								  return null;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Given a title with separators in it (colons, dashes, etc),
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// resolve whether any of the segments should be removed.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function resolveSplitTitle(title) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // Splits while preserving splitters, like:
 								  // ['The New New York', ' - ', 'The Washington Post']
 								  var splitTitle = title.split(TITLE_SPLITTERS_RE);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (splitTitle.length === 1) {
 								    return title;
 								  }
 								  var newTitle = extractBreadcrumbTitle(splitTitle, title);
 								  if (newTitle) return newTitle;
 								  newTitle = cleanDomainFromTitle(splitTitle, url);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // Just return it all.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  return title;
 								}
 								var Cleaners = {
 								  author: cleanAuthor,
 								  lead_image_url: clean$1,
 								  dek: cleanDek,
 								  date_published: cleanDatePublished,
 								  content: extractCleanNode,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  title: cleanTitle$$1
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								// likely to be article text.
 								//
 								// If strip_unlikely_candidates is True, remove any elements that
 								// match certain criteria first. (Like, does this element have a
 								// classname of "comment")
 								//
 								// If weight_nodes is True, use classNames and IDs to determine the
 								// worthiness of nodes.
 								//
 								// Returns a cheerio object $
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function extractBestNode($, opts) {
 								  if (opts.stripUnlikelyCandidates) {
 								    $ = stripUnlikelyCandidates($);
 								  }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  $ = convertToParagraphs$$1($);
 								  $ = scoreContent$$1($, opts.weightNodes);
 								  var $topCandidate = findTopCandidate$$1($);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  return $topCandidate;
 								}
 								var GenericContentExtractor = {
 								  defaultOpts: {
 								    stripUnlikelyCandidates: true,
 								    weightNodes: true,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    cleanConditionally: true
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  },
 								  // Extract the content for this resource - initially, pass in our
 								  // most restrictive opts which will return the highest quality
 								  // content. On each failure, retry with slightly more lax opts.
 								  //
 								  // :param return_type: string. If "node", should return the content
 								  // as a cheerio node rather than as an HTML string.
 								  //
 								  // Opts:
 								  // stripUnlikelyCandidates: Remove any elements that match
 								  // non-article-like criteria first.(Like, does this element
 								  //   have a classname of "comment")
 								  //
 								  // weightNodes: Modify an elements score based on whether it has
 								  // certain classNames or IDs. Examples: Subtract if a node has
 								  // a className of 'comment', Add if a node has an ID of
 								  // 'entry-content'.
 								  //
 								  // cleanConditionally: Clean the node to return of some
 								  // superfluous content. Things like forms, ads, etc.
 								  extract: function extract(_ref, opts) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        html = _ref.html,
 								        title = _ref.title,
 								        url = _ref.url;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    opts = _objectSpread({}, this.defaultOpts, opts);
 								    $ = $ || cheerio.load(html); // Cascade through our extraction-specific opts in an ordered fashion,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // turning them off as we try to extract content.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var node = this.getContentNode($, title, url, opts);
 								    if (nodeIsSufficient(node)) {
 								      return this.cleanAndReturnNode(node, $);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // We didn't succeed on first pass, one by one disable our
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // extraction opts and try again.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    // eslint-disable-next-line no-restricted-syntax
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var _iteratorNormalCompletion = true;
 								    var _didIteratorError = false;
 								    var _iteratorError = undefined;
 								    try {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      for (var _iterator = _getIterator(_Reflect$ownKeys(opts).filter(function (k) {
 								        return opts[k] === true;
 								      })), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        var key = _step.value;
 								        opts[key] = false;
 								        $ = cheerio.load(html);
 								        node = this.getContentNode($, title, url, opts);
 								        if (nodeIsSufficient(node)) {
 								          break;
 								        }
 								      }
 								    } catch (err) {
 								      _didIteratorError = true;
 								      _iteratorError = err;
 								    } finally {
 								      try {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        if (!_iteratorNormalCompletion && _iterator.return != null) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								          _iterator.return();
 								        }
 								      } finally {
 								        if (_didIteratorError) {
 								          throw _iteratorError;
 								        }
 								      }
 								    }
 								    return this.cleanAndReturnNode(node, $);
 								  },
 								  // Get node given current options
 								  getContentNode: function getContentNode($, title, url, opts) {
 								    return extractCleanNode(extractBestNode($, opts), {
 								      $: $,
 								      cleanConditionally: opts.cleanConditionally,
 								      title: title,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      url: url
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    });
 								  },
 								  // Once we got here, either we're at our last-resort node, or
 								  // we broke early. Make sure we at least have -something- before we
 								  // move forward.
 								  cleanAndReturnNode: function cleanAndReturnNode(node, $) {
 								    if (!node) {
 								      return null;
 								    }
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    return normalizeSpaces($.html(node));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								// TODO: It would be great if we could merge the meta and selector lists into
 								// a list of objects, because we could then rank them better. For example,
 								// .hentry .entry-title is far better suited than <meta title>.
 								// An ordered list of meta tag names that denote likely article titles. All
 								// attributes should be lowercase for faster case-insensitive matching. From
 								// most distinct to least distinct.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var STRONG_TITLE_META_TAGS = ['tweetmeme-title', 'dc.title', 'rbtitle', 'headline', 'title']; // og:title is weak because it typically contains context that we don't like,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// for example the source site's name. Gotta get that brand into facebook!
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// most explicit to least explicit.
 								//
 								// Note - this does not use classes like CSS. This checks to see if the string
 								// exists in the className, which is not as accurate as .className (which
 								// splits on spaces/endlines), but for our purposes it's close enough. The
 								// speed tradeoff is worth the accuracy hit.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var STRONG_TITLE_SELECTORS = ['.hentry .entry-title', 'h1#articleHeader', 'h1.articleHeader', 'h1.article', '.instapaper_title', '#meebo-title'];
 								var WEAK_TITLE_SELECTORS = ['article h1', '#entry-title', '.entry-title', '#entryTitle', '#entrytitle', '.entryTitle', '.entrytitle', '#articleTitle', '.articleTitle', 'post post-title', 'h1.title', 'h2.article', 'h1', 'html head title', 'title'];
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								var GenericTitleExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        url = _ref.url,
 								        metaCache = _ref.metaCache;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // First, check to see if we have a matching meta tag that we can make
 								    // use of that is strongly associated with the headline.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var title;
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // Second, look through our content selectors for the most likely
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // article title that is strongly associated with the headline.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // Third, check for weaker meta tags that may match.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // Last, look for weaker selector tags that may match.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (title) return cleanTitle$$1(title, {
 								      url: url,
 								      $: $
 								    }); // If no matches, return an empty string
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								    return '';
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								// An ordered list of meta tag names that denote likely article authors. All
 								// attributes should be lowercase for faster case-insensitive matching. From
 								// most distinct to least distinct.
 								//
 								// Note: "author" is too often the -developer- of the page, so it is not
 								// added here.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var AUTHOR_META_TAGS = ['byl', 'clmst', 'dc.author', 'dcsext.author', 'dc.creator', 'rbauthors', 'authors'];
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// most explicit to least explicit.
 								//
 								// Note - this does not use classes like CSS. This checks to see if the string
 								// exists in the className, which is not as accurate as .className (which
 								// splits on spaces/endlines), but for our purposes it's close enough. The
 								// speed tradeoff is worth the accuracy hit.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .vcard .fn', '.byline.vcard .fn', '.byline .vcard .fn', '.byline .by .author', '.byline .by', '.byline .author', '.post-author.vcard', '.post-author .vcard', 'a[rel=author]', '#by_author', '.by_author', '#entryAuthor', '.entryAuthor', '.byline a[href*=author]', '#author .authorname', '.author .authorname', '#author', '.author', '.articleauthor', '.ArticleAuthor', '.byline']; // An ordered list of Selectors to find likely article authors, with
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// regular expression for content.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var bylineRe = /^[\n\s]*By/i;
 								var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];
 								var GenericAuthorExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        metaCache = _ref.metaCache;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var author; // First, check to see if we have a matching
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // meta tag that we can make use of.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (author && author.length < AUTHOR_MAX_LENGTH) {
 								      return cleanAuthor(author);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Second, look through our selectors looking for potential authors.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (author && author.length < AUTHOR_MAX_LENGTH) {
 								      return cleanAuthor(author);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Last, use our looser regular-expression based selectors for
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // potential authors.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    // eslint-disable-next-line no-restricted-syntax
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var _iteratorNormalCompletion = true;
 								    var _didIteratorError = false;
 								    var _iteratorError = undefined;
 								    try {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      for (var _iterator = _getIterator(BYLINE_SELECTORS_RE), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								        var _step$value = _slicedToArray(_step.value, 2),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            selector = _step$value[0],
 								            regex = _step$value[1];
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								        var node = $(selector);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        if (node.length === 1) {
 								          var text = node.text();
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								          if (regex.test(text)) {
 								            return cleanAuthor(text);
 								          }
 								        }
 								      }
 								    } catch (err) {
 								      _didIteratorError = true;
 								      _iteratorError = err;
 								    } finally {
 								      try {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        if (!_iteratorNormalCompletion && _iterator.return != null) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								          _iterator.return();
 								        }
 								      } finally {
 								        if (_didIteratorError) {
 								          throw _iteratorError;
 								        }
 								      }
 								    }
 								    return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								// An ordered list of meta tag names that denote
 								// likely date published dates. All attributes
 								// should be lowercase for faster case-insensitive matching.
 								// From most distinct to least distinct.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var DATE_PUBLISHED_META_TAGS = ['article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date']; // An ordered list of XPath Selectors to find
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// likely date published dates. From most explicit
 								// to least explicit.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var DATE_PUBLISHED_SELECTORS = ['.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate']; // An ordered list of compiled regular expressions to find likely date
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// published dates from the URL. These should always have the first
 								// reference be a date string that is parseable by dateutil.parser.parse
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var DATE_PUBLISHED_URL_RES = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), new RegExp("/(20\\d{2}/".concat(abbrevMonthsStr, "/[0-3]\\d)/"), 'i')];
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								var GenericDatePublishedExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        url = _ref.url,
 								        metaCache = _ref.metaCache;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var datePublished; // First, check to see if we have a matching meta tag
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // that we can make use of.
 								    // Don't try cleaning tags from this string
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    datePublished = extractFromMeta$$1($, DATE_PUBLISHED_META_TAGS, metaCache, false);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // date_published's.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
 								    if (datePublished) return cleanDatePublished(datePublished);
 								    return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								// Currently there is only one selector for
 								// deks. We should simply return null here
 								// until we have a more robust generic option.
 								// Below is the original source for this, for reference.
 								var GenericDekExtractor = {
 								  extract: function extract() {
 								    return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// An ordered list of meta tag names that denote likely article leading images.
 								// All attributes should be lowercase for faster case-insensitive matching.
 								// From most distinct to least distinct.
 								var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src'];
 								var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]'];
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var POSITIVE_LEAD_IMAGE_URL_HINTS = ['upload', 'wp-content', 'large', 'photo', 'wp-image'];
 								var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
 								var NEGATIVE_LEAD_IMAGE_URL_HINTS = ['spacer', 'sprite', 'blank', 'throbber', 'gradient', 'tile', 'bg', 'background', 'icon', 'social', 'header', 'hdr', 'advert', 'spinner', 'loader', 'loading', 'default', 'rating', 'share', 'facebook', 'twitter', 'theme', 'promo', 'ads', 'wp-includes'];
 								var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var GIF_RE = /\.gif(\?.*)?$/i;
 								var JPG_RE = /\.jpe?g(\?.*)?$/i;
 								function getSig($node) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return "".concat($node.attr('class') || '', " ").concat($node.attr('id') || '');
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Scores image urls based on a variety of heuristics.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function scoreImageUrl(url) {
 								  url = url.trim();
 								  var score = 0;
 								  if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
 								    score += 20;
 								  }
 								  if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
 								    score -= 20;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // TODO: We might want to consider removing this as
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // gifs are much more common/popular than they once were
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (GIF_RE.test(url)) {
 								    score -= 10;
 								  }
 								  if (JPG_RE.test(url)) {
 								    score += 10;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // PNGs are neutral.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  return score;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Alt attribute usually means non-presentational image.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								function scoreAttr($img) {
 								  if ($img.attr('alt')) {
 								    return 5;
 								  }
 								  return 0;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Look through our parent and grandparent for figure-like
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// container elements, give a bonus if we find them
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function scoreByParents($img) {
 								  var score = 0;
 								  var $figParent = $img.parents('figure').first();
 								  if ($figParent.length === 1) {
 								    score += 25;
 								  }
 								  var $parent = $img.parent();
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var $gParent;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if ($parent.length === 1) {
 								    $gParent = $parent.parent();
 								  }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  [$parent, $gParent].forEach(function ($node) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (PHOTO_HINTS_RE$1.test(getSig($node))) {
 								      score += 15;
 								    }
 								  });
 								  return score;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Look at our immediate sibling and see if it looks like it's a
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// caption. Bonus if so.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function scoreBySibling($img) {
 								  var score = 0;
 								  var $sibling = $img.next();
 								  var sibling = $sibling.get(0);
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  if (sibling && sibling.tagName.toLowerCase() === 'figcaption') {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    score += 25;
 								  }
 								  if (PHOTO_HINTS_RE$1.test(getSig($sibling))) {
 								    score += 15;
 								  }
 								  return score;
 								}
 								function scoreByDimensions($img) {
 								  var score = 0;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var width = _parseFloat($img.attr('width'));
 								  var height = _parseFloat($img.attr('height'));
 								  var src = $img.attr('src'); // Penalty for skinny images
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								  if (width && width <= 50) {
 								    score -= 50;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // Penalty for short images
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (height && height <= 50) {
 								    score -= 50;
 								  }
 								  if (width && height && !src.includes('sprite')) {
 								    var area = width * height;
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (area < 5000) {
 								      // Smaller than 50 x 100
 								      score -= 100;
 								    } else {
 								      score += Math.round(area / 1000);
 								    }
 								  }
 								  return score;
 								}
 								function scoreByPosition($imgs, index) {
 								  return $imgs.length / 2 - index;
 								}
 								// it. Like content and next page extraction, uses a scoring system
 								// to determine what the most likely image may be. Short circuits
 								// on really probable things like og:image meta tags.
 								//
 								// Potential signals to still take advantage of:
 								//   * domain
 								//   * weird aspect ratio
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var GenericLeadImageUrlExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        content = _ref.content,
 								        metaCache = _ref.metaCache,
 								        html = _ref.html;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var cleanUrl;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (!$.browser && $('head').length === 0) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      $('*').first().prepend(html);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Check to see if we have a matching meta tag that we can make use of.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // Moving this higher because common practice is now to use large
 								    // images on things like Open Graph or Twitter cards.
 								    // images usually have for things like Open Graph.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								    var imageUrl = extractFromMeta$$1($, LEAD_IMAGE_URL_META_TAGS, metaCache, false);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								    if (imageUrl) {
 								      cleanUrl = clean$1(imageUrl);
 								      if (cleanUrl) return cleanUrl;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Next, try to find the "best" image via the content.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // We'd rather not have to fetch each image and check dimensions,
 								    // so try to do some analysis and determine them instead.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var $content = $(content);
 								    var imgs = $('img', $content).toArray();
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var imgScores = {};
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    imgs.forEach(function (img, index) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      var $img = $(img);
 								      var src = $img.attr('src');
 								      if (!src) return;
 								      var score = scoreImageUrl(src);
 								      score += scoreAttr($img);
 								      score += scoreByParents($img);
 								      score += scoreBySibling($img);
 								      score += scoreByDimensions($img);
 								      score += scoreByPosition(imgs, index);
 								      imgScores[src] = score;
 								    });
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(function (acc, key) {
 								      return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
 								    }, [null, 0]),
 								        _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
 								        topUrl = _Reflect$ownKeys$redu2[0],
 								        topScore = _Reflect$ownKeys$redu2[1];
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								    if (topScore > 0) {
 								      cleanUrl = clean$1(topUrl);
 								      if (cleanUrl) return cleanUrl;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // If nothing else worked, check to see if there are any really
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // probable nodes in the doc, like <link rel="image_src" />.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    // eslint-disable-next-line no-restricted-syntax
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var _iteratorNormalCompletion = true;
 								    var _didIteratorError = false;
 								    var _iteratorError = undefined;
 								    try {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      for (var _iterator = _getIterator(LEAD_IMAGE_URL_SELECTORS), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        var selector = _step.value;
 								        var $node = $(selector).first();
 								        var src = $node.attr('src');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        if (src) {
 								          cleanUrl = clean$1(src);
 								          if (cleanUrl) return cleanUrl;
 								        }
 								        var href = $node.attr('href');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        if (href) {
 								          cleanUrl = clean$1(href);
 								          if (cleanUrl) return cleanUrl;
 								        }
 								        var value = $node.attr('value');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        if (value) {
 								          cleanUrl = clean$1(value);
 								          if (cleanUrl) return cleanUrl;
 								        }
 								      }
 								    } catch (err) {
 								      _didIteratorError = true;
 								      _iteratorError = err;
 								    } finally {
 								      try {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        if (!_iteratorNormalCompletion && _iterator.return != null) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								          _iterator.return();
 								        }
 								      } finally {
 								        if (_didIteratorError) {
 								          throw _iteratorError;
 								        }
 								      }
 								    }
 								    return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function scoreSimilarity(score, articleUrl, href) {
 								  // Do this last and only if we have a real candidate, because it's
 								  // potentially expensive computationally. Compare the link to this
 								  // URL using difflib to get the % similarity of these URLs. On a
 								  // sliding scale, subtract points from this link based on
 								  // similarity.
 								  if (score > 0) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio(); // Subtract .1 from diff_percent when calculating modifier,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // which means that if it's less than 10% different, we give a
 								    // bonus instead. Ex:
 								    //  3% different = +17.5 points
 								    // 10% different = 0 points
 								    // 20% different = -25 points
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var diffPercent = 1.0 - similarity;
 								    var diffModifier = -(250 * (diffPercent - 0.2));
 								    return score + diffModifier;
 								  }
 								  return 0;
 								}
 								function scoreLinkText(linkText, pageNum) {
 								  // If the link text can be parsed as a number, give it a minor
 								  // bonus, with a slight bias towards lower numbered pages. This is
 								  // so that pages that might not have 'next' in their text can still
 								  // get scored, and sorted properly by score.
 								  var score = 0;
 								  if (IS_DIGIT_RE.test(linkText.trim())) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var linkTextAsNum = _parseInt(linkText, 10); // If it's the first page, we already got it on the first call.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // Give it a negative score. Otherwise, up to page 10, give a
 								    // small bonus.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (linkTextAsNum < 2) {
 								      score = -30;
 								    } else {
 								      score = Math.max(0, 10 - linkTextAsNum);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // If it appears that the current page number is greater than
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // this links page number, it's a very bad sign. Give it a big
 								    // penalty.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (pageNum && pageNum >= linkTextAsNum) {
 								      score -= 50;
 								    }
 								  }
 								  return score;
 								}
 								function scorePageInLink(pageNum, isWp) {
 								  // page in the link = bonus. Intentionally ignore wordpress because
 								  // their ?p=123 link style gets caught by this even though it means
 								  // separate documents entirely.
 								  if (pageNum && !isWp) {
 								    return 50;
 								  }
 								  return 0;
 								}
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// this link is not a next page link.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var EXTRANEOUS_LINK_HINTS$1 = ['print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated'];
 								var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(EXTRANEOUS_LINK_HINTS$1.join('|'), 'i'); // Match any link text/classname/id that looks like it could mean the next
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// page. Things like: next, continue, >, >>, » but not >|, »| as those can
 								// mean last page.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								var NEXT_LINK_TEXT_RE$1 = new RegExp('(next|weiter|continue|>([^|]|$)|»([^|]|$))', 'i'); // Match any link text/classname/id that looks like it is an end link: things
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// like "first", "last", "end", etc.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								// page.
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function scoreExtraneousLinks(href) {
 								  // If the URL itself contains extraneous values, give a penalty.
 								  if (EXTRANEOUS_LINK_HINTS_RE$1.test(href)) {
 								    return -25;
 								  }
 								  return 0;
 								}
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function makeSig($link) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return "".concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								function scoreByParents$1($link) {
 								  // If a parent node contains paging-like classname or id, give a
 								  // bonus. Additionally, if a parent_node contains bad content
 								  // (like 'sponsor'), give a penalty.
 								  var $parent = $link.parent();
 								  var positiveMatch = false;
 								  var negativeMatch = false;
 								  var score = 0;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  _Array$from(range(0, 4)).forEach(function () {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if ($parent.length === 0) {
 								      return;
 								    }
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // sign. Add a bonus.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (!positiveMatch && PAGE_RE.test(parentData)) {
 								      positiveMatch = true;
 								      score += 25;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // If we have 'comment' or something in our data, and
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // we don't have something like 'content' as well, that's
 								    // a bad sign. Give a penalty.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								    if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData) && EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      if (!POSITIVE_SCORE_RE.test(parentData)) {
 								        negativeMatch = true;
 								        score -= 25;
 								      }
 								    }
 								    $parent = $parent.parent();
 								  });
 								  return score;
 								}
 								function scorePrevLink(linkData) {
 								  // If the link has something like "previous", its definitely
 								  // an old link, skip it.
 								  if (PREV_LINK_TEXT_RE$1.test(linkData)) {
 								    return -200;
 								  }
 								  return 0;
 								}
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // skip if we've already fetched this url
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  if (previousUrls.find(function (url) {
 								    return href === url;
 								  }) !== undefined) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    return false;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // If we've already parsed this URL, or the URL matches the base
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // URL, or is empty, skip it.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (!href || href === articleUrl || href === baseUrl) {
 								    return false;
 								  }
 								  var hostname = parsedUrl.hostname;
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var _URL$parse = URL.parse(href),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      linkHost = _URL$parse.hostname; // Domain mismatch.
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (linkHost !== hostname) {
 								    return false;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // If href doesn't contain a digit after removing the base URL,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // it's certainly not the next page.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  var fragment = href.replace(baseUrl, '');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (!DIGIT_RE$2.test(fragment)) {
 								    return false;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // This link has extraneous content (like "comment") in its link
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // text, so we skip it.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) {
 								    return false;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // Next page link text is never long, skip if it is too long.
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  if (linkText.length > 25) {
 								    return false;
 								  }
 								  return true;
 								}
 								function scoreBaseUrl(href, baseRegex) {
 								  // If the baseUrl isn't part of this URL, penalize this
 								  // link. It could still be the link, but the odds are lower.
 								  // Example:
 								  // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
 								  if (!baseRegex.test(href)) {
 								    return -25;
 								  }
 								  return 0;
 								}
 								function scoreNextLinkText(linkData) {
 								  // Things like "next", ">>", etc.
 								  if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
 								    return 50;
 								  }
 								  return 0;
 								}
 								function scoreCapLinks(linkData) {
 								  // Cap links are links like "last", etc.
 								  if (CAP_LINK_TEXT_RE$1.test(linkData)) {
 								    // If we found a link like "last", but we've already seen that
 								    // this link is also "next", it's fine. If it's not been
 								    // previously marked as "next", then it's probably bad.
 								    // Penalize.
 								    if (NEXT_LINK_TEXT_RE$1.test(linkData)) {
 								      return -65;
 								    }
 								  }
 								  return 0;
 								}
 								function makeBaseRegex(baseUrl) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return new RegExp("^".concat(baseUrl), 'i');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function makeSig$1($link, linkText) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return "".concat(linkText || $link.text(), " ").concat($link.attr('class') || '', " ").concat($link.attr('id') || '');
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								function scoreLinks(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var links = _ref.links,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      articleUrl = _ref.articleUrl,
 								      baseUrl = _ref.baseUrl,
 								      parsedUrl = _ref.parsedUrl,
 								      $ = _ref.$,
 								      _ref$previousUrls = _ref.previousUrls,
 								      previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  parsedUrl = parsedUrl || URL.parse(articleUrl);
 								  var baseRegex = makeBaseRegex(baseUrl);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var isWp = isWordpress($); // Loop through all links, looking for hints that they may be next-page
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // links. Things like having "page" in their textContent, className or
 								  // id, or being a child of a node with a page-y className or id.
 								  //
 								  // After we do that, assign each page a score, and pick the one that
 								  // looks most like the next page link, as long as its score is strong
 								  // enough to have decent confidence.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var scoredPages = links.reduce(function (possiblePages, link) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // Remove any anchor data since we don't do a good job
 								    // standardizing URLs (it's hard), we're going to do
 								    // some checking with and without a trailing slash
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var attrs = getAttrs(link); // if href is undefined, return
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
 								    if (!attrs.href) return possiblePages;
 								    var href = removeAnchor(attrs.href);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var $link = $(link);
 								    var linkText = $link.text();
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      return possiblePages;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (!possiblePages[href]) {
 								      possiblePages[href] = {
 								        score: 0,
 								        linkText: linkText,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        href: href
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      };
 								    } else {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      possiblePages[href].linkText = "".concat(possiblePages[href].linkText, "|").concat(linkText);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
 								    var possiblePage = possiblePages[href];
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var linkData = makeSig$1($link, linkText);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var pageNum = pageNumFromUrl(href);
 								    var score = scoreBaseUrl(href, baseRegex);
 								    score += scoreNextLinkText(linkData);
 								    score += scoreCapLinks(linkData);
 								    score += scorePrevLink(linkData);
 								    score += scoreByParents$1($link);
 								    score += scoreExtraneousLinks(href);
 								    score += scorePageInLink(pageNum, isWp);
 								    score += scoreLinkText(linkText, pageNum);
 								    score += scoreSimilarity(score, articleUrl, href);
 								    possiblePage.score = score;
 								    return possiblePages;
 								  }, {});
-												fix: using transform-runtime to avoid babel-polyfill conflicts when used
in external code

											
										
										
											8 years ago
+								  return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								// for multi-page articles
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var GenericNextPageUrlExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        url = _ref.url,
 								        parsedUrl = _ref.parsedUrl,
 								        _ref$previousUrls = _ref.previousUrls,
 								        previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    parsedUrl = parsedUrl || URL.parse(url);
 								    var articleUrl = removeAnchor(url);
 								    var baseUrl = articleBaseUrl(url, parsedUrl);
 								    var links = $('a[href]').toArray();
 								    var scoredLinks = scoreLinks({
 								      links: links,
 								      articleUrl: articleUrl,
 								      baseUrl: baseUrl,
 								      parsedUrl: parsedUrl,
 								      $: $,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      previousUrls: previousUrls
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    }); // If no links were scored, return null
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (!scoredLinks) return null; // now that we've scored all possible pages,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // find the biggest one.
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var topPage = _Reflect$ownKeys(scoredLinks).reduce(function (acc, link) {
 								      var scoredLink = scoredLinks[link];
 								      return scoredLink.score > acc.score ? scoredLink : acc;
 								    }, {
 								      score: -100
 								    }); // If the score is less than 50, we're not confident enough to use it,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    // so we fail.
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (topPage.score >= 50) {
 								      return topPage.href;
 								    }
 								    return null;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								var CANONICAL_META_SELECTORS = ['og:url'];
 								function parseDomain(url) {
 								  var parsedUrl = URL.parse(url);
 								  var hostname = parsedUrl.hostname;
 								  return hostname;
 								}
 								function result(url) {
 								  return {
 								    url: url,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    domain: parseDomain(url)
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  };
 								}
 								var GenericUrlExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        url = _ref.url,
 								        metaCache = _ref.metaCache;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var $canonical = $('link[rel=canonical]');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if ($canonical.length !== 0) {
 								      var href = $canonical.attr('href');
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      if (href) {
 								        return result(href);
 								      }
 								    }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (metaUrl) {
 								      return result(metaUrl);
 								    }
 								    return result(url);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
 								function clean$2(content, $) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  content = content.replace(/[\s\n]+/g, ' ').trim();
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  return ellipsize(content, maxLength, {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    ellipse: '&hellip;'
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  });
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								var GenericExcerptExtractor = {
 								  extract: function extract(_ref) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var $ = _ref.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        content = _ref.content,
 								        metaCache = _ref.metaCache;
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache);
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    if (excerpt) {
 								      return clean$2(stripTags(excerpt, $));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Fall back to excerpting from the extracted content
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var maxLength = 200;
 								    var shortContent = content.slice(0, maxLength * 5);
 								    return clean$2($(shortContent).text(), $, maxLength);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								var getWordCount = function getWordCount(content) {
 								  var $ = cheerio.load(content);
 								  var $content = $('div').first();
 								  var text = normalizeSpaces($content.text());
 								  return text.split(/\s/).length;
 								};
 								var getWordCountAlt = function getWordCountAlt(content) {
 								  content = content.replace(/<[^>]*>/g, ' ');
 								  content = content.replace(/\s+/g, ' ');
 								  content = content.trim();
 								  return content.split(' ').length;
 								};
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								var GenericWordCountExtractor = {
 								  extract: function extract(_ref) {
 								    var content = _ref.content;
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								    var count = getWordCount(content);
 								    if (count === 1) count = getWordCountAlt(content);
 								    return count;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
 								var GenericExtractor = {
 								  // This extractor is the default for all domains
 								  domain: '*',
 								  title: GenericTitleExtractor.extract,
 								  date_published: GenericDatePublishedExtractor.extract,
 								  author: GenericAuthorExtractor.extract,
 								  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
 								  lead_image_url: GenericLeadImageUrlExtractor.extract,
 								  dek: GenericDekExtractor.extract,
 								  next_page_url: GenericNextPageUrlExtractor.extract,
 								  url_and_domain: GenericUrlExtractor.extract,
 								  excerpt: GenericExcerptExtractor.extract,
 								  word_count: GenericWordCountExtractor.extract,
 								  direction: function direction(_ref) {
 								    var title = _ref.title;
 								    return stringDirection.getDirection(title);
 								  },
 								  extract: function extract(options) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    var html = options.html,
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								        $ = options.$;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								    if (html && !$) {
 								      var loaded = cheerio.load(html);
 								      options.$ = loaded;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    }
 								    var title = this.title(options);
 								    var date_published = this.date_published(options);
 								    var author = this.author(options);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var content = this.content(_objectSpread({}, options, {
 								      title: title
 								    }));
 								    var lead_image_url = this.lead_image_url(_objectSpread({}, options, {
 								      content: content
 								    }));
 								    var dek = this.dek(_objectSpread({}, options, {
 								      content: content
 								    }));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var next_page_url = this.next_page_url(options);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var excerpt = this.excerpt(_objectSpread({}, options, {
 								      content: content
 								    }));
 								    var word_count = this.word_count(_objectSpread({}, options, {
 								      content: content
 								    }));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var direction = this.direction({
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      title: title
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    });
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var _this$url_and_domain = this.url_and_domain(options),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        url = _this$url_and_domain.url,
 								        domain = _this$url_and_domain.domain;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    return {
 								      title: title,
 								      author: author,
 								      date_published: date_published || null,
 								      dek: dek,
 								      lead_image_url: lead_image_url,
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								      content: content,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      next_page_url: next_page_url,
 								      url: url,
 								      domain: domain,
 								      excerpt: excerpt,
 								      word_count: word_count,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      direction: direction
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    };
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								var Detectors = {
 								  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  'meta[name="generator"][value="blogger"]': BloggerExtractor
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								};
 								function detectByHtml($) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  var selector = _Reflect$ownKeys(Detectors).find(function (s) {
-												release: 1.0.2 (#54)


											
										
										
											8 years ago
+								    return $(s).length > 0;
 								  });
 								  return Detectors[selector];
 								}
 								function getExtractor(url, parsedUrl, $) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  parsedUrl = parsedUrl || URL.parse(url);
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var _parsedUrl = parsedUrl,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      hostname = _parsedUrl.hostname;
 								  var baseDomain = hostname.split('.').slice(-2).join('.');
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								  return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								function cleanBySelectors($content, $, _ref) {
 								  var clean = _ref.clean;
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								  if (!clean) return $content;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  $(clean.join(','), $content).remove();
 								  return $content;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								} // Transform matching elements
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								function transformElements($content, $, _ref2) {
 								  var transforms = _ref2.transforms;
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								  if (!transforms) return $content;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  _Reflect$ownKeys(transforms).forEach(function (key) {
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    var $matches = $(key, $content);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var value = transforms[key]; // If value is a string, convert directly
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								    if (typeof value === 'string') {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      $matches.each(function (index, node) {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								        convertNodeTo$$1($(node), $, transforms[key]);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      });
 								    } else if (typeof value === 'function') {
 								      // If value is function, apply function to node
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      $matches.each(function (index, node) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        var result = value($(node), $); // If function returns a string, convert node to that value
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        if (typeof result === 'string') {
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								          convertNodeTo$$1($(node), $, result);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								        }
 								      });
 								    }
 								  });
 								  return $content;
 								}
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  return selectors.find(function (selector) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (_Array$isArray(selector)) {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								      if (extractHtml) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        return selector.reduce(function (acc, s) {
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								          return acc && $(s).length > 0;
 								        }, true);
 								      }
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								      var _selector = _slicedToArray(selector, 2),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          s = _selector[0],
 								          attr = _selector[1];
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								      return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								    }
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    return (allowMultiple || !allowMultiple && $(selector).length === 1) && $(selector).text().trim() !== '';
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  });
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								function select(opts) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var $ = opts.$,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      type = opts.type,
 								      extractionOpts = opts.extractionOpts,
 								      _opts$extractHtml = opts.extractHtml,
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // contributors), return the string
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (typeof extractionOpts === 'string') return extractionOpts;
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var selectors = extractionOpts.selectors,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      _extractionOpts$defau = extractionOpts.defaultCleaner,
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								      defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
 								      allowMultiple = extractionOpts.allowMultiple;
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								  var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
 								  var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  if (!matchingSelector) return null;
 								  function transformAndClean($node) {
 								    makeLinksAbsolute$$1($node, $, opts.url || '');
 								    cleanBySelectors($node, $, extractionOpts);
 								    transformElements($node, $, extractionOpts);
 								    return $node;
 								  }
 								  function selectHtml() {
 								    // If the selector type requests html as its return type
 								    // transform and clean the element with provided selectors
 								    var $content; // If matching selector is an array, we're considering this a
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    // multi-match selection, which allows the parser to choose several
 								    // selectors to include in the result. Note that all selectors in the
 								    // array must match in order for this selector to trigger
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    if (_Array$isArray(matchingSelector)) {
 								      $content = $(matchingSelector.join(','));
 								      var $wrapper = $('<div></div>');
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								      $content.each(function (_, element) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								        $wrapper.append(element);
 								      });
 								      $content = $wrapper;
-												release: 1.0.0 so we can start doing proper releaes (#39)


											
										
										
											8 years ago
+								    } else {
 								      $content = $(matchingSelector);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    } // Wrap in div so transformation can take place on root element
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    $content.wrap($('<div></div>'));
 								    $content = $content.parent();
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    $content = transformAndClean($content);
 								    if (Cleaners[type]) {
 								      Cleaners[type]($content, _objectSpread({}, opts, {
 								        defaultCleaner: defaultCleaner
 								      }));
 								    }
 								    if (allowMultiple) {
 								      return $content.children().toArray().map(function (el) {
 								        return $.html($(el));
 								      });
 								    }
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								    return $.html($content);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  if (extractHtml) {
 								    return selectHtml(matchingSelector);
 								  }
 								  var $match;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  var result; // if selector is an array (e.g., ['img', 'src']),
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  // extract the attr
-												deps: upgrade (#218)


											
										
										
											5 years ago
 								  if (_Array$isArray(matchingSelector)) {
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								    var _matchingSelector = _slicedToArray(matchingSelector, 3),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        selector = _matchingSelector[0],
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								        attr = _matchingSelector[1],
 								        transform = _matchingSelector[2];
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    $match = $(selector);
 								    $match = transformAndClean($match);
 								    result = $match.map(function (_, el) {
-												release: 2.1.0 (#373)


											
										
										
											5 years ago
+								      var item = $(el).attr(attr).trim();
 								      return transform ? transform(item) : item;
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    });
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  } else {
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    $match = $(matchingSelector);
 								    $match = transformAndClean($match);
 								    result = $match.map(function (_, el) {
 								      return $(el).text().trim();
 								    });
 								  }
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0]; // Allow custom extractor to skip default cleaner
 								  // for this type; defaults to true
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								  if (defaultCleaner && Cleaners[type]) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  }
 								  return result;
 								}
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								function selectExtendedTypes(extend, opts) {
 								  var results = {};
 								  _Reflect$ownKeys(extend).forEach(function (t) {
 								    if (!results[t]) {
 								      results[t] = select(_objectSpread({}, opts, {
 								        type: t,
 								        extractionOpts: extend[t]
 								      }));
 								    }
 								  });
 								  return results;
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								function extractResult(opts) {
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								  var type = opts.type,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      extractor = opts.extractor,
 								      _opts$fallback = opts.fallback,
 								      fallback = _opts$fallback === void 0 ? true : _opts$fallback;
 								  var result = select(_objectSpread({}, opts, {
 								    extractionOpts: extractor[type]
 								  })); // If custom parser succeeds, return the result
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
 								  if (result) {
 								    return result;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  } // If nothing matches the selector, and fallback is enabled,
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								  // run the Generic extraction
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  if (fallback) return GenericExtractor[type](opts);
-												feat: custom parser + generator + detailed readme instructions

Squashed commit of the following:

commit 02563daa67712c3679258ebebac60dfa9568dffb
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 12:25:44 2016 -0400

    updated readme, added newyorker parser for readme guide

commit 0ac613ef823efbffbf4cc9a89e5cb2489d1c4f6f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 11:16:52 2016 -0400

    feat: updated parser so the saved fixture absolutizes urls

commit 85c7a2660b21f95c2205ca4a4378a7570687fed0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 30 10:15:26 2016 -0400

    refactor: attribute selectors must be an array for custom extractors

commit f60f93d5d3d9b2f2d9ec6f28d27ae9dcf16ef01e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 10:13:14 2016 -0400

    fix: whitelisting srcset and alt attributes

commit e31cb1f4e8a9fc9c3d9b20ef9f40ca6c8d6ad51a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 29 09:44:21 2016 -0400

    some housekeeping for coverage tests

commit 39eafe420c776a1fe7f9fea634fb529a3ed75a71
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 28 17:52:08 2016 -0400

    fix: word count for multi-page articles

commit b04e0066b52f190481b1b604c64e3d0b1226ff02
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 22 10:40:23 2016 -0400

    major improvements to output

commit 3f3a880b63b47fe21953485da670b6e291ac60e5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 17:27:53 2016 -0400

    updated test command

commit 14503426557a870755453572221d95c92cff4bd2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 16:00:30 2016 -0400

    shortened generator command

commit 5ebd8343cd4b87b3f5787dab665bff0de96846e1
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 21 15:59:14 2016 -0400

    feat: can disable fallback to generic parser (this will be useful for testing custom parsers)

											
										
										
											8 years ago
+								  return null;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								}
 								var RootExtractor = {
 								  extract: function extract() {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    var opts = arguments.length > 1 ? arguments[1] : undefined;
-												feat: making yarn-friendly for package manager (#17)

* updated several commands; some fixes exposed by yarn upgrade

* removed unnec dep

											
										
										
											8 years ago
+								    var _opts = opts,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        contentOnly = _opts.contentOnly,
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								        extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
-												chore: renamed iris to mercury

											
										
										
											8 years ago
 								    if (extractor.domain === '*') return extractor.extract(opts);
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    opts = _objectSpread({}, opts, {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      extractor: extractor
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								    });
 								    if (contentOnly) {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      var _content = extractResult(_objectSpread({}, opts, {
 								        type: 'content',
 								        extractHtml: true,
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								        title: extractedTitle
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      }));
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      return {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								        content: _content
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      };
 								    }
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    var title = extractResult(_objectSpread({}, opts, {
 								      type: 'title'
 								    }));
 								    var date_published = extractResult(_objectSpread({}, opts, {
 								      type: 'date_published'
 								    }));
 								    var author = extractResult(_objectSpread({}, opts, {
 								      type: 'author'
 								    }));
 								    var next_page_url = extractResult(_objectSpread({}, opts, {
 								      type: 'next_page_url'
 								    }));
 								    var content = extractResult(_objectSpread({}, opts, {
 								      type: 'content',
 								      extractHtml: true,
 								      title: title
 								    }));
 								    var lead_image_url = extractResult(_objectSpread({}, opts, {
 								      type: 'lead_image_url',
 								      content: content
 								    }));
 								    var excerpt = extractResult(_objectSpread({}, opts, {
 								      type: 'excerpt',
 								      content: content
 								    }));
 								    var dek = extractResult(_objectSpread({}, opts, {
 								      type: 'dek',
 								      content: content,
 								      excerpt: excerpt
 								    }));
 								    var word_count = extractResult(_objectSpread({}, opts, {
 								      type: 'word_count',
 								      content: content
 								    }));
 								    var direction = extractResult(_objectSpread({}, opts, {
 								      type: 'direction',
 								      title: title
 								    }));
 								    var _ref3 = extractResult(_objectSpread({}, opts, {
 								      type: 'url_and_domain'
 								    })) || {
 								      url: null,
 								      domain: null
 								    },
 								        url = _ref3.url,
 								        domain = _ref3.domain;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    var extendedResults = {};
 								    if (extractor.extend) {
 								      extendedResults = selectExtendedTypes(extractor.extend, opts);
 								    }
 								    return _objectSpread({
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								      title: title,
 								      content: content,
 								      author: author,
 								      date_published: date_published,
 								      lead_image_url: lead_image_url,
 								      dek: dek,
 								      next_page_url: next_page_url,
 								      url: url,
 								      domain: domain,
 								      excerpt: excerpt,
 								      word_count: word_count,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      direction: direction
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								    }, extendedResults);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function collectAllPages(_x) {
 								  return _collectAllPages.apply(this, arguments);
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								function _collectAllPages() {
 								  _collectAllPages = _asyncToGenerator(
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  /*#__PURE__*/
 								  _regeneratorRuntime.mark(function _callee(_ref) {
 								    var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count;
 								    return _regeneratorRuntime.wrap(function _callee$(_context) {
 								      while (1) {
 								        switch (_context.prev = _context.next) {
 								          case 0:
 								            next_page_url = _ref.next_page_url, html = _ref.html, $ = _ref.$, metaCache = _ref.metaCache, result = _ref.result, Extractor = _ref.Extractor, title = _ref.title, url = _ref.url;
 								            // At this point, we've fetched just the first page
 								            pages = 1;
 								            previousUrls = [removeAnchor(url)]; // If we've gone over 26 pages, something has
 								            // likely gone wrong.
 								          case 3:
 								            if (!(next_page_url && pages < 26)) {
 								              _context.next = 16;
 								              break;
 								            }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            pages += 1; // eslint-disable-next-line no-await-in-loop
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            _context.next = 7;
 								            return Resource.create(next_page_url);
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          case 7:
 								            $ = _context.sent;
 								            html = $.html();
 								            extractorOpts = {
 								              url: next_page_url,
 								              html: html,
 								              $: $,
 								              metaCache: metaCache,
 								              extractedTitle: title,
 								              previousUrls: previousUrls
 								            };
 								            nextPageResult = RootExtractor.extract(Extractor, extractorOpts);
 								            previousUrls.push(next_page_url);
 								            result = _objectSpread({}, result, {
 								              content: "".concat(result.content, "<hr><h4>Page ").concat(pages, "</h4>").concat(nextPageResult.content)
 								            }); // eslint-disable-next-line prefer-destructuring
 								            next_page_url = nextPageResult.next_page_url;
 								            _context.next = 3;
 								            break;
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          case 16:
 								            word_count = GenericExtractor.word_count({
 								              content: "<div>".concat(result.content, "</div>")
 								            });
 								            return _context.abrupt("return", _objectSpread({}, result, {
 								              total_pages: pages,
-												release: 2.2.3 (#703)


											
										
										
											2 years ago
+								              rendered_pages: pages,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              word_count: word_count
 								            }));
 								          case 18:
 								          case "end":
 								            return _context.stop();
 								        }
 								      }
 								    }, _callee, this);
 								  }));
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								  return _collectAllPages.apply(this, arguments);
 								}
-												chore: renamed iris to mercury

											
										
										
											8 years ago
-												release: 2.2.2 (#689)


											
										
										
											2 years ago
+								var Parser = {
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  parse: function () {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								    var _parse = _asyncToGenerator(
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								    /*#__PURE__*/
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    _regeneratorRuntime.mark(function _callee(url) {
 								      var _ref,
 								          html,
 								          opts,
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								          _opts$fetchAllPages,
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								          fetchAllPages,
 								          _opts$fallback,
 								          fallback,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								          _opts$contentType,
 								          contentType,
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								          _opts$headers,
 								          headers,
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								          extend,
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								          customExtractor,
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								          parsedUrl,
 								          $,
 								          Extractor,
 								          metaCache,
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								          extendedTypes,
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								          result,
 								          _result,
 								          title,
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								          next_page_url,
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								          turndownService,
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								          _args = arguments;
-												deps: upgrade (#218)


											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								      return _regeneratorRuntime.wrap(function _callee$(_context) {
 								        while (1) {
 								          switch (_context.prev = _context.next) {
 								            case 0:
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								              _ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								              _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              // set url to window.location.href and load the html
 								              // from the current page
 								              if (!url && cheerio.browser) {
 								                url = window.location.href; // eslint-disable-line no-undef
 								                html = html || cheerio.html();
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								              }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								              parsedUrl = URL.parse(url);
 								              if (validateUrl(parsedUrl)) {
 								                _context.next = 6;
 								                break;
 								              }
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								              return _context.abrupt("return", {
 								                error: true,
 								                message: 'The url parameter passed does not look like a valid URL. Please check your URL and try again.'
 								              });
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								            case 6:
 								              _context.next = 8;
-												feat: Support passing custom headers in requests (#337)


											
										
										
											5 years ago
+								              return Resource.create(url, html, parsedUrl, headers);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
 								            case 8:
 								              $ = _context.sent;
 								              if (!$.failed) {
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								                _context.next = 11;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								                break;
 								              }
 								              return _context.abrupt("return", $);
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								            case 11:
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								              // Add custom extractor via cli.
 								              if (customExtractor) {
 								                addExtractor(customExtractor);
 								              }
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								              Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
-												release: 2.2.2 (#689)


											
										
										
											2 years ago
+								              // if html still has not been set (i.e., url passed to Parser.parse),
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              // set html from the response of Resource.create
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              if (!html) {
 								                html = $.html();
 								              } // Cached value of every meta name in our document.
 								              // Used when extracting title/author/date_published/dek
 								              metaCache = $('meta').map(function (_, node) {
 								                return $(node).attr('name');
 								              }).toArray();
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								              extendedTypes = {};
 								              if (extend) {
 								                extendedTypes = selectExtendedTypes(extend, {
 								                  $: $,
 								                  url: url,
 								                  html: html
 								                });
 								              }
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              result = RootExtractor.extract(Extractor, {
 								                url: url,
 								                html: html,
 								                $: $,
 								                metaCache: metaCache,
 								                parsedUrl: parsedUrl,
 								                fallback: fallback,
 								                contentType: contentType
 								              });
 								              _result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found
 								              if (!(fetchAllPages && next_page_url)) {
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								                _context.next = 25;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								                break;
 								              }
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								              _context.next = 22;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              return collectAllPages({
 								                Extractor: Extractor,
 								                next_page_url: next_page_url,
 								                html: html,
 								                $: $,
 								                metaCache: metaCache,
 								                result: result,
 								                title: title,
 								                url: url
 								              });
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								            case 22:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              result = _context.sent;
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								              _context.next = 26;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              break;
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								            case 25:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								              result = _objectSpread({}, result, {
 								                total_pages: 1,
 								                rendered_pages: 1
 								              });
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								            case 26:
-												chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
											
										
										
											5 years ago
+								              if (contentType === 'markdown') {
 								                turndownService = new TurndownService();
 								                result.content = turndownService.turndown(result.content);
 								              } else if (contentType === 'text') {
 								                result.content = $.text($(result.content));
 								              }
-												feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README

											
										
										
											5 years ago
+								              return _context.abrupt("return", _objectSpread({}, result, extendedTypes));
-												feat: add content format output options (#256)



											
										
										
											5 years ago
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								            case 28:
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								            case "end":
 								              return _context.stop();
 								          }
 								        }
 								      }, _callee, this);
 								    }));
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
-												fix: custom parser generator (#271)

- swap fs import
- fix rollup config
											
										
										
											5 years ago
+								    function parse(_x) {
-												release: 1.1.0 (#245)


											
										
										
											5 years ago
+								      return _parse.apply(this, arguments);
 								    }
 								    return parse;
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }(),
-												Feat: browser support (#19)

Big undertaking to support Mercury in the browser. Builds are working and all tests are passing both for web and node builds. Most code is closely shared.
											
										
										
											8 years ago
+								  browser: !!cheerio.browser,
-												feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress

											
										
										
											8 years ago
+								  // A convenience method for getting a resource
 								  // to work with, e.g., for custom extractor generator
 								  fetchResource: function fetchResource(url) {
-												deps: upgrade (#218)


											
										
										
											5 years ago
+								    return Resource.create(url);
-												release: 2.2.0 (#496)

* release: 2.2.0

											
										
										
											5 years ago
+								  },
 								  addExtractor: function addExtractor$$1(extractor) {
 								    return addExtractor(extractor);
-												feat: add content format output options (#256)



											
										
										
											5 years ago
+								  }
-												chore: renamed iris to mercury

											
										
										
											8 years ago
+								};
-												release: 2.2.2 (#689)


											
										
										
											2 years ago
+								module.exports = Parser;