diff --git a/.babelrc b/.babelrc
index 1341d502..fe77f11e 100644
--- a/.babelrc
+++ b/.babelrc
@@ -1,29 +1,33 @@
{
- "presets": ["es2015"],
+ "presets": ["@babel/preset-env"],
"plugins": [
- "transform-es2015-destructuring",
- "transform-object-rest-spread",
- "transform-async-to-generator",
- ["module-resolver", {
- "root": ["./src"],
- "alias": {
- "./utils": "utils",
- "./cleaners": "cleaners",
- "./resource": "resource",
- "./extractors": "extractors",
- "./test-helpers.js": "test-helpers",
- "./mercury.js": "mercury"
+ "@babel/plugin-transform-destructuring",
+ "@babel/plugin-proposal-object-rest-spread",
+ "@babel/plugin-transform-async-to-generator",
+ [
+ "module-resolver",
+ {
+ "root": ["./src"],
+ "alias": {
+ "./utils": "utils",
+ "./cleaners": "cleaners",
+ "./resource": "resource",
+ "./extractors": "extractors",
+ "./test-helpers.js": "test-helpers",
+ "./mercury.js": "mercury"
+ }
}
- }]
+ ]
],
"env": {
"development": {
"plugins": [
- "transform-export-extensions",
+ ["@babel/plugin-proposal-export-default-from"],
+ ["@babel/plugin-proposal-export-namespace-from"],
[
- "transform-runtime",
+ "@babel/plugin-transform-runtime",
{
- "polyfill": true,
+ "corejs": 2,
"regenerator": true
}
]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index e2f86e51..bc131076 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -21,7 +21,6 @@ jobs:
- checkout
- run: "yarn install"
# For some reason phantomjs-prebuild is failing w/yarn, but npm installing works
- - run: "npm install phantomjs-prebuilt"
- run: "yarn add karma-cli --dev"
- run: "yarn test:web --maxWorkers=4"
- run: "yarn build:web --maxWorkers=4"
diff --git a/.eslintrc b/.eslintrc
index 4d8cf104..d39f2f02 100644
--- a/.eslintrc
+++ b/.eslintrc
@@ -1,17 +1,15 @@
{
"parser": "babel-eslint",
"extends": ["airbnb", "prettier"],
- "plugins": [
- "babel"
- ],
+ "plugins": ["babel"],
"globals": {
- "describe",
- "it",
- "fit",
- "jasmine",
- "beforeEach",
- "beforeAll",
- "afterAll",
+ "describe": true,
+ "it": true,
+ "fit": true,
+ "jasmine": true,
+ "beforeEach": true,
+ "beforeAll": true,
+ "afterAll": true
},
"rules": {
"no-param-reassign": 0,
@@ -23,18 +21,14 @@
"no-confusing-arrow": 0,
"camelcase": 0,
"no-multiple-empty-lines": [
- "error", {"max": 1, "maxEOF": 0, "maxBOF": 0}
+ "error",
+ { "max": 1, "maxEOF": 0, "maxBOF": 0 }
],
- },
- "options": {
- "no-multiple-empty-lines": {
- "max": 1,
- "maxEOF": 0,
- },
+ "import/no-unresolved": false
},
"settings": {
"import/resolver": {
"babel-module": {}
}
- },
+ }
}
diff --git a/dist/mercury.js b/dist/mercury.js
index 58aeedaa..4554b056 100644
--- a/dist/mercury.js
+++ b/dist/mercury.js
@@ -1,33 +1,65 @@
'use strict';
-function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }
+function _interopDefault(ex) {
+ return ex && typeof ex === 'object' && 'default' in ex ? ex['default'] : ex;
+}
-var _regeneratorRuntime = _interopDefault(require('babel-runtime/regenerator'));
-var _extends = _interopDefault(require('babel-runtime/helpers/extends'));
-var _asyncToGenerator = _interopDefault(require('babel-runtime/helpers/asyncToGenerator'));
+var _regeneratorRuntime = _interopDefault(
+ require('@babel/runtime-corejs2/regenerator')
+);
+var _objectSpread = _interopDefault(
+ require('@babel/runtime-corejs2/helpers/objectSpread')
+);
+var _asyncToGenerator = _interopDefault(
+ require('@babel/runtime-corejs2/helpers/asyncToGenerator')
+);
var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
var iconv = _interopDefault(require('iconv-lite'));
-var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
-var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
+var _parseInt = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/parse-int')
+);
+var _slicedToArray = _interopDefault(
+ require('@babel/runtime-corejs2/helpers/slicedToArray')
+);
+var _Promise = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/promise')
+);
var request = _interopDefault(require('request'));
-var _Reflect$ownKeys = _interopDefault(require('babel-runtime/core-js/reflect/own-keys'));
-var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
-var _defineProperty = _interopDefault(require('babel-runtime/helpers/defineProperty'));
-var _typeof = _interopDefault(require('babel-runtime/helpers/typeof'));
-var _getIterator = _interopDefault(require('babel-runtime/core-js/get-iterator'));
-var _Object$keys = _interopDefault(require('babel-runtime/core-js/object/keys'));
+var _Reflect$ownKeys = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/reflect/own-keys')
+);
+var _toConsumableArray = _interopDefault(
+ require('@babel/runtime-corejs2/helpers/toConsumableArray')
+);
+var _defineProperty = _interopDefault(
+ require('@babel/runtime-corejs2/helpers/defineProperty')
+);
+var _parseFloat = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/parse-float')
+);
+var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
+var _getIterator = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/get-iterator')
+);
+var _Object$keys = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/object/keys')
+);
var stringDirection = _interopDefault(require('string-direction'));
var validUrl = _interopDefault(require('valid-url'));
var moment = _interopDefault(require('moment-timezone'));
var parseFormat = _interopDefault(require('moment-parseformat'));
var wuzzy = _interopDefault(require('wuzzy'));
var difflib = _interopDefault(require('difflib'));
-var _Array$from = _interopDefault(require('babel-runtime/core-js/array/from'));
+var _Array$from = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/array/from')
+);
var ellipsize = _interopDefault(require('ellipsize'));
+var _Array$isArray = _interopDefault(
+ require('@babel/runtime-corejs2/core-js/array/is-array')
+);
var NORMALIZE_RE = /\s{2,}/g;
-
function normalizeSpaces(text) {
return text.replace(NORMALIZE_RE, ' ').trim();
}
@@ -38,9 +70,10 @@ function normalizeSpaces(text) {
// string to be cleaned.
// Only used for date_published currently.
function extractFromUrl(url, regexList) {
- var matchRe = regexList.find(function (re) {
+ var matchRe = regexList.find(function(re) {
return re.test(url);
});
+
if (matchRe) {
return matchRe.exec(url)[1];
}
@@ -64,13 +97,13 @@ function extractFromUrl(url, regexList) {
// Does not match:
// pg=102
// page:2
-var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
-
+var PAGE_IN_HREF_RE = new RegExp(
+ '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})',
+ 'i'
+);
var HAS_ALPHA_RE = /[a-z]/i;
-
var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
-
var ENCODING_RE = /charset=([\w-]+)\b/;
var DEFAULT_ENCODING = 'utf-8';
@@ -78,10 +111,9 @@ function pageNumFromUrl(url) {
var matches = url.match(PAGE_IN_HREF_RE);
if (!matches) return null;
- var pageNum = parseInt(matches[6], 10);
-
- // Return pageNum < 100, otherwise
+ var pageNum = _parseInt(matches[6], 10); // Return pageNum < 100, otherwise
// return null
+
return pageNum < 100 ? pageNum : null;
}
@@ -90,78 +122,73 @@ function removeAnchor(url) {
}
function isGoodSegment(segment, index, firstSegmentHasLetters) {
- var goodSegment = true;
-
- // If this is purely a number, and it's the first or second
+ var goodSegment = true; // If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
+
if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
goodSegment = true;
- }
-
- // If this is the first url_segment and it's just "index",
+ } // If this is the first url_segment and it's just "index",
// remove it
+
if (index === 0 && segment.toLowerCase() === 'index') {
goodSegment = false;
- }
-
- // If our first or second url_segment is smaller than 3 characters,
+ } // If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
+
if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
goodSegment = false;
}
return goodSegment;
-}
-
-// Take a URL, and return the article base of said URL. That is, no
+} // Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
+
function articleBaseUrl(url, parsed) {
var parsedUrl = parsed || URL.parse(url);
var protocol = parsedUrl.protocol,
- host = parsedUrl.host,
- path = parsedUrl.path;
-
-
+ host = parsedUrl.host,
+ path = parsedUrl.path;
var firstSegmentHasLetters = false;
- var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
- var segment = rawSegment;
-
- // Split off and save anything that looks like a file type.
- if (segment.includes('.')) {
- var _segment$split = segment.split('.'),
+ var cleanedSegments = path
+ .split('/')
+ .reverse()
+ .reduce(function(acc, rawSegment, index) {
+ var segment = rawSegment; // Split off and save anything that looks like a file type.
+
+ if (segment.includes('.')) {
+ var _segment$split = segment.split('.'),
_segment$split2 = _slicedToArray(_segment$split, 2),
possibleSegment = _segment$split2[0],
fileExt = _segment$split2[1];
- if (IS_ALPHA_RE.test(fileExt)) {
- segment = possibleSegment;
+ if (IS_ALPHA_RE.test(fileExt)) {
+ segment = possibleSegment;
+ }
+ } // If our first or second segment has anything looking like a page
+ // number, remove it.
+
+ if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
+ segment = segment.replace(PAGE_IN_HREF_RE, '');
+ } // If we're on the first segment, check to see if we have any
+ // characters in it. The first segment is actually the last bit of
+ // the URL, and this will be helpful to determine if we're on a URL
+ // segment that looks like "/2/" for example.
+
+ if (index === 0) {
+ firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
+ } // If it's not marked for deletion, push it to cleaned_segments.
+
+ if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
+ acc.push(segment);
}
- }
-
- // If our first or second segment has anything looking like a page
- // number, remove it.
- if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
- segment = segment.replace(PAGE_IN_HREF_RE, '');
- }
-
- // If we're on the first segment, check to see if we have any
- // characters in it. The first segment is actually the last bit of
- // the URL, and this will be helpful to determine if we're on a URL
- // segment that looks like "/2/" for example.
- if (index === 0) {
- firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
- }
-
- // If it's not marked for deletion, push it to cleaned_segments.
- if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
- acc.push(segment);
- }
-
- return acc;
- }, []);
- return protocol + '//' + host + cleanedSegments.reverse().join('/');
+ return acc;
+ }, []);
+ return ''
+ .concat(protocol, '//')
+ .concat(host)
+ .concat(cleanedSegments.reverse().join('/'));
}
// Given a string, return True if it appears to have an ending sentence
@@ -172,58 +199,75 @@ function hasSentenceEnd(text) {
}
function excerptContent(content) {
- var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
-
- return content.trim().split(/\s+/).slice(0, words).join(' ');
+ var words =
+ arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
+ return content
+ .trim()
+ .split(/\s+/)
+ .slice(0, words)
+ .join(' ');
}
-// check a string for encoding; this is
// used in our fetchResource function to
// ensure correctly encoded responses
+
function getEncoding(str) {
var encoding = DEFAULT_ENCODING;
+
if (ENCODING_RE.test(str)) {
var testEncode = ENCODING_RE.exec(str)[1];
+
if (iconv.encodingExists(testEncode)) {
encoding = testEncode;
}
}
+
return encoding;
}
-var _marked = [range].map(_regeneratorRuntime.mark);
+var _marked =
+ /*#__PURE__*/
+ _regeneratorRuntime.mark(range);
function range() {
- var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
- var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
- return _regeneratorRuntime.wrap(function range$(_context) {
- while (1) {
- switch (_context.prev = _context.next) {
- case 0:
- if (!(start <= end)) {
- _context.next = 5;
- break;
- }
+ var start,
+ end,
+ _args = arguments;
+ return _regeneratorRuntime.wrap(
+ function range$(_context) {
+ while (1) {
+ switch ((_context.prev = _context.next)) {
+ case 0:
+ start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
+ end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;
- _context.next = 3;
- return start += 1;
+ case 2:
+ if (!(start <= end)) {
+ _context.next = 7;
+ break;
+ }
- case 3:
- _context.next = 0;
- break;
+ _context.next = 5;
+ return (start += 1);
+
+ case 5:
+ _context.next = 2;
+ break;
- case 5:
- case "end":
- return _context.stop();
+ case 7:
+ case 'end':
+ return _context.stop();
+ }
}
- }
- }, _marked[0], this);
+ },
+ _marked,
+ this
+ );
}
// extremely simple url validation as a first step
function validateUrl(_ref) {
var hostname = _ref.hostname;
-
// If this isn't a valid url, return an error message
return !!hostname;
}
@@ -231,50 +275,50 @@ function validateUrl(_ref) {
var Errors = {
badUrl: {
error: true,
- messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
- }
+ messages:
+ 'The url parameter passed does not look like a valid URL. Please check your data and try again.',
+ },
};
-// Browser does not like us setting user agent
-var REQUEST_HEADERS = cheerio.browser ? {} : {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
-};
+var REQUEST_HEADERS = cheerio.browser
+ ? {}
+ : {
+ 'User-Agent':
+ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
+ }; // The number of milliseconds to attempt to fetch a resource before timing out.
-// The number of milliseconds to attempt to fetch a resource before timing out.
-var FETCH_TIMEOUT = 10000;
+var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from
-// Content types that we do not extract content from
var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];
-
-var BAD_CONTENT_TYPES_RE = new RegExp('^(' + BAD_CONTENT_TYPES.join('|') + ')$', 'i');
-
-// Use this setting as the maximum size an article can be
+var BAD_CONTENT_TYPES_RE = new RegExp(
+ '^('.concat(BAD_CONTENT_TYPES.join('|'), ')$'),
+ 'i'
+); // Use this setting as the maximum size an article can be
// for us to attempt parsing. Defaults to 5 MB.
-var MAX_CONTENT_LENGTH = 5242880;
-// Turn the global proxy on or off
-// Proxying is not currently enabled in Python source
-// so not implementing logic in port.
+var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off
function get(options) {
- return new _Promise(function (resolve, reject) {
- request(options, function (err, response, body) {
+ return new _Promise(function(resolve, reject) {
+ request(options, function(err, response, body) {
if (err) {
reject(err);
} else {
- resolve({ body: body, response: response });
+ resolve({
+ body: body,
+ response: response,
+ });
}
});
});
-}
-
-// Evaluate a response to ensure it's something we should be keeping.
+} // Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 level or
// not. Validation here means that we haven't found reason to bail from
// further processing of this url.
function validateResponse(response) {
- var parseNon2xx = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
+ var parseNon2xx =
+ arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
// Check if we got a valid status code
// This isn't great, but I'm requiring a statusMessage to be set
@@ -282,114 +326,128 @@ function validateResponse(response) {
// statusMessage only not set in nock response, in which case
// I check statusCode, which is currently only 200 for OK responses
// in tests
- if (response.statusMessage && response.statusMessage !== 'OK' || response.statusCode !== 200) {
+ if (
+ (response.statusMessage && response.statusMessage !== 'OK') ||
+ response.statusCode !== 200
+ ) {
if (!response.statusCode) {
- throw new Error('Unable to fetch content. Original exception was ' + response.error);
+ throw new Error(
+ 'Unable to fetch content. Original exception was '.concat(
+ response.error
+ )
+ );
} else if (!parseNon2xx) {
- throw new Error('Resource returned a response status code of ' + response.statusCode + ' and resource was instructed to reject non-2xx level status codes.');
+ throw new Error(
+ 'Resource returned a response status code of '.concat(
+ response.statusCode,
+ ' and resource was instructed to reject non-2xx level status codes.'
+ )
+ );
}
}
var _response$headers = response.headers,
- contentType = _response$headers['content-type'],
- contentLength = _response$headers['content-length'];
-
- // Check that the content is not in BAD_CONTENT_TYPES
+ contentType = _response$headers['content-type'],
+ contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES
if (BAD_CONTENT_TYPES_RE.test(contentType)) {
- throw new Error('Content-type for this resource was ' + contentType + ' and is not allowed.');
- }
+ throw new Error(
+ 'Content-type for this resource was '.concat(
+ contentType,
+ ' and is not allowed.'
+ )
+ );
+ } // Check that the content length is below maximum
- // Check that the content length is below maximum
if (contentLength > MAX_CONTENT_LENGTH) {
- throw new Error('Content for this resource was too large. Maximum content length is ' + MAX_CONTENT_LENGTH + '.');
+ throw new Error(
+ 'Content for this resource was too large. Maximum content length is '.concat(
+ MAX_CONTENT_LENGTH,
+ '.'
+ )
+ );
}
return true;
-}
-
-// Grabs the last two pieces of the URL and joins them back together
-// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'
-
-
-// Set our response attribute to the result of fetching our URL.
+} // Grabs the last two pieces of the URL and joins them back together
// TODO: This should gracefully handle timeouts and raise the
// proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
-var fetchResource$1 = (function () {
- var _ref2 = _asyncToGenerator(_regeneratorRuntime.mark(function _callee(url, parsedUrl) {
- var options, _ref3, response, body;
-
- return _regeneratorRuntime.wrap(function _callee$(_context) {
- while (1) {
- switch (_context.prev = _context.next) {
- case 0:
- parsedUrl = parsedUrl || URL.parse(encodeURI(url));
-
- options = {
- url: parsedUrl.href,
- headers: _extends({}, REQUEST_HEADERS),
- timeout: FETCH_TIMEOUT,
- // Accept cookies
- jar: true,
- // Set to null so the response returns as binary and body as buffer
- // https://github.com/request/request#requestoptions-callback
- encoding: null,
- // Accept and decode gzip
- gzip: true,
- // Follow any redirect
- followAllRedirects: true
- };
- _context.next = 4;
- return get(options);
-
- case 4:
- _ref3 = _context.sent;
- response = _ref3.response;
- body = _ref3.body;
- _context.prev = 7;
-
- validateResponse(response);
- return _context.abrupt('return', {
- body: body,
- response: response
- });
-
- case 12:
- _context.prev = 12;
- _context.t0 = _context['catch'](7);
- return _context.abrupt('return', Errors.badUrl);
-
- case 15:
- case 'end':
- return _context.stop();
- }
- }
- }, _callee, this, [[7, 12]]);
- }));
-
- function fetchResource(_x2, _x3) {
- return _ref2.apply(this, arguments);
- }
+function fetchResource(_x, _x2) {
+ return _fetchResource.apply(this, arguments);
+}
- return fetchResource;
-})();
+function _fetchResource() {
+ _fetchResource = _asyncToGenerator(
+ /*#__PURE__*/
+ _regeneratorRuntime.mark(function _callee(url, parsedUrl) {
+ var options, _ref2, response, body;
+
+ return _regeneratorRuntime.wrap(
+ function _callee$(_context) {
+ while (1) {
+ switch ((_context.prev = _context.next)) {
+ case 0:
+ parsedUrl = parsedUrl || URL.parse(encodeURI(url));
+ options = {
+ url: parsedUrl.href,
+ headers: _objectSpread({}, REQUEST_HEADERS),
+ timeout: FETCH_TIMEOUT,
+ // Accept cookies
+ jar: true,
+ // Set to null so the response returns as binary and body as buffer
+ // https://github.com/request/request#requestoptions-callback
+ encoding: null,
+ // Accept and decode gzip
+ gzip: true,
+ // Follow any redirect
+ followAllRedirects: true,
+ };
+ _context.next = 4;
+ return get(options);
+
+ case 4:
+ _ref2 = _context.sent;
+ response = _ref2.response;
+ body = _ref2.body;
+ _context.prev = 7;
+ validateResponse(response);
+ return _context.abrupt('return', {
+ body: body,
+ response: response,
+ });
+
+ case 12:
+ _context.prev = 12;
+ _context.t0 = _context['catch'](7);
+ return _context.abrupt('return', Errors.badUrl);
+
+ case 15:
+ case 'end':
+ return _context.stop();
+ }
+ }
+ },
+ _callee,
+ this,
+ [[7, 12]]
+ );
+ })
+ );
+ return _fetchResource.apply(this, arguments);
+}
function convertMetaProp($, from, to) {
- $('meta[' + from + ']').each(function (_, node) {
+ $('meta['.concat(from, ']')).each(function(_, node) {
var $node = $(node);
-
var value = $node.attr(from);
$node.attr(to, value);
$node.removeAttr(from);
});
-
return $;
-}
-
-// For ease of use in extracting from meta tags,
+} // For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
@@ -403,54 +461,102 @@ function normalizeMetaTags($) {
}
// Spacer images to be removed
-var SPACER_RE = new RegExp('transparent|spacer|blank', 'i');
-
-// The class we will use to mark elements we want to keep
+var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep
// but would normally remove
-var KEEP_CLASS = 'mercury-parser-keep';
-
-var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]'];
-
-// A list of tags to strip from the output if we encounter them.
-var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object'];
-
-// cleanAttributes
-var REMOVE_ATTRS = ['style', 'align'];
-var REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(function (selector) {
- return '[' + selector + ']';
-});
-var REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');
-var WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
-
-var WHITELIST_ATTRS_RE = new RegExp('^(' + WHITELIST_ATTRS.join('|') + ')$', 'i');
-// removeEmpty
-var REMOVE_EMPTY_TAGS = ['p'];
-var REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(function (tag) {
- return tag + ':empty';
-}).join(',');
-
-// cleanTags
-var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(',');
+var KEEP_CLASS = 'mercury-parser-keep';
+var KEEP_SELECTORS = [
+ 'iframe[src^="https://www.youtube.com"]',
+ 'iframe[src^="https://www.youtube-nocookie.com"]',
+ 'iframe[src^="http://www.youtube.com"]',
+ 'iframe[src^="https://player.vimeo"]',
+ 'iframe[src^="http://player.vimeo"]',
+]; // A list of tags to strip from the output if we encounter them.
+
+var STRIP_OUTPUT_TAGS = [
+ 'title',
+ 'script',
+ 'noscript',
+ 'link',
+ 'style',
+ 'hr',
+ 'embed',
+ 'iframe',
+ 'object',
+]; // cleanAttributes
+var WHITELIST_ATTRS = [
+ 'src',
+ 'srcset',
+ 'href',
+ 'class',
+ 'id',
+ 'alt',
+ 'xlink:href',
+ 'width',
+ 'height',
+];
+var WHITELIST_ATTRS_RE = new RegExp(
+ '^('.concat(WHITELIST_ATTRS.join('|'), ')$'),
+ 'i'
+); // removeEmpty
+
+var CLEAN_CONDITIONALLY_TAGS = [
+ 'ul',
+ 'ol',
+ 'table',
+ 'div',
+ 'button',
+ 'form',
+].join(','); // cleanHeaders
-// cleanHeaders
var HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];
-var HEADER_TAG_LIST = HEADER_TAGS.join(',');
-
-// // CONTENT FETCHING CONSTANTS ////
-
+var HEADER_TAG_LIST = HEADER_TAGS.join(','); // // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
-var UNLIKELY_CANDIDATES_BLACKLIST = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot',
-// 'form', // This is too generic, has too many false positives
-'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.
-'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box
-'presence_control_external', // lifehacker.com container full of false positives
-'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools'];
-
-// A list of strings that can be considered LIKELY candidates when
+
+var UNLIKELY_CANDIDATES_BLACKLIST = [
+ 'ad-break',
+ 'adbox',
+ 'advert',
+ 'addthis',
+ 'agegate',
+ 'aux',
+ 'blogger-labels',
+ 'combx',
+ 'comment',
+ 'conversation',
+ 'disqus',
+ 'entry-unrelated',
+ 'extra',
+ 'foot', // 'form', // This is too generic, has too many false positives
+ 'header',
+ 'hidden',
+ 'loader',
+ 'login', // Note: This can hit 'blogindex'.
+ 'menu',
+ 'meta',
+ 'nav',
+ 'outbrain',
+ 'pager',
+ 'pagination',
+ 'predicta', // readwriteweb inline ad box
+ 'presence_control_external', // lifehacker.com container full of false positives
+ 'popup',
+ 'printfriendly',
+ 'related',
+ 'remove',
+ 'remark',
+ 'rss',
+ 'share',
+ 'shoutbox',
+ 'sidebar',
+ 'sociable',
+ 'sponsor',
+ 'taboola',
+ 'tools',
+]; // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
@@ -461,100 +567,192 @@ var UNLIKELY_CANDIDATES_BLACKLIST = ['ad-break', 'adbox', 'advert', 'addthis', '
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.
-var UNLIKELY_CANDIDATES_WHITELIST = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form
-'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow'];
-// A list of tags which, if found inside, should cause a
to NOT
+var UNLIKELY_CANDIDATES_WHITELIST = [
+ 'and',
+ 'article',
+ 'body',
+ 'blogindex',
+ 'column',
+ 'content',
+ 'entry-content-asset',
+ 'format', // misuse of form
+ 'hfeed',
+ 'hentry',
+ 'hatom',
+ 'main',
+ 'page',
+ 'posts',
+ 'shadow',
+]; // A list of tags which, if found inside, should cause a to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into tags.
-var DIV_TO_P_BLOCK_TAGS = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(',');
-
-// A list of tags that should be ignored when trying to find the top candidate
-// for a document.
-
-
-
-
-// A list of selectors that specify, very clearly, either hNews or other
-// very content-specific style content, like Blogger templates.
-// More examples here: http://microformats.org/wiki/blog-post-formats
-
-
-
-
-// A list of strings that denote a positive scoring for this content as being
+var DIV_TO_P_BLOCK_TAGS = [
+ 'a',
+ 'blockquote',
+ 'dl',
+ 'div',
+ 'img',
+ 'p',
+ 'pre',
+ 'table',
+].join(','); // A list of tags that should be ignored when trying to find the top candidate
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
-var POSITIVE_SCORE_HINTS = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
-'\\Bcopy'];
-
-// The above list, joined into a matching regular expression
-var POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');
-
-// Readability publisher-specific guidelines
-
-// A list of strings that denote a negative scoring for this content as being
+var POSITIVE_SCORE_HINTS = [
+ 'article',
+ 'articlecontent',
+ 'instapaper_body',
+ 'blog',
+ 'body',
+ 'content',
+ 'entry-content-asset',
+ 'entry',
+ 'hentry',
+ 'main',
+ 'Normal',
+ 'page',
+ 'pagination',
+ 'permalink',
+ 'post',
+ 'story',
+ 'text',
+ '[-_]copy', // usatoday
+ '\\Bcopy',
+]; // The above list, joined into a matching regular expression
+
+var POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
-var NEGATIVE_SCORE_HINTS = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
-'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
-'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
-'promo', 'pr_', // autoblog - press release
-'related', 'respond', 'roundcontent', // lifehacker restricted content warning
-'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget'];
-// The above list, joined into a matching regular expression
-var NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');
-
-// XPath to try to determine if a page is wordpress. Not always successful.
-var IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';
-
-// Match a digit. Pretty clear.
-
-
-// A list of words that, if found in link text or URLs, likely mean that
-// this link is not a next page link.
-
-
-
-// Match any phrase that looks like it could be page, or paging, or pagination
-var PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');
-
-// Match any link text/classname/id that looks like it could mean the next
-// page. Things like: next, continue, >, >>, » but not >|, »| as those can
-// mean last page.
-// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i');
-
-// Match any link text/classname/id that looks like it is an end link: things
-// like "first", "last", "end", etc.
-
-
-// Match any link text/classname/id that looks like it means the previous
-// page.
-
-
-// Match 2 or more consecutive tags
-
-
-// Match 1 BR tag.
-
-
-// A list of all of the block level tags known in HTML5 and below. Taken from
+var NEGATIVE_SCORE_HINTS = [
+ 'adbox',
+ 'advert',
+ 'author',
+ 'bio',
+ 'bookmark',
+ 'bottom',
+ 'byline',
+ 'clear',
+ 'com-',
+ 'combx',
+ 'comment',
+ 'comment\\B',
+ 'contact',
+ 'copy',
+ 'credit',
+ 'crumb',
+ 'date',
+ 'deck',
+ 'excerpt',
+ 'featured', // tnr.com has a featured_content which throws us off
+ 'foot',
+ 'footer',
+ 'footnote',
+ 'graf',
+ 'head',
+ 'info',
+ 'infotext', // newscientist.com copyright
+ 'instapaper_ignore',
+ 'jump',
+ 'linebreak',
+ 'link',
+ 'masthead',
+ 'media',
+ 'meta',
+ 'modal',
+ 'outbrain', // slate.com junk
+ 'promo',
+ 'pr_', // autoblog - press release
+ 'related',
+ 'respond',
+ 'roundcontent', // lifehacker restricted content warning
+ 'scroll',
+ 'secondary',
+ 'share',
+ 'shopping',
+ 'shoutbox',
+ 'side',
+ 'sidebar',
+ 'sponsor',
+ 'stamp',
+ 'sub',
+ 'summary',
+ 'tags',
+ 'tools',
+ 'widget',
+]; // The above list, joined into a matching regular expression
+
+var NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful.
+
+var IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear.
+
+var PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next
// http://bit.ly/qneNIT
-var BLOCK_LEVEL_TAGS = ['article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video'];
-var BLOCK_LEVEL_TAGS_RE = new RegExp('^(' + BLOCK_LEVEL_TAGS.join('|') + ')$', 'i');
-// The removal is implemented as a blacklist and whitelist, this test finds
+var BLOCK_LEVEL_TAGS = [
+ 'article',
+ 'aside',
+ 'blockquote',
+ 'body',
+ 'br',
+ 'button',
+ 'canvas',
+ 'caption',
+ 'col',
+ 'colgroup',
+ 'dd',
+ 'div',
+ 'dl',
+ 'dt',
+ 'embed',
+ 'fieldset',
+ 'figcaption',
+ 'figure',
+ 'footer',
+ 'form',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'header',
+ 'hgroup',
+ 'hr',
+ 'li',
+ 'map',
+ 'object',
+ 'ol',
+ 'output',
+ 'p',
+ 'pre',
+ 'progress',
+ 'section',
+ 'table',
+ 'tbody',
+ 'textarea',
+ 'tfoot',
+ 'th',
+ 'thead',
+ 'tr',
+ 'ul',
+ 'video',
+];
+var BLOCK_LEVEL_TAGS_RE = new RegExp(
+ '^('.concat(BLOCK_LEVEL_TAGS.join('|'), ')$'),
+ 'i'
+); // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
+
var candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
var CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
-
var candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
var CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');
@@ -568,27 +766,28 @@ function stripUnlikelyCandidates($) {
//
// :param $: a cheerio object to strip nodes from
// :return $: the cleaned cheerio object
- $('*').not('a').each(function (index, node) {
- var $node = $(node);
- var classes = $node.attr('class');
- var id = $node.attr('id');
- if (!id && !classes) return;
-
- var classAndId = (classes || '') + ' ' + (id || '');
- if (CANDIDATES_WHITELIST.test(classAndId)) {
- return;
- } else if (CANDIDATES_BLACKLIST.test(classAndId)) {
- $node.remove();
- }
- });
+ $('*')
+ .not('a')
+ .each(function(index, node) {
+ var $node = $(node);
+ var classes = $node.attr('class');
+ var id = $node.attr('id');
+ if (!id && !classes) return;
+ var classAndId = ''.concat(classes || '', ' ').concat(id || '');
+
+ if (CANDIDATES_WHITELIST.test(classAndId)) {
+ return;
+ }
+ if (CANDIDATES_BLACKLIST.test(classAndId)) {
+ $node.remove();
+ }
+ });
return $;
}
-// ## NOTES:
// Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
-
// Given cheerio object, convert consecutive tags into
// tags instead.
//
@@ -596,7 +795,7 @@ function stripUnlikelyCandidates($) {
function brsToPs$$1($) {
var collapsing = false;
- $('br').each(function (index, element) {
+ $('br').each(function(index, element) {
var $element = $(element);
var nextElement = $element.next().get(0);
@@ -604,16 +803,14 @@ function brsToPs$$1($) {
collapsing = true;
$element.remove();
} else if (collapsing) {
- collapsing = false;
- // $(element).replaceWith('')
+ collapsing = false; // $(element).replaceWith('')
+
paragraphize(element, $, true);
}
});
-
return $;
}
-// Given a node, turn it into a P if it is not already a P, and
// make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
@@ -625,18 +822,21 @@ function brsToPs$$1($) {
// :param br: Whether or not the passed node is a br
function paragraphize(node, $) {
- var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
-
+ var br =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
var $node = $(node);
if (br) {
var sibling = node.nextSibling;
- var p = $('');
-
- // while the next node is text or not a block level element
+ var p = $(''); // while the next node is text or not a block level element
// append it to a new p node
- while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {
- var nextSibling = sibling.nextSibling;
+
+ while (
+ sibling &&
+ !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))
+ ) {
+ var _sibling = sibling,
+ nextSibling = _sibling.nextSibling;
$(sibling).appendTo(p);
sibling = nextSibling;
}
@@ -650,7 +850,7 @@ function paragraphize(node, $) {
}
function convertDivs($) {
- $('div').each(function (index, div) {
+ $('div').each(function(index, div) {
var $div = $(div);
var convertable = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0;
@@ -658,23 +858,20 @@ function convertDivs($) {
convertNodeTo$$1($div, $, 'p');
}
});
-
return $;
}
function convertSpans($) {
- $('span').each(function (index, span) {
+ $('span').each(function(index, span) {
var $span = $(span);
var convertable = $span.parents('p, div').length === 0;
+
if (convertable) {
convertNodeTo$$1($span, $, 'p');
}
});
-
return $;
-}
-
-// Loop through the provided doc, and convert any p-like elements to
+} // Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
// Things fitting this criteria:
@@ -690,44 +887,55 @@ function convertToParagraphs$$1($) {
$ = brsToPs$$1($);
$ = convertDivs($);
$ = convertSpans($);
-
return $;
}
function convertNodeTo$$1($node, $) {
- var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
-
+ var tag =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
var node = $node.get(0);
+
if (!node) {
return $;
}
- var attrs = getAttrs(node) || {};
- // console.log(attrs)
- var attribString = _Reflect$ownKeys(attrs).map(function (key) {
- return key + '=' + attrs[key];
- }).join(' ');
- var html = void 0;
+ var attrs = getAttrs(node) || {}; // console.log(attrs)
+
+ var attribString = _Reflect$ownKeys(attrs)
+ .map(function(key) {
+ return ''.concat(key, '=').concat(attrs[key]);
+ })
+ .join(' ');
+
+ var html;
if ($.browser) {
// In the browser, the contents of noscript tags aren't rendered, therefore
// transforms on the noscript tag (commonly used for lazy-loading) don't work
// as expected. This test case handles that
- html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
+ html =
+ node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
} else {
html = $node.contents();
}
- $node.replaceWith('<' + tag + ' ' + attribString + '>' + html + '' + tag + '>');
+
+ $node.replaceWith(
+ '<'
+ .concat(tag, ' ')
+ .concat(attribString, '>')
+ .concat(html, '')
+ .concat(tag, '>')
+ );
return $;
}
function cleanForHeight($img, $) {
- var height = parseInt($img.attr('height'), 10);
- var width = parseInt($img.attr('width'), 10) || 20;
+ var height = _parseInt($img.attr('height'), 10);
- // Remove images that explicitly have very small heights or
+ var width = _parseInt($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or
// widths, because they are most likely shims or icons,
// which aren't very useful for reading.
+
if ((height || 20) < 10 || width < 10) {
$img.remove();
} else if (height) {
@@ -738,10 +946,9 @@ function cleanForHeight($img, $) {
}
return $;
-}
-
-// Cleans out images where the source string matches transparent/spacer/etc
+} // Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP
+
function removeSpacers($img, $) {
if (SPACER_RE.test($img.attr('src'))) {
$img.remove();
@@ -751,18 +958,17 @@ function removeSpacers($img, $) {
}
function cleanImages($article, $) {
- $article.find('img').each(function (index, img) {
+ $article.find('img').each(function(index, img) {
var $img = $(img);
-
cleanForHeight($img, $);
removeSpacers($img, $);
});
-
return $;
}
function markToKeep(article, $, url) {
- var tags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];
+ var tags =
+ arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : [];
if (tags.length === 0) {
tags = KEEP_SELECTORS;
@@ -770,43 +976,45 @@ function markToKeep(article, $, url) {
if (url) {
var _URL$parse = URL.parse(url),
- protocol = _URL$parse.protocol,
- hostname = _URL$parse.hostname;
+ protocol = _URL$parse.protocol,
+ hostname = _URL$parse.hostname;
- tags = [].concat(_toConsumableArray(tags), ['iframe[src^="' + protocol + '//' + hostname + '"]']);
+ tags = [].concat(_toConsumableArray(tags), [
+ 'iframe[src^="'.concat(protocol, '//').concat(hostname, '"]'),
+ ]);
}
$(tags.join(','), article).addClass(KEEP_CLASS);
-
return $;
}
function stripJunkTags(article, $) {
- var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];
+ var tags =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];
if (tags.length === 0) {
tags = STRIP_OUTPUT_TAGS;
- }
-
- // Remove matching elements, but ignore
+ } // Remove matching elements, but ignore
// any element with a class of mercury-parser-keep
- $(tags.join(','), article).not('.' + KEEP_CLASS).remove();
+ $(tags.join(','), article)
+ .not('.'.concat(KEEP_CLASS))
+ .remove();
return $;
}
-// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
+
function cleanHOnes$$1(article, $) {
var $hOnes = $('h1', article);
if ($hOnes.length < 3) {
- $hOnes.each(function (index, node) {
+ $hOnes.each(function(index, node) {
return $(node).remove();
});
} else {
- $hOnes.each(function (index, node) {
+ $hOnes.each(function(index, node) {
convertNodeTo$$1($(node), $, 'h2');
});
}
@@ -815,148 +1023,178 @@ function cleanHOnes$$1(article, $) {
}
function removeAllButWhitelist($article, $) {
- $article.find('*').each(function (index, node) {
+ $article.find('*').each(function(index, node) {
var attrs = getAttrs(node);
+ setAttrs(
+ node,
+ _Reflect$ownKeys(attrs).reduce(function(acc, attr) {
+ if (WHITELIST_ATTRS_RE.test(attr)) {
+ return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr]));
+ }
- setAttrs(node, _Reflect$ownKeys(attrs).reduce(function (acc, attr) {
- if (WHITELIST_ATTRS_RE.test(attr)) {
- return _extends({}, acc, _defineProperty({}, attr, attrs[attr]));
- }
-
- return acc;
- }, {}));
- });
-
- // Remove the mercury-parser-keep class from result
- $('.' + KEEP_CLASS, $article).removeClass(KEEP_CLASS);
+ return acc;
+ }, {})
+ );
+ }); // Remove the mercury-parser-keep class from result
+ $('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
return $article;
-}
-
-// function removeAttrs(article, $) {
+} // function removeAttrs(article, $) {
// REMOVE_ATTRS.forEach((attr) => {
// $(`[${attr}]`, article).removeAttr(attr);
// });
// }
-
// Remove attributes like style or align
+
function cleanAttributes$$1($article, $) {
// Grabbing the parent because at this point
// $article will be wrapped in a div which will
// have a score set on it.
- return removeAllButWhitelist($article.parent().length ? $article.parent() : $article, $);
+ return removeAllButWhitelist(
+ $article.parent().length ? $article.parent() : $article,
+ $
+ );
}
function removeEmpty($article, $) {
- $article.find('p').each(function (index, p) {
+ $article.find('p').each(function(index, p) {
var $p = $(p);
- if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove();
+ if ($p.find('iframe, img').length === 0 && $p.text().trim() === '')
+ $p.remove();
});
-
return $;
}
// // CONTENT FETCHING CONSTANTS ////
-
-// A list of strings that can be considered unlikely candidates when
-// extracting content from a resource. These strings are joined together
-// and then tested for existence using re:test, so may contain simple,
-// non-pipe style regular expression queries if necessary.
-var UNLIKELY_CANDIDATES_BLACKLIST$1 = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', 'form', 'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.
-'menu', 'meta', 'nav', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box
-'presence_control_external', // lifehacker.com container full of false positives
-'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'tools'];
-
-// A list of strings that can be considered LIKELY candidates when
-// extracting content from a resource. Essentially, the inverse of the
-// blacklist above - if something matches both blacklist and whitelist,
-// it is kept. This is useful, for example, if something has a className
-// of "rss-content entry-content". It matched 'rss', so it would normally
-// be removed, however, it's also the entry content, so it should be left
-// alone.
-//
-// These strings are joined together and then tested for existence using
-// re:test, so may contain simple, non-pipe style regular expression queries
-// if necessary.
-var UNLIKELY_CANDIDATES_WHITELIST$1 = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form
-'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow'];
-
-// A list of tags which, if found inside, should cause a to NOT
-// be turned into a paragraph tag. Shallow div tags without these elements
-// should be turned into tags.
-var DIV_TO_P_BLOCK_TAGS$1 = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(',');
-
-// A list of tags that should be ignored when trying to find the top candidate
// for a document.
-var NON_TOP_CANDIDATE_TAGS$1 = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
-var NON_TOP_CANDIDATE_TAGS_RE$1 = new RegExp('^(' + NON_TOP_CANDIDATE_TAGS$1.join('|') + ')$', 'i');
-
-// A list of selectors that specify, very clearly, either hNews or other
+var NON_TOP_CANDIDATE_TAGS$1 = [
+ 'br',
+ 'b',
+ 'i',
+ 'label',
+ 'hr',
+ 'area',
+ 'base',
+ 'basefont',
+ 'input',
+ 'img',
+ 'link',
+ 'meta',
+];
+var NON_TOP_CANDIDATE_TAGS_RE$1 = new RegExp(
+ '^('.concat(NON_TOP_CANDIDATE_TAGS$1.join('|'), ')$'),
+ 'i'
+); // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats
-var HNEWS_CONTENT_SELECTORS$1 = [['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body']];
+var HNEWS_CONTENT_SELECTORS$1 = [
+ ['.hentry', '.entry-content'],
+ ['entry', '.entry-content'],
+ ['.entry', '.entry_content'],
+ ['.post', '.postbody'],
+ ['.post', '.post_body'],
+ ['.post', '.post-body'],
+];
var PHOTO_HINTS$1 = ['figure', 'photo', 'image', 'caption'];
-var PHOTO_HINTS_RE$1 = new RegExp(PHOTO_HINTS$1.join('|'), 'i');
-
-// A list of strings that denote a positive scoring for this content as being
+var PHOTO_HINTS_RE$1 = new RegExp(PHOTO_HINTS$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
-var POSITIVE_SCORE_HINTS$1 = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday
-'\\Bcopy'];
-// The above list, joined into a matching regular expression
-var POSITIVE_SCORE_RE$1 = new RegExp(POSITIVE_SCORE_HINTS$1.join('|'), 'i');
-
-// Readability publisher-specific guidelines
-var READABILITY_ASSET$1 = new RegExp('entry-content-asset', 'i');
-
-// A list of strings that denote a negative scoring for this content as being
+var POSITIVE_SCORE_HINTS$1 = [
+ 'article',
+ 'articlecontent',
+ 'instapaper_body',
+ 'blog',
+ 'body',
+ 'content',
+ 'entry-content-asset',
+ 'entry',
+ 'hentry',
+ 'main',
+ 'Normal',
+ 'page',
+ 'pagination',
+ 'permalink',
+ 'post',
+ 'story',
+ 'text',
+ '[-_]copy', // usatoday
+ '\\Bcopy',
+]; // The above list, joined into a matching regular expression
+
+var POSITIVE_SCORE_RE$1 = new RegExp(POSITIVE_SCORE_HINTS$1.join('|'), 'i'); // Readability publisher-specific guidelines
+
+var READABILITY_ASSET$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
-var NEGATIVE_SCORE_HINTS$1 = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off
-'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright
-'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk
-'promo', 'pr_', // autoblog - press release
-'related', 'respond', 'roundcontent', // lifehacker restricted content warning
-'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget'];
-// The above list, joined into a matching regular expression
-var NEGATIVE_SCORE_RE$1 = new RegExp(NEGATIVE_SCORE_HINTS$1.join('|'), 'i');
-
-// Match a digit. Pretty clear.
-
-
-// Match 2 or more consecutive tags
-
-
-// Match 1 BR tag.
-
-
-// A list of all of the block level tags known in HTML5 and below. Taken from
-// http://bit.ly/qneNIT
-
-
-
-// The removal is implemented as a blacklist and whitelist, this test finds
-// blacklisted elements that aren't whitelisted. We do this all in one
-// expression-both because it's only one pass, and because this skips the
-// serialization for whitelisted nodes.
-var candidatesBlacklist$1 = UNLIKELY_CANDIDATES_BLACKLIST$1.join('|');
-
-
-var candidatesWhitelist$1 = UNLIKELY_CANDIDATES_WHITELIST$1.join('|');
-
-
-
+var NEGATIVE_SCORE_HINTS$1 = [
+ 'adbox',
+ 'advert',
+ 'author',
+ 'bio',
+ 'bookmark',
+ 'bottom',
+ 'byline',
+ 'clear',
+ 'com-',
+ 'combx',
+ 'comment',
+ 'comment\\B',
+ 'contact',
+ 'copy',
+ 'credit',
+ 'crumb',
+ 'date',
+ 'deck',
+ 'excerpt',
+ 'featured', // tnr.com has a featured_content which throws us off
+ 'foot',
+ 'footer',
+ 'footnote',
+ 'graf',
+ 'head',
+ 'info',
+ 'infotext', // newscientist.com copyright
+ 'instapaper_ignore',
+ 'jump',
+ 'linebreak',
+ 'link',
+ 'masthead',
+ 'media',
+ 'meta',
+ 'modal',
+ 'outbrain', // slate.com junk
+ 'promo',
+ 'pr_', // autoblog - press release
+ 'related',
+ 'respond',
+ 'roundcontent', // lifehacker restricted content warning
+ 'scroll',
+ 'secondary',
+ 'share',
+ 'shopping',
+ 'shoutbox',
+ 'side',
+ 'sidebar',
+ 'sponsor',
+ 'stamp',
+ 'sub',
+ 'summary',
+ 'tags',
+ 'tools',
+ 'widget',
+]; // The above list, joined into a matching regular expression
+
+var NEGATIVE_SCORE_RE$1 = new RegExp(NEGATIVE_SCORE_HINTS$1.join('|'), 'i'); // Match a digit. Pretty clear.
var PARAGRAPH_SCORE_TAGS$1 = new RegExp('^(p|li|span|pre)$', 'i');
var CHILD_CONTENT_TAGS$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
var BAD_TAGS$1 = new RegExp('^(address|form)$', 'i');
-// Get the score of a node based on its className and id.
function getWeight(node) {
var classes = node.attr('class');
var id = node.attr('id');
@@ -967,6 +1205,7 @@ function getWeight(node) {
if (POSITIVE_SCORE_RE$1.test(id)) {
score += 25;
}
+
if (NEGATIVE_SCORE_RE$1.test(id)) {
score -= 25;
}
@@ -979,22 +1218,21 @@ function getWeight(node) {
if (POSITIVE_SCORE_RE$1.test(classes)) {
score += 25;
}
+
if (NEGATIVE_SCORE_RE$1.test(classes)) {
score -= 25;
}
- }
-
- // even if score has been set by id, add score for
+ } // even if score has been set by id, add score for
// possible photo matches
// "try to keep photos if we can"
+
if (PHOTO_HINTS_RE$1.test(classes)) {
score += 10;
- }
-
- // add 25 if class matches entry-content-asset,
+ } // add 25 if class matches entry-content-asset,
// a class apparently instructed for use in the
// Readability publisher guidelines
// https://www.readability.com/developers/guidelines
+
if (READABILITY_ASSET$1.test(classes)) {
score += 25;
}
@@ -1007,7 +1245,7 @@ function getWeight(node) {
// the node's score attribute
// returns null if no score set
function getScore($node) {
- return parseFloat($node.attr('score')) || null;
+ return _parseFloat($node.attr('score')) || null;
}
// return 1 for every comma in text
@@ -1016,20 +1254,18 @@ function scoreCommas(text) {
}
var idkRe = new RegExp('^(p|pre)$', 'i');
-
function scoreLength(textLength) {
- var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
-
+ var tagName =
+ arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
var chunks = textLength / 50;
if (chunks > 0) {
- var lengthBonus = void 0;
-
- // No idea why p or pre are being tamped down here
+ var lengthBonus; // No idea why p or pre are being tamped down here
// but just following the source for now
// Not even sure why tagName is included here,
// since this is only being called from the context
// of scoreParagraph
+
if (idkRe.test(tagName)) {
lengthBonus = chunks - 2;
} else {
@@ -1042,29 +1278,25 @@ function scoreLength(textLength) {
return 0;
}
-// Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better.
+
function scoreParagraph$$1(node) {
var score = 1;
var text = node.text().trim();
- var textLength = text.length;
+ var textLength = text.length; // If this paragraph is less than 25 characters, don't count it.
- // If this paragraph is less than 25 characters, don't count it.
if (textLength < 25) {
return 0;
- }
-
- // Add points for any commas within this paragraph
- score += scoreCommas(text);
+ } // Add points for any commas within this paragraph
- // For every 50 characters in this paragraph, add another point. Up
+ score += scoreCommas(text); // For every 50 characters in this paragraph, add another point. Up
// to 3 points.
- score += scoreLength(textLength);
- // Articles can end with short paragraphs when people are being clever
+ score += scoreLength(textLength); // Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold.
+
if (text.slice(-1) === ':') {
score -= 1;
}
@@ -1088,9 +1320,9 @@ function addScore$$1($node, $, amount) {
return $node;
}
-// Adds 1/4 of a child's score to its parent
function addToParent$$1(node, $, score) {
var parent = node.parent();
+
if (parent) {
addScore$$1(parent, $, score * 0.25);
}
@@ -1098,12 +1330,12 @@ function addToParent$$1(node, $, score) {
return node;
}
-// gets and returns the score if it exists
// if not, initializes a score based on
// the node's tag type
-function getOrInitScore$$1($node, $) {
- var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
+function getOrInitScore$$1($node, $) {
+ var weightNodes =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
var score = getScore($node);
if (score) {
@@ -1117,30 +1349,34 @@ function getOrInitScore$$1($node, $) {
}
addToParent$$1($node, $, score);
-
return score;
}
-// Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag.
+
function scoreNode$$1($node) {
var _$node$get = $node.get(0),
- tagName = _$node$get.tagName;
-
- // TODO: Consider ordering by most likely.
+ tagName = _$node$get.tagName; // TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node – AP
-
if (PARAGRAPH_SCORE_TAGS$1.test(tagName)) {
return scoreParagraph$$1($node);
- } else if (tagName.toLowerCase() === 'div') {
+ }
+
+ if (tagName.toLowerCase() === 'div') {
return 5;
- } else if (CHILD_CONTENT_TAGS$1.test(tagName)) {
+ }
+
+ if (CHILD_CONTENT_TAGS$1.test(tagName)) {
return 3;
- } else if (BAD_TAGS$1.test(tagName)) {
+ }
+
+ if (BAD_TAGS$1.test(tagName)) {
return -3;
- } else if (tagName.toLowerCase() === 'th') {
+ }
+
+ if (tagName.toLowerCase() === 'th') {
return -5;
}
@@ -1150,7 +1386,7 @@ function scoreNode$$1($node) {
function convertSpans$1($node, $) {
if ($node.get(0)) {
var _$node$get = $node.get(0),
- tagName = _$node$get.tagName;
+ tagName = _$node$get.tagName;
if (tagName === 'span') {
// convert spans to divs
@@ -1167,59 +1403,59 @@ function addScoreTo($node, $, score) {
}
function scorePs($, weightNodes) {
- $('p, pre').not('[score]').each(function (index, node) {
- // The raw score for this paragraph, before we add any parent/child
- // scores.
- var $node = $(node);
- $node = setScore($node, $, getOrInitScore$$1($node, $, weightNodes));
-
- var $parent = $node.parent();
- var rawScore = scoreNode$$1($node);
-
- addScoreTo($parent, $, rawScore, weightNodes);
- if ($parent) {
- // Add half of the individual content score to the
- // grandparent
- addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
- }
- });
-
+ $('p, pre')
+ .not('[score]')
+ .each(function(index, node) {
+ // The raw score for this paragraph, before we add any parent/child
+ // scores.
+ var $node = $(node);
+ $node = setScore($node, $, getOrInitScore$$1($node, $, weightNodes));
+ var $parent = $node.parent();
+ var rawScore = scoreNode$$1($node);
+ addScoreTo($parent, $, rawScore, weightNodes);
+
+ if ($parent) {
+ // Add half of the individual content score to the
+ // grandparent
+ addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
+ }
+ });
return $;
-}
-
-// score content. Parents get the full value of their children's
+} // score content. Parents get the full value of their children's
// content score, grandparents half
-function scoreContent$$1($) {
- var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
+function scoreContent$$1($) {
+ var weightNodes =
+ arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
// First, look for special hNews based selectors and give them a big
// boost, if they exist
- HNEWS_CONTENT_SELECTORS$1.forEach(function (_ref) {
+ HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref) {
var _ref2 = _slicedToArray(_ref, 2),
- parentSelector = _ref2[0],
- childSelector = _ref2[1];
+ parentSelector = _ref2[0],
+ childSelector = _ref2[1];
- $(parentSelector + ' ' + childSelector).each(function (index, node) {
+ $(''.concat(parentSelector, ' ').concat(childSelector)).each(function(
+ index,
+ node
+ ) {
addScore$$1($(node).parent(parentSelector), $, 80);
});
- });
-
- // Doubling this again
+ }); // Doubling this again
// Previous solution caused a bug
// in which parents weren't retaining
// scores. This is not ideal, and
// should be fixed.
+
scorePs($, weightNodes);
scorePs($, weightNodes);
-
return $;
}
-// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
+
function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate;
@@ -1227,74 +1463,87 @@ function mergeSiblings($candidate, topScore, $) {
var siblingScoreThreshold = Math.max(10, topScore * 0.25);
var wrappingDiv = $('');
+ $candidate
+ .parent()
+ .children()
+ .each(function(index, sibling) {
+ var $sibling = $(sibling); // Ignore tags like BR, HR, etc
- $candidate.parent().children().each(function (index, sibling) {
- var $sibling = $(sibling);
- // Ignore tags like BR, HR, etc
- if (NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)) {
- return null;
- }
+ if (NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)) {
+ return null;
+ }
- var siblingScore = getScore($sibling);
- if (siblingScore) {
- if ($sibling.get(0) === $candidate.get(0)) {
- wrappingDiv.append($sibling);
- } else {
- var contentBonus = 0;
- var density = linkDensity($sibling);
+ var siblingScore = getScore($sibling);
- // If sibling has a very low link density,
- // give it a small bonus
- if (density < 0.05) {
- contentBonus += 20;
- }
+ if (siblingScore) {
+ if ($sibling.get(0) === $candidate.get(0)) {
+ wrappingDiv.append($sibling);
+ } else {
+ var contentBonus = 0;
+ var density = linkDensity($sibling); // If sibling has a very low link density,
+ // give it a small bonus
+
+ if (density < 0.05) {
+ contentBonus += 20;
+ } // If sibling has a high link density,
+ // give it a penalty
+
+ if (density >= 0.5) {
+ contentBonus -= 20;
+ } // If sibling node has the same class as
+ // candidate, give it a bonus
+
+ if ($sibling.attr('class') === $candidate.attr('class')) {
+ contentBonus += topScore * 0.2;
+ }
- // If sibling has a high link density,
- // give it a penalty
- if (density >= 0.5) {
- contentBonus -= 20;
- }
+ var newScore = siblingScore + contentBonus;
- // If sibling node has the same class as
- // candidate, give it a bonus
- if ($sibling.attr('class') === $candidate.attr('class')) {
- contentBonus += topScore * 0.2;
- }
+ if (newScore >= siblingScoreThreshold) {
+ return wrappingDiv.append($sibling);
+ }
- var newScore = siblingScore + contentBonus;
+ if (sibling.tagName === 'p') {
+ var siblingContent = $sibling.text();
+ var siblingContentLength = textLength(siblingContent);
- if (newScore >= siblingScoreThreshold) {
- return wrappingDiv.append($sibling);
- } else if (sibling.tagName === 'p') {
- var siblingContent = $sibling.text();
- var siblingContentLength = textLength(siblingContent);
+ if (siblingContentLength > 80 && density < 0.25) {
+ return wrappingDiv.append($sibling);
+ }
- if (siblingContentLength > 80 && density < 0.25) {
- return wrappingDiv.append($sibling);
- } else if (siblingContentLength <= 80 && density === 0 && hasSentenceEnd(siblingContent)) {
- return wrappingDiv.append($sibling);
+ if (
+ siblingContentLength <= 80 &&
+ density === 0 &&
+ hasSentenceEnd(siblingContent)
+ ) {
+ return wrappingDiv.append($sibling);
+ }
}
}
}
- }
- return null;
- });
+ return null;
+ });
- if (wrappingDiv.children().length === 1 && wrappingDiv.children().first().get(0) === $candidate.get(0)) {
+ if (
+ wrappingDiv.children().length === 1 &&
+ wrappingDiv
+ .children()
+ .first()
+ .get(0) === $candidate.get(0)
+ ) {
return $candidate;
}
return wrappingDiv;
}
-// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
+
function findTopCandidate$$1($) {
- var $candidate = void 0;
+ var $candidate;
var topScore = 0;
-
- $('[score]').each(function (index, node) {
+ $('[score]').each(function(index, node) {
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE$1.test(node.tagName)) {
return;
@@ -1307,16 +1556,14 @@ function findTopCandidate$$1($) {
topScore = score;
$candidate = $node;
}
- });
-
- // If we don't have a candidate, return the body
+ }); // If we don't have a candidate, return the body
// or whatever the first element is
+
if (!$candidate) {
return $('body') || $('*').first();
}
$candidate = mergeSiblings($candidate, topScore, $);
-
return $candidate;
}
@@ -1335,45 +1582,46 @@ function removeUnlessContent($node, $, weight) {
if (scoreCommas(content) < 10) {
var pCount = $('p', $node).length;
- var inputCount = $('input', $node).length;
+ var inputCount = $('input', $node).length; // Looks like a form, too many inputs.
- // Looks like a form, too many inputs.
if (inputCount > pCount / 3) {
$node.remove();
return;
}
var contentLength = content.length;
- var imgCount = $('img', $node).length;
-
- // Content is too short, and there are no images, so
+ var imgCount = $('img', $node).length; // Content is too short, and there are no images, so
// this is probably junk content.
+
if (contentLength < 25 && imgCount === 0) {
$node.remove();
return;
}
- var density = linkDensity($node);
-
- // Too high of link density, is probably a menu or
+ var density = linkDensity($node); // Too high of link density, is probably a menu or
// something similar.
// console.log(weight, density, contentLength)
+
if (weight < 25 && density > 0.2 && contentLength > 75) {
$node.remove();
return;
- }
-
- // Too high of a link density, despite the score being
+ } // Too high of a link density, despite the score being
// high.
+
if (weight >= 25 && density > 0.5) {
// Don't remove the node if it's a list and the
// previous sibling starts with a colon though. That
// means it's probably content.
var tagName = $node.get(0).tagName.toLowerCase();
var nodeIsList = tagName === 'ol' || tagName === 'ul';
+
if (nodeIsList) {
var previousNode = $node.prev();
- if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') {
+
+ if (
+ previousNode &&
+ normalizeSpaces(previousNode.text()).slice(-1) === ':'
+ ) {
return;
}
}
@@ -1382,36 +1630,36 @@ function removeUnlessContent($node, $, weight) {
return;
}
- var scriptCount = $('script', $node).length;
+ var scriptCount = $('script', $node).length; // Too many script tags, not enough content.
- // Too many script tags, not enough content.
if (scriptCount > 0 && contentLength < 150) {
$node.remove();
- return;
}
}
-}
-
-// Given an article, clean it of some superfluous content specified by
+} // Given an article, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.
+
function cleanTags$$1($article, $) {
- $(CLEAN_CONDITIONALLY_TAGS, $article).each(function (index, node) {
- var $node = $(node);
- // If marked to keep, skip it
- if ($node.hasClass(KEEP_CLASS) || $node.find('.' + KEEP_CLASS).length > 0) return;
+ $(CLEAN_CONDITIONALLY_TAGS, $article).each(function(index, node) {
+ var $node = $(node); // If marked to keep, skip it
+ if (
+ $node.hasClass(KEEP_CLASS) ||
+ $node.find('.'.concat(KEEP_CLASS)).length > 0
+ )
+ return;
var weight = getScore($node);
+
if (!weight) {
weight = getOrInitScore$$1($node, $);
setScore($node, $, weight);
- }
+ } // drop node if its weight is < 0
- // drop node if its weight is < 0
if (weight < 0) {
$node.remove();
} else {
@@ -1419,54 +1667,49 @@ function cleanTags$$1($article, $) {
removeUnlessContent($node, $, weight);
}
});
-
return $;
}
function cleanHeaders($article, $) {
- var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
-
- $(HEADER_TAG_LIST, $article).each(function (index, header) {
- var $header = $(header);
- // Remove any headers that appear before all other p tags in the
+ var title =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
+ $(HEADER_TAG_LIST, $article).each(function(index, header) {
+ var $header = $(header); // Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
// all of which should be handled by other metadata handling.
+
if ($($header, $article).prevAll('p').length === 0) {
return $header.remove();
- }
+ } // Remove any headers that match the title exactly.
- // Remove any headers that match the title exactly.
if (normalizeSpaces($(header).text()) === title) {
return $header.remove();
- }
-
- // If this header has a negative weight, it's probably junk.
+ } // If this header has a negative weight, it's probably junk.
// Get rid of it.
+
if (getWeight($(header)) < 0) {
return $header.remove();
}
return $header;
});
-
return $;
}
-// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
+
function rewriteTopLevel$$1(article, $) {
// I'm not using context here because
// it's problematic when converting the
// top-level/root node - AP
$ = convertNodeTo$$1($('html'), $, 'div');
$ = convertNodeTo$$1($('body'), $, 'div');
-
return $;
}
function absolutize($, rootUrl, attr, $content) {
- $('[' + attr + ']', $content).each(function (_, node) {
+ $('['.concat(attr, ']'), $content).each(function(_, node) {
var attrs = getAttrs(node);
var url = attrs[attr];
@@ -1478,43 +1721,42 @@ function absolutize($, rootUrl, attr, $content) {
}
function makeLinksAbsolute$$1($content, $, url) {
- ['href', 'src'].forEach(function (attr) {
+ ['href', 'src'].forEach(function(attr) {
return absolutize($, url, attr, $content);
});
-
return $content;
}
function textLength(text) {
return text.trim().replace(/\s+/g, ' ').length;
-}
-
-// Determines what percentage of the text
+} // Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float
+
function linkDensity($node) {
var totalTextLength = textLength($node.text());
-
var linkText = $node.find('a').text();
var linkLength = textLength(linkText);
if (totalTextLength > 0) {
return linkLength / totalTextLength;
- } else if (totalTextLength === 0 && linkLength > 0) {
+ }
+
+ if (totalTextLength === 0 && linkLength > 0) {
return 1;
}
return 0;
}
-// Given a node type to search for, and a list of meta tag names to
// search for, find a meta tag associated.
-function extractFromMeta$$1($, metaNames, cachedNames) {
- var cleanTags$$1 = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
- var foundNames = metaNames.filter(function (name) {
+function extractFromMeta$$1($, metaNames, cachedNames) {
+ var cleanTags =
+ arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
+ var foundNames = metaNames.filter(function(name) {
return cachedNames.indexOf(name) !== -1;
- });
+ }); // eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
@@ -1523,54 +1765,57 @@ function extractFromMeta$$1($, metaNames, cachedNames) {
try {
var _loop = function _loop() {
var name = _step.value;
-
var type = 'name';
var value = 'value';
-
- var nodes = $('meta[' + type + '="' + name + '"]');
-
- // Get the unique value of every matching node, in case there
+ var nodes = $('meta['.concat(type, '="').concat(name, '"]')); // Get the unique value of every matching node, in case there
// are two meta tags with the same name and value.
// Remove empty values.
- var values = nodes.map(function (index, node) {
- return $(node).attr(value);
- }).toArray().filter(function (text) {
- return text !== '';
- });
- // If we have more than one value for the same name, we have a
+ var values = nodes
+ .map(function(index, node) {
+ return $(node).attr(value);
+ })
+ .toArray()
+ .filter(function(text) {
+ return text !== '';
+ }); // If we have more than one value for the same name, we have a
// conflict and can't trust any of them. Skip this name. If we have
// zero, that means our meta tags had no values. Skip this name
// also.
+
if (values.length === 1) {
- var metaValue = void 0;
- // Meta values that contain HTML should be stripped, as they
+ var metaValue; // Meta values that contain HTML should be stripped, as they
// weren't subject to cleaning previously.
- if (cleanTags$$1) {
+
+ if (cleanTags) {
metaValue = stripTags(values[0], $);
} else {
- metaValue = values[0];
+ var _values = _slicedToArray(values, 1);
+
+ metaValue = _values[0];
}
return {
- v: metaValue
+ v: metaValue,
};
}
};
- for (var _iterator = _getIterator(foundNames), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
+ for (
+ var _iterator = _getIterator(foundNames), _step;
+ !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
+ _iteratorNormalCompletion = true
+ ) {
var _ret = _loop();
- if ((typeof _ret === 'undefined' ? 'undefined' : _typeof(_ret)) === "object") return _ret.v;
- }
-
- // If nothing is found, return null
+ if (_typeof(_ret) === 'object') return _ret.v;
+ } // If nothing is found, return null
} catch (err) {
_didIteratorError = true;
_iteratorError = err;
} finally {
try {
- if (!_iteratorNormalCompletion && _iterator.return) {
+ if (!_iteratorNormalCompletion && _iterator.return != null) {
_iterator.return();
}
} finally {
@@ -1588,38 +1833,43 @@ function isGoodNode($node, maxChildren) {
// element. Skip it.
if ($node.children().length > maxChildren) {
return false;
- }
- // If it looks to be within a comment, skip it.
+ } // If it looks to be within a comment, skip it.
+
if (withinComment$$1($node)) {
return false;
}
return true;
-}
-
-// Given a a list of selectors find content that may
+} // Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.
+
function extractFromSelectors$$1($, selectors) {
- var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
- var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
+ var maxChildren =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
+ var textOnly =
+ arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
+ // eslint-disable-next-line no-restricted-syntax
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
- for (var _iterator = _getIterator(selectors), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
+ for (
+ var _iterator = _getIterator(selectors), _step;
+ !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
+ _iteratorNormalCompletion = true
+ ) {
var selector = _step.value;
-
- var nodes = $(selector);
-
- // If we didn't get exactly one of this selector, this may be
+ var nodes = $(selector); // If we didn't get exactly one of this selector, this may be
// a list of articles or comments. Skip it.
+
if (nodes.length === 1) {
var $node = $(nodes[0]);
if (isGoodNode($node, maxChildren)) {
var content = void 0;
+
if (textOnly) {
content = $node.text();
} else {
@@ -1637,7 +1887,7 @@ function extractFromSelectors$$1($, selectors) {
_iteratorError = err;
} finally {
try {
- if (!_iteratorNormalCompletion && _iterator.return) {
+ if (!_iteratorNormalCompletion && _iterator.return != null) {
_iterator.return();
}
} finally {
@@ -1654,28 +1904,25 @@ function extractFromSelectors$$1($, selectors) {
function stripTags(text, $) {
// Wrapping text in html element prevents errors when text
// has no html
- var cleanText = $('' + text + '').text();
+ var cleanText = $(''.concat(text, '')).text();
return cleanText === '' ? text : cleanText;
}
function withinComment$$1($node) {
var parents = $node.parents().toArray();
- var commentParent = parents.find(function (parent) {
+ var commentParent = parents.find(function(parent) {
var attrs = getAttrs(parent);
var nodeClass = attrs.class,
- id = attrs.id;
-
- var classAndId = nodeClass + ' ' + id;
+ id = attrs.id;
+ var classAndId = ''.concat(nodeClass, ' ').concat(id);
return classAndId.includes('comment');
});
-
return commentParent !== undefined;
}
// Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean
-
function nodeIsSufficient($node) {
return $node.text().trim().length >= 100;
}
@@ -1686,18 +1933,16 @@ function isWordpress($) {
function getAttrs(node) {
var attribs = node.attribs,
- attributes = node.attributes;
-
+ attributes = node.attributes;
if (!attribs && attributes) {
- var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
+ var attrs = _Reflect$ownKeys(attributes).reduce(function(acc, index) {
var attr = attributes[index];
-
if (!attr.name || !attr.value) return acc;
-
acc[attr.name] = attr.value;
return acc;
}, {});
+
return attrs;
}
@@ -1722,7 +1967,7 @@ function setAttrs(node, attrs) {
node.removeAttribute(node.attributes[0].name);
}
- _Reflect$ownKeys(attrs).forEach(function (key) {
+ _Reflect$ownKeys(attrs).forEach(function(key) {
node.setAttribute(key, attrs[key]);
});
}
@@ -1734,19 +1979,18 @@ function setAttrs(node, attrs) {
var IS_LINK = new RegExp('https?://', 'i');
var IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i');
-
var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
-// Convert all instances of images with potentially
// lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
+
function convertLazyLoadedImages($) {
- $('img').each(function (_, img) {
+ $('img').each(function(_, img) {
var attrs = getAttrs(img);
- _Reflect$ownKeys(attrs).forEach(function (attr) {
+ _Reflect$ownKeys(attrs).forEach(function(attr) {
var value = attrs[attr];
if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
@@ -1754,7 +1998,6 @@ function convertLazyLoadedImages($) {
}
});
});
-
return $;
}
@@ -1763,20 +2006,21 @@ function isComment(index, node) {
}
function cleanComments($) {
- $.root().find('*').contents().filter(isComment).remove();
-
+ $.root()
+ .find('*')
+ .contents()
+ .filter(isComment)
+ .remove();
return $;
}
function clean($) {
$(TAGS_TO_REMOVE).remove();
-
$ = cleanComments($);
return $;
}
var Resource = {
-
// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
@@ -1786,73 +2030,80 @@ var Resource = {
create: function create(url, preparedResponse, parsedUrl) {
var _this = this;
- return _asyncToGenerator(_regeneratorRuntime.mark(function _callee() {
- var result, validResponse;
- return _regeneratorRuntime.wrap(function _callee$(_context) {
- while (1) {
- switch (_context.prev = _context.next) {
- case 0:
- result = void 0;
-
- if (!preparedResponse) {
- _context.next = 6;
- break;
- }
-
- validResponse = {
- statusMessage: 'OK',
- statusCode: 200,
- headers: {
- 'content-type': 'text/html',
- 'content-length': 500
- }
- };
-
-
- result = { body: preparedResponse, response: validResponse };
- _context.next = 9;
- break;
-
- case 6:
- _context.next = 8;
- return fetchResource$1(url, parsedUrl);
-
- case 8:
- result = _context.sent;
-
- case 9:
- if (!result.error) {
- _context.next = 12;
- break;
+ return _asyncToGenerator(
+ /*#__PURE__*/
+ _regeneratorRuntime.mark(function _callee() {
+ var result, validResponse;
+ return _regeneratorRuntime.wrap(
+ function _callee$(_context) {
+ while (1) {
+ switch ((_context.prev = _context.next)) {
+ case 0:
+ if (!preparedResponse) {
+ _context.next = 5;
+ break;
+ }
+
+ validResponse = {
+ statusMessage: 'OK',
+ statusCode: 200,
+ headers: {
+ 'content-type': 'text/html',
+ 'content-length': 500,
+ },
+ };
+ result = {
+ body: preparedResponse,
+ response: validResponse,
+ };
+ _context.next = 8;
+ break;
+
+ case 5:
+ _context.next = 7;
+ return fetchResource(url, parsedUrl);
+
+ case 7:
+ result = _context.sent;
+
+ case 8:
+ if (!result.error) {
+ _context.next = 11;
+ break;
+ }
+
+ result.failed = true;
+ return _context.abrupt('return', result);
+
+ case 11:
+ return _context.abrupt('return', _this.generateDoc(result));
+
+ case 12:
+ case 'end':
+ return _context.stop();
}
-
- result.failed = true;
- return _context.abrupt('return', result);
-
- case 12:
- return _context.abrupt('return', _this.generateDoc(result));
-
- case 13:
- case 'end':
- return _context.stop();
- }
- }
- }, _callee, _this);
- }))();
+ }
+ },
+ _callee,
+ this
+ );
+ })
+ )();
},
generateDoc: function generateDoc(_ref) {
var content = _ref.body,
- response = _ref.response;
- var contentType = response.headers['content-type'];
-
- // TODO: Implement is_text function from
+ response = _ref.response;
+ var contentType = response.headers['content-type']; // TODO: Implement is_text function from
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
if (!contentType.includes('html') && !contentType.includes('text')) {
throw new Error('Content does not appear to be text.');
}
- var $ = this.encodeDoc({ content: content, contentType: contentType });
+ var $ = this.encodeDoc({
+ content: content,
+ contentType: contentType,
+ });
if ($.root().children().length === 0) {
throw new Error('No children, likely a bad parse.');
@@ -1861,40 +2112,43 @@ var Resource = {
$ = normalizeMetaTags($);
$ = convertLazyLoadedImages($);
$ = clean($);
-
return $;
},
encodeDoc: function encodeDoc(_ref2) {
var content = _ref2.content,
- contentType = _ref2.contentType;
-
+ contentType = _ref2.contentType;
var encoding = getEncoding(contentType);
var decodedContent = iconv.decode(content, encoding);
- var $ = cheerio.load(decodedContent);
+ var $ = cheerio.load(decodedContent); // after first cheerio.load, check to see if encoding matches
- // after first cheerio.load, check to see if encoding matches
var metaContentType = $('meta[http-equiv=content-type]').attr('content');
- var properEncoding = getEncoding(metaContentType);
+ var properEncoding = getEncoding(metaContentType); // if encodings in the header/body dont match, use the one in the body
- // if encodings in the header/body dont match, use the one in the body
if (properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}
return $;
- }
+ },
};
var merge = function merge(extractor, domains) {
- return domains.reduce(function (acc, domain) {
+ return domains.reduce(function(acc, domain) {
acc[domain] = extractor;
return acc;
}, {});
};
function mergeSupportedDomains(extractor) {
- return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
+ return extractor.supportedDomains
+ ? merge(
+ extractor,
+ [extractor.domain].concat(
+ _toConsumableArray(extractor.supportedDomains)
+ )
+ )
+ : merge(extractor, [extractor.domain]);
}
var BloggerExtractor = {
@@ -1904,27 +2158,22 @@ var BloggerExtractor = {
// initially in the page, but it's all there
// in noscript
selectors: ['.post-content noscript'],
-
// Selectors to remove from the extracted content
clean: [],
-
// Convert the noscript tag to a div
transforms: {
- noscript: 'div'
- }
+ noscript: 'div',
+ },
},
-
author: {
- selectors: ['.post-author-name']
+ selectors: ['.post-author-name'],
},
-
title: {
- selectors: ['.post h2.title']
+ selectors: ['.post h2.title'],
},
-
date_published: {
- selectors: ['span.publishdate']
- }
+ selectors: ['span.publishdate'],
+ },
};
var NYMagExtractor = {
@@ -1932,10 +2181,8 @@ var NYMagExtractor = {
content: {
// Order by most likely. Extractor will stop on first occurrence
selectors: ['div.article-content', 'section.body', 'article.article'],
-
// Selectors to remove from the extracted content
clean: ['.ad', '.single-related-story'],
-
// Object of tranformations to make on matched elements
// Each key is the selector, each value is the tag to
// transform to.
@@ -1945,76 +2192,75 @@ var NYMagExtractor = {
transforms: {
// Convert h1s to h2s
h1: 'h2',
-
// Convert lazy-loaded noscript images to figures
noscript: function noscript($node, $) {
var $children = $.browser ? $($node.text()) : $node.children();
- if ($children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img') {
+
+ if (
+ $children.length === 1 &&
+ $children.get(0) !== undefined &&
+ $children.get(0).tagName.toLowerCase() === 'img'
+ ) {
return 'figure';
}
return null;
- }
- }
+ },
+ },
},
-
title: {
- selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1']
+ selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1'],
},
-
author: {
- selectors: ['.by-authors', '.lede-feature-author']
+ selectors: ['.by-authors', '.lede-feature-author'],
},
-
dek: {
- selectors: ['.lede-feature-teaser']
+ selectors: ['.lede-feature-teaser'],
},
-
date_published: {
- selectors: [['time.article-timestamp[datetime]', 'datetime'], 'time.article-timestamp']
- }
+ selectors: [
+ ['time.article-timestamp[datetime]', 'datetime'],
+ 'time.article-timestamp',
+ ],
+ },
};
var WikipediaExtractor = {
domain: 'wikipedia.org',
content: {
selectors: ['#mw-content-text'],
-
defaultCleaner: false,
-
// transform top infobox to an image with caption
transforms: {
'.infobox img': function infoboxImg($node) {
- var $parent = $node.parents('.infobox');
- // Only prepend the first image in .infobox
+ var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox
+
if ($parent.children('img').length === 0) {
$parent.prepend($node);
}
},
'.infobox caption': 'figcaption',
- '.infobox': 'figure'
+ '.infobox': 'figure',
},
-
// Selectors to remove from the extracted content
- clean: ['.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox']
-
+ clean: [
+ '.mw-editsection',
+ 'figure tr, figure td, figure tbody',
+ '#toc',
+ '.navbox',
+ ],
},
-
author: 'Wikipedia Contributors',
-
title: {
- selectors: ['h2.title']
+ selectors: ['h2.title'],
},
-
date_published: {
- selectors: ['#footer-info-lastmod']
- }
-
+ selectors: ['#footer-info-lastmod'],
+ },
};
var TwitterExtractor = {
domain: 'twitter.com',
-
content: {
transforms: {
// We're transforming essentially the whole page here.
@@ -2027,47 +2273,35 @@ var TwitterExtractor = {
$tweetContainer.append(tweets);
$node.replaceWith($tweetContainer);
},
-
// Twitter wraps @ with s, which
// renders as a strikethrough
- s: 'span'
+ s: 'span',
},
-
selectors: ['.permalink[role=main]'],
-
defaultCleaner: false,
-
- clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']
+ clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'],
},
-
author: {
- selectors: ['.tweet.permalink-tweet .username']
+ selectors: ['.tweet.permalink-tweet .username'],
},
-
date_published: {
- selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']]
- }
-
+ selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']],
+ },
};
var NYTimesExtractor = {
domain: 'www.nytimes.com',
-
title: {
- selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline']
+ selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline']
+ selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline'],
},
-
content: {
selectors: ['div.g-blocks', 'article#story'],
-
transforms: {
'img.g-lazy': function imgGLazy($node) {
- var src = $node.attr('src');
- // const widths = $node.attr('data-widths')
+ var src = $node.attr('src'); // const widths = $node.attr('data-widths')
// .slice(1)
// .slice(0, -1)
// .split(',');
@@ -2076,29 +2310,36 @@ var NYTimesExtractor = {
// } else {
// width = '900';
// }
- var width = 640;
+ var width = 640;
src = src.replace('{{size}}', width);
$node.attr('src', src);
- }
+ },
},
-
- clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden', '.comments', '.supplemental', '.nocontent', '.story-footer-links']
+ clean: [
+ '.ad',
+ 'header#story-header',
+ '.story-body-1 .lede.video',
+ '.visually-hidden',
+ '#newsletter-promo',
+ '.promo',
+ '.comments-button',
+ '.hidden',
+ '.comments',
+ '.supplemental',
+ '.nocontent',
+ '.story-footer-links',
+ ],
},
-
date_published: {
- selectors: [['meta[name="article:published"]', 'value']]
+ selectors: [['meta[name="article:published"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: null,
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2106,35 +2347,30 @@ var NYTimesExtractor = {
var TheAtlanticExtractor = {
domain: 'www.theatlantic.com',
title: {
- selectors: ['h1.hed']
+ selectors: ['h1.hed'],
},
-
author: {
- selectors: ['article#article .article-cover-extra .metadata .byline a']
+ selectors: ['article#article .article-cover-extra .metadata .byline a'],
},
-
content: {
- selectors: [['.article-cover figure.lead-img', '.article-body'], '.article-body'],
-
+ selectors: [
+ ['.article-cover figure.lead-img', '.article-body'],
+ '.article-body',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.partner-box', '.callout']
+ clean: ['.partner-box', '.callout'],
},
-
date_published: {
- selectors: [['time[itemProp="datePublished"]', 'datetime']]
+ selectors: [['time[itemProp="datePublished"]', 'datetime']],
},
-
lead_image_url: null,
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2143,43 +2379,36 @@ var TheAtlanticExtractor = {
var NewYorkerExtractor = {
domain: 'www.newyorker.com',
title: {
- selectors: ['h1.title']
+ selectors: ['h1.title'],
},
-
author: {
- selectors: ['.contributors']
+ selectors: ['.contributors'],
},
-
content: {
selectors: ['div#articleBody', 'div.articleBody'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
+ clean: [],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value'], ['time[itemProp="datePublished"]', 'content']],
-
- timezone: 'America/New_York'
+ selectors: [
+ ['meta[name="article:published_time"]', 'value'],
+ ['time[itemProp="datePublished"]', 'content'],
+ ],
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
- selectors: ['.dek', 'h2.dek']
+ selectors: ['.dek', 'h2.dek'],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2188,41 +2417,32 @@ var NewYorkerExtractor = {
var WiredExtractor = {
domain: 'www.wired.com',
title: {
- selectors: ['h1.post-title']
+ selectors: ['h1.post-title'],
},
-
author: {
- selectors: ['a[rel="author"]']
+ selectors: ['a[rel="author"]'],
},
-
content: {
selectors: ['article.content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.visually-hidden', 'figcaption img.photo']
+ clean: ['.visually-hidden', 'figcaption img.photo'],
},
-
date_published: {
- selectors: [['meta[itemprop="datePublished"]', 'value']]
+ selectors: [['meta[itemprop="datePublished"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2231,41 +2451,32 @@ var WiredExtractor = {
var MSNExtractor = {
domain: 'www.msn.com',
title: {
- selectors: ['h1']
+ selectors: ['h1'],
},
-
author: {
- selectors: ['span.authorname-txt']
+ selectors: ['span.authorname-txt'],
},
-
content: {
selectors: ['div.richtext'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['span.caption']
+ clean: ['span.caption'],
},
-
date_published: {
- selectors: ['span.time']
+ selectors: ['span.time'],
},
-
lead_image_url: {
- selectors: []
+ selectors: [],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2274,45 +2485,37 @@ var MSNExtractor = {
var YahooExtractor = {
domain: 'www.yahoo.com',
title: {
- selectors: ['header.canvas-header']
+ selectors: ['header.canvas-header'],
},
-
author: {
- selectors: ['span.provider-name']
+ selectors: ['span.provider-name'],
},
-
content: {
selectors: [
- // enter content selectors
- '.content-canvas'],
-
+ // enter content selectors
+ '.content-canvas',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.figure-caption']
+ clean: ['.figure-caption'],
},
-
date_published: {
- selectors: [['time.date[datetime]', 'datetime']]
+ selectors: [['time.date[datetime]', 'datetime']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
selectors: [
// enter dek selectors
- ]
+ ],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2321,55 +2524,54 @@ var YahooExtractor = {
var BuzzfeedExtractor = {
domain: 'www.buzzfeed.com',
title: {
- selectors: ['h1[id="post-title"]']
+ selectors: ['h1[id="post-title"]'],
},
-
author: {
- selectors: ['a[data-action="user/username"]', 'byline__author']
+ selectors: ['a[data-action="user/username"]', 'byline__author'],
},
-
content: {
- selectors: [['.longform_custom_header_media', '#buzz_sub_buzz'], '#buzz_sub_buzz'],
-
+ selectors: [
+ ['.longform_custom_header_media', '#buzz_sub_buzz'],
+ '#buzz_sub_buzz',
+ ],
defaultCleaner: false,
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
h2: 'b',
-
- 'div.longform_custom_header_media': function divLongform_custom_header_media($node) {
+ 'div.longform_custom_header_media': function divLongform_custom_header_media(
+ $node
+ ) {
if ($node.has('img') && $node.has('.longform_header_image_source')) {
return 'figure';
}
return null;
},
-
- 'figure.longform_custom_header_media .longform_header_image_source': 'figcaption'
+ 'figure.longform_custom_header_media .longform_header_image_source':
+ 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.instapaper_ignore', '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline', '.share-box', '.print']
+ clean: [
+ '.instapaper_ignore',
+ '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline',
+ '.share-box',
+ '.print',
+ ],
},
-
date_published: {
- selectors: ['.buzz-datetime']
+ selectors: ['.buzz-datetime'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2378,41 +2580,32 @@ var BuzzfeedExtractor = {
var WikiaExtractor = {
domain: 'fandom.wikia.com',
title: {
- selectors: ['h1.entry-title']
+ selectors: ['h1.entry-title'],
},
-
author: {
- selectors: ['.author vcard', '.fn']
+ selectors: ['.author vcard', '.fn'],
},
-
content: {
selectors: ['.grid-content', '.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
+ clean: [],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2421,35 +2614,30 @@ var WikiaExtractor = {
var LittleThingsExtractor = {
domain: 'www.littlethings.com',
title: {
- selectors: ['h1.post-title']
+ selectors: ['h1.post-title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
content: {
selectors: [
- // enter content selectors
- '.mainContentIntro', '.content-wrapper'],
-
+ // enter content selectors
+ '.mainContentIntro',
+ '.content-wrapper',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
+ clean: [],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
// Rename CustomExtractor
@@ -2459,104 +2647,99 @@ var PoliticoExtractor = {
domain: 'www.politico.com',
title: {
selectors: [
- // enter title selectors
- ['meta[name="og:title"]', 'value']]
+ // enter title selectors
+ ['meta[name="og:title"]', 'value'],
+ ],
},
-
author: {
- selectors: ['.story-main-content .byline .vcard']
+ selectors: ['.story-main-content .byline .vcard'],
},
-
content: {
selectors: [
- // enter content selectors
- '.story-main-content', '.content-group', '.story-core', '.story-text'],
-
+ // enter content selectors
+ '.story-main-content',
+ '.content-group',
+ '.story-core',
+ '.story-text',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['figcaption']
+ clean: ['figcaption'],
},
-
date_published: {
- selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
+ selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']],
},
-
lead_image_url: {
selectors: [
- // enter lead_image_url selectors
- ['meta[name="og:image"]', 'value']]
+ // enter lead_image_url selectors
+ ['meta[name="og:image"]', 'value'],
+ ],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: null,
-
- excerpt: null
+ excerpt: null,
};
var DeadspinExtractor = {
domain: 'deadspin.com',
-
- supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com'],
-
+ supportedDomains: [
+ 'jezebel.com',
+ 'lifehacker.com',
+ 'kotaku.com',
+ 'gizmodo.com',
+ 'jalopnik.com',
+ 'kinja.com',
+ ],
title: {
- selectors: ['h1.headline']
+ selectors: ['h1.headline'],
},
-
author: {
- selectors: ['.author']
+ selectors: ['.author'],
},
-
content: {
selectors: ['.post-content', '.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- 'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube($node) {
+ 'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube(
+ $node
+ ) {
var youtubeId = $node.attr('id').split('youtube-')[1];
- $node.attr('src', 'https://www.youtube.com/embed/' + youtubeId);
- }
+ $node.attr('src', 'https://www.youtube.com/embed/'.concat(youtubeId));
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.magnifier', '.lightbox']
+ clean: ['.magnifier', '.lightbox'],
},
-
date_published: {
- selectors: [['time.updated[datetime]', 'datetime']]
+ selectors: [['time.updated[datetime]', 'datetime']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
next_page_url: {
selectors: [
// enter selectors
- ]
+ ],
},
-
excerpt: {
selectors: [
// enter selectors
- ]
- }
+ ],
+ },
};
// Rename CustomExtractor
@@ -2565,49 +2748,40 @@ var DeadspinExtractor = {
var BroadwayWorldExtractor = {
domain: 'www.broadwayworld.com',
title: {
- selectors: ['h1.article-title']
+ selectors: ['h1.article-title'],
},
-
author: {
- selectors: ['span[itemprop=author]']
+ selectors: ['span[itemprop=author]'],
},
-
content: {
selectors: ['div[itemprop=articlebody]'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
+ clean: [],
},
-
date_published: {
- selectors: [['meta[itemprop=datePublished]', 'value']]
+ selectors: [['meta[itemprop=datePublished]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: {
selectors: [
// enter selectors
- ]
+ ],
},
-
excerpt: {
selectors: [
// enter selectors
- ]
- }
+ ],
+ },
};
// Rename CustomExtractor
@@ -2616,75 +2790,67 @@ var BroadwayWorldExtractor = {
var ApartmentTherapyExtractor = {
domain: 'www.apartmenttherapy.com',
title: {
- selectors: ['h1.headline']
+ selectors: ['h1.headline'],
},
-
author: {
- selectors: ['.PostByline__name']
+ selectors: ['.PostByline__name'],
},
-
content: {
selectors: ['div.post__content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- 'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture($node, $) {
+ 'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture(
+ $node,
+ $
+ ) {
var data = JSON.parse($node.attr('data-props'));
var src = data.sources[0].src;
-
var $img = $('').attr('src', src);
$node.replaceWith($img);
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
+ clean: [],
},
-
date_published: {
- selectors: [['.PostByline__timestamp[datetime]', 'datetime']]
+ selectors: [['.PostByline__timestamp[datetime]', 'datetime']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
next_page_url: {
selectors: [
// enter selectors
- ]
+ ],
},
-
excerpt: {
selectors: [
// enter selectors
- ]
- }
+ ],
+ },
};
var MediumExtractor = {
domain: 'medium.com',
-
supportedDomains: ['trackchanges.postlight.com'],
-
title: {
- selectors: ['h1']
+ selectors: ['h1'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
content: {
- selectors: [['.section-content'], '.section-content', 'article > div > section'],
-
+ selectors: [
+ ['.section-content'],
+ '.section-content',
+ 'article > div > section',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@@ -2695,127 +2861,102 @@ var MediumExtractor = {
if (ytRe.test(thumb)) {
var _thumb$match = thumb.match(ytRe),
- _thumb$match2 = _slicedToArray(_thumb$match, 2),
- _ = _thumb$match2[0],
- youtubeId = _thumb$match2[1]; // eslint-disable-line
+ _thumb$match2 = _slicedToArray(_thumb$match, 2),
+ _ = _thumb$match2[0],
+ youtubeId = _thumb$match2[1]; // eslint-disable-line
-
- $node.attr('src', 'https://www.youtube.com/embed/' + youtubeId);
+ $node.attr('src', 'https://www.youtube.com/embed/'.concat(youtubeId));
var $parent = $node.parents('figure');
var $caption = $parent.find('figcaption');
$parent.empty().append([$node, $caption]);
}
},
-
// rewrite figures to pull out image and caption, remove rest
figure: function figure($node) {
// ignore if figure has an iframe
if ($node.find('iframe').length > 0) return;
-
var $img = $node.find('img').slice(-1)[0];
var $caption = $node.find('figcaption');
$node.empty().append([$img, $caption]);
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
+ clean: [],
},
-
date_published: {
- selectors: [['time[datetime]', 'datetime']]
+ selectors: [['time[datetime]', 'datetime']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
next_page_url: {
selectors: [
// enter selectors
- ]
+ ],
},
-
excerpt: {
selectors: [
// enter selectors
- ]
- }
+ ],
+ },
};
var WwwTmzComExtractor = {
domain: 'www.tmz.com',
-
title: {
- selectors: ['.post-title-breadcrumb', 'h1', '.headline']
+ selectors: ['.post-title-breadcrumb', 'h1', '.headline'],
},
-
author: 'TMZ STAFF',
-
date_published: {
selectors: ['.article-posted-date'],
-
- timezone: 'America/Los_Angeles'
+ timezone: 'America/Los_Angeles',
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article-content', '.all-post-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.lightbox-link']
- }
+ clean: ['.lightbox-link'],
+ },
};
var WwwWashingtonpostComExtractor = {
domain: 'www.washingtonpost.com',
-
title: {
- selectors: ['h1', '#topper-headline-wrapper']
+ selectors: ['h1', '#topper-headline-wrapper'],
},
-
author: {
- selectors: ['.pb-byline']
+ selectors: ['.pb-byline'],
},
-
date_published: {
- selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']]
+ selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@@ -2827,44 +2968,38 @@ var WwwWashingtonpostComExtractor = {
$node.remove();
return null;
},
- '.pb-caption': 'figcaption'
+ '.pb-caption': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.interstitial-link', '.newsletter-inline-unit']
- }
+ clean: ['.interstitial-link', '.newsletter-inline-unit'],
+ },
};
var WwwHuffingtonpostComExtractor = {
domain: 'www.huffingtonpost.com',
-
title: {
- selectors: ['h1.headline__title']
+ selectors: ['h1.headline__title'],
},
-
author: {
- selectors: ['span.author-card__details__name']
+ selectors: ['span.author-card__details__name'],
},
-
date_published: {
- selectors: [['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value']]
+ selectors: [
+ ['meta[name="article:modified_time"]', 'value'],
+ ['meta[name="article:published_time"]', 'value'],
+ ],
},
-
dek: {
- selectors: ['h2.headline__subtitle']
+ selectors: ['h2.headline__subtitle'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.entry__body'],
-
defaultCleaner: false,
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@@ -2873,1565 +3008,1329 @@ var WwwHuffingtonpostComExtractor = {
// $node.replaceWith($figure);
// },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story']
- }
+ clean: [
+ '.pull-quote',
+ '.tag-cloud',
+ '.embed-asset',
+ '.below-entry',
+ '.entry-corrections',
+ '#suggested-story',
+ ],
+ },
};
var NewrepublicComExtractor = {
domain: 'newrepublic.com',
-
title: {
- selectors: ['h1.article-headline', '.minutes-primary h1.minute-title']
+ selectors: ['h1.article-headline', '.minutes-primary h1.minute-title'],
},
-
author: {
- selectors: ['div.author-list', '.minutes-primary h3.minute-byline']
+ selectors: ['div.author-list', '.minutes-primary h3.minute-byline'],
},
-
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
- selectors: ['h2.article-subhead']
+ selectors: ['h2.article-subhead'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['.article-cover', 'div.content-body'], ['.minute-image', '.minutes-primary div.content-body']],
-
+ selectors: [
+ ['.article-cover', 'div.content-body'],
+ ['.minute-image', '.minutes-primary div.content-body'],
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['aside']
- }
+ clean: ['aside'],
+ },
};
var MoneyCnnComExtractor = {
domain: 'money.cnn.com',
-
title: {
- selectors: ['.article-title']
+ selectors: ['.article-title'],
},
-
author: {
- selectors: ['.byline a']
+ selectors: ['.byline a'],
},
-
date_published: {
selectors: [['meta[name="date"]', 'value']],
-
- timezone: 'GMT'
+ timezone: 'GMT',
},
-
dek: {
- selectors: ['#storytext h2']
+ selectors: ['#storytext h2'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['#storytext'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.inStoryHeading']
- }
+ clean: ['.inStoryHeading'],
+ },
};
var WwwThevergeComExtractor = {
domain: 'www.theverge.com',
-
supportedDomains: ['www.polygon.com'],
-
title: {
- selectors: ['h1']
+ selectors: ['h1'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['h2.p-dek']
+ selectors: ['h2.p-dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [
- // feature template multi-match
- ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'],
- // regular post multi-match
- ['.e-image--hero', '.c-entry-content'],
- // feature template fallback
- '.l-wrapper .l-feature',
- // regular post fallback
- 'div.c-entry-content'],
-
+ // feature template multi-match
+ ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match
+ ['.e-image--hero', '.c-entry-content'], // feature template fallback
+ '.l-wrapper .l-feature', // regular post fallback
+ 'div.c-entry-content',
+ ],
// Transform lazy-loaded images
transforms: {
noscript: function noscript($node) {
var $children = $node.children();
+
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'span';
}
return null;
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.aside', 'img.c-dynamic-image']
- }
+ clean: ['.aside', 'img.c-dynamic-image'],
+ },
};
var WwwCnnComExtractor = {
domain: 'www.cnn.com',
-
title: {
- selectors: ['h1.pg-headline', 'h1']
+ selectors: ['h1.pg-headline', 'h1'],
},
-
author: {
- selectors: ['.metadata__byline__author']
+ selectors: ['.metadata__byline__author'],
},
-
date_published: {
- selectors: [['meta[name="pubdate"]', 'value']]
+ selectors: [['meta[name="pubdate"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [
- // a more specific selector to grab the lead image and the body
- ['.media__video--thumbnail', '.zn-body-text'],
- // a fallback for the above
- '.zn-body-text', 'div[itemprop="articleBody"]'],
-
+ // a more specific selector to grab the lead image and the body
+ ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above
+ '.zn-body-text',
+ 'div[itemprop="articleBody"]',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph($node) {
+ '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph(
+ $node
+ ) {
var $text = $node.html();
+
if ($text) {
return 'p';
}
return null;
},
-
// this transform cleans the short, all-link sections linking
// to related content but not marked as such in any way.
'.zn-body__paragraph': function znBody__paragraph($node) {
if ($node.has('a')) {
- if ($node.text().trim() === $node.find('a').text().trim()) {
+ if (
+ $node.text().trim() ===
+ $node
+ .find('a')
+ .text()
+ .trim()
+ ) {
$node.remove();
}
}
},
-
- '.media__video--thumbnail': 'figure'
-
+ '.media__video--thumbnail': 'figure',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwAolComExtractor = {
domain: 'www.aol.com',
-
title: {
- selectors: ['h1.p-article__title']
+ selectors: ['h1.p-article__title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
selectors: ['.p-article__byline__date'],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwYoutubeComExtractor = {
domain: 'www.youtube.com',
-
title: {
- selectors: ['.watch-title', 'h1.watch-title-container']
+ selectors: ['.watch-title', 'h1.watch-title-container'],
},
-
author: {
- selectors: ['.yt-user-info']
+ selectors: ['.yt-user-info'],
},
-
date_published: {
selectors: [['meta[itemProp="datePublished"]', 'value']],
-
- timezone: 'GMT'
+ timezone: 'GMT',
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
defaultCleaner: false,
-
selectors: [['#player-api', '#eow-description']],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'#player-api': function playerApi($node, $) {
var videoId = $('meta[itemProp="videoId"]').attr('value');
- $node.html('\n ');
- }
+ $node.html(
+ '\n '
+ )
+ );
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwTheguardianComExtractor = {
domain: 'www.theguardian.com',
-
title: {
- selectors: ['.content__headline']
+ selectors: ['.content__headline'],
},
-
author: {
- selectors: ['p.byline']
+ selectors: ['p.byline'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['.content__standfirst']
+ selectors: ['.content__standfirst'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.content__article-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.hide-on-mobile', '.inline-icon']
- }
+ clean: ['.hide-on-mobile', '.inline-icon'],
+ },
};
var WwwSbnationComExtractor = {
domain: 'www.sbnation.com',
-
title: {
- selectors: ['h1.c-page-title']
+ selectors: ['h1.c-page-title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['h2.c-entry-summary.p-dek']
+ selectors: ['h2.c-entry-summary.p-dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.c-entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwBloombergComExtractor = {
domain: 'www.bloomberg.com',
-
title: {
selectors: [
- // normal articles
- '.lede-headline',
-
- // /graphics/ template
- 'h1.article-title',
-
- // /news/ template
- 'h1.lede-text-only__hed']
+ // normal articles
+ '.lede-headline', // /graphics/ template
+ 'h1.article-title', // /news/ template
+ 'h1.lede-text-only__hed',
+ ],
},
-
author: {
- selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link',
-
- // /graphics/ template
- '.bydek',
-
- // /news/ template
- '.author']
+ selectors: [
+ ['meta[name="parsely-author"]', 'value'],
+ '.byline-details__link', // /graphics/ template
+ '.bydek', // /news/ template
+ '.author',
+ ],
},
-
date_published: {
- selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value']]
+ selectors: [
+ ['time.published-at', 'datetime'],
+ ['time[datetime]', 'datetime'],
+ ['meta[name="date"]', 'value'],
+ ['meta[name="parsely-pub-date"]', 'value'],
+ ],
},
-
dek: {
- selectors: []
+ selectors: [],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: ['.article-body__content',
-
- // /graphics/ template
- ['section.copy-block'],
-
- // /news/ template
- '.body-copy'],
-
+ selectors: [
+ '.article-body__content', // /graphics/ template
+ ['section.copy-block'], // /news/ template
+ '.body-copy',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.inline-newsletter', '.page-ad']
- }
+ clean: ['.inline-newsletter', '.page-ad'],
+ },
};
var WwwBustleComExtractor = {
domain: 'www.bustle.com',
-
title: {
- selectors: ['h1.post-page__title']
+ selectors: ['h1.post-page__title'],
},
-
author: {
- selectors: ['div.content-meta__author']
+ selectors: ['div.content-meta__author'],
},
-
date_published: {
- selectors: [['time.content-meta__published-date[datetime]', 'datetime']]
+ selectors: [['time.content-meta__published-date[datetime]', 'datetime']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.post-page__body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwNprOrgExtractor = {
domain: 'www.npr.org',
-
title: {
- selectors: ['h1', '.storytitle']
+ selectors: ['h1', '.storytitle'],
},
-
author: {
- selectors: ['p.byline__name.byline__name--block']
+ selectors: ['p.byline__name.byline__name--block'],
},
-
date_published: {
- selectors: [['.dateblock time[datetime]', 'datetime'], ['meta[name="date"]', 'value']]
+ selectors: [
+ ['.dateblock time[datetime]', 'datetime'],
+ ['meta[name="date"]', 'value'],
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value'], ['meta[name="twitter:image:src"]', 'value']]
+ selectors: [
+ ['meta[name="og:image"]', 'value'],
+ ['meta[name="twitter:image:src"]', 'value'],
+ ],
},
-
content: {
selectors: ['.storytext'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.bucketwrap.image': 'figure',
- '.bucketwrap.image .credit-caption': 'figcaption'
+ '.bucketwrap.image .credit-caption': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['div.enlarge_measure']
- }
+ clean: ['div.enlarge_measure'],
+ },
};
var WwwRecodeNetExtractor = {
domain: 'www.recode.net',
-
title: {
- selectors: ['h1.c-page-title']
+ selectors: ['h1.c-page-title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['h2.c-entry-summary.p-dek']
+ selectors: ['h2.c-entry-summary.p-dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
-
+ selectors: [
+ ['figure.e-image--hero', '.c-entry-content'],
+ '.c-entry-content',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var QzComExtractor = {
domain: 'qz.com',
-
title: {
- selectors: ['header.item-header.content-width-responsive']
+ selectors: ['header.item-header.content-width-responsive'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: ['.timestamp']
+ selectors: ['.timestamp'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['figure.featured-image', '.item-body'], '.item-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.article-aside', '.progressive-image-thumbnail']
- }
+ clean: ['.article-aside', '.progressive-image-thumbnail'],
+ },
};
var WwwDmagazineComExtractor = {
domain: 'www.dmagazine.com',
-
title: {
- selectors: ['h1.story__title']
+ selectors: ['h1.story__title'],
},
-
author: {
- selectors: ['.story__info .story__info__item:first-child']
+ selectors: ['.story__info .story__info__item:first-child'],
},
-
date_published: {
selectors: [
- // enter selectors
- '.story__info'],
-
- timezone: 'America/Chicago'
+ // enter selectors
+ '.story__info',
+ ],
+ timezone: 'America/Chicago',
},
-
dek: {
- selectors: ['.story__subhead']
+ selectors: ['.story__subhead'],
},
-
lead_image_url: {
- selectors: [['article figure a:first-child', 'href']]
+ selectors: [['article figure a:first-child', 'href']],
},
-
content: {
selectors: ['.story__content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwReutersComExtractor = {
domain: 'www.reuters.com',
-
title: {
- selectors: ['h1.article-headline']
+ selectors: ['h1.article-headline'],
},
-
author: {
- selectors: ['.author']
+ selectors: ['.author'],
},
-
date_published: {
- selectors: [['meta[name="og:article:published_time"]', 'value']]
+ selectors: [['meta[name="og:article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['#article-text'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- '.article-subtitle': 'h4'
+ '.article-subtitle': 'h4',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['#article-byline .author']
- }
+ clean: ['#article-byline .author'],
+ },
};
var MashableComExtractor = {
domain: 'mashable.com',
-
title: {
- selectors: ['h1.title']
+ selectors: ['h1.title'],
},
-
author: {
- selectors: ['span.author_name a']
+ selectors: ['span.author_name a'],
},
-
date_published: {
- selectors: [['meta[name="og:article:published_time"]', 'value']]
+ selectors: [['meta[name="og:article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['section.article-content.blueprint'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- '.image-credit': 'figcaption'
+ '.image-credit': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwChicagotribuneComExtractor = {
domain: 'www.chicagotribune.com',
-
title: {
- selectors: ['h1.trb_ar_hl_t']
+ selectors: ['h1.trb_ar_hl_t'],
},
-
author: {
- selectors: ['span.trb_ar_by_nm_au']
+ selectors: ['span.trb_ar_by_nm_au'],
},
-
date_published: {
- selectors: [['meta[itemprop="datePublished"]', 'value']]
+ selectors: [['meta[itemprop="datePublished"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.trb_ar_page'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwVoxComExtractor = {
domain: 'www.vox.com',
-
title: {
- selectors: ['h1.c-page-title']
+ selectors: ['h1.c-page-title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['.p-dek']
+ selectors: ['.p-dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['figure.e-image--hero', '.c-entry-content'], '.c-entry-content'],
-
+ selectors: [
+ ['figure.e-image--hero', '.c-entry-content'],
+ '.c-entry-content',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- 'figure .e-image__image noscript': function figureEImage__imageNoscript($node) {
+ 'figure .e-image__image noscript': function figureEImage__imageNoscript(
+ $node
+ ) {
var imgHtml = $node.html();
- $node.parents('.e-image__image').find('.c-dynamic-image').replaceWith(imgHtml);
+ $node
+ .parents('.e-image__image')
+ .find('.c-dynamic-image')
+ .replaceWith(imgHtml);
},
-
- 'figure .e-image__meta': 'figcaption'
+ 'figure .e-image__meta': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var NewsNationalgeographicComExtractor = {
domain: 'news.nationalgeographic.com',
-
title: {
- selectors: ['h1', 'h1.main-title']
+ selectors: ['h1', 'h1.main-title'],
},
-
author: {
- selectors: ['.byline-component__contributors b span']
+ selectors: ['.byline-component__contributors b span'],
},
-
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']],
format: 'ddd MMM DD HH:mm:ss zz YYYY',
- timezone: 'EST'
+ timezone: 'EST',
},
-
dek: {
- selectors: ['.article__deck']
+ selectors: ['.article__deck'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.parsys.content', '.__image-lead__'], '.content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.parsys.content': function parsysContent($node, $) {
- var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
+ var $imgSrc = $node
+ .find('.image.parbase.section')
+ .find('.picturefill')
+ .first()
+ .data('platform-src');
+
if ($imgSrc) {
- $node.prepend($(''));
+ $node.prepend(
+ $(''))
+ );
}
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.pull-quote.pull-quote--large']
- }
+ clean: ['.pull-quote.pull-quote--large'],
+ },
};
var WwwNationalgeographicComExtractor = {
domain: 'www.nationalgeographic.com',
-
title: {
- selectors: ['h1', 'h1.main-title']
+ selectors: ['h1', 'h1.main-title'],
},
-
author: {
- selectors: ['.byline-component__contributors b span']
+ selectors: ['.byline-component__contributors b span'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['.article__deck']
+ selectors: ['.article__deck'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.parsys.content', '.__image-lead__'], '.content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.parsys.content': function parsysContent($node, $) {
var $imageParent = $node.children().first();
+
if ($imageParent.hasClass('imageGroup')) {
- var $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();
+ var $dataAttrContainer = $imageParent
+ .find('.media--medium__container')
+ .children()
+ .first();
var imgPath1 = $dataAttrContainer.data('platform-image1-path');
var imgPath2 = $dataAttrContainer.data('platform-image2-path');
+
if (imgPath2 && imgPath1) {
- $node.prepend($('
\n \n \n
'));
+ $node.prepend(
+ $(
+ '
\n \n \n
')
+ )
+ );
}
} else {
- var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
+ var $imgSrc = $node
+ .find('.image.parbase.section')
+ .find('.picturefill')
+ .first()
+ .data('platform-src');
+
if ($imgSrc) {
- $node.prepend($(''));
+ $node.prepend(
+ $(''))
+ );
}
}
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.pull-quote.pull-quote--small']
- }
+ clean: ['.pull-quote.pull-quote--small'],
+ },
};
var WwwLatimesComExtractor = {
domain: 'www.latimes.com',
-
title: {
- selectors: ['.trb_ar_hl']
+ selectors: ['.trb_ar_hl'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[itemprop="datePublished"]', 'value']]
+ selectors: [['meta[itemprop="datePublished"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.trb_ar_main'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.trb_ar_la': function trb_ar_la($node) {
var $figure = $node.find('figure');
$node.replaceWith($figure);
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.trb_ar_by', '.trb_ar_cr']
- }
+ clean: ['.trb_ar_by', '.trb_ar_cr'],
+ },
};
var PagesixComExtractor = {
domain: 'pagesix.com',
-
supportedDomains: ['nypost.com'],
-
title: {
- selectors: ['h1 a']
+ selectors: ['h1 a'],
},
-
author: {
- selectors: ['.byline']
+ selectors: ['.byline'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: [['meta[name="description"]', 'value']]
+ selectors: [['meta[name="description"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['#featured-image-wrapper', '.entry-content'], '.entry-content'],
-
+ selectors: [
+ ['#featured-image-wrapper', '.entry-content'],
+ '.entry-content',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'#featured-image-wrapper': 'figure',
- '.wp-caption-text': 'figcaption'
+ '.wp-caption-text': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.modal-trigger']
- }
+ clean: ['.modal-trigger'],
+ },
};
var ThefederalistpapersOrgExtractor = {
domain: 'thefederalistpapers.org',
-
title: {
- selectors: ['h1.entry-title']
+ selectors: ['h1.entry-title'],
},
-
author: {
- selectors: ['main span.entry-author-name']
+ selectors: ['main span.entry-author-name'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: [['p[style]']]
- }
+ clean: [['p[style]']],
+ },
};
var WwwCbssportsComExtractor = {
domain: 'www.cbssports.com',
-
title: {
- selectors: ['.article-headline']
+ selectors: ['.article-headline'],
},
-
author: {
- selectors: ['.author-name']
+ selectors: ['.author-name'],
},
-
date_published: {
selectors: [['.date-original-reading-time time', 'datetime']],
- timezone: 'UTC'
+ timezone: 'UTC',
},
-
dek: {
- selectors: ['.article-subline']
+ selectors: ['.article-subline'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwMsnbcComExtractor = {
domain: 'www.msnbc.com',
-
title: {
- selectors: ['h1', 'h1.is-title-pane']
+ selectors: ['h1', 'h1.is-title-pane'],
},
-
author: {
- selectors: ['.author']
+ selectors: ['.author'],
},
-
date_published: {
- selectors: [['meta[name="DC.date.issued"]', 'value']]
+ selectors: [['meta[name="DC.date.issued"]', 'value']],
},
-
dek: {
- selectors: [['meta[name="description"]', 'value']]
+ selectors: [['meta[name="description"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.pane-node-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.pane-node-body': function paneNodeBody($node, $) {
- var _WwwMsnbcComExtractor = _slicedToArray(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
- selector = _WwwMsnbcComExtractor[0],
- attr = _WwwMsnbcComExtractor[1];
+ var _WwwMsnbcComExtractor = _slicedToArray(
+ WwwMsnbcComExtractor.lead_image_url.selectors[0],
+ 2
+ ),
+ selector = _WwwMsnbcComExtractor[0],
+ attr = _WwwMsnbcComExtractor[1];
var src = $(selector).attr(attr);
+
if (src) {
- $node.prepend('');
+ $node.prepend(''));
}
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwThepoliticalinsiderComExtractor = {
domain: 'www.thepoliticalinsider.com',
-
title: {
- selectors: [['meta[name="sailthru.title"]', 'value']]
+ selectors: [['meta[name="sailthru.title"]', 'value']],
},
-
author: {
- selectors: [['meta[name="sailthru.author"]', 'value']]
+ selectors: [['meta[name="sailthru.author"]', 'value']],
},
-
date_published: {
selectors: [['meta[name="sailthru.date"]', 'value']],
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div#article-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwMentalflossComExtractor = {
domain: 'www.mentalfloss.com',
-
title: {
- selectors: ['h1.title', '.title-group', '.inner']
+ selectors: ['h1.title', '.title-group', '.inner'],
},
-
author: {
- selectors: ['.field-name-field-enhanced-authors']
+ selectors: ['.field-name-field-enhanced-authors'],
},
-
date_published: {
selectors: ['.date-display-single'],
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.field.field-name-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var AbcnewsGoComExtractor = {
domain: 'abcnews.go.com',
-
title: {
- selectors: ['.article-header h1']
+ selectors: ['.article-header h1'],
},
-
author: {
selectors: ['.authors'],
- clean: ['.author-overlay', '.by-text']
+ clean: ['.author-overlay', '.by-text'],
},
-
date_published: {
selectors: ['.timestamp'],
- timezone: 'America/New_York'
-
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article-copy'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwNydailynewsComExtractor = {
domain: 'www.nydailynews.com',
-
title: {
- selectors: ['h1#ra-headline']
+ selectors: ['h1#ra-headline'],
},
-
author: {
- selectors: [['meta[name="parsely-author"]', 'value']]
+ selectors: [['meta[name="parsely-author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="sailthru.date"]', 'value']]
+ selectors: [['meta[name="sailthru.date"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['article#ra-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom']
- }
+ clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom'],
+ },
};
var WwwCnbcComExtractor = {
domain: 'www.cnbc.com',
-
title: {
- selectors: ['h1.title']
+ selectors: ['h1.title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div#article_body.content', 'div.story'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwPopsugarComExtractor = {
domain: 'www.popsugar.com',
-
title: {
- selectors: ['h2.post-title', 'title-text']
+ selectors: ['h2.post-title', 'title-text'],
},
-
author: {
- selectors: [['meta[name="article:author"]', 'value']]
+ selectors: [['meta[name="article:author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['#content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.share-copy-title', '.post-tags', '.reactions']
- }
+ clean: ['.share-copy-title', '.post-tags', '.reactions'],
+ },
};
var ObserverComExtractor = {
domain: 'observer.com',
-
title: {
- selectors: ['h1.entry-title']
+ selectors: ['h1.entry-title'],
},
-
author: {
- selectors: ['.author', '.vcard']
+ selectors: ['.author', '.vcard'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['h2.dek']
+ selectors: ['h2.dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var PeopleComExtractor = {
domain: 'people.com',
-
title: {
- selectors: [['meta[name="og:title"]', 'value']]
+ selectors: [['meta[name="og:title"]', 'value']],
},
-
author: {
- selectors: ['a.author.url.fn']
+ selectors: ['a.author.url.fn'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.article-body__inner'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwUsmagazineComExtractor = {
domain: 'www.usmagazine.com',
-
title: {
- selectors: ['header h1']
+ selectors: ['header h1'],
},
-
author: {
- selectors: ['a.article-byline.tracked-offpage']
+ selectors: ['a.article-byline.tracked-offpage'],
},
-
date_published: {
timezone: 'America/New_York',
-
- selectors: ['time.article-published-date']
+ selectors: ['time.article-published-date'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.article-body-inner'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.module-related']
- }
+ clean: ['.module-related'],
+ },
};
var WwwRollingstoneComExtractor = {
domain: 'www.rollingstone.com',
-
title: {
- selectors: ['h1.content-title']
+ selectors: ['h1.content-title'],
},
-
author: {
- selectors: ['a.content-author.tracked-offpage']
+ selectors: ['a.content-author.tracked-offpage'],
},
-
date_published: {
selectors: ['time.content-published-date'],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
- selectors: ['.content-description']
+ selectors: ['.content-description'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.lead-container', '.article-content'], '.article-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.module-related']
- }
+ clean: ['.module-related'],
+ },
};
var twofortysevensportsComExtractor = {
domain: '247sports.com',
-
title: {
- selectors: ['title', 'article header h1']
+ selectors: ['title', 'article header h1'],
},
-
author: {
- selectors: ['.author']
+ selectors: ['.author'],
},
-
date_published: {
- selectors: [['time[data-published]', 'data-published']]
+ selectors: [['time[data-published]', 'data-published']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['section.body.article'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var UproxxComExtractor = {
domain: 'uproxx.com',
-
title: {
- selectors: ['div.post-top h1']
+ selectors: ['div.post-top h1'],
},
-
author: {
- selectors: ['.post-top .authorname']
+ selectors: ['.post-top .authorname'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.post-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'div.image': 'figure',
- 'div.image .wp-media-credit': 'figcaption'
+ 'div.image .wp-media-credit': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwEonlineComExtractor = {
domain: 'www.eonline.com',
-
title: {
- selectors: ['h1.article__title']
+ selectors: ['h1.article__title'],
},
-
author: {
- selectors: ['.entry-meta__author a']
+ selectors: ['.entry-meta__author a'],
},
-
date_published: {
- selectors: [['meta[itemprop="datePublished"]', 'value']]
+ selectors: [['meta[itemprop="datePublished"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['.post-content section, .post-content div.post-content__image']],
-
+ selectors: [
+ ['.post-content section, .post-content div.post-content__image'],
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'div.post-content__image': 'figure',
- 'div.post-content__image .image__credits': 'figcaption'
+ 'div.post-content__image .image__credits': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwMiamiheraldComExtractor = {
domain: 'www.miamiherald.com',
-
title: {
- selectors: ['h1.title']
+ selectors: ['h1.title'],
},
-
date_published: {
selectors: ['p.published-date'],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.dateline-storybody'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwRefinery29ComExtractor = {
domain: 'www.refinery29.com',
-
title: {
- selectors: ['h1.title']
+ selectors: ['h1.title'],
},
-
author: {
- selectors: ['.contributor']
+ selectors: ['.contributor'],
},
-
date_published: {
selectors: [['meta[name="sailthru.date"]', 'value']],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['.full-width-opener', '.article-content'], '.article-content', '.body'],
-
+ selectors: [
+ ['.full-width-opener', '.article-content'],
+ '.article-content',
+ '.body',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@@ -4439,212 +4338,170 @@ var WwwRefinery29ComExtractor = {
var imgHtml = $node.html();
$node.parents('.loading').replaceWith(imgHtml);
},
-
'.section-image': 'figure',
-
'.section-image .content-caption': 'figcaption',
-
- '.section-text': 'p'
+ '.section-text': 'p',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.story-share']
- }
+ clean: ['.story-share'],
+ },
};
var WwwMacrumorsComExtractor = {
domain: 'www.macrumors.com',
-
title: {
- selectors: ['h1', 'h1.title']
+ selectors: ['h1', 'h1.title'],
},
-
author: {
- selectors: ['.author-url']
+ selectors: ['.author-url'],
},
-
date_published: {
selectors: ['.article .byline'],
-
// Wednesday January 18, 2017 11:44 am PST
format: 'dddd MMMM D, YYYY h:mm A zz',
-
- timezone: 'America/Los_Angeles'
+ timezone: 'America/Los_Angeles',
},
-
dek: {
- selectors: [['meta[name="description"]', 'value']]
+ selectors: [['meta[name="description"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwAndroidcentralComExtractor = {
domain: 'www.androidcentral.com',
-
title: {
- selectors: ['h1', 'h1.main-title']
+ selectors: ['h1', 'h1.main-title'],
},
-
author: {
- selectors: ['.meta-by']
+ selectors: ['.meta-by'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: [['meta[name="og:description"]', 'value']]
+ selectors: [['meta[name="og:description"]', 'value']],
},
-
lead_image_url: {
- selectors: [['.image-large', 'src']]
+ selectors: [['.image-large', 'src']],
},
-
content: {
selectors: ['.article-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.intro', 'blockquote']
- }
+ clean: ['.intro', 'blockquote'],
+ },
};
var WwwSiComExtractor = {
domain: 'www.si.com',
-
title: {
- selectors: ['h1', 'h1.headline']
+ selectors: ['h1', 'h1.headline'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
selectors: ['.timestamp'],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
- selectors: ['.quick-hit ul']
+ selectors: ['.quick-hit ul'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['p', '.marquee_large_2x', '.component.image']],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
-
noscript: function noscript($node) {
var $children = $node.children();
+
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure';
}
return null;
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: [['.inline-thumb', '.primary-message', '.description', '.instructions']]
- }
+ clean: [
+ ['.inline-thumb', '.primary-message', '.description', '.instructions'],
+ ],
+ },
};
var WwwRawstoryComExtractor = {
domain: 'www.rawstory.com',
-
title: {
- selectors: ['.blog-title']
+ selectors: ['.blog-title'],
},
-
author: {
- selectors: ['.blog-author a:first-of-type']
+ selectors: ['.blog-author a:first-of-type'],
},
-
date_published: {
selectors: ['.blog-author a:last-of-type'],
-
- timezone: 'EST'
+ timezone: 'EST',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.blog-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwCnetComExtractor = {
domain: 'www.cnet.com',
-
title: {
- selectors: [['meta[name="og:title"]', 'value']]
+ selectors: [['meta[name="og:title"]', 'value']],
},
-
author: {
- selectors: ['a.author']
+ selectors: ['a.author'],
},
-
date_published: {
selectors: ['time'],
-
- timezone: 'America/Los_Angeles'
+ timezone: 'America/Los_Angeles',
},
-
dek: {
- selectors: ['.article-dek']
+ selectors: ['.article-dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['img.__image-lead__', '.article-main-body'], '.article-main-body'],
-
+ selectors: [
+ ['img.__image-lead__', '.article-main-body'],
+ '.article-main-body',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@@ -4654,813 +4511,661 @@ var WwwCnetComExtractor = {
$img.attr('height', '100%');
$img.addClass('__image-lead__');
$node.remove('.imgContainer').prepend($img);
- }
+ },
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwCinemablendComExtractor = {
domain: 'www.cinemablend.com',
-
title: {
- selectors: ['.story_title']
+ selectors: ['.story_title'],
},
-
author: {
- selectors: ['.author']
+ selectors: ['.author'],
},
-
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']],
-
- timezone: 'EST'
+ timezone: 'EST',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div#wrap_left_content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwTodayComExtractor = {
domain: 'www.today.com',
-
title: {
- selectors: ['h1.entry-headline']
+ selectors: ['h1.entry-headline'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="DC.date.issued"]', 'value']]
+ selectors: [['meta[name="DC.date.issued"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry-container'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.label-comment']
- }
+ clean: ['.label-comment'],
+ },
};
var WwwHowtogeekComExtractor = {
domain: 'www.howtogeek.com',
-
title: {
- selectors: ['title']
+ selectors: ['title'],
},
-
author: {
- selectors: ['#authorinfobox a']
+ selectors: ['#authorinfobox a'],
},
-
date_published: {
selectors: ['#authorinfobox + div li'],
- timezone: 'GMT'
+ timezone: 'GMT',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.thecontent'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwAlComExtractor = {
domain: 'www.al.com',
-
title: {
- selectors: [['meta[name="title"]', 'value']]
+ selectors: [['meta[name="title"]', 'value']],
},
-
author: {
- selectors: [['meta[name="article_author"]', 'value']]
+ selectors: [['meta[name="article_author"]', 'value']],
},
-
date_published: {
selectors: [['meta[name="article_date_original"]', 'value']],
- timezone: 'EST'
+ timezone: 'EST',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwThepennyhoarderComExtractor = {
domain: 'www.thepennyhoarder.com',
-
title: {
- selectors: [['meta[name="dcterms.title"]', 'value']]
+ selectors: [['meta[name="dcterms.title"]', 'value']],
},
-
author: {
- selectors: [['link[rel="author"]', 'title']]
+ selectors: [['link[rel="author"]', 'title']],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.post-img', '.post-text'], '.post-text'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwWesternjournalismComExtractor = {
domain: 'www.westernjournalism.com',
-
title: {
- selectors: ['title', 'h1.entry-title']
+ selectors: ['title', 'h1.entry-title'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="DC.date.issued"]', 'value']]
+ selectors: [['meta[name="DC.date.issued"]', 'value']],
},
-
dek: {
- selectors: ['.subtitle']
+ selectors: ['.subtitle'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.article-sharing.top + div'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.ad-notice-small']
- }
+ clean: ['.ad-notice-small'],
+ },
};
var FusionNetExtractor = {
domain: 'fusion.net',
-
title: {
- selectors: ['.post-title', '.single-title', '.headline']
+ selectors: ['.post-title', '.single-title', '.headline'],
},
-
author: {
- selectors: ['.show-for-medium .byline']
+ selectors: ['.show-for-medium .byline'],
},
-
date_published: {
- selectors: [['time.local-time', 'datetime']]
+ selectors: [['time.local-time', 'datetime']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
- selectors: [['.post-featured-media', '.article-content'], '.article-content'],
-
+ selectors: [
+ ['.post-featured-media', '.article-content'],
+ '.article-content',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
- '.fusion-youtube-oembed': 'figure'
+ '.fusion-youtube-oembed': 'figure',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwAmericanowComExtractor = {
domain: 'www.americanow.com',
-
title: {
- selectors: ['.title', ['meta[name="title"]', 'value']]
+ selectors: ['.title', ['meta[name="title"]', 'value']],
},
-
author: {
- selectors: ['.byline']
+ selectors: ['.byline'],
},
-
date_published: {
- selectors: [['meta[name="publish_date"]', 'value']]
+ selectors: [['meta[name="publish_date"]', 'value']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.article-content', '.image', '.body'], '.body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.article-video-wrapper', '.show-for-small-only']
- }
+ clean: ['.article-video-wrapper', '.show-for-small-only'],
+ },
};
var ScienceflyComExtractor = {
domain: 'sciencefly.com',
-
title: {
- selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title']
+ selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title'],
},
-
author: {
- selectors: ['div.cb-author', 'div.cb-author-title']
+ selectors: ['div.cb-author', 'div.cb-author-title'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['div.theiaPostSlider_slides img', 'src']]
+ selectors: [['div.theiaPostSlider_slides img', 'src']],
},
-
content: {
selectors: ['div.theiaPostSlider_slides'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var HellogigglesComExtractor = {
domain: 'hellogiggles.com',
-
title: {
- selectors: ['.title']
+ selectors: ['.title'],
},
-
author: {
- selectors: ['.author-link']
+ selectors: ['.author-link'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var ThoughtcatalogComExtractor = {
domain: 'thoughtcatalog.com',
-
title: {
- selectors: ['h1.title', ['meta[name="og:title"]', 'value']]
+ selectors: ['h1.title', ['meta[name="og:title"]', 'value']],
},
-
author: {
- selectors: ['div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 'h1.writer-name']
+ selectors: [
+ 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name',
+ 'h1.writer-name',
+ ],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry.post'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.tc_mark']
- }
+ clean: ['.tc_mark'],
+ },
};
var WwwNjComExtractor = {
domain: 'www.nj.com',
-
title: {
- selectors: [['meta[name="title"]', 'value']]
+ selectors: [['meta[name="title"]', 'value']],
},
-
author: {
- selectors: [['meta[name="article_author"]', 'value']]
+ selectors: [['meta[name="article_author"]', 'value']],
},
-
date_published: {
selectors: [['meta[name="article_date_original"]', 'value']],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwInquisitrComExtractor = {
domain: 'www.inquisitr.com',
-
title: {
- selectors: ['h1.entry-title.story--header--title']
+ selectors: ['h1.entry-title.story--header--title'],
},
-
author: {
- selectors: ['div.story--header--author']
+ selectors: ['div.story--header--author'],
},
-
date_published: {
- selectors: [['meta[name="datePublished"]', 'value']]
+ selectors: [['meta[name="datePublished"]', 'value']],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['article.story', '.entry-content.'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.post-category', '.story--header--socials', '.story--header--content']
- }
+ clean: [
+ '.post-category',
+ '.story--header--socials',
+ '.story--header--content',
+ ],
+ },
};
var WwwNbcnewsComExtractor = {
domain: 'www.nbcnews.com',
-
title: {
- selectors: ['div.article-hed h1']
+ selectors: ['div.article-hed h1'],
},
-
author: {
- selectors: ['span.byline_author']
+ selectors: ['span.byline_author'],
},
-
date_published: {
- selectors: [['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
-
- timezone: 'America/New_York'
+ selectors: [
+ ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'],
+ '.flag_article-wrapper time',
+ ],
+ timezone: 'America/New_York',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['div.article-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var FortuneComExtractor = {
domain: 'fortune.com',
-
title: {
- selectors: ['h1']
+ selectors: ['h1'],
},
-
author: {
- selectors: [['meta[name="author"]', 'value']]
+ selectors: [['meta[name="author"]', 'value']],
},
-
date_published: {
selectors: ['.MblGHNMJ'],
-
- timezone: 'UTC'
+ timezone: 'UTC',
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['picture', 'article.row'], 'article.row'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var WwwLinkedinComExtractor = {
domain: 'www.linkedin.com',
-
title: {
- selectors: ['.article-title', 'h1']
+ selectors: ['.article-title', 'h1'],
},
-
author: {
- selectors: [['meta[name="article:author"]', 'value'], '.entity-name a[rel=author]']
+ selectors: [
+ ['meta[name="article:author"]', 'value'],
+ '.entity-name a[rel=author]',
+ ],
},
-
date_published: {
selectors: [['time[itemprop="datePublished"]', 'datetime']],
-
- timezone: 'America/Los_Angeles'
+ timezone: 'America/Los_Angeles',
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['header figure', '.prose'], '.prose'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.entity-image']
- }
+ clean: ['.entity-image'],
+ },
};
var ObamawhitehouseArchivesGovExtractor = {
domain: 'obamawhitehouse.archives.gov',
-
supportedDomains: ['whitehouse.gov'],
-
title: {
- selectors: ['h1', '.pane-node-title']
+ selectors: ['h1', '.pane-node-title'],
},
-
author: {
- selectors: ['.blog-author-link', '.node-person-name-link']
+ selectors: ['.blog-author-link', '.node-person-name-link'],
},
-
date_published: {
- selectors: [['meta[name="article:published_time"]', 'value']]
+ selectors: [['meta[name="article:published_time"]', 'value']],
},
-
dek: {
- selectors: ['.field-name-field-forall-summary']
+ selectors: ['.field-name-field-forall-summary'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
defaultCleaner: false,
-
selectors: ['div#content-start', '.pane-node-field-forall-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.pane-node-title', '.pane-custom.pane-1']
- }
+ clean: ['.pane-node-title', '.pane-custom.pane-1'],
+ },
};
var WwwOpposingviewsComExtractor = {
domain: 'www.opposingviews.com',
-
title: {
- selectors: ['h1.title']
+ selectors: ['h1.title'],
},
-
author: {
- selectors: ['div.date span span a']
+ selectors: ['div.date span span a'],
},
-
date_published: {
- selectors: [['meta[name="publish_date"]', 'value']]
+ selectors: [['meta[name="publish_date"]', 'value']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.show-for-small-only']
- }
+ clean: ['.show-for-small-only'],
+ },
};
var WwwProspectmagazineCoUkExtractor = {
domain: 'www.prospectmagazine.co.uk',
-
title: {
- selectors: ['.page-title']
+ selectors: ['.page-title'],
},
-
author: {
- selectors: ['.aside_author .title']
+ selectors: ['.aside_author .title'],
},
-
date_published: {
selectors: ['.post-info'],
-
- timezone: 'Europe/London'
+ timezone: 'Europe/London',
},
-
dek: {
- selectors: ['.page-subtitle']
+ selectors: ['.page-subtitle'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [
- // ['article.type-post div.post_content p'],
- 'article .post_content'],
-
+ // ['article.type-post div.post_content p'],
+ 'article .post_content',
+ ],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
var ForwardComExtractor = {
domain: 'forward.com',
-
title: {
- selectors: [['meta[name="og:title"]', 'value']]
+ selectors: [['meta[name="og:title"]', 'value']],
},
-
author: {
- selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']]
+ selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']],
},
-
date_published: {
- selectors: [['meta[name="date"]', 'value']]
+ selectors: [['meta[name="date"]', 'value']],
},
-
dek: {
selectors: [
// enter selectors
- ]
+ ],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.post-item-media-wrap', '.post-item p']],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.donate-box', '.message', '.subtitle']
- }
+ clean: ['.donate-box', '.message', '.subtitle'],
+ },
};
var WwwQdailyComExtractor = {
domain: 'www.qdaily.com',
-
title: {
- selectors: ['h2', 'h2.title']
+ selectors: ['h2', 'h2.title'],
},
-
author: {
- selectors: ['.name']
+ selectors: ['.name'],
},
-
date_published: {
- selectors: [['.date.smart-date', 'data-origindate']]
+ selectors: [['.date.smart-date', 'data-origindate']],
},
-
dek: {
- selectors: ['.excerpt']
+ selectors: ['.excerpt'],
},
-
lead_image_url: {
- selectors: [['.article-detail-hd img', 'src']]
+ selectors: [['.article-detail-hd img', 'src']],
},
-
content: {
selectors: ['.detail'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.lazyload', '.lazylad', '.lazylood']
- }
+ clean: ['.lazyload', '.lazylad', '.lazylood'],
+ },
};
var GothamistComExtractor = {
domain: 'gothamist.com',
-
- supportedDomains: ['chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com'],
-
+ supportedDomains: [
+ 'chicagoist.com',
+ 'laist.com',
+ 'sfist.com',
+ 'shanghaiist.com',
+ 'dcist.com',
+ ],
title: {
- selectors: ['h1', '.entry-header h1']
+ selectors: ['h1', '.entry-header h1'],
},
-
author: {
- selectors: ['.author']
+ selectors: ['.author'],
},
-
date_published: {
selectors: ['abbr', 'abbr.published'],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
- selectors: [null]
+ selectors: [null],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.entry-body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@@ -5469,294 +5174,276 @@ var GothamistComExtractor = {
'div.image-left': 'figure',
'.image-left i': 'figcaption',
'div.image-right': 'figure',
- '.image-right i': 'figcaption'
+ '.image-right i': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.image-none br', '.image-left br', '.image-right br', '.galleryEase']
- }
+ clean: [
+ '.image-none br',
+ '.image-left br',
+ '.image-right br',
+ '.galleryEase',
+ ],
+ },
};
var WwwFoolComExtractor = {
domain: 'www.fool.com',
-
title: {
- selectors: ['h1']
+ selectors: ['h1'],
},
-
author: {
- selectors: ['.author-inline .author-name']
+ selectors: ['.author-inline .author-name'],
},
-
date_published: {
- selectors: [['meta[name="date"]', 'value']]
+ selectors: [['meta[name="date"]', 'value']],
},
-
dek: {
- selectors: ['header h2']
+ selectors: ['header h2'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.article-content'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.caption img': function captionImg($node) {
var src = $node.attr('src');
- $node.parent().replaceWith('');
+ $node
+ .parent()
+ .replaceWith(''));
},
- '.caption': 'figcaption'
+ '.caption': 'figcaption',
},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['#pitch']
- }
+ clean: ['#pitch'],
+ },
};
var WwwSlateComExtractor = {
domain: 'www.slate.com',
-
title: {
- selectors: ['.hed', 'h1']
+ selectors: ['.hed', 'h1'],
},
-
author: {
- selectors: ['a[rel=author]']
+ selectors: ['a[rel=author]'],
},
-
date_published: {
selectors: ['.pub-date'],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
- selectors: ['.dek']
+ selectors: ['.dek'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: ['.body'],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: ['.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment']
- }
+ clean: [
+ '.about-the-author',
+ '.pullquote',
+ '.newsletter-signup-component',
+ '.top-comment',
+ ],
+ },
};
var IciRadioCanadaCaExtractor = {
domain: 'ici.radio-canada.ca',
-
title: {
- selectors: ['h1']
+ selectors: ['h1'],
},
-
author: {
- selectors: [['meta[name="dc.creator"]', 'value']]
+ selectors: [['meta[name="dc.creator"]', 'value']],
},
-
date_published: {
selectors: [['meta[name="dc.date.created"]', 'value']],
-
- timezone: 'America/New_York'
+ timezone: 'America/New_York',
},
-
dek: {
- selectors: ['.bunker-component.lead']
+ selectors: ['.bunker-component.lead'],
},
-
lead_image_url: {
- selectors: [['meta[name="og:image"]', 'value']]
+ selectors: [['meta[name="og:image"]', 'value']],
},
-
content: {
selectors: [['.main-multimedia-item', '.news-story-content']],
-
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
-
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
- clean: []
- }
+ clean: [],
+ },
};
-
-
-var CustomExtractors = Object.freeze({
- BloggerExtractor: BloggerExtractor,
- NYMagExtractor: NYMagExtractor,
- WikipediaExtractor: WikipediaExtractor,
- TwitterExtractor: TwitterExtractor,
- NYTimesExtractor: NYTimesExtractor,
- TheAtlanticExtractor: TheAtlanticExtractor,
- NewYorkerExtractor: NewYorkerExtractor,
- WiredExtractor: WiredExtractor,
- MSNExtractor: MSNExtractor,
- YahooExtractor: YahooExtractor,
- BuzzfeedExtractor: BuzzfeedExtractor,
- WikiaExtractor: WikiaExtractor,
- LittleThingsExtractor: LittleThingsExtractor,
- PoliticoExtractor: PoliticoExtractor,
- DeadspinExtractor: DeadspinExtractor,
- BroadwayWorldExtractor: BroadwayWorldExtractor,
- ApartmentTherapyExtractor: ApartmentTherapyExtractor,
- MediumExtractor: MediumExtractor,
- WwwTmzComExtractor: WwwTmzComExtractor,
- WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
- WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
- NewrepublicComExtractor: NewrepublicComExtractor,
- MoneyCnnComExtractor: MoneyCnnComExtractor,
- WwwThevergeComExtractor: WwwThevergeComExtractor,
- WwwCnnComExtractor: WwwCnnComExtractor,
- WwwAolComExtractor: WwwAolComExtractor,
- WwwYoutubeComExtractor: WwwYoutubeComExtractor,
- WwwTheguardianComExtractor: WwwTheguardianComExtractor,
- WwwSbnationComExtractor: WwwSbnationComExtractor,
- WwwBloombergComExtractor: WwwBloombergComExtractor,
- WwwBustleComExtractor: WwwBustleComExtractor,
- WwwNprOrgExtractor: WwwNprOrgExtractor,
- WwwRecodeNetExtractor: WwwRecodeNetExtractor,
- QzComExtractor: QzComExtractor,
- WwwDmagazineComExtractor: WwwDmagazineComExtractor,
- WwwReutersComExtractor: WwwReutersComExtractor,
- MashableComExtractor: MashableComExtractor,
- WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
- WwwVoxComExtractor: WwwVoxComExtractor,
- NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
- WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
- WwwLatimesComExtractor: WwwLatimesComExtractor,
- PagesixComExtractor: PagesixComExtractor,
- ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
- WwwCbssportsComExtractor: WwwCbssportsComExtractor,
- WwwMsnbcComExtractor: WwwMsnbcComExtractor,
- WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
- WwwMentalflossComExtractor: WwwMentalflossComExtractor,
- AbcnewsGoComExtractor: AbcnewsGoComExtractor,
- WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
- WwwCnbcComExtractor: WwwCnbcComExtractor,
- WwwPopsugarComExtractor: WwwPopsugarComExtractor,
- ObserverComExtractor: ObserverComExtractor,
- PeopleComExtractor: PeopleComExtractor,
- WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
- WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
- twofortysevensportsComExtractor: twofortysevensportsComExtractor,
- UproxxComExtractor: UproxxComExtractor,
- WwwEonlineComExtractor: WwwEonlineComExtractor,
- WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
- WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
- WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
- WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
- WwwSiComExtractor: WwwSiComExtractor,
- WwwRawstoryComExtractor: WwwRawstoryComExtractor,
- WwwCnetComExtractor: WwwCnetComExtractor,
- WwwCinemablendComExtractor: WwwCinemablendComExtractor,
- WwwTodayComExtractor: WwwTodayComExtractor,
- WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
- WwwAlComExtractor: WwwAlComExtractor,
- WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
- WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
- FusionNetExtractor: FusionNetExtractor,
- WwwAmericanowComExtractor: WwwAmericanowComExtractor,
- ScienceflyComExtractor: ScienceflyComExtractor,
- HellogigglesComExtractor: HellogigglesComExtractor,
- ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
- WwwNjComExtractor: WwwNjComExtractor,
- WwwInquisitrComExtractor: WwwInquisitrComExtractor,
- WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
- FortuneComExtractor: FortuneComExtractor,
- WwwLinkedinComExtractor: WwwLinkedinComExtractor,
- ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
- WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
- WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
- ForwardComExtractor: ForwardComExtractor,
- WwwQdailyComExtractor: WwwQdailyComExtractor,
- GothamistComExtractor: GothamistComExtractor,
- WwwFoolComExtractor: WwwFoolComExtractor,
- WwwSlateComExtractor: WwwSlateComExtractor,
- IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor
+var CustomExtractors = /*#__PURE__*/ Object.freeze({
+ BloggerExtractor: BloggerExtractor,
+ NYMagExtractor: NYMagExtractor,
+ WikipediaExtractor: WikipediaExtractor,
+ TwitterExtractor: TwitterExtractor,
+ NYTimesExtractor: NYTimesExtractor,
+ TheAtlanticExtractor: TheAtlanticExtractor,
+ NewYorkerExtractor: NewYorkerExtractor,
+ WiredExtractor: WiredExtractor,
+ MSNExtractor: MSNExtractor,
+ YahooExtractor: YahooExtractor,
+ BuzzfeedExtractor: BuzzfeedExtractor,
+ WikiaExtractor: WikiaExtractor,
+ LittleThingsExtractor: LittleThingsExtractor,
+ PoliticoExtractor: PoliticoExtractor,
+ DeadspinExtractor: DeadspinExtractor,
+ BroadwayWorldExtractor: BroadwayWorldExtractor,
+ ApartmentTherapyExtractor: ApartmentTherapyExtractor,
+ MediumExtractor: MediumExtractor,
+ WwwTmzComExtractor: WwwTmzComExtractor,
+ WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
+ WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
+ NewrepublicComExtractor: NewrepublicComExtractor,
+ MoneyCnnComExtractor: MoneyCnnComExtractor,
+ WwwThevergeComExtractor: WwwThevergeComExtractor,
+ WwwCnnComExtractor: WwwCnnComExtractor,
+ WwwAolComExtractor: WwwAolComExtractor,
+ WwwYoutubeComExtractor: WwwYoutubeComExtractor,
+ WwwTheguardianComExtractor: WwwTheguardianComExtractor,
+ WwwSbnationComExtractor: WwwSbnationComExtractor,
+ WwwBloombergComExtractor: WwwBloombergComExtractor,
+ WwwBustleComExtractor: WwwBustleComExtractor,
+ WwwNprOrgExtractor: WwwNprOrgExtractor,
+ WwwRecodeNetExtractor: WwwRecodeNetExtractor,
+ QzComExtractor: QzComExtractor,
+ WwwDmagazineComExtractor: WwwDmagazineComExtractor,
+ WwwReutersComExtractor: WwwReutersComExtractor,
+ MashableComExtractor: MashableComExtractor,
+ WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
+ WwwVoxComExtractor: WwwVoxComExtractor,
+ NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
+ WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
+ WwwLatimesComExtractor: WwwLatimesComExtractor,
+ PagesixComExtractor: PagesixComExtractor,
+ ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
+ WwwCbssportsComExtractor: WwwCbssportsComExtractor,
+ WwwMsnbcComExtractor: WwwMsnbcComExtractor,
+ WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
+ WwwMentalflossComExtractor: WwwMentalflossComExtractor,
+ AbcnewsGoComExtractor: AbcnewsGoComExtractor,
+ WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
+ WwwCnbcComExtractor: WwwCnbcComExtractor,
+ WwwPopsugarComExtractor: WwwPopsugarComExtractor,
+ ObserverComExtractor: ObserverComExtractor,
+ PeopleComExtractor: PeopleComExtractor,
+ WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
+ WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
+ twofortysevensportsComExtractor: twofortysevensportsComExtractor,
+ UproxxComExtractor: UproxxComExtractor,
+ WwwEonlineComExtractor: WwwEonlineComExtractor,
+ WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
+ WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
+ WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
+ WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
+ WwwSiComExtractor: WwwSiComExtractor,
+ WwwRawstoryComExtractor: WwwRawstoryComExtractor,
+ WwwCnetComExtractor: WwwCnetComExtractor,
+ WwwCinemablendComExtractor: WwwCinemablendComExtractor,
+ WwwTodayComExtractor: WwwTodayComExtractor,
+ WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
+ WwwAlComExtractor: WwwAlComExtractor,
+ WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
+ WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
+ FusionNetExtractor: FusionNetExtractor,
+ WwwAmericanowComExtractor: WwwAmericanowComExtractor,
+ ScienceflyComExtractor: ScienceflyComExtractor,
+ HellogigglesComExtractor: HellogigglesComExtractor,
+ ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
+ WwwNjComExtractor: WwwNjComExtractor,
+ WwwInquisitrComExtractor: WwwInquisitrComExtractor,
+ WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
+ FortuneComExtractor: FortuneComExtractor,
+ WwwLinkedinComExtractor: WwwLinkedinComExtractor,
+ ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
+ WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
+ WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
+ ForwardComExtractor: ForwardComExtractor,
+ WwwQdailyComExtractor: WwwQdailyComExtractor,
+ GothamistComExtractor: GothamistComExtractor,
+ WwwFoolComExtractor: WwwFoolComExtractor,
+ WwwSlateComExtractor: WwwSlateComExtractor,
+ IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
});
-var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
+var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
var extractor = CustomExtractors[key];
- return _extends({}, acc, mergeSupportedDomains(extractor));
+ return _objectSpread({}, acc, mergeSupportedDomains(extractor));
}, {});
// CLEAN AUTHOR CONSTANTS
-var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
-// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
-
+var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS
-var TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
-// An ordered list of meta tag names that denote likely article deks.
-// From most distinct to least distinct.
-//
-// NOTE: There are currently no meta tags that seem to provide the right
-// content consistenty enough. Two options were:
-// - og:description
-// - dc.description
-// However, these tags often have SEO-specific junk in them that's not
-// header-worthy like a dek is. Excerpt material at best.
-
-
-// An ordered list of Selectors to find likely article deks. From
-// most explicit to least explicit.
-//
-// Should be more restrictive than not, as a failed dek can be pretty
-// detrimental to the aesthetics of an article.
+var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.
-// CLEAN DATE PUBLISHED CONSTANTS
var MS_DATE_STRING = /^\d{13}$/i;
var SEC_DATE_STRING = /^\d{10}$/i;
var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
-var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
+var months = [
+ 'jan',
+ 'feb',
+ 'mar',
+ 'apr',
+ 'may',
+ 'jun',
+ 'jul',
+ 'aug',
+ 'sep',
+ 'oct',
+ 'nov',
+ 'dec',
+];
var allMonths = months.join('|');
var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
var timestamp3 = '-[0-9]{3,4}$';
-var SPLIT_DATE_STRING = new RegExp('(' + timestamp1 + ')|(' + timestamp2 + ')|(' + timestamp3 + ')|([0-9]{1,4})|(' + allMonths + ')', 'ig');
-
-// 2016-11-22T08:57-500
+var SPLIT_DATE_STRING = new RegExp(
+ '('
+ .concat(timestamp1, ')|(')
+ .concat(timestamp2, ')|(')
+ .concat(timestamp3, ')|([0-9]{1,4})|(')
+ .concat(allMonths, ')'),
+ 'ig'
+); // 2016-11-22T08:57-500
// Check if datetime string has an offset at the end
-var TIME_WITH_OFFSET_RE = /-\d{3,4}$/;
-// CLEAN TITLE CONSTANTS
+var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
-var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
+var TITLE_SPLITTERS_RE = /(: | - | \| )/g;
var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g');
-// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
+
function cleanAuthor(author) {
return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}
function clean$1(leadImageUrl) {
leadImageUrl = leadImageUrl.trim();
+
if (validUrl.isWebUri(leadImageUrl)) {
return leadImageUrl;
}
@@ -5764,53 +5451,51 @@ function clean$1(leadImageUrl) {
return null;
}
-// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.
+
function cleanDek(dek, _ref) {
var $ = _ref.$,
- excerpt = _ref.excerpt;
-
+ excerpt = _ref.excerpt;
// Sanity check that we didn't get too short or long of a dek.
- if (dek.length > 1000 || dek.length < 5) return null;
+ if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt
- // Check that dek isn't the same as excerpt
- if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;
-
- var dekText = stripTags(dek, $);
-
- // Plain text links shouldn't exist in the dek. If we have some, it's
+ if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10))
+ return null;
+ var dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail.
- if (TEXT_LINK_RE.test(dekText)) return null;
+ if (TEXT_LINK_RE.test(dekText)) return null;
return normalizeSpaces(dekText.trim());
}
-// Is there a compelling reason to use moment here?
-// Mostly only being used for the isValid() method,
-// but could just check for 'Invalid Date' string.
-
function cleanDateString(dateString) {
- return (dateString.match(SPLIT_DATE_STRING) || []).join(' ').replace(TIME_MERIDIAN_DOTS_RE, 'm').replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3').replace(CLEAN_DATE_STRING_RE, '$1').trim();
+ return (dateString.match(SPLIT_DATE_STRING) || [])
+ .join(' ')
+ .replace(TIME_MERIDIAN_DOTS_RE, 'm')
+ .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
+ .replace(CLEAN_DATE_STRING_RE, '$1')
+ .trim();
}
-
function createDate(dateString, timezone, format) {
if (TIME_WITH_OFFSET_RE.test(dateString)) {
return moment(new Date(dateString));
}
- return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString));
-}
-
-// Take a date published string, and hopefully return a date out of
+ return timezone
+ ? moment.tz(dateString, format || parseFormat(dateString), timezone)
+ : moment(dateString, format || parseFormat(dateString));
+} // Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
+
function cleanDatePublished(dateString) {
- var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
- timezone = _ref.timezone,
- format = _ref.format;
+ var _ref =
+ arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
+ timezone = _ref.timezone,
+ format = _ref.format;
// If string is in milliseconds or seconds, convert to int and return
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
- return new Date(parseInt(dateString, 10)).toISOString();
+ return new Date(_parseInt(dateString, 10)).toISOString();
}
var date = createDate(dateString, timezone, format);
@@ -5823,83 +5508,72 @@ function cleanDatePublished(dateString) {
return date.isValid() ? date.toISOString() : null;
}
-// Clean our article content, returning a new, cleaned node.
function extractCleanNode(article, _ref) {
var $ = _ref.$,
- _ref$cleanConditional = _ref.cleanConditionally,
- cleanConditionally = _ref$cleanConditional === undefined ? true : _ref$cleanConditional,
- _ref$title = _ref.title,
- title = _ref$title === undefined ? '' : _ref$title,
- _ref$url = _ref.url,
- url = _ref$url === undefined ? '' : _ref$url,
- _ref$defaultCleaner = _ref.defaultCleaner,
- defaultCleaner = _ref$defaultCleaner === undefined ? true : _ref$defaultCleaner;
-
+ _ref$cleanConditional = _ref.cleanConditionally,
+ cleanConditionally =
+ _ref$cleanConditional === void 0 ? true : _ref$cleanConditional,
+ _ref$title = _ref.title,
+ title = _ref$title === void 0 ? '' : _ref$title,
+ _ref$url = _ref.url,
+ url = _ref$url === void 0 ? '' : _ref$url,
+ _ref$defaultCleaner = _ref.defaultCleaner,
+ defaultCleaner =
+ _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner;
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
- rewriteTopLevel$$1(article, $);
-
- // Drop small images and spacer images
+ rewriteTopLevel$$1(article, $); // Drop small images and spacer images
// Only do this is defaultCleaner is set to true;
// this can sometimes be too aggressive.
- if (defaultCleaner) cleanImages(article, $);
- // Make links absolute
- makeLinksAbsolute$$1(article, $, url);
+ if (defaultCleaner) cleanImages(article, $); // Make links absolute
- // Mark elements to keep that would normally be removed.
+ makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed.
// E.g., stripJunkTags will remove iframes, so we're going to mark
// YouTube/Vimeo videos as elements we want to keep.
- markToKeep(article, $, url);
- // Drop certain tags like , etc
+ markToKeep(article, $, url); // Drop certain tags like , etc
// This is -mostly- for cleanliness, not security.
- stripJunkTags(article, $);
- // H1 tags are typically the article title, which should be extracted
+ stripJunkTags(article, $); // H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
- cleanHOnes$$1(article, $);
- // Clean headers
- cleanHeaders(article, $, title);
+ cleanHOnes$$1(article, $); // Clean headers
- // We used to clean UL's and OL's here, but it was leading to
+ cleanHeaders(article, $, title); // We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
// Also optionally running, since it can be overly aggressive.
- if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally);
- // Remove empty paragraph nodes
- removeEmpty(article, $);
+ if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes
- // Remove unnecessary attributes
- cleanAttributes$$1(article, $);
+ removeEmpty(article, $); // Remove unnecessary attributes
+ cleanAttributes$$1(article, $);
return article;
}
function cleanTitle$$1(title, _ref) {
var url = _ref.url,
- $ = _ref.$;
+ $ = _ref.$;
// If title has |, :, or - in it, see if
// we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) {
title = resolveSplitTitle(title, url);
- }
-
- // Final sanity check that we didn't get a crazy title.
+ } // Final sanity check that we didn't get a crazy title.
// if (title.length > 150 || title.length < 15) {
+
if (title.length > 150) {
// If we did, return h1 from the document if it exists
var h1 = $('h1');
+
if (h1.length === 1) {
title = h1.text();
}
- }
+ } // strip any html tags in the title text
- // strip any html tags in the title text
return normalizeSpaces(stripTags(title, $).trim());
}
@@ -5908,53 +5582,45 @@ function extractBreadcrumbTitle(splitTitle, text) {
// The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
// NYTimes - Blogs - Bits - The Best Gadgets on Earth
if (splitTitle.length >= 6) {
- var _ret = function () {
- // Look to see if we can find a breadcrumb splitter that happens
- // more than once. If we can, we'll be able to better pull out
- // the title.
- var termCounts = splitTitle.reduce(function (acc, titleText) {
- acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
- return acc;
- }, {});
-
- var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(function (acc, key) {
- if (acc[1] < termCounts[key]) {
- return [key, termCounts[key]];
- }
-
- return acc;
- }, [0, 0]),
- _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
- maxTerm = _Reflect$ownKeys$redu2[0],
- termCount = _Reflect$ownKeys$redu2[1];
-
- // We found a splitter that was used more than once, so it
- // is probably the breadcrumber. Split our title on that instead.
- // Note: max_term should be <= 4 characters, so that " >> "
- // will match, but nothing longer than that.
-
+ // Look to see if we can find a breadcrumb splitter that happens
+ // more than once. If we can, we'll be able to better pull out
+ // the title.
+ var termCounts = splitTitle.reduce(function(acc, titleText) {
+ acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
+ return acc;
+ }, {});
- if (termCount >= 2 && maxTerm.length <= 4) {
- splitTitle = text.split(maxTerm);
- }
+ var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(
+ function(acc, key) {
+ if (acc[1] < termCounts[key]) {
+ return [key, termCounts[key]];
+ }
- var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
- var longestEnd = splitEnds.reduce(function (acc, end) {
- return acc.length > end.length ? acc : end;
- }, '');
+ return acc;
+ },
+ [0, 0]
+ ),
+ _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
+ maxTerm = _Reflect$ownKeys$redu2[0],
+ termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it
+ // is probably the breadcrumber. Split our title on that instead.
+ // Note: max_term should be <= 4 characters, so that " >> "
+ // will match, but nothing longer than that.
+
+ if (termCount >= 2 && maxTerm.length <= 4) {
+ splitTitle = text.split(maxTerm);
+ }
- if (longestEnd.length > 10) {
- return {
- v: longestEnd
- };
- }
+ var splitEnds = [splitTitle[0], splitTitle.slice(-1)];
+ var longestEnd = splitEnds.reduce(function(acc, end) {
+ return acc.length > end.length ? acc : end;
+ }, '');
- return {
- v: text
- };
- }();
+ if (longestEnd.length > 10) {
+ return longestEnd;
+ }
- if ((typeof _ret === 'undefined' ? 'undefined' : _typeof(_ret)) === "object") return _ret.v;
+ return text;
}
return null;
@@ -5968,10 +5634,9 @@ function cleanDomainFromTitle(splitTitle, url) {
// Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right.
var _URL$parse = URL.parse(url),
- host = _URL$parse.host;
+ host = _URL$parse.host;
var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
-
var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
@@ -5979,7 +5644,10 @@ function cleanDomainFromTitle(splitTitle, url) {
return splitTitle.slice(2).join('');
}
- var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
+ var endSlug = splitTitle
+ .slice(-1)[0]
+ .toLowerCase()
+ .replace(' ', '');
var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
if (endSlugRatio > 0.4 && endSlug.length >= 5) {
@@ -5987,28 +5655,26 @@ function cleanDomainFromTitle(splitTitle, url) {
}
return null;
-}
-
-// Given a title with separators in it (colons, dashes, etc),
+} // Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
-function resolveSplitTitle(title) {
- var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';
+function resolveSplitTitle(title) {
+ var url =
+ arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
var splitTitle = title.split(TITLE_SPLITTERS_RE);
+
if (splitTitle.length === 1) {
return title;
}
var newTitle = extractBreadcrumbTitle(splitTitle, title);
if (newTitle) return newTitle;
-
newTitle = cleanDomainFromTitle(splitTitle, url);
- if (newTitle) return newTitle;
-
- // Fuzzy ratio didn't find anything, so this title is probably legit.
+ if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
+
return title;
}
@@ -6018,10 +5684,9 @@ var Cleaners = {
dek: cleanDek,
date_published: cleanDatePublished,
content: extractCleanNode,
- title: cleanTitle$$1
+ title: cleanTitle$$1,
};
-// Using a variety of scoring techniques, extract the content most
// likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
@@ -6032,12 +5697,12 @@ var Cleaners = {
// worthiness of nodes.
//
// Returns a cheerio object $
+
function extractBestNode($, opts) {
// clone the node so we can get back to our
// initial parsed state if needed
// TODO Do I need this? – AP
// let $root = $.root().clone()
-
if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($);
}
@@ -6045,7 +5710,6 @@ function extractBestNode($, opts) {
$ = convertToParagraphs$$1($);
$ = scoreContent$$1($, opts.weightNodes);
var $topCandidate = findTopCandidate$$1($);
-
return $topCandidate;
}
@@ -6053,9 +5717,8 @@ var GenericContentExtractor = {
defaultOpts: {
stripUnlikelyCandidates: true,
weightNodes: true,
- cleanConditionally: true
+ cleanConditionally: true,
},
-
// Extract the content for this resource - initially, pass in our
// most restrictive opts which will return the highest quality
// content. On each failure, retry with slightly more lax opts.
@@ -6077,37 +5740,39 @@ var GenericContentExtractor = {
// superfluous content. Things like forms, ads, etc.
extract: function extract(_ref, opts) {
var $ = _ref.$,
- html = _ref.html,
- title = _ref.title,
- url = _ref.url;
-
- opts = _extends({}, this.defaultOpts, opts);
-
- $ = $ || cheerio.load(html);
-
- // Cascade through our extraction-specific opts in an ordered fashion,
+ html = _ref.html,
+ title = _ref.title,
+ url = _ref.url;
+ opts = _objectSpread({}, this.defaultOpts, opts);
+ $ = $ || cheerio.load(html); // Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content.
+
var node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) {
return this.cleanAndReturnNode(node, $);
- }
-
- // We didn't succeed on first pass, one by one disable our
+ } // We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
+ // eslint-disable-next-line no-restricted-syntax
+
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
- for (var _iterator = _getIterator(_Reflect$ownKeys(opts).filter(function (k) {
- return opts[k] === true;
- })), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
+ for (
+ var _iterator = _getIterator(
+ _Reflect$ownKeys(opts).filter(function(k) {
+ return opts[k] === true;
+ })
+ ),
+ _step;
+ !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
+ _iteratorNormalCompletion = true
+ ) {
var key = _step.value;
-
opts[key] = false;
$ = cheerio.load(html);
-
node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) {
@@ -6119,7 +5784,7 @@ var GenericContentExtractor = {
_iteratorError = err;
} finally {
try {
- if (!_iteratorNormalCompletion && _iterator.return) {
+ if (!_iteratorNormalCompletion && _iterator.return != null) {
_iterator.return();
}
} finally {
@@ -6131,19 +5796,15 @@ var GenericContentExtractor = {
return this.cleanAndReturnNode(node, $);
},
-
-
// Get node given current options
getContentNode: function getContentNode($, title, url, opts) {
return extractCleanNode(extractBestNode($, opts), {
$: $,
cleanConditionally: opts.cleanConditionally,
title: title,
- url: url
+ url: url,
});
},
-
-
// Once we got here, either we're at our last-resort node, or
// we broke early. Make sure we at least have -something- before we
// move forward.
@@ -6152,68 +5813,101 @@ var GenericContentExtractor = {
return null;
}
- return normalizeSpaces($.html(node));
-
- // if return_type == "html":
+ return normalizeSpaces($.html(node)); // if return_type == "html":
// return normalize_spaces(node_to_html(node))
// else:
// return node
- }
+ },
};
// TODO: It would be great if we could merge the meta and selector lists into
// a list of objects, because we could then rank them better. For example,
// .hentry .entry-title is far better suited than .
-
// An ordered list of meta tag names that denote likely article titles. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
-var STRONG_TITLE_META_TAGS = ['tweetmeme-title', 'dc.title', 'rbtitle', 'headline', 'title'];
-
-// og:title is weak because it typically contains context that we don't like,
+var STRONG_TITLE_META_TAGS = [
+ 'tweetmeme-title',
+ 'dc.title',
+ 'rbtitle',
+ 'headline',
+ 'title',
+]; // og:title is weak because it typically contains context that we don't like,
// for example the source site's name. Gotta get that brand into facebook!
-var WEAK_TITLE_META_TAGS = ['og:title'];
-// An ordered list of XPath Selectors to find likely article titles. From
+var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
-var STRONG_TITLE_SELECTORS = ['.hentry .entry-title', 'h1#articleHeader', 'h1.articleHeader', 'h1.article', '.instapaper_title', '#meebo-title'];
-var WEAK_TITLE_SELECTORS = ['article h1', '#entry-title', '.entry-title', '#entryTitle', '#entrytitle', '.entryTitle', '.entrytitle', '#articleTitle', '.articleTitle', 'post post-title', 'h1.title', 'h2.article', 'h1', 'html head title', 'title'];
+var STRONG_TITLE_SELECTORS = [
+ '.hentry .entry-title',
+ 'h1#articleHeader',
+ 'h1.articleHeader',
+ 'h1.article',
+ '.instapaper_title',
+ '#meebo-title',
+];
+var WEAK_TITLE_SELECTORS = [
+ 'article h1',
+ '#entry-title',
+ '.entry-title',
+ '#entryTitle',
+ '#entrytitle',
+ '.entryTitle',
+ '.entrytitle',
+ '#articleTitle',
+ '.articleTitle',
+ 'post post-title',
+ 'h1.title',
+ 'h2.article',
+ 'h1',
+ 'html head title',
+ 'title',
+];
var GenericTitleExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- url = _ref.url,
- metaCache = _ref.metaCache;
-
+ url = _ref.url,
+ metaCache = _ref.metaCache;
// First, check to see if we have a matching meta tag that we can make
// use of that is strongly associated with the headline.
- var title = void 0;
-
+ var title;
title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache);
- if (title) return cleanTitle$$1(title, { url: url, $: $ });
-
- // Second, look through our content selectors for the most likely
+ if (title)
+ return cleanTitle$$1(title, {
+ url: url,
+ $: $,
+ }); // Second, look through our content selectors for the most likely
// article title that is strongly associated with the headline.
+
title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS);
- if (title) return cleanTitle$$1(title, { url: url, $: $ });
+ if (title)
+ return cleanTitle$$1(title, {
+ url: url,
+ $: $,
+ }); // Third, check for weaker meta tags that may match.
- // Third, check for weaker meta tags that may match.
title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache);
- if (title) return cleanTitle$$1(title, { url: url, $: $ });
+ if (title)
+ return cleanTitle$$1(title, {
+ url: url,
+ $: $,
+ }); // Last, look for weaker selector tags that may match.
- // Last, look for weaker selector tags that may match.
title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS);
- if (title) return cleanTitle$$1(title, { url: url, $: $ });
+ if (title)
+ return cleanTitle$$1(title, {
+ url: url,
+ $: $,
+ }); // If no matches, return an empty string
- // If no matches, return an empty string
return '';
- }
+ },
};
// An ordered list of meta tag names that denote likely article authors. All
@@ -6222,62 +5916,95 @@ var GenericTitleExtractor = {
//
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
-var AUTHOR_META_TAGS = ['byl', 'clmst', 'dc.author', 'dcsext.author', 'dc.creator', 'rbauthors', 'authors'];
-
-var AUTHOR_MAX_LENGTH = 300;
-
-// An ordered list of XPath Selectors to find likely article authors. From
+var AUTHOR_META_TAGS = [
+ 'byl',
+ 'clmst',
+ 'dc.author',
+ 'dcsext.author',
+ 'dc.creator',
+ 'rbauthors',
+ 'authors',
+];
+var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
-var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .vcard .fn', '.byline.vcard .fn', '.byline .vcard .fn', '.byline .by .author', '.byline .by', '.byline .author', '.post-author.vcard', '.post-author .vcard', 'a[rel=author]', '#by_author', '.by_author', '#entryAuthor', '.entryAuthor', '.byline a[href*=author]', '#author .authorname', '.author .authorname', '#author', '.author', '.articleauthor', '.ArticleAuthor', '.byline'];
-// An ordered list of Selectors to find likely article authors, with
+var AUTHOR_SELECTORS = [
+ '.entry .entry-author',
+ '.author.vcard .fn',
+ '.author .vcard .fn',
+ '.byline.vcard .fn',
+ '.byline .vcard .fn',
+ '.byline .by .author',
+ '.byline .by',
+ '.byline .author',
+ '.post-author.vcard',
+ '.post-author .vcard',
+ 'a[rel=author]',
+ '#by_author',
+ '.by_author',
+ '#entryAuthor',
+ '.entryAuthor',
+ '.byline a[href*=author]',
+ '#author .authorname',
+ '.author .authorname',
+ '#author',
+ '.author',
+ '.articleauthor',
+ '.ArticleAuthor',
+ '.byline',
+]; // An ordered list of Selectors to find likely article authors, with
// regular expression for content.
+
var bylineRe = /^[\n\s]*By/i;
var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];
var GenericAuthorExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- metaCache = _ref.metaCache;
-
- var author = void 0;
-
- // First, check to see if we have a matching
+ metaCache = _ref.metaCache;
+ var author; // First, check to see if we have a matching
// meta tag that we can make use of.
+
author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache);
+
if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author);
- }
+ } // Second, look through our selectors looking for potential authors.
- // Second, look through our selectors looking for potential authors.
author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2);
+
if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author);
- }
-
- // Last, use our looser regular-expression based selectors for
+ } // Last, use our looser regular-expression based selectors for
// potential authors.
+ // eslint-disable-next-line no-restricted-syntax
+
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
- for (var _iterator = _getIterator(BYLINE_SELECTORS_RE), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
+ for (
+ var _iterator = _getIterator(BYLINE_SELECTORS_RE), _step;
+ !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
+ _iteratorNormalCompletion = true
+ ) {
var _ref4 = _step.value;
var _ref3 = _slicedToArray(_ref4, 2);
var selector = _ref3[0];
var regex = _ref3[1];
-
var node = $(selector);
+
if (node.length === 1) {
var text = node.text();
+
if (regex.test(text)) {
return cleanAuthor(text);
}
@@ -6288,7 +6015,7 @@ var GenericAuthorExtractor = {
_iteratorError = err;
} finally {
try {
- if (!_iteratorNormalCompletion && _iterator.return) {
+ if (!_iteratorNormalCompletion && _iterator.return != null) {
_iterator.return();
}
} finally {
@@ -6299,58 +6026,91 @@ var GenericAuthorExtractor = {
}
return null;
- }
+ },
};
// An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
-var DATE_PUBLISHED_META_TAGS = ['article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date'];
-
-// An ordered list of XPath Selectors to find
+var DATE_PUBLISHED_META_TAGS = [
+ 'article:published_time',
+ 'displaydate',
+ 'dc.date',
+ 'dc.date.issued',
+ 'rbpubdate',
+ 'publish_date',
+ 'pub_date',
+ 'pagedate',
+ 'pubdate',
+ 'revision_date',
+ 'doc_date',
+ 'date_created',
+ 'content_create_date',
+ 'lastmodified',
+ 'created',
+ 'date',
+]; // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.
-var DATE_PUBLISHED_SELECTORS = ['.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate'];
-// An ordered list of compiled regular expressions to find likely date
+var DATE_PUBLISHED_SELECTORS = [
+ '.hentry .dtstamp.published',
+ '.hentry .published',
+ '.hentry .dtstamp.updated',
+ '.hentry .updated',
+ '.single .published',
+ '.meta .published',
+ '.meta .postDate',
+ '.entry-date',
+ '.byline .date',
+ '.postmetadata .date',
+ '.article_datetime',
+ '.date-header',
+ '.story-date',
+ '.dateStamp',
+ '#story .datetime',
+ '.dateline',
+ '.pubdate',
+]; // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse
+
var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
var DATE_PUBLISHED_URL_RES = [
-// /2012/01/27/ but not /2012/01/293
-new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
-// 20120127 or 20120127T but not 2012012733 or 8201201733
-// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
-// 2012-01-27
-new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
-// /2012/jan/27/
-new RegExp('/(20\\d{2}/' + abbrevMonthsStr + '/[0-3]\\d)/', 'i')];
+ // /2012/01/27/ but not /2012/01/293
+ new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733
+ // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
+ // 2012-01-27
+ new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/
+ new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'),
+];
var GenericDatePublishedExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- url = _ref.url,
- metaCache = _ref.metaCache;
-
- var datePublished = void 0;
- // First, check to see if we have a matching meta tag
+ url = _ref.url,
+ metaCache = _ref.metaCache;
+ var datePublished; // First, check to see if we have a matching meta tag
// that we can make use of.
// Don't try cleaning tags from this string
- datePublished = extractFromMeta$$1($, DATE_PUBLISHED_META_TAGS, metaCache, false);
- if (datePublished) return cleanDatePublished(datePublished);
- // Second, look through our selectors looking for potential
+ datePublished = extractFromMeta$$1(
+ $,
+ DATE_PUBLISHED_META_TAGS,
+ metaCache,
+ false
+ );
+ if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential
// date_published's.
+
datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS);
- if (datePublished) return cleanDatePublished(datePublished);
+ if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL
- // Lastly, look to see if a dately string exists in the URL
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
if (datePublished) return cleanDatePublished(datePublished);
-
return null;
- }
+ },
};
// import {
@@ -6358,14 +6118,11 @@ var GenericDatePublishedExtractor = {
// DEK_SELECTORS,
// DEK_URL_RES,
// } from './constants';
-
// import { cleanDek } from 'cleaners';
-
// import {
// extractFromMeta,
// extractFromSelectors,
// } from 'utils/dom';
-
// Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
@@ -6374,12 +6131,8 @@ var GenericDekExtractor = {
// extract({ $, content, metaCache }) {
extract: function extract() {
return null;
- }
+ },
};
-
-
-
-// def extract_dek(self):
// # First, check to see if we have a matching meta tag that we can make
// # use of.
// dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
@@ -6409,23 +6162,58 @@ var GenericDekExtractor = {
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src'];
-
var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]'];
-
-var POSITIVE_LEAD_IMAGE_URL_HINTS = ['upload', 'wp-content', 'large', 'photo', 'wp-image'];
-var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
-
-var NEGATIVE_LEAD_IMAGE_URL_HINTS = ['spacer', 'sprite', 'blank', 'throbber', 'gradient', 'tile', 'bg', 'background', 'icon', 'social', 'header', 'hdr', 'advert', 'spinner', 'loader', 'loading', 'default', 'rating', 'share', 'facebook', 'twitter', 'theme', 'promo', 'ads', 'wp-includes'];
-var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
-
+var POSITIVE_LEAD_IMAGE_URL_HINTS = [
+ 'upload',
+ 'wp-content',
+ 'large',
+ 'photo',
+ 'wp-image',
+];
+var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(
+ POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'),
+ 'i'
+);
+var NEGATIVE_LEAD_IMAGE_URL_HINTS = [
+ 'spacer',
+ 'sprite',
+ 'blank',
+ 'throbber',
+ 'gradient',
+ 'tile',
+ 'bg',
+ 'background',
+ 'icon',
+ 'social',
+ 'header',
+ 'hdr',
+ 'advert',
+ 'spinner',
+ 'loader',
+ 'loading',
+ 'default',
+ 'rating',
+ 'share',
+ 'facebook',
+ 'twitter',
+ 'theme',
+ 'promo',
+ 'ads',
+ 'wp-includes',
+];
+var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(
+ NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'),
+ 'i'
+);
var GIF_RE = /\.gif(\?.*)?$/i;
var JPG_RE = /\.jpe?g(\?.*)?$/i;
function getSig($node) {
- return ($node.attr('class') || '') + ' ' + ($node.attr('id') || '');
-}
+ return ''
+ .concat($node.attr('class') || '', ' ')
+ .concat($node.attr('id') || '');
+} // Scores image urls based on a variety of heuristics.
-// Scores image urls based on a variety of heuristics.
function scoreImageUrl(url) {
url = url.trim();
var score = 0;
@@ -6436,34 +6224,29 @@ function scoreImageUrl(url) {
if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score -= 20;
- }
-
- // TODO: We might want to consider removing this as
+ } // TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were
+
if (GIF_RE.test(url)) {
score -= 10;
}
if (JPG_RE.test(url)) {
score += 10;
- }
-
- // PNGs are neutral.
+ } // PNGs are neutral.
return score;
-}
+} // Alt attribute usually means non-presentational image.
-// Alt attribute usually means non-presentational image.
function scoreAttr($img) {
if ($img.attr('alt')) {
return 5;
}
return 0;
-}
-
-// Look through our parent and grandparent for figure-like
+} // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them
+
function scoreByParents($img) {
var score = 0;
var $figParent = $img.parents('figure').first();
@@ -6473,22 +6256,21 @@ function scoreByParents($img) {
}
var $parent = $img.parent();
- var $gParent = void 0;
+ var $gParent;
+
if ($parent.length === 1) {
$gParent = $parent.parent();
}
- [$parent, $gParent].forEach(function ($node) {
+ [$parent, $gParent].forEach(function($node) {
if (PHOTO_HINTS_RE$1.test(getSig($node))) {
score += 15;
}
});
-
return score;
-}
-
-// Look at our immediate sibling and see if it looks like it's a
+} // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
+
function scoreBySibling($img) {
var score = 0;
var $sibling = $img.next();
@@ -6504,26 +6286,26 @@ function scoreBySibling($img) {
return score;
}
-
function scoreByDimensions($img) {
var score = 0;
- var width = parseFloat($img.attr('width'));
- var height = parseFloat($img.attr('height'));
- var src = $img.attr('src');
+ var width = _parseFloat($img.attr('width'));
+
+ var height = _parseFloat($img.attr('height'));
+
+ var src = $img.attr('src'); // Penalty for skinny images
- // Penalty for skinny images
if (width && width <= 50) {
score -= 50;
- }
+ } // Penalty for short images
- // Penalty for short images
if (height && height <= 50) {
score -= 50;
}
if (width && height && !src.includes('sprite')) {
var area = width * height;
+
if (area < 5000) {
// Smaller than 50 x 100
score -= 100;
@@ -6534,12 +6316,10 @@ function scoreByDimensions($img) {
return score;
}
-
function scoreByPosition($imgs, index) {
return $imgs.length / 2 - index;
}
-// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
@@ -6547,90 +6327,99 @@ function scoreByPosition($imgs, index) {
// Potential signals to still take advantage of:
// * domain
// * weird aspect ratio
+
var GenericLeadImageUrlExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- content = _ref.content,
- metaCache = _ref.metaCache,
- html = _ref.html;
+ content = _ref.content,
+ metaCache = _ref.metaCache,
+ html = _ref.html;
+ var cleanUrl;
- var cleanUrl = void 0;
if (!$.browser && $('head').length === 0) {
- $('*').first().prepend(html);
- }
-
- // Check to see if we have a matching meta tag that we can make use of.
+ $('*')
+ .first()
+ .prepend(html);
+ } // Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph.
- var imageUrl = extractFromMeta$$1($, LEAD_IMAGE_URL_META_TAGS, metaCache, false);
+
+ var imageUrl = extractFromMeta$$1(
+ $,
+ LEAD_IMAGE_URL_META_TAGS,
+ metaCache,
+ false
+ );
if (imageUrl) {
cleanUrl = clean$1(imageUrl);
-
if (cleanUrl) return cleanUrl;
- }
-
- // Next, try to find the "best" image via the content.
+ } // Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
+
var $content = $(content);
var imgs = $('img', $content).toArray();
var imgScores = {};
-
- imgs.forEach(function (img, index) {
+ imgs.forEach(function(img, index) {
var $img = $(img);
var src = $img.attr('src');
-
if (!src) return;
-
var score = scoreImageUrl(src);
score += scoreAttr($img);
score += scoreByParents($img);
score += scoreBySibling($img);
score += scoreByDimensions($img);
score += scoreByPosition(imgs, index);
-
imgScores[src] = score;
});
- var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(function (acc, key) {
- return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
- }, [null, 0]),
- _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
- topUrl = _Reflect$ownKeys$redu2[0],
- topScore = _Reflect$ownKeys$redu2[1];
+ var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(
+ function(acc, key) {
+ return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
+ },
+ [null, 0]
+ ),
+ _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
+ topUrl = _Reflect$ownKeys$redu2[0],
+ topScore = _Reflect$ownKeys$redu2[1];
if (topScore > 0) {
cleanUrl = clean$1(topUrl);
-
if (cleanUrl) return cleanUrl;
- }
-
- // If nothing else worked, check to see if there are any really
+ } // If nothing else worked, check to see if there are any really
// probable nodes in the doc, like .
+ // eslint-disable-next-line no-restricted-syntax
+
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
- for (var _iterator = _getIterator(LEAD_IMAGE_URL_SELECTORS), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
+ for (
+ var _iterator = _getIterator(LEAD_IMAGE_URL_SELECTORS), _step;
+ !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
+ _iteratorNormalCompletion = true
+ ) {
var selector = _step.value;
-
var $node = $(selector).first();
var src = $node.attr('src');
+
if (src) {
cleanUrl = clean$1(src);
if (cleanUrl) return cleanUrl;
}
var href = $node.attr('href');
+
if (href) {
cleanUrl = clean$1(href);
if (cleanUrl) return cleanUrl;
}
var value = $node.attr('value');
+
if (value) {
cleanUrl = clean$1(value);
if (cleanUrl) return cleanUrl;
@@ -6641,7 +6430,7 @@ var GenericLeadImageUrlExtractor = {
_iteratorError = err;
} finally {
try {
- if (!_iteratorNormalCompletion && _iterator.return) {
+ if (!_iteratorNormalCompletion && _iterator.return != null) {
_iterator.return();
}
} finally {
@@ -6652,12 +6441,8 @@ var GenericLeadImageUrlExtractor = {
}
return null;
- }
+ },
};
-
-
-
-// def extract(self):
// """
// # First, try to find the "best" image via the content.
// # We'd rather not have to fetch each image and check dimensions,
@@ -6819,13 +6604,17 @@ function scoreSimilarity(score, articleUrl, href) {
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
- var similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
- // Subtract .1 from diff_percent when calculating modifier,
+ var similarity = new difflib.SequenceMatcher(
+ null,
+ articleUrl,
+ href
+ ).ratio(); // Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
+
var diffPercent = 1.0 - similarity;
var diffModifier = -(250 * (diffPercent - 0.2));
return score + diffModifier;
@@ -6842,19 +6631,18 @@ function scoreLinkText(linkText, pageNum) {
var score = 0;
if (IS_DIGIT_RE.test(linkText.trim())) {
- var linkTextAsNum = parseInt(linkText, 10);
- // If it's the first page, we already got it on the first call.
+ var linkTextAsNum = _parseInt(linkText, 10); // If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
+
if (linkTextAsNum < 2) {
score = -30;
} else {
score = Math.max(0, 10 - linkTextAsNum);
- }
-
- // If it appears that the current page number is greater than
+ } // If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
+
if (pageNum && pageNum >= linkTextAsNum) {
score -= 50;
}
@@ -6874,27 +6662,42 @@ function scorePageInLink(pageNum, isWp) {
return 0;
}
-var DIGIT_RE$2 = /\d/;
-
-// A list of words that, if found in link text or URLs, likely mean that
+var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
-var EXTRANEOUS_LINK_HINTS$1 = ['print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated'];
-var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(EXTRANEOUS_LINK_HINTS$1.join('|'), 'i');
-// Match any link text/classname/id that looks like it could mean the next
+var EXTRANEOUS_LINK_HINTS$1 = [
+ 'print',
+ 'archive',
+ 'comment',
+ 'discuss',
+ 'e-mail',
+ 'email',
+ 'share',
+ 'reply',
+ 'all',
+ 'login',
+ 'sign',
+ 'single',
+ 'adx',
+ 'entry-unrelated',
+];
+var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp(
+ EXTRANEOUS_LINK_HINTS$1.join('|'),
+ 'i'
+); // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
-var NEXT_LINK_TEXT_RE$1 = new RegExp('(next|weiter|continue|>([^|]|$)|»([^|]|$))', 'i');
-// Match any link text/classname/id that looks like it is an end link: things
+var NEXT_LINK_TEXT_RE$1 = new RegExp(
+ '(next|weiter|continue|>([^|]|$)|»([^|]|$))',
+ 'i'
+); // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
-var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i');
-// Match any link text/classname/id that looks like it means the previous
+var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous
// page.
-var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i');
-// Match any phrase that looks like it could be page, or paging, or pagination
+var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination
function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
@@ -6905,8 +6708,10 @@ function scoreExtraneousLinks(href) {
return 0;
}
-function makeSig$1($link) {
- return ($link.attr('class') || '') + ' ' + ($link.attr('id') || '');
+function makeSig($link) {
+ return ''
+ .concat($link.attr('class') || '', ' ')
+ .concat($link.attr('id') || '');
}
function scoreByParents$1($link) {
@@ -6918,24 +6723,26 @@ function scoreByParents$1($link) {
var negativeMatch = false;
var score = 0;
- _Array$from(range(0, 4)).forEach(function () {
+ _Array$from(range(0, 4)).forEach(function() {
if ($parent.length === 0) {
return;
}
- var parentData = makeSig$1($parent, ' ');
-
- // If we have 'page' or 'paging' in our data, that's a good
+ var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
+
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true;
score += 25;
- }
-
- // If we have 'comment' or something in our data, and
+ } // If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
- if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData) && EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)) {
+
+ if (
+ !negativeMatch &&
+ NEGATIVE_SCORE_RE.test(parentData) &&
+ EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)
+ ) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true;
score -= 25;
@@ -6958,16 +6765,24 @@ function scorePrevLink(linkData) {
return 0;
}
-function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls) {
+function shouldScore(
+ href,
+ articleUrl,
+ baseUrl,
+ parsedUrl,
+ linkText,
+ previousUrls
+) {
// skip if we've already fetched this url
- if (previousUrls.find(function (url) {
- return href === url;
- }) !== undefined) {
+ if (
+ previousUrls.find(function(url) {
+ return href === url;
+ }) !== undefined
+ ) {
return false;
- }
-
- // If we've already parsed this URL, or the URL matches the base
+ } // If we've already parsed this URL, or the URL matches the base
// URL, or is empty, skip it.
+
if (!href || href === articleUrl || href === baseUrl) {
return false;
}
@@ -6975,29 +6790,24 @@ function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrl
var hostname = parsedUrl.hostname;
var _URL$parse = URL.parse(href),
- linkHost = _URL$parse.hostname;
-
- // Domain mismatch.
-
+ linkHost = _URL$parse.hostname; // Domain mismatch.
if (linkHost !== hostname) {
return false;
- }
-
- // If href doesn't contain a digit after removing the base URL,
+ } // If href doesn't contain a digit after removing the base URL,
// it's certainly not the next page.
+
var fragment = href.replace(baseUrl, '');
+
if (!DIGIT_RE$2.test(fragment)) {
return false;
- }
-
- // This link has extraneous content (like "comment") in its link
+ } // This link has extraneous content (like "comment") in its link
// text, so we skip it.
+
if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) {
return false;
- }
+ } // Next page link text is never long, skip if it is too long.
- // Next page link text is never long, skip if it is too long.
if (linkText.length > 25) {
return false;
}
@@ -7042,65 +6852,66 @@ function scoreCapLinks(linkData) {
}
function makeBaseRegex(baseUrl) {
- return new RegExp('^' + baseUrl, 'i');
+ return new RegExp('^'.concat(baseUrl), 'i');
}
-function makeSig($link, linkText) {
- return (linkText || $link.text()) + ' ' + ($link.attr('class') || '') + ' ' + ($link.attr('id') || '');
+function makeSig$1($link, linkText) {
+ return ''
+ .concat(linkText || $link.text(), ' ')
+ .concat($link.attr('class') || '', ' ')
+ .concat($link.attr('id') || '');
}
function scoreLinks(_ref) {
var links = _ref.links,
- articleUrl = _ref.articleUrl,
- baseUrl = _ref.baseUrl,
- parsedUrl = _ref.parsedUrl,
- $ = _ref.$,
- _ref$previousUrls = _ref.previousUrls,
- previousUrls = _ref$previousUrls === undefined ? [] : _ref$previousUrls;
-
+ articleUrl = _ref.articleUrl,
+ baseUrl = _ref.baseUrl,
+ parsedUrl = _ref.parsedUrl,
+ $ = _ref.$,
+ _ref$previousUrls = _ref.previousUrls,
+ previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
parsedUrl = parsedUrl || URL.parse(articleUrl);
var baseRegex = makeBaseRegex(baseUrl);
- var isWp = isWordpress($);
-
- // Loop through all links, looking for hints that they may be next-page
+ var isWp = isWordpress($); // Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or
// id, or being a child of a node with a page-y className or id.
//
// After we do that, assign each page a score, and pick the one that
// looks most like the next page link, as long as its score is strong
// enough to have decent confidence.
- var scoredPages = links.reduce(function (possiblePages, link) {
+
+ var scoredPages = links.reduce(function(possiblePages, link) {
// Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
- var attrs = getAttrs(link);
+ var attrs = getAttrs(link); // if href is undefined, return
- // if href is undefined, return
if (!attrs.href) return possiblePages;
-
var href = removeAnchor(attrs.href);
var $link = $(link);
var linkText = $link.text();
- if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
+ if (
+ !shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)
+ ) {
return possiblePages;
- }
+ } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
- // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if (!possiblePages[href]) {
possiblePages[href] = {
score: 0,
linkText: linkText,
- href: href
+ href: href,
};
} else {
- possiblePages[href].linkText = possiblePages[href].linkText + '|' + linkText;
+ possiblePages[href].linkText = ''
+ .concat(possiblePages[href].linkText, '|')
+ .concat(linkText);
}
var possiblePage = possiblePages[href];
- var linkData = makeSig($link, linkText);
+ var linkData = makeSig$1($link, linkText);
var pageNum = pageNumFromUrl(href);
-
var score = scoreBaseUrl(href, baseRegex);
score += scoreNextLinkText(linkData);
score += scoreCapLinks(linkData);
@@ -7110,59 +6921,54 @@ function scoreLinks(_ref) {
score += scorePageInLink(pageNum, isWp);
score += scoreLinkText(linkText, pageNum);
score += scoreSimilarity(score, articleUrl, href);
-
possiblePage.score = score;
-
return possiblePages;
}, {});
-
return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages;
}
-// Looks for and returns next page url
// for multi-page articles
+
var GenericNextPageUrlExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- url = _ref.url,
- parsedUrl = _ref.parsedUrl,
- _ref$previousUrls = _ref.previousUrls,
- previousUrls = _ref$previousUrls === undefined ? [] : _ref$previousUrls;
-
+ url = _ref.url,
+ parsedUrl = _ref.parsedUrl,
+ _ref$previousUrls = _ref.previousUrls,
+ previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
parsedUrl = parsedUrl || URL.parse(url);
-
var articleUrl = removeAnchor(url);
var baseUrl = articleBaseUrl(url, parsedUrl);
-
var links = $('a[href]').toArray();
-
var scoredLinks = scoreLinks({
links: links,
articleUrl: articleUrl,
baseUrl: baseUrl,
parsedUrl: parsedUrl,
$: $,
- previousUrls: previousUrls
- });
-
- // If no links were scored, return null
- if (!scoredLinks) return null;
+ previousUrls: previousUrls,
+ }); // If no links were scored, return null
- // now that we've scored all possible pages,
+ if (!scoredLinks) return null; // now that we've scored all possible pages,
// find the biggest one.
- var topPage = _Reflect$ownKeys(scoredLinks).reduce(function (acc, link) {
- var scoredLink = scoredLinks[link];
- return scoredLink.score > acc.score ? scoredLink : acc;
- }, { score: -100 });
- // If the score is less than 50, we're not confident enough to use it,
+ var topPage = _Reflect$ownKeys(scoredLinks).reduce(
+ function(acc, link) {
+ var scoredLink = scoredLinks[link];
+ return scoredLink.score > acc.score ? scoredLink : acc;
+ },
+ {
+ score: -100,
+ }
+ ); // If the score is less than 50, we're not confident enough to use it,
// so we fail.
+
if (topPage.score >= 50) {
return topPage.href;
}
return null;
- }
+ },
};
var CANONICAL_META_SELECTORS = ['og:url'];
@@ -7170,76 +6976,76 @@ var CANONICAL_META_SELECTORS = ['og:url'];
function parseDomain(url) {
var parsedUrl = URL.parse(url);
var hostname = parsedUrl.hostname;
-
return hostname;
}
function result(url) {
return {
url: url,
- domain: parseDomain(url)
+ domain: parseDomain(url),
};
}
var GenericUrlExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- url = _ref.url,
- metaCache = _ref.metaCache;
-
+ url = _ref.url,
+ metaCache = _ref.metaCache;
var $canonical = $('link[rel=canonical]');
+
if ($canonical.length !== 0) {
var href = $canonical.attr('href');
+
if (href) {
return result(href);
}
}
var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache);
+
if (metaUrl) {
return result(metaUrl);
}
return result(url);
- }
+ },
};
var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
function clean$2(content, $) {
- var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
-
+ var maxLength =
+ arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
content = content.replace(/[\s\n]+/g, ' ').trim();
- return ellipsize(content, maxLength, { ellipse: '…' });
+ return ellipsize(content, maxLength, {
+ ellipse: '…',
+ });
}
-
var GenericExcerptExtractor = {
extract: function extract(_ref) {
var $ = _ref.$,
- content = _ref.content,
- metaCache = _ref.metaCache;
-
+ content = _ref.content,
+ metaCache = _ref.metaCache;
var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache);
+
if (excerpt) {
return clean$2(stripTags(excerpt, $));
- }
- // Fall back to excerpting from the extracted content
+ } // Fall back to excerpting from the extracted content
+
var maxLength = 200;
var shortContent = content.slice(0, maxLength * 5);
return clean$2($(shortContent).text(), $, maxLength);
- }
+ },
};
var GenericWordCountExtractor = {
extract: function extract(_ref) {
var content = _ref.content;
-
var $ = cheerio.load(content);
var $content = $('div').first();
-
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
- }
+ },
};
var GenericExtractor = {
@@ -7259,11 +7065,9 @@ var GenericExtractor = {
var title = _ref.title;
return stringDirection.getDirection(title);
},
-
extract: function extract(options) {
var html = options.html,
- $ = options.$;
-
+ $ = options.$;
if (html && !$) {
var loaded = cheerio.load(html);
@@ -7273,17 +7077,39 @@ var GenericExtractor = {
var title = this.title(options);
var date_published = this.date_published(options);
var author = this.author(options);
- var content = this.content(_extends({}, options, { title: title }));
- var lead_image_url = this.lead_image_url(_extends({}, options, { content: content }));
- var dek = this.dek(_extends({}, options, { content: content }));
+ var content = this.content(
+ _objectSpread({}, options, {
+ title: title,
+ })
+ );
+ var lead_image_url = this.lead_image_url(
+ _objectSpread({}, options, {
+ content: content,
+ })
+ );
+ var dek = this.dek(
+ _objectSpread({}, options, {
+ content: content,
+ })
+ );
var next_page_url = this.next_page_url(options);
- var excerpt = this.excerpt(_extends({}, options, { content: content }));
- var word_count = this.word_count(_extends({}, options, { content: content }));
- var direction = this.direction({ title: title });
+ var excerpt = this.excerpt(
+ _objectSpread({}, options, {
+ content: content,
+ })
+ );
+ var word_count = this.word_count(
+ _objectSpread({}, options, {
+ content: content,
+ })
+ );
+ var direction = this.direction({
+ title: title,
+ });
- var _url_and_domain = this.url_and_domain(options),
- url = _url_and_domain.url,
- domain = _url_and_domain.domain;
+ var _this$url_and_domain = this.url_and_domain(options),
+ url = _this$url_and_domain.url,
+ domain = _this$url_and_domain.domain;
return {
title: title,
@@ -7297,18 +7123,17 @@ var GenericExtractor = {
domain: domain,
excerpt: excerpt,
word_count: word_count,
- direction: direction
+ direction: direction,
};
- }
+ },
};
var Detectors = {
'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
- 'meta[name="generator"][value="blogger"]': BloggerExtractor
+ 'meta[name="generator"][value="blogger"]': BloggerExtractor,
};
-
function detectByHtml($) {
- var selector = _Reflect$ownKeys(Detectors).find(function (s) {
+ var selector = _Reflect$ownKeys(Detectors).find(function(s) {
return $(s).length > 0;
});
@@ -7318,44 +7143,43 @@ function detectByHtml($) {
function getExtractor(url, parsedUrl, $) {
parsedUrl = parsedUrl || URL.parse(url);
var _parsedUrl = parsedUrl,
- hostname = _parsedUrl.hostname;
-
- var baseDomain = hostname.split('.').slice(-2).join('.');
-
- return Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
+ hostname = _parsedUrl.hostname;
+ var baseDomain = hostname
+ .split('.')
+ .slice(-2)
+ .join('.');
+ return (
+ Extractors[hostname] ||
+ Extractors[baseDomain] ||
+ detectByHtml($) ||
+ GenericExtractor
+ );
}
-// Remove elements by an array of selectors
function cleanBySelectors($content, $, _ref) {
var clean = _ref.clean;
-
if (!clean) return $content;
-
$(clean.join(','), $content).remove();
-
return $content;
-}
+} // Transform matching elements
-// Transform matching elements
function transformElements($content, $, _ref2) {
var transforms = _ref2.transforms;
-
if (!transforms) return $content;
- _Reflect$ownKeys(transforms).forEach(function (key) {
+ _Reflect$ownKeys(transforms).forEach(function(key) {
var $matches = $(key, $content);
- var value = transforms[key];
+ var value = transforms[key]; // If value is a string, convert directly
- // If value is a string, convert directly
if (typeof value === 'string') {
- $matches.each(function (index, node) {
+ $matches.each(function(index, node) {
convertNodeTo$$1($(node), $, transforms[key]);
});
} else if (typeof value === 'function') {
// If value is function, apply function to node
- $matches.each(function (index, node) {
- var result = value($(node), $);
- // If function returns a string, convert node to that value
+ $matches.each(function(index, node) {
+ var result = value($(node), $); // If function returns a string, convert node to that value
+
if (typeof result === 'string') {
convertNodeTo$$1($(node), $, result);
}
@@ -7367,109 +7191,110 @@ function transformElements($content, $, _ref2) {
}
function findMatchingSelector($, selectors, extractHtml) {
- return selectors.find(function (selector) {
- if (Array.isArray(selector)) {
+ return selectors.find(function(selector) {
+ if (_Array$isArray(selector)) {
if (extractHtml) {
- return selector.reduce(function (acc, s) {
+ return selector.reduce(function(acc, s) {
return acc && $(s).length > 0;
}, true);
}
var _selector = _slicedToArray(selector, 2),
- s = _selector[0],
- attr = _selector[1];
-
- return $(s).length === 1 && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
+ s = _selector[0],
+ attr = _selector[1];
+
+ return (
+ $(s).length === 1 &&
+ $(s).attr(attr) &&
+ $(s)
+ .attr(attr)
+ .trim() !== ''
+ );
}
- return $(selector).length === 1 && $(selector).text().trim() !== '';
+ return (
+ $(selector).length === 1 &&
+ $(selector)
+ .text()
+ .trim() !== ''
+ );
});
}
function select(opts) {
var $ = opts.$,
- type = opts.type,
- extractionOpts = opts.extractionOpts,
- _opts$extractHtml = opts.extractHtml,
- extractHtml = _opts$extractHtml === undefined ? false : _opts$extractHtml;
- // Skip if there's not extraction for this type
-
- if (!extractionOpts) return null;
+ type = opts.type,
+ extractionOpts = opts.extractionOpts,
+ _opts$extractHtml = opts.extractHtml,
+ extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
- // If a string is hardcoded for a type (e.g., Wikipedia
+ if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
// contributors), return the string
- if (typeof extractionOpts === 'string') return extractionOpts;
+ if (typeof extractionOpts === 'string') return extractionOpts;
var selectors = extractionOpts.selectors,
- _extractionOpts$defau = extractionOpts.defaultCleaner,
- defaultCleaner = _extractionOpts$defau === undefined ? true : _extractionOpts$defau;
-
-
+ _extractionOpts$defau = extractionOpts.defaultCleaner,
+ defaultCleaner =
+ _extractionOpts$defau === void 0 ? true : _extractionOpts$defau;
var matchingSelector = findMatchingSelector($, selectors, extractHtml);
-
- if (!matchingSelector) return null;
-
- // Declaring result; will contain either
+ if (!matchingSelector) return null; // Declaring result; will contain either
// text or html, which will be cleaned
// by the appropriate cleaner type
-
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
- var $content = void 0;
+
+ var $content;
+
if (extractHtml) {
// If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
// array must match in order for this selector to trigger
- if (Array.isArray(matchingSelector)) {
- (function () {
- $content = $(matchingSelector.join(','));
- var $wrapper = $('');
- $content.each(function (index, element) {
- $wrapper.append(element);
- });
-
- $content = $wrapper;
- })();
+ if (_Array$isArray(matchingSelector)) {
+ $content = $(matchingSelector.join(','));
+ var $wrapper = $('');
+ $content.each(function(index, element) {
+ $wrapper.append(element);
+ });
+ $content = $wrapper;
} else {
$content = $(matchingSelector);
- }
+ } // Wrap in div so transformation can take place on root element
- // Wrap in div so transformation can take place on root element
$content.wrap($(''));
$content = $content.parent();
-
$content = transformElements($content, $, extractionOpts);
$content = cleanBySelectors($content, $, extractionOpts);
-
- $content = Cleaners[type]($content, _extends({}, opts, { defaultCleaner: defaultCleaner }));
-
+ $content = Cleaners[type](
+ $content,
+ _objectSpread({}, opts, {
+ defaultCleaner: defaultCleaner,
+ })
+ );
return $.html($content);
}
- var result = void 0;
-
- // if selector is an array (e.g., ['img', 'src']),
+ var result; // if selector is an array (e.g., ['img', 'src']),
// extract the attr
- if (Array.isArray(matchingSelector)) {
+
+ if (_Array$isArray(matchingSelector)) {
var _matchingSelector = _slicedToArray(matchingSelector, 2),
- selector = _matchingSelector[0],
- attr = _matchingSelector[1];
+ selector = _matchingSelector[0],
+ attr = _matchingSelector[1];
- result = $(selector).attr(attr).trim();
+ result = $(selector)
+ .attr(attr)
+ .trim();
} else {
var $node = $(matchingSelector);
-
$node = cleanBySelectors($node, $, extractionOpts);
$node = transformElements($node, $, extractionOpts);
-
result = $node.text().trim();
- }
-
- // Allow custom extractor to skip default cleaner
+ } // Allow custom extractor to skip default cleaner
// for this type; defaults to true
+
if (defaultCleaner) {
- return Cleaners[type](result, _extends({}, opts, extractionOpts));
+ return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
}
return result;
@@ -7477,62 +7302,123 @@ function select(opts) {
function extractResult(opts) {
var type = opts.type,
- extractor = opts.extractor,
- _opts$fallback = opts.fallback,
- fallback = _opts$fallback === undefined ? true : _opts$fallback;
-
+ extractor = opts.extractor,
+ _opts$fallback = opts.fallback,
+ fallback = _opts$fallback === void 0 ? true : _opts$fallback;
+ var result = select(
+ _objectSpread({}, opts, {
+ extractionOpts: extractor[type],
+ })
+ ); // If custom parser succeeds, return the result
- var result = select(_extends({}, opts, { extractionOpts: extractor[type] }));
-
- // If custom parser succeeds, return the result
if (result) {
return result;
- }
-
- // If nothing matches the selector, and fallback is enabled,
+ } // If nothing matches the selector, and fallback is enabled,
// run the Generic extraction
- if (fallback) return GenericExtractor[type](opts);
+ if (fallback) return GenericExtractor[type](opts);
return null;
}
var RootExtractor = {
extract: function extract() {
- var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
- var opts = arguments[1];
+ var extractor =
+ arguments.length > 0 && arguments[0] !== undefined
+ ? arguments[0]
+ : GenericExtractor;
+ var opts = arguments.length > 1 ? arguments[1] : undefined;
var _opts = opts,
- contentOnly = _opts.contentOnly,
- extractedTitle = _opts.extractedTitle;
- // This is the generic extractor. Run its extract method
+ contentOnly = _opts.contentOnly,
+ extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
if (extractor.domain === '*') return extractor.extract(opts);
-
- opts = _extends({}, opts, {
- extractor: extractor
+ opts = _objectSpread({}, opts, {
+ extractor: extractor,
});
if (contentOnly) {
- var _content = extractResult(_extends({}, opts, { type: 'content', extractHtml: true, title: extractedTitle
- }));
+ var _content = extractResult(
+ _objectSpread({}, opts, {
+ type: 'content',
+ extractHtml: true,
+ title: extractedTitle,
+ })
+ );
+
return {
- content: _content
+ content: _content,
};
}
- var title = extractResult(_extends({}, opts, { type: 'title' }));
- var date_published = extractResult(_extends({}, opts, { type: 'date_published' }));
- var author = extractResult(_extends({}, opts, { type: 'author' }));
- var next_page_url = extractResult(_extends({}, opts, { type: 'next_page_url' }));
- var content = extractResult(_extends({}, opts, { type: 'content', extractHtml: true, title: title
- }));
- var lead_image_url = extractResult(_extends({}, opts, { type: 'lead_image_url', content: content }));
- var excerpt = extractResult(_extends({}, opts, { type: 'excerpt', content: content }));
- var dek = extractResult(_extends({}, opts, { type: 'dek', content: content, excerpt: excerpt }));
- var word_count = extractResult(_extends({}, opts, { type: 'word_count', content: content }));
- var direction = extractResult(_extends({}, opts, { type: 'direction', title: title }));
-
- var _ref3 = extractResult(_extends({}, opts, { type: 'url_and_domain' })) || { url: null, domain: null },
- url = _ref3.url,
- domain = _ref3.domain;
+
+ var title = extractResult(
+ _objectSpread({}, opts, {
+ type: 'title',
+ })
+ );
+ var date_published = extractResult(
+ _objectSpread({}, opts, {
+ type: 'date_published',
+ })
+ );
+ var author = extractResult(
+ _objectSpread({}, opts, {
+ type: 'author',
+ })
+ );
+ var next_page_url = extractResult(
+ _objectSpread({}, opts, {
+ type: 'next_page_url',
+ })
+ );
+ var content = extractResult(
+ _objectSpread({}, opts, {
+ type: 'content',
+ extractHtml: true,
+ title: title,
+ })
+ );
+ var lead_image_url = extractResult(
+ _objectSpread({}, opts, {
+ type: 'lead_image_url',
+ content: content,
+ })
+ );
+ var excerpt = extractResult(
+ _objectSpread({}, opts, {
+ type: 'excerpt',
+ content: content,
+ })
+ );
+ var dek = extractResult(
+ _objectSpread({}, opts, {
+ type: 'dek',
+ content: content,
+ excerpt: excerpt,
+ })
+ );
+ var word_count = extractResult(
+ _objectSpread({}, opts, {
+ type: 'word_count',
+ content: content,
+ })
+ );
+ var direction = extractResult(
+ _objectSpread({}, opts, {
+ type: 'direction',
+ title: title,
+ })
+ );
+
+ var _ref3 = extractResult(
+ _objectSpread({}, opts, {
+ type: 'url_and_domain',
+ })
+ ) || {
+ url: null,
+ domain: null,
+ },
+ url = _ref3.url,
+ domain = _ref3.domain;
return {
title: title,
@@ -7546,235 +7432,257 @@ var RootExtractor = {
domain: domain,
excerpt: excerpt,
word_count: word_count,
- direction: direction
+ direction: direction,
};
- }
+ },
};
-var collectAllPages = (function () {
- var _ref = _asyncToGenerator(_regeneratorRuntime.mark(function _callee(_ref2) {
- var next_page_url = _ref2.next_page_url,
- html = _ref2.html,
- $ = _ref2.$,
- metaCache = _ref2.metaCache,
- result = _ref2.result,
- Extractor = _ref2.Extractor,
- title = _ref2.title,
- url = _ref2.url;
- var pages, previousUrls, extractorOpts, nextPageResult, word_count;
- return _regeneratorRuntime.wrap(function _callee$(_context) {
- while (1) {
- switch (_context.prev = _context.next) {
- case 0:
- // At this point, we've fetched just the first page
- pages = 1;
- previousUrls = [removeAnchor(url)];
-
- // If we've gone over 26 pages, something has
- // likely gone wrong.
-
- case 2:
- if (!(next_page_url && pages < 26)) {
- _context.next = 15;
- break;
- }
-
- pages += 1;
- _context.next = 6;
- return Resource.create(next_page_url);
-
- case 6:
- $ = _context.sent;
-
- html = $.html();
-
- extractorOpts = {
- url: next_page_url,
- html: html,
- $: $,
- metaCache: metaCache,
- contentOnly: true,
- extractedTitle: title,
- previousUrls: previousUrls
- };
- nextPageResult = RootExtractor.extract(Extractor, extractorOpts);
-
-
- previousUrls.push(next_page_url);
- result = _extends({}, result, {
- content: result.content + '