release: 1.1.0 (#245)

This commit is contained in:
Adam Pash 2019-02-05 14:53:22 -08:00 committed by GitHub
parent 6844975c94
commit d884c3470c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 20695 additions and 35661 deletions

View File

@ -1,5 +1,30 @@
# Mercury Parser Changelog # Mercury Parser Changelog
### 1.1.0 (Feb 5, 2019)
##### Commits
- [[`6844975c94`](https://github.com/postlight/mercury-parser/commit/6844975c94)] - **feat**: add mercury-parser cli (#244) (Adam Pash)
- [[`7bdbbc8ed8`](https://github.com/postlight/mercury-parser/commit/7bdbbc8ed8)] - **deps**: update dependencies to enable Greenkeeper 🌴 (#243) (greenkeeper[bot])
- [[`e38aff9c17`](https://github.com/postlight/mercury-parser/commit/e38aff9c17)] - **docs**: add npm install instructions (#240) (Adam Pash)
- [[`dc3dff6584`](https://github.com/postlight/mercury-parser/commit/dc3dff6584)] - **docs**: add hero to README (#239) (Gina Trapani)
- [[`15f7fa1e27`](https://github.com/postlight/mercury-parser/commit/15f7fa1e27)] - a more explicit .prettierrc (Adam Pash)
- [[`c6f42c1278`](https://github.com/postlight/mercury-parser/commit/c6f42c1278)] - **docs**: cleanup and update docs (#238) (Adam Pash)
- [[`92de5ce4ed`](https://github.com/postlight/mercury-parser/commit/92de5ce4ed)] - **docs**: remove contributors (github already has this covered) (#237) (Adam Pash)
- [[`2845a1bb7e`](https://github.com/postlight/mercury-parser/commit/2845a1bb7e)] - **docs**: add gitter room text and link (#235) (George Haddad)
- [[`380196b709`](https://github.com/postlight/mercury-parser/commit/380196b709)] - **docs**: change text to include AMP and Reader (#236) (George Haddad)
- [[`33bf5882b9`](https://github.com/postlight/mercury-parser/commit/33bf5882b9)] - **docs**: add mit license badge (#234) (George Haddad)
- [[`5c0325f5a7`](https://github.com/postlight/mercury-parser/commit/5c0325f5a7)] - **feat**: hook up ci to publish to npm (#226) (George Haddad)
- [[`663cc45bf4`](https://github.com/postlight/mercury-parser/commit/663cc45bf4)] - fresh run of prettier; remove NOTES.md (#233) (Adam Pash)
- [[`244d17ddd3`](https://github.com/postlight/mercury-parser/commit/244d17ddd3)] - **fix**: proxy browser in build tests (#232) (Adam Pash)
- [[`0668f5d75b`](https://github.com/postlight/mercury-parser/commit/0668f5d75b)] - **docs**: add instructions for browser usage to parse current page (#231) (Toufic Mouallem)
- [[`4ab50133f4`](https://github.com/postlight/mercury-parser/commit/4ab50133f4)] - **chore**: update node rollup config (#229) (Jad Termsani)
- [[`1ccd14e1e9`](https://github.com/postlight/mercury-parser/commit/1ccd14e1e9)] - **feat**: add fortinet custom parser (#188) (Wajeeh Zantout)
- [[`9b36003b62`](https://github.com/postlight/mercury-parser/commit/9b36003b62)] - **feat**: add fastcompany custom parser (#191) (Wajeeh Zantout)
- [[`199fe70b03`](https://github.com/postlight/mercury-parser/commit/199fe70b03)] - Docs contributors (#227) (Ralph Jbeily)
- [[`9756e6ee67`](https://github.com/postlight/mercury-parser/commit/9756e6ee67)] - **docs**: update mercury parser installation (#228) (Ralph Jbeily)
- [[`1c7ae48de0`](https://github.com/postlight/mercury-parser/commit/1c7ae48de0)] - **dx**: include test results in comment (#230) (Adam Pash)
### 1.0.13 (Oct 11, 2018) ### 1.0.13 (Oct 11, 2018)
##### Commits ##### Commits

2
cli.js
View File

@ -1,5 +1,5 @@
#!/usr/bin/env node #!/usr/bin/env node
/* eslint-disable no-multi-str */ /* eslint-disable */
const Mercury = require('./dist/mercury'); const Mercury = require('./dist/mercury');

460
dist/mercury.js vendored
View File

@ -38,6 +38,7 @@ var _defineProperty = _interopDefault(
var _parseFloat = _interopDefault( var _parseFloat = _interopDefault(
require('@babel/runtime-corejs2/core-js/parse-float') require('@babel/runtime-corejs2/core-js/parse-float')
); );
var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof')); var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
var _getIterator = _interopDefault( var _getIterator = _interopDefault(
require('@babel/runtime-corejs2/core-js/get-iterator') require('@babel/runtime-corejs2/core-js/get-iterator')
@ -391,20 +392,28 @@ function _fetchResource() {
switch ((_context.prev = _context.next)) { switch ((_context.prev = _context.next)) {
case 0: case 0:
parsedUrl = parsedUrl || URL.parse(encodeURI(url)); parsedUrl = parsedUrl || URL.parse(encodeURI(url));
options = { options = _objectSpread(
url: parsedUrl.href, {
headers: _objectSpread({}, REQUEST_HEADERS), url: parsedUrl.href,
timeout: FETCH_TIMEOUT, headers: _objectSpread({}, REQUEST_HEADERS),
// Accept cookies timeout: FETCH_TIMEOUT,
jar: true, // Accept cookies
// Set to null so the response returns as binary and body as buffer jar: true,
// https://github.com/request/request#requestoptions-callback // Set to null so the response returns as binary and body as buffer
encoding: null, // https://github.com/request/request#requestoptions-callback
// Accept and decode gzip encoding: null,
gzip: true, // Accept and decode gzip
// Follow any redirect gzip: true,
followAllRedirects: true, // Follow any non-GET redirects
}; followAllRedirects: true,
},
typeof window !== 'undefined'
? {}
: {
// Follow GET redirects; this option is for Node only
followRedirect: true,
}
);
_context.next = 4; _context.next = 4;
return get(options); return get(options);
@ -803,8 +812,7 @@ function brsToPs$$1($) {
collapsing = true; collapsing = true;
$element.remove(); $element.remove();
} else if (collapsing) { } else if (collapsing) {
collapsing = false; // $(element).replaceWith('<p />') collapsing = false;
paragraphize(element, $, true); paragraphize(element, $, true);
} }
}); });
@ -899,7 +907,7 @@ function convertNodeTo$$1($node, $) {
return $; return $;
} }
var attrs = getAttrs(node) || {}; // console.log(attrs) var attrs = getAttrs(node) || {};
var attribString = _Reflect$ownKeys(attrs) var attribString = _Reflect$ownKeys(attrs)
.map(function(key) { .map(function(key) {
@ -1039,12 +1047,7 @@ function removeAllButWhitelist($article, $) {
$('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS); $('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
return $article; return $article;
} // function removeAttrs(article, $) { } // Remove attributes like style or align
// REMOVE_ATTRS.forEach((attr) => {
// $(`[${attr}]`, article).removeAttr(attr);
// });
// }
// Remove attributes like style or align
function cleanAttributes$$1($article, $) { function cleanAttributes$$1($article, $) {
// Grabbing the parent because at this point // Grabbing the parent because at this point
@ -1709,13 +1712,43 @@ function rewriteTopLevel$$1(article, $) {
} }
function absolutize($, rootUrl, attr, $content) { function absolutize($, rootUrl, attr, $content) {
var baseUrl = $('base').attr('href');
$('['.concat(attr, ']'), $content).each(function(_, node) { $('['.concat(attr, ']'), $content).each(function(_, node) {
var attrs = getAttrs(node); var attrs = getAttrs(node);
var url = attrs[attr]; var url = attrs[attr];
var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
setAttr(node, attr, absoluteUrl);
});
}
if (url) { function absolutizeSet($, rootUrl, $content) {
var absoluteUrl = URL.resolve(rootUrl, url); $('[srcset]', $content).each(function(_, node) {
setAttr(node, attr, absoluteUrl); var attrs = getAttrs(node);
var urlSet = attrs.srcset;
if (urlSet) {
// a comma should be considered part of the candidate URL unless preceded by a descriptor
// descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
// space characters inside the URL should be encoded (%20 or +)
var candidates = urlSet.match(
/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g
);
var absoluteCandidates = candidates.map(function(candidate) {
// a candidate URL cannot start or end with a comma
// descriptors are separated from the URLs by unescaped whitespace
var parts = candidate
.trim()
.replace(/,$/, '')
.split(/\s+/);
parts[0] = URL.resolve(rootUrl, parts[0]);
return parts.join(' ');
});
var absoluteUrlSet = _toConsumableArray(
new _Set(absoluteCandidates)
).join(', ');
setAttr(node, 'srcset', absoluteUrlSet);
} }
}); });
} }
@ -1724,6 +1757,7 @@ function makeLinksAbsolute$$1($content, $, url) {
['href', 'src'].forEach(function(attr) { ['href', 'src'].forEach(function(attr) {
return absolutize($, url, attr, $content); return absolutize($, url, attr, $content);
}); });
absolutizeSet($, url, $content);
return $content; return $content;
} }
@ -2027,12 +2061,14 @@ var Resource = {
// :param response: If set, use as the response rather than // :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a // attempting to fetch it ourselves. Expects a
// string. // string.
create: function create(url, preparedResponse, parsedUrl) { create: (function() {
var _this = this; var _create = _asyncToGenerator(
return _asyncToGenerator(
/*#__PURE__*/ /*#__PURE__*/
_regeneratorRuntime.mark(function _callee() { _regeneratorRuntime.mark(function _callee(
url,
preparedResponse,
parsedUrl
) {
var result, validResponse; var result, validResponse;
return _regeneratorRuntime.wrap( return _regeneratorRuntime.wrap(
function _callee$(_context) { function _callee$(_context) {
@ -2076,7 +2112,7 @@ var Resource = {
return _context.abrupt('return', result); return _context.abrupt('return', result);
case 11: case 11:
return _context.abrupt('return', _this.generateDoc(result)); return _context.abrupt('return', this.generateDoc(result));
case 12: case 12:
case 'end': case 'end':
@ -2088,8 +2124,14 @@ var Resource = {
this this
); );
}) })
)(); );
},
function create(_x, _x2, _x3) {
return _create.apply(this, arguments);
}
return create;
})(),
generateDoc: function generateDoc(_ref) { generateDoc: function generateDoc(_ref) {
var content = _ref.body, var content = _ref.body,
response = _ref.response; response = _ref.response;
@ -2301,16 +2343,7 @@ var NYTimesExtractor = {
selectors: ['div.g-blocks', 'article#story'], selectors: ['div.g-blocks', 'article#story'],
transforms: { transforms: {
'img.g-lazy': function imgGLazy($node) { 'img.g-lazy': function imgGLazy($node) {
var src = $node.attr('src'); // const widths = $node.attr('data-widths') var src = $node.attr('src');
// .slice(1)
// .slice(0, -1)
// .split(',');
// if (widths.length) {
// width = widths.slice(-1);
// } else {
// width = '900';
// }
var width = 640; var width = 640;
src = src.replace('{{size}}', width); src = src.replace('{{size}}', width);
$node.attr('src', src); $node.attr('src', src);
@ -2944,10 +2977,10 @@ var WwwWashingtonpostComExtractor = {
selectors: ['h1', '#topper-headline-wrapper'], selectors: ['h1', '#topper-headline-wrapper'],
}, },
author: { author: {
selectors: ['.pb-byline'], selectors: ['.pb-author-name'],
}, },
date_published: { date_published: {
selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']], selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']],
}, },
dek: { dek: {
selectors: [], selectors: [],
@ -3002,12 +3035,7 @@ var WwwHuffingtonpostComExtractor = {
defaultCleaner: false, defaultCleaner: false,
// Is there anything in the content you selected that needs transformed // Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images // before it's consumable content? E.g., unusual lazy loaded images
transforms: { transforms: {},
// 'div.top-media': ($node) => {
// const $figure = $node.children('figure');
// $node.replaceWith($figure);
// },
},
// Is there anything that is in the result that shouldn't be? // Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from // The clean selectors will remove anything that matches from
// the result // the result
@ -5065,10 +5093,7 @@ var WwwProspectmagazineCoUkExtractor = {
selectors: [['meta[name="og:image"]', 'value']], selectors: [['meta[name="og:image"]', 'value']],
}, },
content: { content: {
selectors: [ selectors: ['article .post_content'],
// ['article.type-post div.post_content p'],
'article .post_content',
],
// Is there anything in the content you selected that needs transformed // Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images // before it's consumable content? E.g., unusual lazy loaded images
transforms: {}, transforms: {},
@ -5290,6 +5315,60 @@ var IciRadioCanadaCaExtractor = {
}, },
}; };
var WwwFortinetComExtractor = {
domain: 'www.fortinet.com',
title: {
selectors: ['h1'],
},
author: {
selectors: ['.b15-blog-meta__author'],
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12',
],
transforms: {
noscript: function noscript($node) {
var $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure';
}
return null;
},
},
},
};
var WwwFastcompanyComExtractor = {
domain: 'www.fastcompany.com',
title: {
selectors: ['h1'],
},
author: {
selectors: ['.post__by'],
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
selectors: ['.post__deck'],
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: ['.post__article'],
},
};
var CustomExtractors = /*#__PURE__*/ Object.freeze({ var CustomExtractors = /*#__PURE__*/ Object.freeze({
BloggerExtractor: BloggerExtractor, BloggerExtractor: BloggerExtractor,
NYMagExtractor: NYMagExtractor, NYMagExtractor: NYMagExtractor,
@ -5382,6 +5461,8 @@ var CustomExtractors = /*#__PURE__*/ Object.freeze({
WwwFoolComExtractor: WwwFoolComExtractor, WwwFoolComExtractor: WwwFoolComExtractor,
WwwSlateComExtractor: WwwSlateComExtractor, WwwSlateComExtractor: WwwSlateComExtractor,
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor, IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
WwwFortinetComExtractor: WwwFortinetComExtractor,
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
}); });
var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) { var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
@ -5390,8 +5471,7 @@ var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
}, {}); }, {});
// CLEAN AUTHOR CONSTANTS // CLEAN AUTHOR CONSTANTS
var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)', var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS
// CLEAN DEK CONSTANTS
var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks. var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.
@ -5699,10 +5779,6 @@ var Cleaners = {
// Returns a cheerio object $ // Returns a cheerio object $
function extractBestNode($, opts) { function extractBestNode($, opts) {
// clone the node so we can get back to our
// initial parsed state if needed
// TODO Do I need this? AP
// let $root = $.root().clone()
if (opts.stripUnlikelyCandidates) { if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($); $ = stripUnlikelyCandidates($);
} }
@ -5813,10 +5889,7 @@ var GenericContentExtractor = {
return null; return null;
} }
return normalizeSpaces($.html(node)); // if return_type == "html": return normalizeSpaces($.html(node));
// return normalize_spaces(node_to_html(node))
// else:
// return node
}, },
}; };
@ -5994,12 +6067,10 @@ var GenericAuthorExtractor = {
!(_iteratorNormalCompletion = (_step = _iterator.next()).done); !(_iteratorNormalCompletion = (_step = _iterator.next()).done);
_iteratorNormalCompletion = true _iteratorNormalCompletion = true
) { ) {
var _ref4 = _step.value; var _step$value = _slicedToArray(_step.value, 2),
selector = _step$value[0],
regex = _step$value[1];
var _ref3 = _slicedToArray(_ref4, 2);
var selector = _ref3[0];
var regex = _ref3[1];
var node = $(selector); var node = $(selector);
if (node.length === 1) { if (node.length === 1) {
@ -6078,11 +6149,8 @@ var DATE_PUBLISHED_SELECTORS = [
var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'; var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
var DATE_PUBLISHED_URL_RES = [ var DATE_PUBLISHED_URL_RES = [
// /2012/01/27/ but not /2012/01/293 new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733 new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
// 2012-01-27
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/
new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'), new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'),
]; ];
@ -6113,50 +6181,15 @@ var GenericDatePublishedExtractor = {
}, },
}; };
// import {
// DEK_META_TAGS,
// DEK_SELECTORS,
// DEK_URL_RES,
// } from './constants';
// import { cleanDek } from 'cleaners';
// import {
// extractFromMeta,
// extractFromSelectors,
// } from 'utils/dom';
// Currently there is only one selector for // Currently there is only one selector for
// deks. We should simply return null here // deks. We should simply return null here
// until we have a more robust generic option. // until we have a more robust generic option.
// Below is the original source for this, for reference. // Below is the original source for this, for reference.
var GenericDekExtractor = { var GenericDekExtractor = {
// extract({ $, content, metaCache }) {
extract: function extract() { extract: function extract() {
return null; return null;
}, },
}; };
// # First, check to see if we have a matching meta tag that we can make
// # use of.
// dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
// if not dek:
// # Second, look through our CSS/XPath selectors. This may return
// # an HTML fragment.
// dek = self.extract_from_selectors('dek',
// constants.DEK_SELECTORS,
// text_only=False)
//
// if dek:
// # Make sure our dek isn't in the first few thousand characters
// # of the content, otherwise it's just the start of the article
// # and not a true dek.
// content = self.extract_content()
// content_chunk = normalize_spaces(strip_tags(content[:2000]))
// dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
//
// # 80% or greater similarity means the dek was very similar to some
// # of the starting content, so we skip it.
// if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
// return dek
//
// return None
// An ordered list of meta tag names that denote likely article leading images. // An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching. // All attributes should be lowercase for faster case-insensitive matching.
@ -6443,159 +6476,6 @@ var GenericLeadImageUrlExtractor = {
return null; return null;
}, },
}; };
// """
// # First, try to find the "best" image via the content.
// # We'd rather not have to fetch each image and check dimensions,
// # so try to do some analysis and determine them instead.
// content = self.extractor.extract_content(return_type="node")
// imgs = content.xpath('.//img')
// img_scores = defaultdict(int)
// logger.debug('Scoring %d images from content', len(imgs))
// for (i, img) in enumerate(imgs):
// img_score = 0
//
// if not 'src' in img.attrib:
// logger.debug('No src attribute found')
// continue
//
// try:
// parsed_img = urlparse(img.attrib['src'])
// img_path = parsed_img.path.lower()
// except ValueError:
// logger.debug('ValueError getting img path.')
// continue
// logger.debug('Image path is %s', img_path)
//
// if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
// logger.debug('Positive URL hints match. Adding 20.')
// img_score += 20
//
// if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
// logger.debug('Negative URL hints match. Subtracting 20.')
// img_score -= 20
//
// # Gifs are more often structure than photos
// if img_path.endswith('gif'):
// logger.debug('gif found. Subtracting 10.')
// img_score -= 10
//
// # JPGs are more often photographs
// if img_path.endswith('jpg'):
// logger.debug('jpg found. Adding 10.')
// img_score += 10
//
// # PNGs are neutral.
//
// # Alt attribute usually means non-presentational image.
// if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
// logger.debug('alt attribute found. Adding 5.')
// img_score += 5
//
// # Look through our parent and grandparent for figure-like
// # container elements, give a bonus if we find them
// parents = [img.getparent()]
// if parents[0] is not None and parents[0].getparent() is not None:
// parents.append(parents[0].getparent())
// for p in parents:
// if p.tag == 'figure':
// logger.debug('Parent with <figure> tag found. Adding 25.')
// img_score += 25
//
// p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
// if constants.PHOTO_HINTS_RE.search(p_sig):
// logger.debug('Photo hints regex match. Adding 15.')
// img_score += 15
//
// # Look at our immediate sibling and see if it looks like it's a
// # caption. Bonus if so.
// sibling = img.getnext()
// if sibling is not None:
// if sibling.tag == 'figcaption':
// img_score += 25
//
// sib_sig = ' '.join([sibling.get('id', ''),
// sibling.get('class', '')]).lower()
// if 'caption' in sib_sig:
// img_score += 15
//
// # Pull out width/height if they were set.
// img_width = None
// img_height = None
// if 'width' in img.attrib:
// try:
// img_width = float(img.get('width'))
// except ValueError:
// pass
// if 'height' in img.attrib:
// try:
// img_height = float(img.get('height'))
// except ValueError:
// pass
//
// # Penalty for skinny images
// if img_width and img_width <= 50:
// logger.debug('Skinny image found. Subtracting 50.')
// img_score -= 50
//
// # Penalty for short images
// if img_height and img_height <= 50:
// # Wide, short images are more common than narrow, tall ones
// logger.debug('Short image found. Subtracting 25.')
// img_score -= 25
//
// if img_width and img_height and not 'sprite' in img_path:
// area = img_width * img_height
//
// if area < 5000: # Smaller than 50x100
// logger.debug('Image with small area found. Subtracting 100.')
// img_score -= 100
// else:
// img_score += round(area/1000.0)
//
// # If the image is higher on the page than other images,
// # it gets a bonus. Penalty if lower.
// logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
// img_score += len(imgs)/2 - i
//
// # Use the raw src here because we munged img_path for case
// # insensitivity
// logger.debug('Final score is %d.', img_score)
// img_scores[img.attrib['src']] += img_score
//
// top_score = 0
// top_url = None
// for (url, score) in img_scores.items():
// if score > top_score:
// top_url = url
// top_score = score
//
// if top_score > 0:
// logger.debug('Using top score image from content. Score was %d', top_score)
// return top_url
//
//
// # If nothing else worked, check to see if there are any really
// # probable nodes in the doc, like <link rel="image_src" />.
// logger.debug('Trying to find lead image in probable nodes')
// for selector in constants.LEAD_IMAGE_URL_SELECTORS:
// nodes = self.resource.extract_by_selector(selector)
// for node in nodes:
// clean_value = None
// if node.attrib.get('src'):
// clean_value = self.clean(node.attrib['src'])
//
// if not clean_value and node.attrib.get('href'):
// clean_value = self.clean(node.attrib['href'])
//
// if not clean_value and node.attrib.get('value'):
// clean_value = self.clean(node.attrib['value'])
//
// if clean_value:
// logger.debug('Found lead image in probable nodes.')
// logger.debug('Node was: %s', node)
// return clean_value
//
// return None
function scoreSimilarity(score, articleUrl, href) { function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's // Do this last and only if we have a real candidate, because it's
@ -7543,13 +7423,12 @@ function _collectAllPages() {
} }
var Mercury = { var Mercury = {
parse: function parse(url, html) { parse: (function() {
var opts = var _parse = _asyncToGenerator(
arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
return _asyncToGenerator(
/*#__PURE__*/ /*#__PURE__*/
_regeneratorRuntime.mark(function _callee() { _regeneratorRuntime.mark(function _callee(url, html) {
var _opts$fetchAllPages, var opts,
_opts$fetchAllPages,
fetchAllPages, fetchAllPages,
_opts$fallback, _opts$fallback,
fallback, fallback,
@ -7560,13 +7439,16 @@ var Mercury = {
result, result,
_result, _result,
title, title,
next_page_url; next_page_url,
_args = arguments;
return _regeneratorRuntime.wrap( return _regeneratorRuntime.wrap(
function _callee$(_context) { function _callee$(_context) {
while (1) { while (1) {
switch ((_context.prev = _context.next)) { switch ((_context.prev = _context.next)) {
case 0: case 0:
opts =
_args.length > 2 && _args[2] !== undefined ? _args[2] : {};
(_opts$fetchAllPages = opts.fetchAllPages), (_opts$fetchAllPages = opts.fetchAllPages),
(fetchAllPages = (fetchAllPages =
_opts$fetchAllPages === void 0 _opts$fetchAllPages === void 0
@ -7587,29 +7469,29 @@ var Mercury = {
parsedUrl = URL.parse(url); parsedUrl = URL.parse(url);
if (validateUrl(parsedUrl)) { if (validateUrl(parsedUrl)) {
_context.next = 5; _context.next = 6;
break; break;
} }
return _context.abrupt('return', Errors.badUrl); return _context.abrupt('return', Errors.badUrl);
case 5: case 6:
_context.next = 7; _context.next = 8;
return Resource.create(url, html, parsedUrl); return Resource.create(url, html, parsedUrl);
case 7: case 8:
$ = _context.sent; $ = _context.sent;
Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`); Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
// If we found an error creating the resource, return that error // If we found an error creating the resource, return that error
if (!$.failed) { if (!$.failed) {
_context.next = 11; _context.next = 12;
break; break;
} }
return _context.abrupt('return', $); return _context.abrupt('return', $);
case 11: case 12:
// if html still has not been set (i.e., url passed to Mercury.parse), // if html still has not been set (i.e., url passed to Mercury.parse),
// set html from the response of Resource.create // set html from the response of Resource.create
if (!html) { if (!html) {
@ -7635,11 +7517,11 @@ var Mercury = {
(next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found (next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found
if (!(fetchAllPages && next_page_url)) { if (!(fetchAllPages && next_page_url)) {
_context.next = 21; _context.next = 22;
break; break;
} }
_context.next = 18; _context.next = 19;
return collectAllPages({ return collectAllPages({
Extractor: Extractor, Extractor: Extractor,
next_page_url: next_page_url, next_page_url: next_page_url,
@ -7651,21 +7533,21 @@ var Mercury = {
url: url, url: url,
}); });
case 18: case 19:
result = _context.sent; result = _context.sent;
_context.next = 22; _context.next = 23;
break; break;
case 21: case 22:
result = _objectSpread({}, result, { result = _objectSpread({}, result, {
total_pages: 1, total_pages: 1,
rendered_pages: 1, rendered_pages: 1,
}); });
case 22: case 23:
return _context.abrupt('return', result); return _context.abrupt('return', result);
case 23: case 24:
case 'end': case 'end':
return _context.stop(); return _context.stop();
} }
@ -7675,8 +7557,14 @@ var Mercury = {
this this
); );
}) })
)(); );
},
function parse(_x, _x2) {
return _parse.apply(this, arguments);
}
return parse;
})(),
browser: !!cheerio.browser, browser: !!cheerio.browser,
// A convenience method for getting a resource // A convenience method for getting a resource
// to work with, e.g., for custom extractor generator // to work with, e.g., for custom extractor generator

55867
dist/mercury.web.js vendored

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{ {
"name": "@postlight/mercury-parser", "name": "@postlight/mercury-parser",
"version": "1.0.13", "version": "1.1.0",
"description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.", "description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
"author": "Postlight <mercury@postlight.com>", "author": "Postlight <mercury@postlight.com>",
"homepage": "https://mercury.postlight.com", "homepage": "https://mercury.postlight.com",