mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-15 06:12:48 +00:00
release: 1.1.0 (#245)
This commit is contained in:
parent
6844975c94
commit
d884c3470c
25
CHANGELOG.md
25
CHANGELOG.md
@ -1,5 +1,30 @@
|
|||||||
# Mercury Parser Changelog
|
# Mercury Parser Changelog
|
||||||
|
|
||||||
|
### 1.1.0 (Feb 5, 2019)
|
||||||
|
|
||||||
|
##### Commits
|
||||||
|
|
||||||
|
- [[`6844975c94`](https://github.com/postlight/mercury-parser/commit/6844975c94)] - **feat**: add mercury-parser cli (#244) (Adam Pash)
|
||||||
|
- [[`7bdbbc8ed8`](https://github.com/postlight/mercury-parser/commit/7bdbbc8ed8)] - **deps**: update dependencies to enable Greenkeeper 🌴 (#243) (greenkeeper[bot])
|
||||||
|
- [[`e38aff9c17`](https://github.com/postlight/mercury-parser/commit/e38aff9c17)] - **docs**: add npm install instructions (#240) (Adam Pash)
|
||||||
|
- [[`dc3dff6584`](https://github.com/postlight/mercury-parser/commit/dc3dff6584)] - **docs**: add hero to README (#239) (Gina Trapani)
|
||||||
|
- [[`15f7fa1e27`](https://github.com/postlight/mercury-parser/commit/15f7fa1e27)] - a more explicit .prettierrc (Adam Pash)
|
||||||
|
- [[`c6f42c1278`](https://github.com/postlight/mercury-parser/commit/c6f42c1278)] - **docs**: cleanup and update docs (#238) (Adam Pash)
|
||||||
|
- [[`92de5ce4ed`](https://github.com/postlight/mercury-parser/commit/92de5ce4ed)] - **docs**: remove contributors (github already has this covered) (#237) (Adam Pash)
|
||||||
|
- [[`2845a1bb7e`](https://github.com/postlight/mercury-parser/commit/2845a1bb7e)] - **docs**: add gitter room text and link (#235) (George Haddad)
|
||||||
|
- [[`380196b709`](https://github.com/postlight/mercury-parser/commit/380196b709)] - **docs**: change text to include AMP and Reader (#236) (George Haddad)
|
||||||
|
- [[`33bf5882b9`](https://github.com/postlight/mercury-parser/commit/33bf5882b9)] - **docs**: add mit license badge (#234) (George Haddad)
|
||||||
|
- [[`5c0325f5a7`](https://github.com/postlight/mercury-parser/commit/5c0325f5a7)] - **feat**: hook up ci to publish to npm (#226) (George Haddad)
|
||||||
|
- [[`663cc45bf4`](https://github.com/postlight/mercury-parser/commit/663cc45bf4)] - fresh run of prettier; remove NOTES.md (#233) (Adam Pash)
|
||||||
|
- [[`244d17ddd3`](https://github.com/postlight/mercury-parser/commit/244d17ddd3)] - **fix**: proxy browser in build tests (#232) (Adam Pash)
|
||||||
|
- [[`0668f5d75b`](https://github.com/postlight/mercury-parser/commit/0668f5d75b)] - **docs**: add instructions for browser usage to parse current page (#231) (Toufic Mouallem)
|
||||||
|
- [[`4ab50133f4`](https://github.com/postlight/mercury-parser/commit/4ab50133f4)] - **chore**: update node rollup config (#229) (Jad Termsani)
|
||||||
|
- [[`1ccd14e1e9`](https://github.com/postlight/mercury-parser/commit/1ccd14e1e9)] - **feat**: add fortinet custom parser (#188) (Wajeeh Zantout)
|
||||||
|
- [[`9b36003b62`](https://github.com/postlight/mercury-parser/commit/9b36003b62)] - **feat**: add fastcompany custom parser (#191) (Wajeeh Zantout)
|
||||||
|
- [[`199fe70b03`](https://github.com/postlight/mercury-parser/commit/199fe70b03)] - Docs contributors (#227) (Ralph Jbeily)
|
||||||
|
- [[`9756e6ee67`](https://github.com/postlight/mercury-parser/commit/9756e6ee67)] - **docs**: update mercury parser installation (#228) (Ralph Jbeily)
|
||||||
|
- [[`1c7ae48de0`](https://github.com/postlight/mercury-parser/commit/1c7ae48de0)] - **dx**: include test results in comment (#230) (Adam Pash)
|
||||||
|
|
||||||
### 1.0.13 (Oct 11, 2018)
|
### 1.0.13 (Oct 11, 2018)
|
||||||
|
|
||||||
##### Commits
|
##### Commits
|
||||||
|
2
cli.js
2
cli.js
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env node
|
#!/usr/bin/env node
|
||||||
/* eslint-disable no-multi-str */
|
/* eslint-disable */
|
||||||
|
|
||||||
const Mercury = require('./dist/mercury');
|
const Mercury = require('./dist/mercury');
|
||||||
|
|
||||||
|
460
dist/mercury.js
vendored
460
dist/mercury.js
vendored
@ -38,6 +38,7 @@ var _defineProperty = _interopDefault(
|
|||||||
var _parseFloat = _interopDefault(
|
var _parseFloat = _interopDefault(
|
||||||
require('@babel/runtime-corejs2/core-js/parse-float')
|
require('@babel/runtime-corejs2/core-js/parse-float')
|
||||||
);
|
);
|
||||||
|
var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
|
||||||
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
|
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
|
||||||
var _getIterator = _interopDefault(
|
var _getIterator = _interopDefault(
|
||||||
require('@babel/runtime-corejs2/core-js/get-iterator')
|
require('@babel/runtime-corejs2/core-js/get-iterator')
|
||||||
@ -391,20 +392,28 @@ function _fetchResource() {
|
|||||||
switch ((_context.prev = _context.next)) {
|
switch ((_context.prev = _context.next)) {
|
||||||
case 0:
|
case 0:
|
||||||
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
|
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
|
||||||
options = {
|
options = _objectSpread(
|
||||||
url: parsedUrl.href,
|
{
|
||||||
headers: _objectSpread({}, REQUEST_HEADERS),
|
url: parsedUrl.href,
|
||||||
timeout: FETCH_TIMEOUT,
|
headers: _objectSpread({}, REQUEST_HEADERS),
|
||||||
// Accept cookies
|
timeout: FETCH_TIMEOUT,
|
||||||
jar: true,
|
// Accept cookies
|
||||||
// Set to null so the response returns as binary and body as buffer
|
jar: true,
|
||||||
// https://github.com/request/request#requestoptions-callback
|
// Set to null so the response returns as binary and body as buffer
|
||||||
encoding: null,
|
// https://github.com/request/request#requestoptions-callback
|
||||||
// Accept and decode gzip
|
encoding: null,
|
||||||
gzip: true,
|
// Accept and decode gzip
|
||||||
// Follow any redirect
|
gzip: true,
|
||||||
followAllRedirects: true,
|
// Follow any non-GET redirects
|
||||||
};
|
followAllRedirects: true,
|
||||||
|
},
|
||||||
|
typeof window !== 'undefined'
|
||||||
|
? {}
|
||||||
|
: {
|
||||||
|
// Follow GET redirects; this option is for Node only
|
||||||
|
followRedirect: true,
|
||||||
|
}
|
||||||
|
);
|
||||||
_context.next = 4;
|
_context.next = 4;
|
||||||
return get(options);
|
return get(options);
|
||||||
|
|
||||||
@ -803,8 +812,7 @@ function brsToPs$$1($) {
|
|||||||
collapsing = true;
|
collapsing = true;
|
||||||
$element.remove();
|
$element.remove();
|
||||||
} else if (collapsing) {
|
} else if (collapsing) {
|
||||||
collapsing = false; // $(element).replaceWith('<p />')
|
collapsing = false;
|
||||||
|
|
||||||
paragraphize(element, $, true);
|
paragraphize(element, $, true);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -899,7 +907,7 @@ function convertNodeTo$$1($node, $) {
|
|||||||
return $;
|
return $;
|
||||||
}
|
}
|
||||||
|
|
||||||
var attrs = getAttrs(node) || {}; // console.log(attrs)
|
var attrs = getAttrs(node) || {};
|
||||||
|
|
||||||
var attribString = _Reflect$ownKeys(attrs)
|
var attribString = _Reflect$ownKeys(attrs)
|
||||||
.map(function(key) {
|
.map(function(key) {
|
||||||
@ -1039,12 +1047,7 @@ function removeAllButWhitelist($article, $) {
|
|||||||
|
|
||||||
$('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
|
$('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS);
|
||||||
return $article;
|
return $article;
|
||||||
} // function removeAttrs(article, $) {
|
} // Remove attributes like style or align
|
||||||
// REMOVE_ATTRS.forEach((attr) => {
|
|
||||||
// $(`[${attr}]`, article).removeAttr(attr);
|
|
||||||
// });
|
|
||||||
// }
|
|
||||||
// Remove attributes like style or align
|
|
||||||
|
|
||||||
function cleanAttributes$$1($article, $) {
|
function cleanAttributes$$1($article, $) {
|
||||||
// Grabbing the parent because at this point
|
// Grabbing the parent because at this point
|
||||||
@ -1709,13 +1712,43 @@ function rewriteTopLevel$$1(article, $) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function absolutize($, rootUrl, attr, $content) {
|
function absolutize($, rootUrl, attr, $content) {
|
||||||
|
var baseUrl = $('base').attr('href');
|
||||||
$('['.concat(attr, ']'), $content).each(function(_, node) {
|
$('['.concat(attr, ']'), $content).each(function(_, node) {
|
||||||
var attrs = getAttrs(node);
|
var attrs = getAttrs(node);
|
||||||
var url = attrs[attr];
|
var url = attrs[attr];
|
||||||
|
var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
|
||||||
|
setAttr(node, attr, absoluteUrl);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (url) {
|
function absolutizeSet($, rootUrl, $content) {
|
||||||
var absoluteUrl = URL.resolve(rootUrl, url);
|
$('[srcset]', $content).each(function(_, node) {
|
||||||
setAttr(node, attr, absoluteUrl);
|
var attrs = getAttrs(node);
|
||||||
|
var urlSet = attrs.srcset;
|
||||||
|
|
||||||
|
if (urlSet) {
|
||||||
|
// a comma should be considered part of the candidate URL unless preceded by a descriptor
|
||||||
|
// descriptors can only contain positive numbers followed immediately by either 'w' or 'x'
|
||||||
|
// space characters inside the URL should be encoded (%20 or +)
|
||||||
|
var candidates = urlSet.match(
|
||||||
|
/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g
|
||||||
|
);
|
||||||
|
var absoluteCandidates = candidates.map(function(candidate) {
|
||||||
|
// a candidate URL cannot start or end with a comma
|
||||||
|
// descriptors are separated from the URLs by unescaped whitespace
|
||||||
|
var parts = candidate
|
||||||
|
.trim()
|
||||||
|
.replace(/,$/, '')
|
||||||
|
.split(/\s+/);
|
||||||
|
parts[0] = URL.resolve(rootUrl, parts[0]);
|
||||||
|
return parts.join(' ');
|
||||||
|
});
|
||||||
|
|
||||||
|
var absoluteUrlSet = _toConsumableArray(
|
||||||
|
new _Set(absoluteCandidates)
|
||||||
|
).join(', ');
|
||||||
|
|
||||||
|
setAttr(node, 'srcset', absoluteUrlSet);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -1724,6 +1757,7 @@ function makeLinksAbsolute$$1($content, $, url) {
|
|||||||
['href', 'src'].forEach(function(attr) {
|
['href', 'src'].forEach(function(attr) {
|
||||||
return absolutize($, url, attr, $content);
|
return absolutize($, url, attr, $content);
|
||||||
});
|
});
|
||||||
|
absolutizeSet($, url, $content);
|
||||||
return $content;
|
return $content;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2027,12 +2061,14 @@ var Resource = {
|
|||||||
// :param response: If set, use as the response rather than
|
// :param response: If set, use as the response rather than
|
||||||
// attempting to fetch it ourselves. Expects a
|
// attempting to fetch it ourselves. Expects a
|
||||||
// string.
|
// string.
|
||||||
create: function create(url, preparedResponse, parsedUrl) {
|
create: (function() {
|
||||||
var _this = this;
|
var _create = _asyncToGenerator(
|
||||||
|
|
||||||
return _asyncToGenerator(
|
|
||||||
/*#__PURE__*/
|
/*#__PURE__*/
|
||||||
_regeneratorRuntime.mark(function _callee() {
|
_regeneratorRuntime.mark(function _callee(
|
||||||
|
url,
|
||||||
|
preparedResponse,
|
||||||
|
parsedUrl
|
||||||
|
) {
|
||||||
var result, validResponse;
|
var result, validResponse;
|
||||||
return _regeneratorRuntime.wrap(
|
return _regeneratorRuntime.wrap(
|
||||||
function _callee$(_context) {
|
function _callee$(_context) {
|
||||||
@ -2076,7 +2112,7 @@ var Resource = {
|
|||||||
return _context.abrupt('return', result);
|
return _context.abrupt('return', result);
|
||||||
|
|
||||||
case 11:
|
case 11:
|
||||||
return _context.abrupt('return', _this.generateDoc(result));
|
return _context.abrupt('return', this.generateDoc(result));
|
||||||
|
|
||||||
case 12:
|
case 12:
|
||||||
case 'end':
|
case 'end':
|
||||||
@ -2088,8 +2124,14 @@ var Resource = {
|
|||||||
this
|
this
|
||||||
);
|
);
|
||||||
})
|
})
|
||||||
)();
|
);
|
||||||
},
|
|
||||||
|
function create(_x, _x2, _x3) {
|
||||||
|
return _create.apply(this, arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
return create;
|
||||||
|
})(),
|
||||||
generateDoc: function generateDoc(_ref) {
|
generateDoc: function generateDoc(_ref) {
|
||||||
var content = _ref.body,
|
var content = _ref.body,
|
||||||
response = _ref.response;
|
response = _ref.response;
|
||||||
@ -2301,16 +2343,7 @@ var NYTimesExtractor = {
|
|||||||
selectors: ['div.g-blocks', 'article#story'],
|
selectors: ['div.g-blocks', 'article#story'],
|
||||||
transforms: {
|
transforms: {
|
||||||
'img.g-lazy': function imgGLazy($node) {
|
'img.g-lazy': function imgGLazy($node) {
|
||||||
var src = $node.attr('src'); // const widths = $node.attr('data-widths')
|
var src = $node.attr('src');
|
||||||
// .slice(1)
|
|
||||||
// .slice(0, -1)
|
|
||||||
// .split(',');
|
|
||||||
// if (widths.length) {
|
|
||||||
// width = widths.slice(-1);
|
|
||||||
// } else {
|
|
||||||
// width = '900';
|
|
||||||
// }
|
|
||||||
|
|
||||||
var width = 640;
|
var width = 640;
|
||||||
src = src.replace('{{size}}', width);
|
src = src.replace('{{size}}', width);
|
||||||
$node.attr('src', src);
|
$node.attr('src', src);
|
||||||
@ -2944,10 +2977,10 @@ var WwwWashingtonpostComExtractor = {
|
|||||||
selectors: ['h1', '#topper-headline-wrapper'],
|
selectors: ['h1', '#topper-headline-wrapper'],
|
||||||
},
|
},
|
||||||
author: {
|
author: {
|
||||||
selectors: ['.pb-byline'],
|
selectors: ['.pb-author-name'],
|
||||||
},
|
},
|
||||||
date_published: {
|
date_published: {
|
||||||
selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']],
|
selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']],
|
||||||
},
|
},
|
||||||
dek: {
|
dek: {
|
||||||
selectors: [],
|
selectors: [],
|
||||||
@ -3002,12 +3035,7 @@ var WwwHuffingtonpostComExtractor = {
|
|||||||
defaultCleaner: false,
|
defaultCleaner: false,
|
||||||
// Is there anything in the content you selected that needs transformed
|
// Is there anything in the content you selected that needs transformed
|
||||||
// before it's consumable content? E.g., unusual lazy loaded images
|
// before it's consumable content? E.g., unusual lazy loaded images
|
||||||
transforms: {
|
transforms: {},
|
||||||
// 'div.top-media': ($node) => {
|
|
||||||
// const $figure = $node.children('figure');
|
|
||||||
// $node.replaceWith($figure);
|
|
||||||
// },
|
|
||||||
},
|
|
||||||
// Is there anything that is in the result that shouldn't be?
|
// Is there anything that is in the result that shouldn't be?
|
||||||
// The clean selectors will remove anything that matches from
|
// The clean selectors will remove anything that matches from
|
||||||
// the result
|
// the result
|
||||||
@ -5065,10 +5093,7 @@ var WwwProspectmagazineCoUkExtractor = {
|
|||||||
selectors: [['meta[name="og:image"]', 'value']],
|
selectors: [['meta[name="og:image"]', 'value']],
|
||||||
},
|
},
|
||||||
content: {
|
content: {
|
||||||
selectors: [
|
selectors: ['article .post_content'],
|
||||||
// ['article.type-post div.post_content p'],
|
|
||||||
'article .post_content',
|
|
||||||
],
|
|
||||||
// Is there anything in the content you selected that needs transformed
|
// Is there anything in the content you selected that needs transformed
|
||||||
// before it's consumable content? E.g., unusual lazy loaded images
|
// before it's consumable content? E.g., unusual lazy loaded images
|
||||||
transforms: {},
|
transforms: {},
|
||||||
@ -5290,6 +5315,60 @@ var IciRadioCanadaCaExtractor = {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
var WwwFortinetComExtractor = {
|
||||||
|
domain: 'www.fortinet.com',
|
||||||
|
title: {
|
||||||
|
selectors: ['h1'],
|
||||||
|
},
|
||||||
|
author: {
|
||||||
|
selectors: ['.b15-blog-meta__author'],
|
||||||
|
},
|
||||||
|
date_published: {
|
||||||
|
selectors: [['meta[name="article:published_time"]', 'value']],
|
||||||
|
},
|
||||||
|
lead_image_url: {
|
||||||
|
selectors: [['meta[name="og:image"]', 'value']],
|
||||||
|
},
|
||||||
|
content: {
|
||||||
|
selectors: [
|
||||||
|
'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12',
|
||||||
|
],
|
||||||
|
transforms: {
|
||||||
|
noscript: function noscript($node) {
|
||||||
|
var $children = $node.children();
|
||||||
|
|
||||||
|
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||||
|
return 'figure';
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
var WwwFastcompanyComExtractor = {
|
||||||
|
domain: 'www.fastcompany.com',
|
||||||
|
title: {
|
||||||
|
selectors: ['h1'],
|
||||||
|
},
|
||||||
|
author: {
|
||||||
|
selectors: ['.post__by'],
|
||||||
|
},
|
||||||
|
date_published: {
|
||||||
|
selectors: [['meta[name="article:published_time"]', 'value']],
|
||||||
|
},
|
||||||
|
dek: {
|
||||||
|
selectors: ['.post__deck'],
|
||||||
|
},
|
||||||
|
lead_image_url: {
|
||||||
|
selectors: [['meta[name="og:image"]', 'value']],
|
||||||
|
},
|
||||||
|
content: {
|
||||||
|
selectors: ['.post__article'],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
var CustomExtractors = /*#__PURE__*/ Object.freeze({
|
var CustomExtractors = /*#__PURE__*/ Object.freeze({
|
||||||
BloggerExtractor: BloggerExtractor,
|
BloggerExtractor: BloggerExtractor,
|
||||||
NYMagExtractor: NYMagExtractor,
|
NYMagExtractor: NYMagExtractor,
|
||||||
@ -5382,6 +5461,8 @@ var CustomExtractors = /*#__PURE__*/ Object.freeze({
|
|||||||
WwwFoolComExtractor: WwwFoolComExtractor,
|
WwwFoolComExtractor: WwwFoolComExtractor,
|
||||||
WwwSlateComExtractor: WwwSlateComExtractor,
|
WwwSlateComExtractor: WwwSlateComExtractor,
|
||||||
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
|
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
|
||||||
|
WwwFortinetComExtractor: WwwFortinetComExtractor,
|
||||||
|
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
|
||||||
});
|
});
|
||||||
|
|
||||||
var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
|
var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
|
||||||
@ -5390,8 +5471,7 @@ var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) {
|
|||||||
}, {});
|
}, {});
|
||||||
|
|
||||||
// CLEAN AUTHOR CONSTANTS
|
// CLEAN AUTHOR CONSTANTS
|
||||||
var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
|
var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // CLEAN DEK CONSTANTS
|
||||||
// CLEAN DEK CONSTANTS
|
|
||||||
|
|
||||||
var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.
|
var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks.
|
||||||
|
|
||||||
@ -5699,10 +5779,6 @@ var Cleaners = {
|
|||||||
// Returns a cheerio object $
|
// Returns a cheerio object $
|
||||||
|
|
||||||
function extractBestNode($, opts) {
|
function extractBestNode($, opts) {
|
||||||
// clone the node so we can get back to our
|
|
||||||
// initial parsed state if needed
|
|
||||||
// TODO Do I need this? – AP
|
|
||||||
// let $root = $.root().clone()
|
|
||||||
if (opts.stripUnlikelyCandidates) {
|
if (opts.stripUnlikelyCandidates) {
|
||||||
$ = stripUnlikelyCandidates($);
|
$ = stripUnlikelyCandidates($);
|
||||||
}
|
}
|
||||||
@ -5813,10 +5889,7 @@ var GenericContentExtractor = {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return normalizeSpaces($.html(node)); // if return_type == "html":
|
return normalizeSpaces($.html(node));
|
||||||
// return normalize_spaces(node_to_html(node))
|
|
||||||
// else:
|
|
||||||
// return node
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -5994,12 +6067,10 @@ var GenericAuthorExtractor = {
|
|||||||
!(_iteratorNormalCompletion = (_step = _iterator.next()).done);
|
!(_iteratorNormalCompletion = (_step = _iterator.next()).done);
|
||||||
_iteratorNormalCompletion = true
|
_iteratorNormalCompletion = true
|
||||||
) {
|
) {
|
||||||
var _ref4 = _step.value;
|
var _step$value = _slicedToArray(_step.value, 2),
|
||||||
|
selector = _step$value[0],
|
||||||
|
regex = _step$value[1];
|
||||||
|
|
||||||
var _ref3 = _slicedToArray(_ref4, 2);
|
|
||||||
|
|
||||||
var selector = _ref3[0];
|
|
||||||
var regex = _ref3[1];
|
|
||||||
var node = $(selector);
|
var node = $(selector);
|
||||||
|
|
||||||
if (node.length === 1) {
|
if (node.length === 1) {
|
||||||
@ -6078,11 +6149,8 @@ var DATE_PUBLISHED_SELECTORS = [
|
|||||||
|
|
||||||
var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
|
var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
|
||||||
var DATE_PUBLISHED_URL_RES = [
|
var DATE_PUBLISHED_URL_RES = [
|
||||||
// /2012/01/27/ but not /2012/01/293
|
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
|
||||||
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733
|
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
|
||||||
// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
|
|
||||||
// 2012-01-27
|
|
||||||
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/
|
|
||||||
new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'),
|
new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'),
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -6113,50 +6181,15 @@ var GenericDatePublishedExtractor = {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
// import {
|
|
||||||
// DEK_META_TAGS,
|
|
||||||
// DEK_SELECTORS,
|
|
||||||
// DEK_URL_RES,
|
|
||||||
// } from './constants';
|
|
||||||
// import { cleanDek } from 'cleaners';
|
|
||||||
// import {
|
|
||||||
// extractFromMeta,
|
|
||||||
// extractFromSelectors,
|
|
||||||
// } from 'utils/dom';
|
|
||||||
// Currently there is only one selector for
|
// Currently there is only one selector for
|
||||||
// deks. We should simply return null here
|
// deks. We should simply return null here
|
||||||
// until we have a more robust generic option.
|
// until we have a more robust generic option.
|
||||||
// Below is the original source for this, for reference.
|
// Below is the original source for this, for reference.
|
||||||
var GenericDekExtractor = {
|
var GenericDekExtractor = {
|
||||||
// extract({ $, content, metaCache }) {
|
|
||||||
extract: function extract() {
|
extract: function extract() {
|
||||||
return null;
|
return null;
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
// # First, check to see if we have a matching meta tag that we can make
|
|
||||||
// # use of.
|
|
||||||
// dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
|
|
||||||
// if not dek:
|
|
||||||
// # Second, look through our CSS/XPath selectors. This may return
|
|
||||||
// # an HTML fragment.
|
|
||||||
// dek = self.extract_from_selectors('dek',
|
|
||||||
// constants.DEK_SELECTORS,
|
|
||||||
// text_only=False)
|
|
||||||
//
|
|
||||||
// if dek:
|
|
||||||
// # Make sure our dek isn't in the first few thousand characters
|
|
||||||
// # of the content, otherwise it's just the start of the article
|
|
||||||
// # and not a true dek.
|
|
||||||
// content = self.extract_content()
|
|
||||||
// content_chunk = normalize_spaces(strip_tags(content[:2000]))
|
|
||||||
// dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
|
|
||||||
//
|
|
||||||
// # 80% or greater similarity means the dek was very similar to some
|
|
||||||
// # of the starting content, so we skip it.
|
|
||||||
// if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
|
|
||||||
// return dek
|
|
||||||
//
|
|
||||||
// return None
|
|
||||||
|
|
||||||
// An ordered list of meta tag names that denote likely article leading images.
|
// An ordered list of meta tag names that denote likely article leading images.
|
||||||
// All attributes should be lowercase for faster case-insensitive matching.
|
// All attributes should be lowercase for faster case-insensitive matching.
|
||||||
@ -6443,159 +6476,6 @@ var GenericLeadImageUrlExtractor = {
|
|||||||
return null;
|
return null;
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
// """
|
|
||||||
// # First, try to find the "best" image via the content.
|
|
||||||
// # We'd rather not have to fetch each image and check dimensions,
|
|
||||||
// # so try to do some analysis and determine them instead.
|
|
||||||
// content = self.extractor.extract_content(return_type="node")
|
|
||||||
// imgs = content.xpath('.//img')
|
|
||||||
// img_scores = defaultdict(int)
|
|
||||||
// logger.debug('Scoring %d images from content', len(imgs))
|
|
||||||
// for (i, img) in enumerate(imgs):
|
|
||||||
// img_score = 0
|
|
||||||
//
|
|
||||||
// if not 'src' in img.attrib:
|
|
||||||
// logger.debug('No src attribute found')
|
|
||||||
// continue
|
|
||||||
//
|
|
||||||
// try:
|
|
||||||
// parsed_img = urlparse(img.attrib['src'])
|
|
||||||
// img_path = parsed_img.path.lower()
|
|
||||||
// except ValueError:
|
|
||||||
// logger.debug('ValueError getting img path.')
|
|
||||||
// continue
|
|
||||||
// logger.debug('Image path is %s', img_path)
|
|
||||||
//
|
|
||||||
// if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
|
|
||||||
// logger.debug('Positive URL hints match. Adding 20.')
|
|
||||||
// img_score += 20
|
|
||||||
//
|
|
||||||
// if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
|
|
||||||
// logger.debug('Negative URL hints match. Subtracting 20.')
|
|
||||||
// img_score -= 20
|
|
||||||
//
|
|
||||||
// # Gifs are more often structure than photos
|
|
||||||
// if img_path.endswith('gif'):
|
|
||||||
// logger.debug('gif found. Subtracting 10.')
|
|
||||||
// img_score -= 10
|
|
||||||
//
|
|
||||||
// # JPGs are more often photographs
|
|
||||||
// if img_path.endswith('jpg'):
|
|
||||||
// logger.debug('jpg found. Adding 10.')
|
|
||||||
// img_score += 10
|
|
||||||
//
|
|
||||||
// # PNGs are neutral.
|
|
||||||
//
|
|
||||||
// # Alt attribute usually means non-presentational image.
|
|
||||||
// if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
|
|
||||||
// logger.debug('alt attribute found. Adding 5.')
|
|
||||||
// img_score += 5
|
|
||||||
//
|
|
||||||
// # Look through our parent and grandparent for figure-like
|
|
||||||
// # container elements, give a bonus if we find them
|
|
||||||
// parents = [img.getparent()]
|
|
||||||
// if parents[0] is not None and parents[0].getparent() is not None:
|
|
||||||
// parents.append(parents[0].getparent())
|
|
||||||
// for p in parents:
|
|
||||||
// if p.tag == 'figure':
|
|
||||||
// logger.debug('Parent with <figure> tag found. Adding 25.')
|
|
||||||
// img_score += 25
|
|
||||||
//
|
|
||||||
// p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
|
|
||||||
// if constants.PHOTO_HINTS_RE.search(p_sig):
|
|
||||||
// logger.debug('Photo hints regex match. Adding 15.')
|
|
||||||
// img_score += 15
|
|
||||||
//
|
|
||||||
// # Look at our immediate sibling and see if it looks like it's a
|
|
||||||
// # caption. Bonus if so.
|
|
||||||
// sibling = img.getnext()
|
|
||||||
// if sibling is not None:
|
|
||||||
// if sibling.tag == 'figcaption':
|
|
||||||
// img_score += 25
|
|
||||||
//
|
|
||||||
// sib_sig = ' '.join([sibling.get('id', ''),
|
|
||||||
// sibling.get('class', '')]).lower()
|
|
||||||
// if 'caption' in sib_sig:
|
|
||||||
// img_score += 15
|
|
||||||
//
|
|
||||||
// # Pull out width/height if they were set.
|
|
||||||
// img_width = None
|
|
||||||
// img_height = None
|
|
||||||
// if 'width' in img.attrib:
|
|
||||||
// try:
|
|
||||||
// img_width = float(img.get('width'))
|
|
||||||
// except ValueError:
|
|
||||||
// pass
|
|
||||||
// if 'height' in img.attrib:
|
|
||||||
// try:
|
|
||||||
// img_height = float(img.get('height'))
|
|
||||||
// except ValueError:
|
|
||||||
// pass
|
|
||||||
//
|
|
||||||
// # Penalty for skinny images
|
|
||||||
// if img_width and img_width <= 50:
|
|
||||||
// logger.debug('Skinny image found. Subtracting 50.')
|
|
||||||
// img_score -= 50
|
|
||||||
//
|
|
||||||
// # Penalty for short images
|
|
||||||
// if img_height and img_height <= 50:
|
|
||||||
// # Wide, short images are more common than narrow, tall ones
|
|
||||||
// logger.debug('Short image found. Subtracting 25.')
|
|
||||||
// img_score -= 25
|
|
||||||
//
|
|
||||||
// if img_width and img_height and not 'sprite' in img_path:
|
|
||||||
// area = img_width * img_height
|
|
||||||
//
|
|
||||||
// if area < 5000: # Smaller than 50x100
|
|
||||||
// logger.debug('Image with small area found. Subtracting 100.')
|
|
||||||
// img_score -= 100
|
|
||||||
// else:
|
|
||||||
// img_score += round(area/1000.0)
|
|
||||||
//
|
|
||||||
// # If the image is higher on the page than other images,
|
|
||||||
// # it gets a bonus. Penalty if lower.
|
|
||||||
// logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
|
|
||||||
// img_score += len(imgs)/2 - i
|
|
||||||
//
|
|
||||||
// # Use the raw src here because we munged img_path for case
|
|
||||||
// # insensitivity
|
|
||||||
// logger.debug('Final score is %d.', img_score)
|
|
||||||
// img_scores[img.attrib['src']] += img_score
|
|
||||||
//
|
|
||||||
// top_score = 0
|
|
||||||
// top_url = None
|
|
||||||
// for (url, score) in img_scores.items():
|
|
||||||
// if score > top_score:
|
|
||||||
// top_url = url
|
|
||||||
// top_score = score
|
|
||||||
//
|
|
||||||
// if top_score > 0:
|
|
||||||
// logger.debug('Using top score image from content. Score was %d', top_score)
|
|
||||||
// return top_url
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// # If nothing else worked, check to see if there are any really
|
|
||||||
// # probable nodes in the doc, like <link rel="image_src" />.
|
|
||||||
// logger.debug('Trying to find lead image in probable nodes')
|
|
||||||
// for selector in constants.LEAD_IMAGE_URL_SELECTORS:
|
|
||||||
// nodes = self.resource.extract_by_selector(selector)
|
|
||||||
// for node in nodes:
|
|
||||||
// clean_value = None
|
|
||||||
// if node.attrib.get('src'):
|
|
||||||
// clean_value = self.clean(node.attrib['src'])
|
|
||||||
//
|
|
||||||
// if not clean_value and node.attrib.get('href'):
|
|
||||||
// clean_value = self.clean(node.attrib['href'])
|
|
||||||
//
|
|
||||||
// if not clean_value and node.attrib.get('value'):
|
|
||||||
// clean_value = self.clean(node.attrib['value'])
|
|
||||||
//
|
|
||||||
// if clean_value:
|
|
||||||
// logger.debug('Found lead image in probable nodes.')
|
|
||||||
// logger.debug('Node was: %s', node)
|
|
||||||
// return clean_value
|
|
||||||
//
|
|
||||||
// return None
|
|
||||||
|
|
||||||
function scoreSimilarity(score, articleUrl, href) {
|
function scoreSimilarity(score, articleUrl, href) {
|
||||||
// Do this last and only if we have a real candidate, because it's
|
// Do this last and only if we have a real candidate, because it's
|
||||||
@ -7543,13 +7423,12 @@ function _collectAllPages() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var Mercury = {
|
var Mercury = {
|
||||||
parse: function parse(url, html) {
|
parse: (function() {
|
||||||
var opts =
|
var _parse = _asyncToGenerator(
|
||||||
arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
|
|
||||||
return _asyncToGenerator(
|
|
||||||
/*#__PURE__*/
|
/*#__PURE__*/
|
||||||
_regeneratorRuntime.mark(function _callee() {
|
_regeneratorRuntime.mark(function _callee(url, html) {
|
||||||
var _opts$fetchAllPages,
|
var opts,
|
||||||
|
_opts$fetchAllPages,
|
||||||
fetchAllPages,
|
fetchAllPages,
|
||||||
_opts$fallback,
|
_opts$fallback,
|
||||||
fallback,
|
fallback,
|
||||||
@ -7560,13 +7439,16 @@ var Mercury = {
|
|||||||
result,
|
result,
|
||||||
_result,
|
_result,
|
||||||
title,
|
title,
|
||||||
next_page_url;
|
next_page_url,
|
||||||
|
_args = arguments;
|
||||||
|
|
||||||
return _regeneratorRuntime.wrap(
|
return _regeneratorRuntime.wrap(
|
||||||
function _callee$(_context) {
|
function _callee$(_context) {
|
||||||
while (1) {
|
while (1) {
|
||||||
switch ((_context.prev = _context.next)) {
|
switch ((_context.prev = _context.next)) {
|
||||||
case 0:
|
case 0:
|
||||||
|
opts =
|
||||||
|
_args.length > 2 && _args[2] !== undefined ? _args[2] : {};
|
||||||
(_opts$fetchAllPages = opts.fetchAllPages),
|
(_opts$fetchAllPages = opts.fetchAllPages),
|
||||||
(fetchAllPages =
|
(fetchAllPages =
|
||||||
_opts$fetchAllPages === void 0
|
_opts$fetchAllPages === void 0
|
||||||
@ -7587,29 +7469,29 @@ var Mercury = {
|
|||||||
parsedUrl = URL.parse(url);
|
parsedUrl = URL.parse(url);
|
||||||
|
|
||||||
if (validateUrl(parsedUrl)) {
|
if (validateUrl(parsedUrl)) {
|
||||||
_context.next = 5;
|
_context.next = 6;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return _context.abrupt('return', Errors.badUrl);
|
return _context.abrupt('return', Errors.badUrl);
|
||||||
|
|
||||||
case 5:
|
case 6:
|
||||||
_context.next = 7;
|
_context.next = 8;
|
||||||
return Resource.create(url, html, parsedUrl);
|
return Resource.create(url, html, parsedUrl);
|
||||||
|
|
||||||
case 7:
|
case 8:
|
||||||
$ = _context.sent;
|
$ = _context.sent;
|
||||||
Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
|
Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
|
||||||
// If we found an error creating the resource, return that error
|
// If we found an error creating the resource, return that error
|
||||||
|
|
||||||
if (!$.failed) {
|
if (!$.failed) {
|
||||||
_context.next = 11;
|
_context.next = 12;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return _context.abrupt('return', $);
|
return _context.abrupt('return', $);
|
||||||
|
|
||||||
case 11:
|
case 12:
|
||||||
// if html still has not been set (i.e., url passed to Mercury.parse),
|
// if html still has not been set (i.e., url passed to Mercury.parse),
|
||||||
// set html from the response of Resource.create
|
// set html from the response of Resource.create
|
||||||
if (!html) {
|
if (!html) {
|
||||||
@ -7635,11 +7517,11 @@ var Mercury = {
|
|||||||
(next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found
|
(next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found
|
||||||
|
|
||||||
if (!(fetchAllPages && next_page_url)) {
|
if (!(fetchAllPages && next_page_url)) {
|
||||||
_context.next = 21;
|
_context.next = 22;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
_context.next = 18;
|
_context.next = 19;
|
||||||
return collectAllPages({
|
return collectAllPages({
|
||||||
Extractor: Extractor,
|
Extractor: Extractor,
|
||||||
next_page_url: next_page_url,
|
next_page_url: next_page_url,
|
||||||
@ -7651,21 +7533,21 @@ var Mercury = {
|
|||||||
url: url,
|
url: url,
|
||||||
});
|
});
|
||||||
|
|
||||||
case 18:
|
case 19:
|
||||||
result = _context.sent;
|
result = _context.sent;
|
||||||
_context.next = 22;
|
_context.next = 23;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 21:
|
case 22:
|
||||||
result = _objectSpread({}, result, {
|
result = _objectSpread({}, result, {
|
||||||
total_pages: 1,
|
total_pages: 1,
|
||||||
rendered_pages: 1,
|
rendered_pages: 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
case 22:
|
case 23:
|
||||||
return _context.abrupt('return', result);
|
return _context.abrupt('return', result);
|
||||||
|
|
||||||
case 23:
|
case 24:
|
||||||
case 'end':
|
case 'end':
|
||||||
return _context.stop();
|
return _context.stop();
|
||||||
}
|
}
|
||||||
@ -7675,8 +7557,14 @@ var Mercury = {
|
|||||||
this
|
this
|
||||||
);
|
);
|
||||||
})
|
})
|
||||||
)();
|
);
|
||||||
},
|
|
||||||
|
function parse(_x, _x2) {
|
||||||
|
return _parse.apply(this, arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
return parse;
|
||||||
|
})(),
|
||||||
browser: !!cheerio.browser,
|
browser: !!cheerio.browser,
|
||||||
// A convenience method for getting a resource
|
// A convenience method for getting a resource
|
||||||
// to work with, e.g., for custom extractor generator
|
// to work with, e.g., for custom extractor generator
|
||||||
|
55867
dist/mercury.web.js
vendored
55867
dist/mercury.web.js
vendored
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@postlight/mercury-parser",
|
"name": "@postlight/mercury-parser",
|
||||||
"version": "1.0.13",
|
"version": "1.1.0",
|
||||||
"description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
|
"description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
|
||||||
"author": "Postlight <mercury@postlight.com>",
|
"author": "Postlight <mercury@postlight.com>",
|
||||||
"homepage": "https://mercury.postlight.com",
|
"homepage": "https://mercury.postlight.com",
|
||||||
|
Loading…
Reference in New Issue
Block a user