feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README
pull/326/head^2
Drew Bell 5 years ago committed by Adam Pash
parent 136d6df798
commit b3e2a0ffd1

@ -67,7 +67,9 @@ If Mercury is unable to find a field, that field will return `null`.
By default, Mercury Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example:
```javascript
Mercury.parse(url, { contentType: 'markdown' }).then(result => console.log(result));
Mercury.parse(url, { contentType: 'markdown' }).then(result =>
console.log(result)
);
```
This returns the the page's `content` as GitHub-flavored Markdown:
@ -94,6 +96,15 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source
# Pass optional --format argument to set content type (html|markdown|text)
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown
# Pass optional --extend-list argument to add a custom type to the response
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em"
# Pass optional --extend-list argument to add a custom type with multiple matches
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list categories=".meta__tags-list a"
# Get the value of attributes by adding a pipe to --extend or --extend-list
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
```
## License

@ -8,8 +8,12 @@ const {
_: [url],
format,
f,
extend,
e,
extendList,
l,
} = argv;
(async (urlToParse, contentType) => {
(async (urlToParse, contentType, extendedTypes, extendedListTypes) => {
if (!urlToParse) {
console.log(
'\n\
@ -17,7 +21,7 @@ mercury-parser\n\n\
The Mercury Parser extracts semantic content from any url\n\n\
Usage:\n\
\n\
$ mercury-parser url-to-parse [--format=html|text|markdown]\n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--extend type=selector]... [--extend-list type=selector]... \n\
\n\
'
);
@ -31,8 +35,25 @@ Usage:\n\
text: 'text',
txt: 'text',
};
const extensions = {};
[].concat(extendedTypes || []).forEach(t => {
const [name, selector] = t.split('=');
const fullSelector =
selector.indexOf('|') > 0 ? selector.split('|') : selector;
extensions[name] = { selectors: [fullSelector] };
});
[].concat(extendedListTypes || []).forEach(t => {
const [name, selector] = t.split('=');
const fullSelector =
selector.indexOf('|') > 0 ? selector.split('|') : selector;
extensions[name] = {
selectors: [fullSelector],
allowMultiple: true,
};
});
const result = await Mercury.parse(urlToParse, {
contentType: contentTypeMap[contentType],
extend: extensions,
});
console.log(JSON.stringify(result, null, 2));
} catch (e) {
@ -51,4 +72,4 @@ Usage:\n\
console.error(`\n${reportBug}\n`);
process.exit(1);
}
})(url, format || f);
})(url, format || f, extend || e, extendList || l);

331
dist/mercury.js vendored

@ -13,7 +13,7 @@ var iconv = _interopDefault(require('iconv-lite'));
var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
var _Promise = _interopDefault(require('@babel/runtime-corejs2/core-js/promise'));
var request = _interopDefault(require('request'));
var request = _interopDefault(require('postman-request'));
var _Reflect$ownKeys = _interopDefault(require('@babel/runtime-corejs2/core-js/reflect/own-keys'));
var _toConsumableArray = _interopDefault(require('@babel/runtime-corejs2/helpers/toConsumableArray'));
var _defineProperty = _interopDefault(require('@babel/runtime-corejs2/helpers/defineProperty'));
@ -32,7 +32,7 @@ var _Array$from = _interopDefault(require('@babel/runtime-corejs2/core-js/array/
var ellipsize = _interopDefault(require('ellipsize'));
var _Array$isArray = _interopDefault(require('@babel/runtime-corejs2/core-js/array/is-array'));
var NORMALIZE_RE = /\s{2,}/g;
var NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
function normalizeSpaces(text) {
return text.replace(NORMALIZE_RE, ' ').trim();
}
@ -400,10 +400,10 @@ var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we wil
// but would normally remove
var KEEP_CLASS = 'mercury-parser-keep';
var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]']; // A list of tags to strip from the output if we encounter them.
var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
var WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty
var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
@ -1284,9 +1284,9 @@ function rewriteTopLevel$$1(article, $) {
return $;
}
function absolutize($, rootUrl, attr, $content) {
function absolutize($, rootUrl, attr) {
var baseUrl = $('base').attr('href');
$("[".concat(attr, "]"), $content).each(function (_, node) {
$("[".concat(attr, "]")).each(function (_, node) {
var attrs = getAttrs(node);
var url = attrs[attr];
var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
@ -1321,7 +1321,7 @@ function absolutizeSet($, rootUrl, $content) {
function makeLinksAbsolute$$1($content, $, url) {
['href', 'src'].forEach(function (attr) {
return absolutize($, url, attr, $content);
return absolutize($, url, attr);
});
absolutizeSet($, url, $content);
return $content;
@ -1567,7 +1567,9 @@ function setAttrs(node, attrs) {
// DOM manipulation
var IS_LINK = new RegExp('https?://', 'i');
var IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i');
var IMAGE_RE = '.(png|gif|jpe?g)';
var IS_IMAGE = new RegExp("".concat(IMAGE_RE), 'i');
var IS_SRCSET = new RegExp("".concat(IMAGE_RE, "(\\s*[\\d.]+[wx])"), 'i');
var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
// lazy loaded images into normal images.
@ -1582,7 +1584,9 @@ function convertLazyLoadedImages($) {
_Reflect$ownKeys(attrs).forEach(function (attr) {
var value = attrs[attr];
if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
$(img).attr('srcset', value);
} else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
$(img).attr('src', value);
}
});
@ -2200,7 +2204,7 @@ var PoliticoExtractor = {
var DeadspinExtractor = {
domain: 'deadspin.com',
supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com'],
supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', 'avclub.com', 'clickhole.com', 'splinternews.com', 'theonion.com', 'theroot.com', 'thetakeout.com', 'theinventory.com'],
title: {
selectors: ['h1.headline']
},
@ -4677,6 +4681,149 @@ var WwwFastcompanyComExtractor = {
}
};
var BlisterreviewComExtractor = {
domain: 'blisterreview.com',
title: {
selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title']
},
author: {
selectors: ['span.author-name']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value'], ['time.entry-date', 'datetime'], ['meta[itemprop="datePublished"]', 'content']]
},
dek: {
selectors: [// enter selectors
]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value'], ['meta[property="og:image"]', 'content'], ['meta[itemprop="image"]', 'content'], ['meta[name="twitter:image"]', 'content'], ['img.attachment-large', 'src']]
},
content: {
selectors: [['.elementor-section-wrap', '.elementor-text-editor > p, .elementor-text-editor > ul > li, .attachment-large, .wp-caption-text']],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
figcaption: 'p'
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.comments-area']
}
};
var NewsMynaviJpExtractor = {
domain: 'news.mynavi.jp',
title: {
selectors: [['meta[name="og:title"]', 'value']]
},
author: {
selectors: ['main div.article-author a.article-author__name']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
},
dek: {
selectors: [['meta[name="og:description"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['main article div'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
img: function img($node) {
var src = $node.attr('data-original');
if (src !== '') {
$node.attr('src', src);
}
}
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};
var GithubComExtractor = {
domain: 'github.com',
title: {
selectors: [['meta[name="og:title"]', 'value']]
},
author: {
selectors: [// enter author selectors
]
},
date_published: {
selectors: [['span[itemprop="dateModified"] relative-time', 'datetime']]
},
dek: {
selectors: ['span[itemprop="about"]']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: [['#readme article']],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};
var WwwRedditComExtractor = {
domain: 'www.reddit.com',
title: {
selectors: ['div[data-test-id="post-content"] h2']
},
author: {
selectors: ['div[data-test-id="post-content"] a[href*="user/"]']
},
date_published: {
selectors: ['div[data-test-id="post-content"] a[data-click-id="timestamp"]']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: [['div[data-test-id="post-content"] p'], // text post
['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
'div[data-test-id="post-content"] div[data-click-id="media"]'], // external link with media preview (YouTube, imgur album, etc...)
['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])'], // external link
'div[data-test-id="post-content"]'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'div[role="img"]': function divRoleImg($node) {
// External link image preview
var $img = $node.find('img');
var bgImg = $node.css('background-image');
if ($img.length === 1 && bgImg) {
$img.attr('src', bgImg.match(/\((.*?)\)/)[1].replace(/('|")/g, ''));
return $img;
}
return $node;
}
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.icon']
}
};
var CustomExtractors = /*#__PURE__*/Object.freeze({
@ -4772,7 +4919,11 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
WwwSlateComExtractor: WwwSlateComExtractor,
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
WwwFortinetComExtractor: WwwFortinetComExtractor,
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
BlisterreviewComExtractor: BlisterreviewComExtractor,
NewsMynaviJpExtractor: NewsMynaviJpExtractor,
GithubComExtractor: GithubComExtractor,
WwwRedditComExtractor: WwwRedditComExtractor
});
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
@ -4790,6 +4941,10 @@ var SEC_DATE_STRING = /^\d{10}$/i;
var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
var TIME_NOW_STRING = /^\s*(just|right)?\s*now\s*/i;
var timeUnits = ['seconds?', 'minutes?', 'hours?', 'days?', 'weeks?', 'months?', 'years?'];
var allTimeUnits = timeUnits.join('|');
var TIME_AGO_STRING = new RegExp("(\\d+)\\s+(".concat(allTimeUnits, ")\\s+ago"), 'i');
var months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
var allMonths = months.join('|');
var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
@ -4845,6 +5000,15 @@ function createDate(dateString, timezone, format) {
return moment(new Date(dateString));
}
if (TIME_AGO_STRING.test(dateString)) {
var fragments = TIME_AGO_STRING.exec(dateString);
return moment().subtract(fragments[1], fragments[2]);
}
if (TIME_NOW_STRING.test(dateString)) {
return moment();
}
return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString));
} // Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
@ -6124,7 +6288,7 @@ function transformElements($content, $, _ref2) {
return $content;
}
function findMatchingSelector($, selectors, extractHtml) {
function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
return selectors.find(function (selector) {
if (_Array$isArray(selector)) {
if (extractHtml) {
@ -6137,10 +6301,10 @@ function findMatchingSelector($, selectors, extractHtml) {
s = _selector[0],
attr = _selector[1];
return $(s).length === 1 && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
}
return $(selector).length === 1 && $(selector).text().trim() !== '';
return (allowMultiple || !allowMultiple && $(selector).length === 1) && $(selector).text().trim() !== '';
});
}
@ -6157,25 +6321,30 @@ function select(opts) {
if (typeof extractionOpts === 'string') return extractionOpts;
var selectors = extractionOpts.selectors,
_extractionOpts$defau = extractionOpts.defaultCleaner,
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau;
var matchingSelector = findMatchingSelector($, selectors, extractHtml);
if (!matchingSelector) return null; // Declaring result; will contain either
// text or html, which will be cleaned
// by the appropriate cleaner type
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
var $content;
if (extractHtml) {
// If matching selector is an array, we're considering this a
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
allowMultiple = extractionOpts.allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple);
if (!matchingSelector) return null;
function transformAndClean($node) {
makeLinksAbsolute$$1($node, $, opts.url || '');
cleanBySelectors($node, $, extractionOpts);
transformElements($node, $, extractionOpts);
return $node;
}
function selectHtml() {
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
var $content; // If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
// array must match in order for this selector to trigger
if (_Array$isArray(matchingSelector)) {
$content = $(matchingSelector.join(','));
var $wrapper = $('<div></div>');
$content.each(function (index, element) {
$content.each(function (_, element) {
$wrapper.append(element);
});
$content = $wrapper;
@ -6186,14 +6355,28 @@ function select(opts) {
$content.wrap($('<div></div>'));
$content = $content.parent();
$content = transformElements($content, $, extractionOpts);
$content = cleanBySelectors($content, $, extractionOpts);
$content = Cleaners[type]($content, _objectSpread({}, opts, {
defaultCleaner: defaultCleaner
}));
$content = transformAndClean($content);
if (Cleaners[type]) {
Cleaners[type]($content, _objectSpread({}, opts, {
defaultCleaner: defaultCleaner
}));
}
if (allowMultiple) {
return $content.children().toArray().map(function (el) {
return $.html($(el));
});
}
return $.html($content);
}
if (extractHtml) {
return selectHtml(matchingSelector);
}
var $match;
var result; // if selector is an array (e.g., ['img', 'src']),
// extract the attr
@ -6202,22 +6385,42 @@ function select(opts) {
selector = _matchingSelector[0],
attr = _matchingSelector[1];
result = $(selector).attr(attr).trim();
$match = $(selector);
$match = transformAndClean($match);
result = $match.map(function (_, el) {
return $(el).attr(attr).trim();
});
} else {
var $node = $(matchingSelector);
$node = cleanBySelectors($node, $, extractionOpts);
$node = transformElements($node, $, extractionOpts);
result = $node.text().trim();
} // Allow custom extractor to skip default cleaner
// for this type; defaults to true
$match = $(matchingSelector);
$match = transformAndClean($match);
result = $match.map(function (_, el) {
return $(el).text().trim();
});
}
result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0]; // Allow custom extractor to skip default cleaner
// for this type; defaults to true
if (defaultCleaner) {
if (defaultCleaner && Cleaners[type]) {
return Cleaners[type](result, _objectSpread({}, opts, extractionOpts));
}
return result;
}
function selectExtendedTypes(extend, opts) {
var results = {};
_Reflect$ownKeys(extend).forEach(function (t) {
if (!results[t]) {
results[t] = select(_objectSpread({}, opts, {
type: t,
extractionOpts: extend[t]
}));
}
});
return results;
}
function extractResult(opts) {
var type = opts.type,
@ -6311,7 +6514,13 @@ var RootExtractor = {
url = _ref3.url,
domain = _ref3.domain;
return {
var extendedResults = {};
if (extractor.extend) {
extendedResults = selectExtendedTypes(extractor.extend, opts);
}
return _objectSpread({
title: title,
content: content,
author: author,
@ -6324,7 +6533,7 @@ var RootExtractor = {
excerpt: excerpt,
word_count: word_count,
direction: direction
};
}, extendedResults);
}
};
@ -6414,10 +6623,12 @@ var Mercury = {
fallback,
_opts$contentType,
contentType,
extend,
parsedUrl,
$,
Extractor,
metaCache,
extendedTypes,
result,
_result,
title,
@ -6430,7 +6641,7 @@ var Mercury = {
switch (_context.prev = _context.next) {
case 0:
_ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // if no url was passed and this is the browser version,
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, extend = opts.extend; // if no url was passed and this is the browser version,
// set url to window.location.href and load the html
// from the current page
@ -6455,19 +6666,19 @@ var Mercury = {
case 8:
$ = _context.sent;
Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
// If we found an error creating the resource, return that error
if (!$.failed) {
_context.next = 12;
_context.next = 11;
break;
}
return _context.abrupt("return", $);
case 12:
case 11:
Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
// if html still has not been set (i.e., url passed to Mercury.parse),
// set html from the response of Resource.create
if (!html) {
html = $.html();
} // Cached value of every meta name in our document.
@ -6477,6 +6688,16 @@ var Mercury = {
metaCache = $('meta').map(function (_, node) {
return $(node).attr('name');
}).toArray();
extendedTypes = {};
if (extend) {
extendedTypes = selectExtendedTypes(extend, {
$: $,
url: url,
html: html
});
}
result = RootExtractor.extract(Extractor, {
url: url,
html: html,
@ -6489,11 +6710,11 @@ var Mercury = {
_result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found
if (!(fetchAllPages && next_page_url)) {
_context.next = 22;
_context.next = 24;
break;
}
_context.next = 19;
_context.next = 21;
return collectAllPages({
Extractor: Extractor,
next_page_url: next_page_url,
@ -6505,18 +6726,18 @@ var Mercury = {
url: url
});
case 19:
case 21:
result = _context.sent;
_context.next = 23;
_context.next = 25;
break;
case 22:
case 24:
result = _objectSpread({}, result, {
total_pages: 1,
rendered_pages: 1
});
case 23:
case 25:
if (contentType === 'markdown') {
turndownService = new TurndownService();
result.content = turndownService.turndown(result.content);
@ -6524,9 +6745,9 @@ var Mercury = {
result.content = $.text($(result.content));
}
return _context.abrupt("return", result);
return _context.abrupt("return", _objectSpread({}, result, extendedTypes));
case 25:
case 27:
case "end":
return _context.stop();
}

@ -95,6 +95,27 @@ export const ExampleExtractor = {
...
```
### Custom types
To add a custom key to the response, add an `extend` object. The response will include
results for each key of this object (`categories` in the example below). Setting
`allowMultiple` to `true` means Mercury will find all the content that matches the
selectors, and will always return an array of results for that key.
```javascript
export const ExampleExtractorWithExtend = {
...
extend: {
categories: {
selectors: ['.post-taglist a'],
allowMultiple: true,
}
},
...
```
### Cleaning content from an article
An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.

@ -1,5 +1,5 @@
import Cleaners from 'cleaners';
import { convertNodeTo } from 'utils/dom';
import { convertNodeTo, makeLinksAbsolute } from 'utils/dom';
import GenericExtractor from './generic';
// Remove elements by an array of selectors
@ -39,7 +39,7 @@ export function transformElements($content, $, { transforms }) {
return $content;
}
function findMatchingSelector($, selectors, extractHtml) {
function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
return selectors.find(selector => {
if (Array.isArray(selector)) {
if (extractHtml) {
@ -48,7 +48,7 @@ function findMatchingSelector($, selectors, extractHtml) {
const [s, attr] = selector;
return (
$(s).length === 1 &&
(allowMultiple || (!allowMultiple && $(s).length === 1)) &&
$(s).attr(attr) &&
$(s)
.attr(attr)
@ -57,7 +57,7 @@ function findMatchingSelector($, selectors, extractHtml) {
}
return (
$(selector).length === 1 &&
(allowMultiple || (!allowMultiple && $(selector).length === 1)) &&
$(selector)
.text()
.trim() !== ''
@ -74,20 +74,29 @@ export function select(opts) {
// contributors), return the string
if (typeof extractionOpts === 'string') return extractionOpts;
const { selectors, defaultCleaner = true } = extractionOpts;
const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
const matchingSelector = findMatchingSelector($, selectors, extractHtml);
const matchingSelector = findMatchingSelector(
$,
selectors,
extractHtml,
allowMultiple
);
if (!matchingSelector) return null;
// Declaring result; will contain either
// text or html, which will be cleaned
// by the appropriate cleaner type
function transformAndClean($node) {
makeLinksAbsolute($node, $, opts.url || '');
cleanBySelectors($node, $, extractionOpts);
transformElements($node, $, extractionOpts);
return $node;
}
function selectHtml() {
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
let $content;
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
let $content;
if (extractHtml) {
// If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
@ -95,7 +104,7 @@ export function select(opts) {
if (Array.isArray(matchingSelector)) {
$content = $(matchingSelector.join(','));
const $wrapper = $('<div></div>');
$content.each((index, element) => {
$content.each((_, element) => {
$wrapper.append(element);
});
@ -107,42 +116,71 @@ export function select(opts) {
// Wrap in div so transformation can take place on root element
$content.wrap($('<div></div>'));
$content = $content.parent();
$content = transformAndClean($content);
if (Cleaners[type]) {
Cleaners[type]($content, { ...opts, defaultCleaner });
}
$content = transformElements($content, $, extractionOpts);
$content = cleanBySelectors($content, $, extractionOpts);
$content = Cleaners[type]($content, { ...opts, defaultCleaner });
if (allowMultiple) {
return $content
.children()
.toArray()
.map(el => $.html($(el)));
}
return $.html($content);
}
let result;
if (extractHtml) {
return selectHtml(matchingSelector);
}
let $match;
let result;
// if selector is an array (e.g., ['img', 'src']),
// extract the attr
if (Array.isArray(matchingSelector)) {
const [selector, attr] = matchingSelector;
result = $(selector)
.attr(attr)
.trim();
$match = $(selector);
$match = transformAndClean($match);
result = $match.map((_, el) =>
$(el)
.attr(attr)
.trim()
);
} else {
let $node = $(matchingSelector);
$node = cleanBySelectors($node, $, extractionOpts);
$node = transformElements($node, $, extractionOpts);
result = $node.text().trim();
$match = $(matchingSelector);
$match = transformAndClean($match);
result = $match.map((_, el) =>
$(el)
.text()
.trim()
);
}
result =
Array.isArray(result.toArray()) && allowMultiple
? result.toArray()
: result[0];
// Allow custom extractor to skip default cleaner
// for this type; defaults to true
if (defaultCleaner) {
if (defaultCleaner && Cleaners[type]) {
return Cleaners[type](result, { ...opts, ...extractionOpts });
}
return result;
}
export function selectExtendedTypes(extend, opts) {
const results = {};
Reflect.ownKeys(extend).forEach(t => {
if (!results[t]) {
results[t] = select({ ...opts, type: t, extractionOpts: extend[t] });
}
});
return results;
}
function extractResult(opts) {
const { type, extractor, fallback = true } = opts;
@ -206,6 +244,11 @@ const RootExtractor = {
type: 'url_and_domain',
}) || { url: null, domain: null };
let extendedResults = {};
if (extractor.extend) {
extendedResults = selectExtendedTypes(extractor.extend, opts);
}
return {
title,
content,
@ -219,6 +262,7 @@ const RootExtractor = {
excerpt,
word_count,
direction,
...extendedResults,
};
},
};

@ -228,4 +228,74 @@ describe('select(opts)', () => {
assert.equal(result, null);
});
it('returns an array of results if allowMultiple is true', () => {
const html = `
<div><div><ul><li class="item">One</li><li class="item">Two</li></ul></div></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'items',
$,
extractionOpts: {
selectors: ['.item'],
allowMultiple: true,
},
extractHtml: true,
};
const result = select(opts);
assert.equal(result.length, 2);
assert.deepEqual(result, [
'<li class="item">One</li>',
'<li class="item">Two</li>',
]);
});
it('makes links absolute in extended types when extracting HTML', () => {
const html = `
<div><p><a class="linky" href="/foo">Bar</a></p></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'links',
$,
url: 'http://example.com',
extractionOpts: {
selectors: ['.linky'],
},
extractHtml: true,
};
const result = select(opts);
assert.equal(
result,
'<div><a class="linky" href="http://example.com/foo">Bar</a></div>'
);
});
it('makes links absolute in extended types when extracting attrs', () => {
const html = `
<div><p><a class="linky" href="/foo">Bar</a><a class="linky" href="/bar">Baz</a></p></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'links',
$,
url: 'http://example.com',
extractionOpts: {
selectors: [['.linky', 'href']],
allowMultiple: true,
},
};
const result = select(opts);
assert.deepEqual(result, [
'http://example.com/foo',
'http://example.com/bar',
]);
});
});

@ -5,7 +5,7 @@ import TurndownService from 'turndown';
import Resource from 'resource';
import { validateUrl } from 'utils';
import getExtractor from 'extractors/get-extractor';
import RootExtractor from 'extractors/root-extractor';
import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
import collectAllPages from 'extractors/collect-all-pages';
const Mercury = {
@ -14,6 +14,7 @@ const Mercury = {
fetchAllPages = true,
fallback = true,
contentType = 'html',
extend,
} = opts;
// if no url was passed and this is the browser version,
@ -56,6 +57,11 @@ const Mercury = {
.map((_, node) => $(node).attr('name'))
.toArray();
let extendedTypes = {};
if (extend) {
extendedTypes = selectExtendedTypes(extend, { $, url, html });
}
let result = RootExtractor.extract(Extractor, {
url,
html,
@ -95,7 +101,7 @@ const Mercury = {
result.content = $.text($(result.content));
}
return result;
return { ...result, ...extendedTypes };
},
browser: !!cheerio.browser,

@ -48,14 +48,6 @@ describe('Mercury', () => {
assert(/content-type for this resource/i.test(error.message));
});
it('does blogger', async () => {
const result = await Mercury.parse(
'https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html'
);
assert.equal(typeof result, 'object');
});
it('does wikipedia', async () => {
const result = await Mercury.parse(
'https://en.wikipedia.org/wiki/Brihadeeswarar_Temple_fire'
@ -134,4 +126,66 @@ describe('Mercury', () => {
assert.equal(htmlRe.test(content), false);
assert.equal(markdownRe.test(content), true);
});
it('returns custom elements if an extend object is passed', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { sites } = await Mercury.parse(url, {
html,
extend: {
sites: {
selectors: ['a.site-name'],
allowMultiple: true,
},
},
});
assert.ok(sites);
assert.equal(sites.length, 8);
assert.equal(sites[0], 'NYMag.com');
});
it('returns an array if a single element matches a custom extend', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { sites } = await Mercury.parse(url, {
html,
extend: {
sites: {
selectors: [['li:first-child a.site-name', 'href']],
allowMultiple: true,
},
},
});
assert.ok(sites);
assert.equal(sites.length, 1);
});
it('returns custom attributes if an extend object is passed', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { sites } = await Mercury.parse(url, {
html,
extend: {
sites: {
selectors: [['a.site-name', 'href']],
allowMultiple: true,
},
},
});
assert.ok(sites);
assert.equal(sites.length, 8);
assert.equal(sites[1], 'http://nymag.com/daily/intelligencer/');
});
});

@ -2,10 +2,10 @@ import URL from 'url';
import { getAttrs, setAttr } from 'utils/dom';
function absolutize($, rootUrl, attr, $content) {
function absolutize($, rootUrl, attr) {
const baseUrl = $('base').attr('href');
$(`[${attr}]`, $content).each((_, node) => {
$(`[${attr}]`).each((_, node) => {
const attrs = getAttrs(node);
const url = attrs[attr];
const absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
@ -43,7 +43,7 @@ function absolutizeSet($, rootUrl, $content) {
}
export default function makeLinksAbsolute($content, $, url) {
['href', 'src'].forEach(attr => absolutize($, url, attr, $content));
['href', 'src'].forEach(attr => absolutize($, url, attr));
absolutizeSet($, url, $content);
return $content;

Loading…
Cancel
Save