|
|
|
@ -8,6 +8,7 @@ var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/h
|
|
|
|
|
var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
|
|
|
|
|
var URL = _interopDefault(require('url'));
|
|
|
|
|
var cheerio = _interopDefault(require('cheerio'));
|
|
|
|
|
var TurndownService = _interopDefault(require('turndown'));
|
|
|
|
|
var iconv = _interopDefault(require('iconv-lite'));
|
|
|
|
|
var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
|
|
|
|
|
var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
|
|
|
|
@ -21,7 +22,6 @@ var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
|
|
|
|
|
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
|
|
|
|
|
var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
|
|
|
|
|
var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
|
|
|
|
|
var TurndownService = _interopDefault(require('turndown'));
|
|
|
|
|
var stringDirection = _interopDefault(require('string-direction'));
|
|
|
|
|
var validUrl = _interopDefault(require('valid-url'));
|
|
|
|
|
var moment = _interopDefault(require('moment-timezone'));
|
|
|
|
@ -6018,9 +6018,7 @@ var GenericExtractor = {
|
|
|
|
|
},
|
|
|
|
|
extract: function extract(options) {
|
|
|
|
|
var html = options.html,
|
|
|
|
|
$ = options.$,
|
|
|
|
|
_options$contentType = options.contentType,
|
|
|
|
|
contentType = _options$contentType === void 0 ? 'html' : _options$contentType;
|
|
|
|
|
$ = options.$;
|
|
|
|
|
|
|
|
|
|
if (html && !$) {
|
|
|
|
|
var loaded = cheerio.load(html);
|
|
|
|
@ -6054,24 +6052,13 @@ var GenericExtractor = {
|
|
|
|
|
url = _this$url_and_domain.url,
|
|
|
|
|
domain = _this$url_and_domain.domain;
|
|
|
|
|
|
|
|
|
|
var convertedContent;
|
|
|
|
|
|
|
|
|
|
if (contentType === 'html') {
|
|
|
|
|
convertedContent = content;
|
|
|
|
|
} else if (contentType === 'text') {
|
|
|
|
|
convertedContent = $.text(cheerio.load(content));
|
|
|
|
|
} else if (contentType === 'markdown') {
|
|
|
|
|
var turndownService = new TurndownService();
|
|
|
|
|
convertedContent = turndownService.turndown(content);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
title: title,
|
|
|
|
|
author: author,
|
|
|
|
|
date_published: date_published || null,
|
|
|
|
|
dek: dek,
|
|
|
|
|
lead_image_url: lead_image_url,
|
|
|
|
|
content: convertedContent,
|
|
|
|
|
content: content,
|
|
|
|
|
next_page_url: next_page_url,
|
|
|
|
|
url: url,
|
|
|
|
|
domain: domain,
|
|
|
|
@ -6161,9 +6148,7 @@ function select(opts) {
|
|
|
|
|
type = opts.type,
|
|
|
|
|
extractionOpts = opts.extractionOpts,
|
|
|
|
|
_opts$extractHtml = opts.extractHtml,
|
|
|
|
|
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml,
|
|
|
|
|
_opts$contentType = opts.contentType,
|
|
|
|
|
contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // Skip if there's not extraction for this type
|
|
|
|
|
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
|
|
|
|
|
|
|
|
|
|
if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
|
|
|
|
|
// contributors), return the string
|
|
|
|
@ -6205,19 +6190,7 @@ function select(opts) {
|
|
|
|
|
$content = Cleaners[type]($content, _objectSpread({}, opts, {
|
|
|
|
|
defaultCleaner: defaultCleaner
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
if (contentType === 'html') {
|
|
|
|
|
return $.html($content);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (contentType === 'text') {
|
|
|
|
|
return $.text($content);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (contentType === 'markdown') {
|
|
|
|
|
var turndownService = new TurndownService();
|
|
|
|
|
return turndownService.turndown($.html($content));
|
|
|
|
|
}
|
|
|
|
|
return $.html($content);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var result; // if selector is an array (e.g., ['img', 'src']),
|
|
|
|
@ -6270,9 +6243,7 @@ var RootExtractor = {
|
|
|
|
|
var opts = arguments.length > 1 ? arguments[1] : undefined;
|
|
|
|
|
var _opts = opts,
|
|
|
|
|
contentOnly = _opts.contentOnly,
|
|
|
|
|
extractedTitle = _opts.extractedTitle,
|
|
|
|
|
_opts$contentType2 = _opts.contentType,
|
|
|
|
|
contentType = _opts$contentType2 === void 0 ? 'html' : _opts$contentType2; // This is the generic extractor. Run its extract method
|
|
|
|
|
extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
|
|
|
|
|
|
|
|
|
|
if (extractor.domain === '*') return extractor.extract(opts);
|
|
|
|
|
opts = _objectSpread({}, opts, {
|
|
|
|
@ -6283,8 +6254,7 @@ var RootExtractor = {
|
|
|
|
|
var _content = extractResult(_objectSpread({}, opts, {
|
|
|
|
|
type: 'content',
|
|
|
|
|
extractHtml: true,
|
|
|
|
|
title: extractedTitle,
|
|
|
|
|
contentType: contentType
|
|
|
|
|
title: extractedTitle
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
@ -6451,6 +6421,7 @@ var Mercury = {
|
|
|
|
|
_result,
|
|
|
|
|
title,
|
|
|
|
|
next_page_url,
|
|
|
|
|
turndownService,
|
|
|
|
|
_args = arguments;
|
|
|
|
|
|
|
|
|
|
return _regeneratorRuntime.wrap(function _callee$(_context) {
|
|
|
|
@ -6545,9 +6516,16 @@ var Mercury = {
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
case 23:
|
|
|
|
|
if (contentType === 'markdown') {
|
|
|
|
|
turndownService = new TurndownService();
|
|
|
|
|
result.content = turndownService.turndown(result.content);
|
|
|
|
|
} else if (contentType === 'text') {
|
|
|
|
|
result.content = $.text($(result.content));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return _context.abrupt("return", result);
|
|
|
|
|
|
|
|
|
|
case 24:
|
|
|
|
|
case 25:
|
|
|
|
|
case "end":
|
|
|
|
|
return _context.stop();
|
|
|
|
|
}
|
|
|
|
|