chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans
it up a bit.
pull/274/head
Adam Pash 5 years ago committed by GitHub
parent 867623ab33
commit 9bf88b0ba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

54
dist/mercury.js vendored

@ -8,6 +8,7 @@ var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/h
var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
var TurndownService = _interopDefault(require('turndown'));
var iconv = _interopDefault(require('iconv-lite'));
var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
@ -21,7 +22,6 @@ var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
var TurndownService = _interopDefault(require('turndown'));
var stringDirection = _interopDefault(require('string-direction'));
var validUrl = _interopDefault(require('valid-url'));
var moment = _interopDefault(require('moment-timezone'));
@ -6018,9 +6018,7 @@ var GenericExtractor = {
},
extract: function extract(options) {
var html = options.html,
$ = options.$,
_options$contentType = options.contentType,
contentType = _options$contentType === void 0 ? 'html' : _options$contentType;
$ = options.$;
if (html && !$) {
var loaded = cheerio.load(html);
@ -6054,24 +6052,13 @@ var GenericExtractor = {
url = _this$url_and_domain.url,
domain = _this$url_and_domain.domain;
var convertedContent;
if (contentType === 'html') {
convertedContent = content;
} else if (contentType === 'text') {
convertedContent = $.text(cheerio.load(content));
} else if (contentType === 'markdown') {
var turndownService = new TurndownService();
convertedContent = turndownService.turndown(content);
}
return {
title: title,
author: author,
date_published: date_published || null,
dek: dek,
lead_image_url: lead_image_url,
content: convertedContent,
content: content,
next_page_url: next_page_url,
url: url,
domain: domain,
@ -6161,9 +6148,7 @@ function select(opts) {
type = opts.type,
extractionOpts = opts.extractionOpts,
_opts$extractHtml = opts.extractHtml,
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml,
_opts$contentType = opts.contentType,
contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // Skip if there's not extraction for this type
extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type
if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
// contributors), return the string
@ -6205,19 +6190,7 @@ function select(opts) {
$content = Cleaners[type]($content, _objectSpread({}, opts, {
defaultCleaner: defaultCleaner
}));
if (contentType === 'html') {
return $.html($content);
}
if (contentType === 'text') {
return $.text($content);
}
if (contentType === 'markdown') {
var turndownService = new TurndownService();
return turndownService.turndown($.html($content));
}
return $.html($content);
}
var result; // if selector is an array (e.g., ['img', 'src']),
@ -6270,9 +6243,7 @@ var RootExtractor = {
var opts = arguments.length > 1 ? arguments[1] : undefined;
var _opts = opts,
contentOnly = _opts.contentOnly,
extractedTitle = _opts.extractedTitle,
_opts$contentType2 = _opts.contentType,
contentType = _opts$contentType2 === void 0 ? 'html' : _opts$contentType2; // This is the generic extractor. Run its extract method
extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method
if (extractor.domain === '*') return extractor.extract(opts);
opts = _objectSpread({}, opts, {
@ -6283,8 +6254,7 @@ var RootExtractor = {
var _content = extractResult(_objectSpread({}, opts, {
type: 'content',
extractHtml: true,
title: extractedTitle,
contentType: contentType
title: extractedTitle
}));
return {
@ -6451,6 +6421,7 @@ var Mercury = {
_result,
title,
next_page_url,
turndownService,
_args = arguments;
return _regeneratorRuntime.wrap(function _callee$(_context) {
@ -6545,9 +6516,16 @@ var Mercury = {
});
case 23:
if (contentType === 'markdown') {
turndownService = new TurndownService();
result.content = turndownService.turndown(result.content);
} else if (contentType === 'text') {
result.content = $.text($(result.content));
}
return _context.abrupt("return", result);
case 24:
case 25:
case "end":
return _context.stop();
}

@ -1,5 +1,4 @@
import cheerio from 'cheerio';
import TurndownService from 'turndown';
import stringDirection from 'string-direction';
import GenericContentExtractor from './content/extractor';
@ -29,7 +28,7 @@ const GenericExtractor = {
direction: ({ title }) => stringDirection.getDirection(title),
extract(options) {
const { html, $, contentType = 'html' } = options;
const { html, $ } = options;
if (html && !$) {
const loaded = cheerio.load(html);
@ -48,24 +47,13 @@ const GenericExtractor = {
const direction = this.direction({ title });
const { url, domain } = this.url_and_domain(options);
let convertedContent;
if (contentType === 'html') {
convertedContent = content;
} else if (contentType === 'text') {
convertedContent = $.text(cheerio.load(content));
} else if (contentType === 'markdown') {
const turndownService = new TurndownService();
convertedContent = turndownService.turndown(content);
}
return {
title,
author,
date_published: date_published || null,
dek,
lead_image_url,
content: convertedContent,
content,
next_page_url,
url,
domain,

@ -1,4 +1,3 @@
import TurndownService from 'turndown';
import Cleaners from 'cleaners';
import { convertNodeTo } from 'utils/dom';
import GenericExtractor from './generic';
@ -67,13 +66,7 @@ function findMatchingSelector($, selectors, extractHtml) {
}
export function select(opts) {
const {
$,
type,
extractionOpts,
extractHtml = false,
contentType = 'html',
} = opts;
const { $, type, extractionOpts, extractHtml = false } = opts;
// Skip if there's not extraction for this type
if (!extractionOpts) return null;
@ -120,16 +113,7 @@ export function select(opts) {
$content = Cleaners[type]($content, { ...opts, defaultCleaner });
if (contentType === 'html') {
return $.html($content);
}
if (contentType === 'text') {
return $.text($content);
}
if (contentType === 'markdown') {
const turndownService = new TurndownService();
return turndownService.turndown($.html($content));
}
return $.html($content);
}
let result;
@ -178,7 +162,7 @@ function extractResult(opts) {
const RootExtractor = {
extract(extractor = GenericExtractor, opts) {
const { contentOnly, extractedTitle, contentType = 'html' } = opts;
const { contentOnly, extractedTitle } = opts;
// This is the generic extractor. Run its extract method
if (extractor.domain === '*') return extractor.extract(opts);
@ -193,7 +177,6 @@ const RootExtractor = {
type: 'content',
extractHtml: true,
title: extractedTitle,
contentType,
});
return {
content,

@ -32,73 +32,6 @@ describe('RootExtractor', () => {
assert.equal(url, null);
});
it('returns text content if text is passed as contentType', () => {
const fullUrl =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const $ = cheerio.load(html);
const { content } = RootExtractor.extract(NYMagExtractor, {
url: fullUrl,
html,
$,
metaCache: [],
fallback: false,
contentType: 'text',
});
const htmlRe = /<[a-z][\s\S]*>/g;
assert.equal(htmlRe.test(content), false);
});
it('returns markdown if markdown is passed as contentType', () => {
const fullUrl =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const $ = cheerio.load(html);
const { content } = RootExtractor.extract(NYMagExtractor, {
url: fullUrl,
html,
$,
metaCache: [],
fallback: false,
contentType: 'markdown',
});
const htmlRe = /<[a-z][\s\S]*>/;
const markdownRe = /\[[\w\s]+\]\(.*\)/;
assert.equal(htmlRe.test(content), false);
assert.equal(markdownRe.test(content), true);
});
it('also can select type on Generic Extractor', () => {
const fullUrl =
'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html';
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
const $ = cheerio.load(html);
const { content } = RootExtractor.extract(undefined, {
url: fullUrl,
html,
$,
metaCache: [],
fallback: false,
contentType: 'markdown',
});
const htmlRe = /<[a-z][\s\S]*>/;
const markdownRe = /\[[\w\s]+\]\(.*\)/;
assert.equal(htmlRe.test(content), false);
assert.equal(markdownRe.test(content), true);
});
});
describe('cleanBySelectors($content, $, { clean })', () => {

@ -1,5 +1,6 @@
import URL from 'url';
import cheerio from 'cheerio';
import TurndownService from 'turndown';
import Resource from 'resource';
import { validateUrl, Errors } from 'utils';
@ -83,6 +84,13 @@ const Mercury = {
};
}
if (contentType === 'markdown') {
const turndownService = new TurndownService();
result.content = turndownService.turndown(result.content);
} else if (contentType === 'text') {
result.content = $.text($(result.content));
}
return result;
},

@ -4,6 +4,8 @@ import { Errors } from 'utils';
import { record } from 'test-helpers';
import Mercury from './mercury';
const fs = require('fs');
describe('Mercury', () => {
const recorder = record('mercury-test');
beforeAll(recorder.before);
@ -92,4 +94,37 @@ describe('Mercury', () => {
assert.equal(result.next_page_url, `${url}2`);
});
});
it('returns text content if text is passed as contentType', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { content } = await Mercury.parse(url, { html, contentType: 'text' });
const htmlRe = /<[a-z][\s\S]*>/g;
assert.equal(htmlRe.test(content), false);
});
it('returns markdown if markdown is passed as contentType', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { content } = await Mercury.parse(url, {
html,
contentType: 'markdown',
});
const htmlRe = /<[a-z][\s\S]*>/;
const markdownRe = /\[[\w\s]+\]\(.*\)/;
assert.equal(htmlRe.test(content), false);
assert.equal(markdownRe.test(content), true);
});
});

Loading…
Cancel
Save