mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
Merge pull request #5 from postlight/feat-wired-extractor
feat: added wired custom extractor AMAAAAAAZING!!!!
This commit is contained in:
commit
d786a7ae0c
81
dist/mercury.js
vendored
81
dist/mercury.js
vendored
@ -26,8 +26,8 @@ var ellipsize = _interopDefault(require('ellipsize'));
|
|||||||
var _marked = [range].map(_regeneratorRuntime.mark);
|
var _marked = [range].map(_regeneratorRuntime.mark);
|
||||||
|
|
||||||
function range() {
|
function range() {
|
||||||
var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];
|
var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
|
||||||
var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];
|
var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
|
||||||
return _regeneratorRuntime.wrap(function range$(_context) {
|
return _regeneratorRuntime.wrap(function range$(_context) {
|
||||||
while (1) {
|
while (1) {
|
||||||
switch (_context.prev = _context.next) {
|
switch (_context.prev = _context.next) {
|
||||||
@ -101,7 +101,7 @@ function get(options) {
|
|||||||
// further processing of this url.
|
// further processing of this url.
|
||||||
|
|
||||||
function validateResponse(response) {
|
function validateResponse(response) {
|
||||||
var parseNon2xx = arguments.length <= 1 || arguments[1] === undefined ? false : arguments[1];
|
var parseNon2xx = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
|
||||||
|
|
||||||
// Check if we got a valid status code
|
// Check if we got a valid status code
|
||||||
if (response.statusMessage !== 'OK') {
|
if (response.statusMessage !== 'OK') {
|
||||||
@ -627,6 +627,49 @@ var NewYorkerExtractor = {
|
|||||||
excerpt: null
|
excerpt: null
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Rename CustomExtractor
|
||||||
|
// to fit your publication
|
||||||
|
// (e.g., NYTimesExtractor)
|
||||||
|
var WiredExtractor = {
|
||||||
|
domain: 'www.wired.com',
|
||||||
|
title: {
|
||||||
|
selectors: ['h1.post-title']
|
||||||
|
},
|
||||||
|
|
||||||
|
author: {
|
||||||
|
selectors: ['a[rel="author"]']
|
||||||
|
},
|
||||||
|
|
||||||
|
content: {
|
||||||
|
selectors: ['article.content'],
|
||||||
|
|
||||||
|
// Is there anything in the content you selected that needs transformed
|
||||||
|
// before it's consumable content? E.g., unusual lazy loaded images
|
||||||
|
transforms: [],
|
||||||
|
|
||||||
|
// Is there anything that is in the result that shouldn't be?
|
||||||
|
// The clean selectors will remove anything that matches from
|
||||||
|
// the result
|
||||||
|
clean: ['.visually-hidden']
|
||||||
|
},
|
||||||
|
|
||||||
|
date_published: {
|
||||||
|
selectors: [['meta[itemprop="datePublished"]', 'value']]
|
||||||
|
},
|
||||||
|
|
||||||
|
lead_image_url: {
|
||||||
|
selectors: [['meta[name="og:image"]', 'value']]
|
||||||
|
},
|
||||||
|
|
||||||
|
dek: {
|
||||||
|
selectors: [['meta[name="og:description"]', 'value']]
|
||||||
|
},
|
||||||
|
|
||||||
|
next_page_url: null,
|
||||||
|
|
||||||
|
excerpt: null
|
||||||
|
};
|
||||||
|
|
||||||
var Extractors = {
|
var Extractors = {
|
||||||
'nymag.com': NYMagExtractor,
|
'nymag.com': NYMagExtractor,
|
||||||
'blogspot.com': BloggerExtractor,
|
'blogspot.com': BloggerExtractor,
|
||||||
@ -634,7 +677,9 @@ var Extractors = {
|
|||||||
'twitter.com': TwitterExtractor,
|
'twitter.com': TwitterExtractor,
|
||||||
'www.nytimes.com': NYTimesExtractor,
|
'www.nytimes.com': NYTimesExtractor,
|
||||||
'www.theatlantic.com': TheAtlanticExtractor,
|
'www.theatlantic.com': TheAtlanticExtractor,
|
||||||
'www.newyorker.com': NewYorkerExtractor
|
'www.newyorker.com': NewYorkerExtractor,
|
||||||
|
'www.wired.com': WiredExtractor
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Spacer images to be removed
|
// Spacer images to be removed
|
||||||
@ -822,7 +867,7 @@ function brsToPs($) {
|
|||||||
// :param br: Whether or not the passed node is a br
|
// :param br: Whether or not the passed node is a br
|
||||||
|
|
||||||
function paragraphize(node, $) {
|
function paragraphize(node, $) {
|
||||||
var br = arguments.length <= 2 || arguments[2] === undefined ? false : arguments[2];
|
var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;
|
||||||
|
|
||||||
var $node = $(node);
|
var $node = $(node);
|
||||||
|
|
||||||
@ -892,7 +937,7 @@ function convertToParagraphs($) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function convertNodeTo($node, $) {
|
function convertNodeTo($node, $) {
|
||||||
var tag = arguments.length <= 2 || arguments[2] === undefined ? 'p' : arguments[2];
|
var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';
|
||||||
|
|
||||||
var node = $node.get(0);
|
var node = $node.get(0);
|
||||||
if (!node) {
|
if (!node) {
|
||||||
@ -952,7 +997,7 @@ function cleanImages($article, $) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function stripJunkTags(article, $) {
|
function stripJunkTags(article, $) {
|
||||||
var tags = arguments.length <= 2 || arguments[2] === undefined ? [] : arguments[2];
|
var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];
|
||||||
|
|
||||||
if (tags.length === 0) {
|
if (tags.length === 0) {
|
||||||
tags = STRIP_OUTPUT_TAGS;
|
tags = STRIP_OUTPUT_TAGS;
|
||||||
@ -1165,7 +1210,7 @@ function scoreCommas(text) {
|
|||||||
var idkRe = new RegExp('^(p|pre)$', 'i');
|
var idkRe = new RegExp('^(p|pre)$', 'i');
|
||||||
|
|
||||||
function scoreLength(textLength) {
|
function scoreLength(textLength) {
|
||||||
var tagName = arguments.length <= 1 || arguments[1] === undefined ? 'p' : arguments[1];
|
var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';
|
||||||
|
|
||||||
var chunks = textLength / 50;
|
var chunks = textLength / 50;
|
||||||
|
|
||||||
@ -1249,7 +1294,7 @@ function addToParent(node, $, score) {
|
|||||||
// if not, initializes a score based on
|
// if not, initializes a score based on
|
||||||
// the node's tag type
|
// the node's tag type
|
||||||
function getOrInitScore($node, $) {
|
function getOrInitScore($node, $) {
|
||||||
var weightNodes = arguments.length <= 2 || arguments[2] === undefined ? true : arguments[2];
|
var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;
|
||||||
|
|
||||||
var score = getScore($node);
|
var score = getScore($node);
|
||||||
|
|
||||||
@ -1339,7 +1384,7 @@ function scorePs($, weightNodes) {
|
|||||||
// score content. Parents get the full value of their children's
|
// score content. Parents get the full value of their children's
|
||||||
// content score, grandparents half
|
// content score, grandparents half
|
||||||
function scoreContent($) {
|
function scoreContent($) {
|
||||||
var weightNodes = arguments.length <= 1 || arguments[1] === undefined ? true : arguments[1];
|
var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
|
||||||
|
|
||||||
// First, look for special hNews based selectors and give them a big
|
// First, look for special hNews based selectors and give them a big
|
||||||
// boost, if they exist
|
// boost, if they exist
|
||||||
@ -1709,7 +1754,7 @@ function cleanTags($article, $) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function cleanHeaders($article, $) {
|
function cleanHeaders($article, $) {
|
||||||
var title = arguments.length <= 2 || arguments[2] === undefined ? '' : arguments[2];
|
var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
|
||||||
|
|
||||||
$(HEADER_TAG_LIST, $article).each(function (index, header) {
|
$(HEADER_TAG_LIST, $article).each(function (index, header) {
|
||||||
var $header = $(header);
|
var $header = $(header);
|
||||||
@ -1794,7 +1839,7 @@ function linkDensity($node) {
|
|||||||
// search for, find a meta tag associated.
|
// search for, find a meta tag associated.
|
||||||
|
|
||||||
function extractFromMeta($, metaNames, cachedNames) {
|
function extractFromMeta($, metaNames, cachedNames) {
|
||||||
var cleanTags = arguments.length <= 3 || arguments[3] === undefined ? true : arguments[3];
|
var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
|
||||||
|
|
||||||
var foundNames = metaNames.filter(function (name) {
|
var foundNames = metaNames.filter(function (name) {
|
||||||
return cachedNames.indexOf(name) !== -1;
|
return cachedNames.indexOf(name) !== -1;
|
||||||
@ -1885,8 +1930,8 @@ function isGoodNode($node, maxChildren) {
|
|||||||
// be extractable from the document. This is for flat
|
// be extractable from the document. This is for flat
|
||||||
// meta-information, like author, title, date published, etc.
|
// meta-information, like author, title, date published, etc.
|
||||||
function extractFromSelectors($, selectors) {
|
function extractFromSelectors($, selectors) {
|
||||||
var maxChildren = arguments.length <= 2 || arguments[2] === undefined ? 1 : arguments[2];
|
var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
|
||||||
var textOnly = arguments.length <= 3 || arguments[3] === undefined ? true : arguments[3];
|
var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
|
||||||
var _iteratorNormalCompletion = true;
|
var _iteratorNormalCompletion = true;
|
||||||
var _didIteratorError = false;
|
var _didIteratorError = false;
|
||||||
var _iteratorError = undefined;
|
var _iteratorError = undefined;
|
||||||
@ -2214,7 +2259,7 @@ function cleanDomainFromTitle(splitTitle, url) {
|
|||||||
// Given a title with separators in it (colons, dashes, etc),
|
// Given a title with separators in it (colons, dashes, etc),
|
||||||
// resolve whether any of the segments should be removed.
|
// resolve whether any of the segments should be removed.
|
||||||
function resolveSplitTitle(title) {
|
function resolveSplitTitle(title) {
|
||||||
var url = arguments.length <= 1 || arguments[1] === undefined ? '' : arguments[1];
|
var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';
|
||||||
|
|
||||||
// Splits while preserving splitters, like:
|
// Splits while preserving splitters, like:
|
||||||
// ['The New New York', ' - ', 'The Washington Post']
|
// ['The New New York', ' - ', 'The Washington Post']
|
||||||
@ -3236,7 +3281,7 @@ var GenericUrlExtractor = {
|
|||||||
var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
|
var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
|
||||||
|
|
||||||
function clean$2(content, $) {
|
function clean$2(content, $) {
|
||||||
var maxLength = arguments.length <= 2 || arguments[2] === undefined ? 200 : arguments[2];
|
var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
|
||||||
|
|
||||||
content = content.replace(/[\s\n]+/g, ' ').trim();
|
content = content.replace(/[\s\n]+/g, ' ').trim();
|
||||||
return ellipsize(content, maxLength, { ellipse: '…' });
|
return ellipsize(content, maxLength, { ellipse: '…' });
|
||||||
@ -3488,7 +3533,7 @@ function extractResult(opts) {
|
|||||||
|
|
||||||
var RootExtractor = {
|
var RootExtractor = {
|
||||||
extract: function extract() {
|
extract: function extract() {
|
||||||
var extractor = arguments.length <= 0 || arguments[0] === undefined ? GenericExtractor : arguments[0];
|
var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
|
||||||
var opts = arguments[1];
|
var opts = arguments[1];
|
||||||
var _opts = opts;
|
var _opts = opts;
|
||||||
var contentOnly = _opts.contentOnly;
|
var contentOnly = _opts.contentOnly;
|
||||||
@ -3628,7 +3673,7 @@ var Mercury = {
|
|||||||
parse: function parse(url, html) {
|
parse: function parse(url, html) {
|
||||||
var _this = this;
|
var _this = this;
|
||||||
|
|
||||||
var opts = arguments.length <= 2 || arguments[2] === undefined ? {} : arguments[2];
|
var opts = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
|
||||||
return _asyncToGenerator(_regeneratorRuntime.mark(function _callee() {
|
return _asyncToGenerator(_regeneratorRuntime.mark(function _callee() {
|
||||||
var _opts$fetchAllPages, fetchAllPages, _opts$fallback, fallback, parsedUrl, Extractor, $, metaCache, result, _result, title, next_page_url;
|
var _opts$fetchAllPages, fetchAllPages, _opts$fallback, fallback, parsedUrl, Extractor, $, metaCache, result, _result, title, next_page_url;
|
||||||
|
|
||||||
|
2
dist/mercury.js.map
vendored
2
dist/mercury.js.map
vendored
File diff suppressed because one or more lines are too long
1
fixtures/www.wired.com/1475256747028.html
Normal file
1
fixtures/www.wired.com/1475256747028.html
Normal file
File diff suppressed because one or more lines are too long
@ -5,6 +5,7 @@ import { TwitterExtractor } from './custom/twitter.com';
|
|||||||
import { NYTimesExtractor } from './custom/www.nytimes.com';
|
import { NYTimesExtractor } from './custom/www.nytimes.com';
|
||||||
import { TheAtlanticExtractor } from './custom/www.theatlantic.com';
|
import { TheAtlanticExtractor } from './custom/www.theatlantic.com';
|
||||||
import { NewYorkerExtractor } from './custom/www.newyorker.com';
|
import { NewYorkerExtractor } from './custom/www.newyorker.com';
|
||||||
|
import { WiredExtractor } from './custom/www.wired.com';
|
||||||
|
|
||||||
const Extractors = {
|
const Extractors = {
|
||||||
'nymag.com': NYMagExtractor,
|
'nymag.com': NYMagExtractor,
|
||||||
@ -14,6 +15,8 @@ const Extractors = {
|
|||||||
'www.nytimes.com': NYTimesExtractor,
|
'www.nytimes.com': NYTimesExtractor,
|
||||||
'www.theatlantic.com': TheAtlanticExtractor,
|
'www.theatlantic.com': TheAtlanticExtractor,
|
||||||
'www.newyorker.com': NewYorkerExtractor,
|
'www.newyorker.com': NewYorkerExtractor,
|
||||||
|
'www.wired.com': WiredExtractor,
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export default Extractors;
|
export default Extractors;
|
||||||
|
61
src/extractors/custom/www.wired.com/index.js
Normal file
61
src/extractors/custom/www.wired.com/index.js
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
// Rename CustomExtractor
|
||||||
|
// to fit your publication
|
||||||
|
// (e.g., NYTimesExtractor)
|
||||||
|
export const WiredExtractor = {
|
||||||
|
domain: 'www.wired.com',
|
||||||
|
title: {
|
||||||
|
selectors: [
|
||||||
|
'h1.post-title',
|
||||||
|
// enter title selectors
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
author: {
|
||||||
|
selectors: [
|
||||||
|
'a[rel="author"]',
|
||||||
|
// enter author selectors
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
content: {
|
||||||
|
selectors: [
|
||||||
|
'article.content',
|
||||||
|
// enter content selectors
|
||||||
|
],
|
||||||
|
|
||||||
|
// Is there anything in the content you selected that needs transformed
|
||||||
|
// before it's consumable content? E.g., unusual lazy loaded images
|
||||||
|
transforms: [
|
||||||
|
],
|
||||||
|
|
||||||
|
// Is there anything that is in the result that shouldn't be?
|
||||||
|
// The clean selectors will remove anything that matches from
|
||||||
|
// the result
|
||||||
|
clean: [
|
||||||
|
'.visually-hidden',
|
||||||
|
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
date_published: {
|
||||||
|
selectors: [
|
||||||
|
['meta[itemprop="datePublished"]', 'value'],
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
lead_image_url: {
|
||||||
|
selectors: [
|
||||||
|
['meta[name="og:image"]', 'value'],
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
dek: {
|
||||||
|
selectors: [
|
||||||
|
['meta[name="og:description"]', 'value'],
|
||||||
|
],
|
||||||
|
},
|
||||||
|
|
||||||
|
next_page_url: null,
|
||||||
|
|
||||||
|
excerpt: null,
|
||||||
|
};
|
134
src/extractors/custom/www.wired.com/index.test.js
Normal file
134
src/extractors/custom/www.wired.com/index.test.js
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
import assert from 'assert';
|
||||||
|
import fs from 'fs';
|
||||||
|
import URL from 'url';
|
||||||
|
import cheerio from 'cheerio';
|
||||||
|
|
||||||
|
import Mercury from 'mercury';
|
||||||
|
import getExtractor from 'extractors/get-extractor';
|
||||||
|
|
||||||
|
// Rename CustomExtractor
|
||||||
|
describe('WiredExtractor', () => {
|
||||||
|
it('is selected properly', () => {
|
||||||
|
// To pass this test, rename your extractor in
|
||||||
|
// ./src/extractors/custom/www.wired.com/index.js
|
||||||
|
// (e.g., CustomExtractor => NYTimesExtractor)
|
||||||
|
// then add your new extractor to
|
||||||
|
// src/extractors/all.js
|
||||||
|
const url =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
const extractor = getExtractor(url);
|
||||||
|
assert.equal(extractor.domain, URL.parse(url).hostname);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns the title', ((async)) () => {
|
||||||
|
// To pass this test, fill out the title selector
|
||||||
|
// in ./src/extractors/custom/www.wired.com/index.js.
|
||||||
|
const html =
|
||||||
|
fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
|
||||||
|
const articleUrl =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
|
||||||
|
const { title } =
|
||||||
|
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||||
|
|
||||||
|
// Update these values with the expected values from
|
||||||
|
// the article.
|
||||||
|
assert.equal(title, 'An Ode to the Rosetta Spacecraft as It Flings Itself Into a Comet');
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
it('returns the author', ((async)) () => {
|
||||||
|
// To pass this test, fill out the author selector
|
||||||
|
// in ./src/extractors/custom/www.wired.com/index.js.
|
||||||
|
const html =
|
||||||
|
fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
|
||||||
|
const articleUrl =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
|
||||||
|
const { author } =
|
||||||
|
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||||
|
|
||||||
|
// Update these values with the expected values from
|
||||||
|
// the article.
|
||||||
|
assert.equal(author, 'Emma Grey Ellis');
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
it('returns the date_published', ((async)) () => {
|
||||||
|
// To pass this test, fill out the date_published selector
|
||||||
|
// in ./src/extractors/custom/www.wired.com/index.js.
|
||||||
|
const html =
|
||||||
|
fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
|
||||||
|
const articleUrl =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
|
||||||
|
const { date_published } =
|
||||||
|
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||||
|
|
||||||
|
// Update these values with the expected values from
|
||||||
|
// the article.
|
||||||
|
assert.equal(date_published, '2016-09-30T07:00:12.000Z');
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
it('returns the dek', ((async)) () => {
|
||||||
|
// To pass this test, fill out the dek selector
|
||||||
|
// in ./src/extractors/custom/www.wired.com/index.js.
|
||||||
|
const html =
|
||||||
|
fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
|
||||||
|
const articleUrl =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
|
||||||
|
const { dek } =
|
||||||
|
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||||
|
|
||||||
|
// Update these values with the expected values from
|
||||||
|
// the article.
|
||||||
|
assert.equal(dek, 'Time to break out the tissues, space fans.');
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
it('returns the lead_image_url', ((async)) () => {
|
||||||
|
// To pass this test, fill out the lead_image_url selector
|
||||||
|
// in ./src/extractors/custom/www.wired.com/index.js.
|
||||||
|
const html =
|
||||||
|
fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
|
||||||
|
const articleUrl =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
|
||||||
|
const { lead_image_url } =
|
||||||
|
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||||
|
|
||||||
|
// Update these values with the expected values from
|
||||||
|
// the article.
|
||||||
|
assert.equal(lead_image_url, 'https://www.wired.com/wp-content/uploads/2016/09/Rosetta_impact-1-1200x630.jpg');
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
it('returns the content', ((async)) () => {
|
||||||
|
// To pass this test, fill out the content selector
|
||||||
|
// in ./src/extractors/custom/www.wired.com/index.js.
|
||||||
|
// You may also want to make use of the clean and transform
|
||||||
|
// options.
|
||||||
|
const html =
|
||||||
|
fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
|
||||||
|
const url =
|
||||||
|
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
|
||||||
|
|
||||||
|
const { content } =
|
||||||
|
await Mercury.parse(url, html, { fallback: false });
|
||||||
|
|
||||||
|
const $ = cheerio.load(content || '');
|
||||||
|
|
||||||
|
const first13 = $('*').first()
|
||||||
|
.text()
|
||||||
|
.trim()
|
||||||
|
.split(/\s+/)
|
||||||
|
.slice(0, 13)
|
||||||
|
.join(' ');
|
||||||
|
|
||||||
|
// Update these values with the expected values from
|
||||||
|
// the article.
|
||||||
|
assert.equal(first13, 'Today, the European Space Agency’s Rosetta spacecraft will engage its thrusters for one');
|
||||||
|
});
|
||||||
|
});
|
Loading…
Reference in New Issue
Block a user