chore: Inline test fixtures (#683)

Not to be confused with extractor fixtures, which are snapshots of a webpage.

This change removes the pattern of separate JS files that provide "fixtures" for tests, which are used as provided or expected strings in tests. They were inconsistent and disorganized, and generally just served to add indirection to test files. So now all those strings are defined where they are used in their respective tests.
feat-netease-extractor
John Holdun 2 years ago committed by GitHub
parent 0d2bad544c
commit 112846f74f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,15 +0,0 @@
const HTML = {
docWithH1: '<div><h1>This Is the Real Title</h1></div>',
docWith2H1s: `
<div>
<h1>This Is the Real Title</h1>
<h1>This Is the Real Title</h1>
</div>
`,
docWithTagsInH1: {
before: '<div><h1>This Is the <em>Real</em> Title</h1></div>',
after: 'This Is the Real Title',
},
};
export default HTML;

@ -9,8 +9,7 @@ describe('clean(leadImageUrl)', () => {
}); });
it('returns null if the url is not valid', () => { it('returns null if the url is not valid', () => {
const url = 'this is not a valid url'; assert.equal(clean('this is not a valid url'), null);
assert.equal(clean(url), null);
}); });
it('trims whitespace', () => { it('trims whitespace', () => {

@ -1,27 +1,35 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { cleanTitle } from './index'; import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => { describe('cleanTitle(title, { url, $ })', () => {
it('only uses h1 if there is only one on the page', () => { it('only uses h1 if there is only one on the page', () => {
const title = 'Too Short'; const title = 'Too Short';
const $ = cheerio.load(HTML.docWith2H1s); const $ = cheerio.load(`
<div>
<h1>This Is the Real Title</h1>
<h1>This Is the Real Title</h1>
</div>
`);
assert.equal(cleanTitle(title, { url: '', $ }), title); assert.equal(cleanTitle(title, { url: '', $ }), title);
}); });
it('removes HTML tags from titles', () => { it('removes HTML tags from titles', () => {
const $ = cheerio.load(HTML.docWithTagsInH1.before); const $ = cheerio.load(
'<div><h1>This Is the <em>Real</em> Title</h1></div>'
);
const title = $('h1').html(); const title = $('h1').html();
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after); assert.equal(cleanTitle(title, { url: '', $ }), 'This Is the Real Title');
}); });
it('trims extraneous spaces', () => { it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love "; const title = " This Is a Great Title That You'll Love ";
const $ = cheerio.load(HTML.docWithTagsInH1.before); const $ = cheerio.load(
'<div><h1>This Is the <em>Real</em> Title</h1></div>'
);
assert.equal(cleanTitle(title, { url: '', $ }), title.trim()); assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
}); });

@ -5,17 +5,15 @@ import detectByHtml from './detect-by-html';
describe('detectByHtml', () => { describe('detectByHtml', () => {
it('detects a medium post from the html', () => { it('detects a medium post from the html', () => {
const html = '<head><meta name="al:ios:app_name" value="Medium" /></head>'; const $ = cheerio.load(
'<head><meta name="al:ios:app_name" value="Medium" /></head>'
const $ = cheerio.load(html); );
assert.equal(detectByHtml($).domain, 'medium.com'); assert.equal(detectByHtml($).domain, 'medium.com');
}); });
it('returns nothing if no match is found', () => { it('returns nothing if no match is found', () => {
const html = '<div></div>'; const $ = cheerio.load('<div></div>');
const $ = cheerio.load(html);
assert.equal(detectByHtml($), null); assert.equal(detectByHtml($), null);
}); });

@ -1,39 +1,54 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import GenericAuthorExtractor from './extractor'; import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => { describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => { describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => { it('extracts author from meta tags', () => {
const $ = cheerio.load(HTML.authorMeta.test); const $ = cheerio.load(`
<html>
<meta name="dc.author" value="Adam" />
</html>
`);
const result = GenericAuthorExtractor.extract({ const result = GenericAuthorExtractor.extract({
$, $,
metaCache: ['dc.author', 'something-else'], metaCache: ['dc.author', 'something-else'],
}); });
assert.equal(result, HTML.authorMeta.result); assert.equal(result, 'Adam');
}); });
it('extracts author from author selectors', () => { it('extracts author from author selectors', () => {
const $ = cheerio.load(HTML.authorSelectors.test); const $ = cheerio.load(`
<div>
<div class="byline">
<a href="/author/adam">Adam</a>
</div>
</div>
`);
const result = GenericAuthorExtractor.extract({ const result = GenericAuthorExtractor.extract({
$, $,
metaCache: ['dc.author', 'something-else'], metaCache: ['dc.author', 'something-else'],
}); });
assert.equal(result, HTML.authorSelectors.result); assert.equal(result, 'Adam');
}); });
it('extracts author with regex selectors', () => { it('extracts author with regex selectors', () => {
const $ = cheerio.load(HTML.authorRegSelectors.test); const $ = cheerio.load(`
<div>
<div class="byline">
<span>By Adam</span>
</div>
</div>
`);
const result = GenericAuthorExtractor.extract({ const result = GenericAuthorExtractor.extract({
$, $,
metaCache: ['dc.author', 'something-else'], metaCache: ['dc.author', 'something-else'],
}); });
assert.equal(result, HTML.authorRegSelectors.result); assert.equal(result, 'Adam');
}); });
it('returns null if no author found', () => { it('returns null if no author found', () => {

@ -1,32 +0,0 @@
const HTML = {
authorMeta: {
test: `
<html>
<meta name="dc.author" value="Adam" />
</html>
`,
result: 'Adam',
},
authorSelectors: {
test: `
<div>
<div class="byline">
<a href="/author/adam">Adam</a>
</div>
</div>
`,
result: 'Adam',
},
authorRegSelectors: {
test: `
<div>
<div class="byline">
<span>By Adam</span>
</div>
</div>
`,
result: 'Adam',
},
};
export default HTML;

@ -7,14 +7,12 @@ const fs = require('fs');
describe('extractBestNode($, flags)', () => { describe('extractBestNode($, flags)', () => {
it('scores the dom nodes and returns the best option', () => { it('scores the dom nodes and returns the best option', () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8'); const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8');
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
};
const $ = cheerio.load(html); const $ = cheerio.load(html);
const bestNode = extractBestNode($, opts); const bestNode = extractBestNode($, {
stripUnlikelyCandidates: true,
weightNodes: true,
});
assert(typeof bestNode, 'object'); assert(typeof bestNode, 'object');
}); });

@ -7,17 +7,15 @@ describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => { describe('addScore(node, $, amount)', () => {
it("adds the specified amount to a node's score", () => { it("adds the specified amount to a node's score", () => {
const $ = cheerio.load('<p score="25">Foo</p>'); const $ = cheerio.load('<p score="25">Foo</p>');
let $node = $('p').first(); const $node = $('p').first();
addScore($node, $, 25);
$node = addScore($node, $, 25);
assert.equal(getScore($node), 50); assert.equal(getScore($node), 50);
}); });
it('adds score if score not yet set (assumes score is 0)', () => { it('adds score if score not yet set (assumes score is 0)', () => {
const $ = cheerio.load('<p>Foo</p>'); const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first(); const $node = $('p').first();
addScore($node, $, 25);
$node = addScore($node, $, 25);
assert.equal(getScore($node), 25); assert.equal(getScore($node), 25);
}); });
}); });

@ -6,11 +6,8 @@ import { addToParent, getScore } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('addToParent(node, $, amount)', () => { describe('addToParent(node, $, amount)', () => {
it("adds 1/4 of a node's score it its parent", () => { it("adds 1/4 of a node's score it its parent", () => {
const html = '<div score="25"><p score="40">Foo</p></div>'; const $ = cheerio.load('<div score="25"><p score="40">Foo</p></div>');
const $ = cheerio.load(html); const $node = addToParent($('p').first(), $, 40);
let $node = $('p').first();
$node = addToParent($node, $, 40);
assert.equal(getScore($node.parent()), 35); assert.equal(getScore($node.parent()), 35);
assert.equal(getScore($node), 40); assert.equal(getScore($node), 40);

@ -1,15 +1,17 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { getScore, findTopCandidate, scoreContent } from './index'; import { getScore, findTopCandidate, scoreContent } from './index';
const fs = require('fs'); const fs = require('fs');
describe('findTopCandidate($)', () => { describe('findTopCandidate($)', () => {
it('finds the top candidate from simple case', () => { it('finds the top candidate from simple case', () => {
const $ = cheerio.load(HTML.findDom1); const $ = cheerio.load(`
<div score="100">
<p score="1">Lorem ipsum etc</p>
</div>
`);
const $$topCandidate = findTopCandidate($); const $$topCandidate = findTopCandidate($);
@ -17,17 +19,27 @@ describe('findTopCandidate($)', () => {
}); });
it('finds the top candidate from a nested case', () => { it('finds the top candidate from a nested case', () => {
const $ = cheerio.load(HTML.findDom2); const $ = cheerio.load(`
<div score="10">
<article score="50">
<p score="1">Lorem ipsum etc</p>
</article>
</div>
`);
const $$topCandidate = findTopCandidate($); const $$topCandidate = findTopCandidate($);
// this is wrapped in a div so checking // this is wrapped in a div so checking the score of the first child
// the score of the first child
assert.equal(getScore($$topCandidate.first()), 50); assert.equal(getScore($$topCandidate.first()), 50);
}); });
it('ignores tags like BR', () => { it('ignores tags like BR', () => {
const $ = cheerio.load(HTML.findDom3); const $ = cheerio.load(`
<article score="50">
<p score="1">Lorem ipsum br</p>
<br score="1000" />
</article>
`);
const $topCandidate = findTopCandidate($); const $topCandidate = findTopCandidate($);
@ -35,13 +47,19 @@ describe('findTopCandidate($)', () => {
}); });
it('returns BODY if no candidates found', () => { it('returns BODY if no candidates found', () => {
const $ = cheerio.load(HTML.topBody); const $ = cheerio.load(`
<body>
<article>
<p>Lorem ipsum etc</p>
<br />
</article>
<body>
`);
const $topCandidate = findTopCandidate($); const $topCandidate = findTopCandidate($);
// browser won't allow body tag to be placed // browser won't allow body tag to be placed arbitrarily/loaded on the page,
// arbitrarily/loaded on the page, so we tranform // so we tranform it in cheerio-query, so this test would fail.
// it in cheerio-query, so this test would fail.
if (!$.browser) { if (!$.browser) {
assert.equal($topCandidate.get(0).tagName, 'body'); assert.equal($topCandidate.get(0).tagName, 'body');
} }

@ -1,664 +0,0 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`,
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`,
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`,
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`,
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`,
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
},
// cleanConditionally
dropNegativeScore: {
before: `
<div>
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`,
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`,
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
previousEndsInColon: {
before: `
<div weight="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: 'What do you think?',
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
};
export default HTML;

@ -1,87 +0,0 @@
const HTML = {
score1: `
<p>Lorem ipsum dolor sit amet</p>
`,
score3: `
<p>Lorem ipsum, dolor sit, amet</p>
`,
score19: `
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`,
divScore5: `
<div>Lorem ipsum, dolor sit, amet</div>
`,
blockquoteScore3: `
<blockquote>Lorem ipsum, dolor sit, amet</blockquote>
`,
formScoreNeg3: `
<form><label>Lorem ipsum, dolor sit, amet</label></form>
`,
thScoreNeg5: `
<th>Lorem ipsum, dolor sit, amet</th>
`,
score44: `
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`,
score44Parent: `
<div>
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
hNews: {
before: `
<div class="hentry">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
after: `
<div class="hentry" score="99">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
},
nonHNews: {
before: `
<div class="">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
after: `
<div class="" score="38">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
},
// findTopCandidate
findDom1: `
<div score="100">
<p score="1">Lorem ipsum etc</p>
</div>
`,
findDom2: `
<div score="10">
<article score="50">
<p score="1">Lorem ipsum etc</p>
</article>
</div>
`,
findDom3: `
<article score="50">
<p score="1">Lorem ipsum br</p>
<br score="1000" />
</article>
`,
topBody: `
<body>
<article>
<p>Lorem ipsum etc</p>
<br />
</article>
<body>
`,
};
export default HTML;

@ -1,17 +1,13 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { getOrInitScore, getScore } from './index'; import { getOrInitScore, getScore } from './index';
describe('getOrInitScore(node, $)', () => { describe('getOrInitScore(node, $)', () => {
describe('when score set', () => { describe('when score set', () => {
it("returns score if node's score already set", () => { it("returns score if node's score already set", () => {
const html = '<p score="40">Foo</p>'; const $ = cheerio.load('<p score="40">Foo</p>');
const $ = cheerio.load(html); const score = getOrInitScore($('p').first(), $);
const node = $('p').first();
const score = getOrInitScore(node, $);
assert.equal(score, 40); assert.equal(score, 40);
}); });
@ -19,40 +15,40 @@ describe('getOrInitScore(node, $)', () => {
describe('when no score set', () => { describe('when no score set', () => {
it('returns 0 if no class/id and text < 25 chars', () => { it('returns 0 if no class/id and text < 25 chars', () => {
const html = '<p>Foo</p>'; const $ = cheerio.load('<p>Foo</p>');
const $ = cheerio.load(html); const score = getOrInitScore($('p').first(), $);
const node = $('p').first();
const score = getOrInitScore(node, $);
assert.equal(score, 0); assert.equal(score, 0);
}); });
it('returns score if no class/id and has commas/length', () => { it('returns score if no class/id and has commas/length', () => {
const $ = cheerio.load(HTML.score19); const $ = cheerio.load(
const node = $('p').first(); `<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>`
);
const score = getOrInitScore(node, $); const score = getOrInitScore($('p').first(), $);
assert.equal(score, 19); assert.equal(score, 19);
}); });
it('returns greater score if weighted class/id is set', () => { it('returns greater score if weighted class/id is set', () => {
const $ = cheerio.load(HTML.score44); const $ = cheerio.load(
const node = $('p').first(); `<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>`
);
const score = getOrInitScore(node, $); const score = getOrInitScore($('p').first(), $);
assert.equal(score, 44); assert.equal(score, 44);
}); });
it('gives 1/4 of its score to its parent', () => { it('gives 1/4 of its score to its parent', () => {
const $ = cheerio.load(HTML.score44Parent); const $ = cheerio.load(`
const node = $('p').first(); <div>
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
getOrInitScore(node, $); </div>
`);
assert.equal(getScore(node.parent()), 16);
const $node = $('p').first();
getOrInitScore($node, $);
assert.equal(getScore($node.parent()), 16);
}); });
}); });
}); });

@ -7,15 +7,14 @@ describe('Scoring utils', () => {
describe('getScore($node)', () => { describe('getScore($node)', () => {
it('returns null if the node has no score set', () => { it('returns null if the node has no score set', () => {
const $ = cheerio.load('<p>Foo</p>'); const $ = cheerio.load('<p>Foo</p>');
const $node = $('p').first(); assert.equal(getScore($('p').first()), null);
assert.equal(getScore($node), null);
}); });
it('returns 25 if the node has a score attr of 25', () => { it('returns 25 if the node has a score attr of 25', () => {
const $ = cheerio.load('<p score="25">Foo</p>'); const $ = cheerio.load('<p score="25">Foo</p>');
const $node = $('p').first(); const score = getScore($('p').first());
assert.equal(typeof getScore($node), 'number'); assert.equal(typeof score, 'number');
assert.equal(getScore($node), 25); assert.equal(score, 25);
}); });
}); });
}); });

@ -1,55 +1,90 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/get-weight';
import { getWeight } from './index'; import { getWeight } from './index';
describe('Generic Extractor Utils', () => { describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => { describe('getWeight(node)', () => {
it('returns a score of 25 if node has positive id', () => { it('returns a score of 25 if node has positive id', () => {
const $ = cheerio.load(HTML.positiveId); const $ = cheerio.load(`
<div id="entry">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25); assert.equal(getWeight($('div')), 25);
}); });
it('returns a score of -25 if node has negative id', () => { it('returns a score of -25 if node has negative id', () => {
const $ = cheerio.load(HTML.negativeId); const $ = cheerio.load(`
<div id="adbox">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), -25); assert.equal(getWeight($('div')), -25);
}); });
it('returns a score of 25 if node has positive class', () => { it('returns a score of 25 if node has positive class', () => {
const $ = cheerio.load(HTML.positiveClass); const $ = cheerio.load(`
<div class="entry">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25); assert.equal(getWeight($('div')), 25);
}); });
it('returns a score of -25 if node has negative class', () => { it('returns a score of -25 if node has negative class', () => {
const $ = cheerio.load(HTML.negativeClass); const $ = cheerio.load(`
<div id="comment ad">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), -25); assert.equal(getWeight($('div')), -25);
}); });
it('returns a score of 25 if node has both positive id and class', () => { it('returns a score of 25 if node has both positive id and class', () => {
const $ = cheerio.load(HTML.positiveIdAndClass); const $ = cheerio.load(`
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25); assert.equal(getWeight($('div')), 25);
}); });
it('returns a score of 25 if node has pos id and neg class', () => { it('returns a score of 25 if node has pos id and neg class', () => {
// is this really wanted? id="entry" class="adbox" // is this really wanted? id="entry" class="adbox"
// should get positive score? // should get positive score?
const $ = cheerio.load(HTML.positiveIdNegClass); const $ = cheerio.load(`
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 25); assert.equal(getWeight($('div')), 25);
}); });
it('returns a score of 10 if node has pos img class', () => { it('returns a score of 10 if node has pos img class', () => {
const $ = cheerio.load(HTML.positivePhotoClass); const $ = cheerio.load(`
<div class="figure">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 10); assert.equal(getWeight($('div')), 10);
}); });
it('returns a score of 35 if node has pos id pos img class', () => { it('returns a score of 35 if node has pos id pos img class', () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto); const $ = cheerio.load(`
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 35); assert.equal(getWeight($('div')), 35);
}); });
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => { it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
const $ = cheerio.load(HTML.entryContentAsset); const $ = cheerio.load(`
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`);
assert.equal(getWeight($('div')), 50); assert.equal(getWeight($('div')), 50);
}); });
}); });

@ -1,8 +1,6 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { scoreContent, getScore } from './index'; import { scoreContent, getScore } from './index';
const fs = require('fs'); const fs = require('fs');
@ -12,15 +10,24 @@ const fs = require('fs');
// probably missing something when calculating // probably missing something when calculating
describe('scoreContent($, weightNodes)', () => { describe('scoreContent($, weightNodes)', () => {
it('loves hNews content', () => { it('loves hNews content', () => {
const $ = cheerio.load(HTML.hNews.before); const $ = cheerio.load(`
<div class="hentry">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`);
scoreContent($); scoreContent($);
assert.equal(getScore($('div').first()), 140); assert.equal(getScore($('div').first()), 140);
}); });
it('is so-so about non-hNews content', () => { it('is so-so about non-hNews content', () => {
const $ = cheerio.load(HTML.nonHNews.before); const $ = cheerio.load(`
scoreContent($).html(); <div class="">
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`);
scoreContent($);
assert.equal(getScore($('div').first()), 65); assert.equal(getScore($('div').first()), 65);
}); });
@ -28,15 +35,14 @@ describe('scoreContent($, weightNodes)', () => {
it('scores this Wired article the same', () => { it('scores this Wired article the same', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8'); const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html); const $ = cheerio.load(html);
scoreContent($).html(); scoreContent($);
assert.equal(getScore($('article').first()), 65.5); assert.equal(getScore($('article').first()), 65.5);
}); });
it('scores this Vulture article', () => { it('scores this Vulture article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8'); const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
let $ = cheerio.load(html); const $ = scoreContent(cheerio.load(html));
$ = scoreContent($);
assert.equal($('p[score]').length, 62); assert.equal($('p[score]').length, 62);
const itemprop = $('[itemprop=articleBody]').first(); const itemprop = $('[itemprop=articleBody]').first();
@ -73,8 +79,9 @@ describe('scoreContent($, weightNodes)', () => {
</div> </div>
</div> </div>
`; `;
let $ = cheerio.load(html);
$ = scoreContent($); const $ = cheerio.load(html);
scoreContent($);
assert.equal( assert.equal(
$('p') $('p')

@ -1,14 +1,11 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { scoreNode, scoreParagraph } from './index'; import { scoreNode, scoreParagraph } from './index';
describe('scoreNode(node)', () => { describe('scoreNode(node)', () => {
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const html = '<p><em>Foo</em> bar</p>'; const $ = cheerio.load('<p><em>Foo</em> bar</p>');
const $ = cheerio.load(html);
const node = $('p').first(); const node = $('p').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -19,7 +16,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score1); const $ = cheerio.load(`
<p>Lorem ipsum dolor sit amet</p>
`);
const node = $('p').first(); const node = $('p').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -30,7 +29,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score3); const $ = cheerio.load(`
<p>Lorem ipsum, dolor sit, amet</p>
`);
const node = $('p').first(); const node = $('p').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -41,7 +42,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score19); const $ = cheerio.load(`
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`);
const node = $('p').first(); const node = $('p').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -52,7 +55,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores divs with 5', () => { it('scores divs with 5', () => {
const $ = cheerio.load(HTML.divScore5); const $ = cheerio.load(`
<div>Lorem ipsum, dolor sit, amet</div>
`);
const node = $('div').first(); const node = $('div').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -61,7 +66,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores the blockquote family with 3', () => { it('scores the blockquote family with 3', () => {
const $ = cheerio.load(HTML.blockquoteScore3); const $ = cheerio.load(`
<blockquote>Lorem ipsum, dolor sit, amet</blockquote>
`);
const node = $('blockquote').first(); const node = $('blockquote').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -70,7 +77,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores a form with negative 3', () => { it('scores a form with negative 3', () => {
const $ = cheerio.load(HTML.formScoreNeg3); const $ = cheerio.load(`
<form><label>Lorem ipsum, dolor sit, amet</label></form>
`);
const node = $('form').first(); const node = $('form').first();
const score = scoreNode(node); const score = scoreNode(node);
@ -79,7 +88,9 @@ describe('scoreNode(node)', () => {
}); });
it('scores a TH element with negative 5', () => { it('scores a TH element with negative 5', () => {
const $ = cheerio.load(HTML.thScoreNeg5); const $ = cheerio.load(`
<th>Lorem ipsum, dolor sit, amet</th>
`);
const node = $('th').first(); const node = $('th').first();
const score = scoreNode(node); const score = scoreNode(node);

@ -1,44 +1,36 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { scoreParagraph } from './index'; import { scoreParagraph } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreParagraph(node)', () => { describe('scoreParagraph(node)', () => {
it('returns 0 if text is less than 25 chars', () => { it('returns 0 if text is less than 25 chars', () => {
const html = '<p><em>Foo</em> bar</p>'; const $ = cheerio.load('<p><em>Foo</em> bar</p>');
const $ = cheerio.load(html); const score = scoreParagraph($('p').first());
const node = $('p').first();
const score = scoreParagraph(node);
assert.equal(score, 0); assert.equal(score, 0);
}); });
it('returns 1 if text is > 25 chars and has 0 commas', () => { it('returns 1 if text is > 25 chars and has 0 commas', () => {
const $ = cheerio.load(HTML.score1); const $ = cheerio.load('<p>Lorem ipsum dolor sit amet</p>');
const node = $('p').first(); const score = scoreParagraph($('p').first());
const score = scoreParagraph(node);
assert.equal(score, 1); assert.equal(score, 1);
}); });
it('returns 3 if text is > 25 chars and has 2 commas', () => { it('returns 3 if text is > 25 chars and has 2 commas', () => {
const $ = cheerio.load(HTML.score3); const $ = cheerio.load('<p>Lorem ipsum, dolor sit, amet</p>');
const node = $('p').first(); const score = scoreParagraph($('p').first());
const score = scoreParagraph(node);
assert.equal(score, 3); assert.equal(score, 3);
}); });
it('returns 19 if text has 15 commas, ~600 chars', () => { it('returns 19 if text has 15 commas, ~600 chars', () => {
const $ = cheerio.load(HTML.score19); const $ = cheerio.load(
const node = $('p').first(); `<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>`
);
const score = scoreParagraph(node); const score = scoreParagraph($('p').first());
assert.equal(score, 19); assert.equal(score, 19);
}); });

@ -7,10 +7,9 @@ describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => { describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => { it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>'); const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first(); const $node = $('p').first();
const newScore = 25; const newScore = 25;
$node = setScore($node, $, newScore); setScore($node, $, newScore);
const score = getScore($node); const score = getScore($node);
assert(score, newScore); assert(score, newScore);

@ -2,13 +2,18 @@ import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import moment from 'moment-timezone'; import moment from 'moment-timezone';
import HTML from './fixtures/html';
import GenericDatePublishedExtractor from './extractor'; import GenericDatePublishedExtractor from './extractor';
describe('GenericDatePublishedExtractor', () => { describe('GenericDatePublishedExtractor', () => {
describe('extract($, metaCache)', () => { describe('extract($, metaCache)', () => {
it('extracts datePublished from meta tags', () => { it('extracts datePublished from meta tags', () => {
const $ = cheerio.load(HTML.datePublishedMeta.test); const $ = cheerio.load(`
<html>
<head>
<meta name="displaydate" value="1/1/2020 8:30 (EST)" />
</head>
</html>
`);
const metaCache = ['displaydate', 'something-else']; const metaCache = ['displaydate', 'something-else'];
const result = GenericDatePublishedExtractor.extract({ const result = GenericDatePublishedExtractor.extract({
$, $,
@ -16,11 +21,19 @@ describe('GenericDatePublishedExtractor', () => {
metaCache, metaCache,
}); });
assert.equal(result, HTML.datePublishedMeta.result.toISOString()); assert.equal(result, new Date('1/1/2020 8:30 (EST)').toISOString());
}); });
it('extracts datePublished from selectors', () => { it('extracts datePublished from selectors', () => {
const $ = cheerio.load(HTML.datePublishedSelectors.test); const $ = cheerio.load(`
<div>
<div class="hentry">
<div class="updated">
1/1/2020 <span class="time">8:30am</span>
</div>
</head>
</div>
`);
const metaCache = []; const metaCache = [];
const result = GenericDatePublishedExtractor.extract({ const result = GenericDatePublishedExtractor.extract({
$, $,
@ -28,7 +41,7 @@ describe('GenericDatePublishedExtractor', () => {
metaCache, metaCache,
}); });
assert.equal(result, HTML.datePublishedMeta.result.toISOString()); assert.equal(result, new Date('1/1/2020 8:30 (EST)').toISOString());
}); });
it('extracts from url formatted /2012/08/01/etc', () => { it('extracts from url formatted /2012/08/01/etc', () => {

@ -1,26 +0,0 @@
const HTML = {
datePublishedMeta: {
test: `
<html>
<head>
<meta name="displaydate" value="1/1/2020 8:30 (EST)" />
</head>
</html>
`,
result: new Date('1/1/2020 8:30 (EST)'),
},
datePublishedSelectors: {
test: `
<div>
<div class="hentry">
<div class="updated">
1/1/2020 <span class="time">8:30am</span>
</div>
</head>
</div>
`,
result: new Date('1/1/2020 8:30 am (EST)'),
},
};
export default HTML;

@ -1,14 +1,18 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import GenericLeadImageUrlExtractor from './extractor'; import GenericLeadImageUrlExtractor from './extractor';
describe('GenericLeadImageUrlExtractor', () => { describe('GenericLeadImageUrlExtractor', () => {
describe('extract({ $, content, metaCache })', () => { describe('extract({ $, content, metaCache })', () => {
it('returns og:image first', () => { it('returns og:image first', () => {
const $ = cheerio.load(HTML.og.test); const $ = cheerio.load(`
<html>
<head>
<meta name="og:image" value="http://example.com/lead.jpg">
</head>
</html>
`);
const content = $('*').first(); const content = $('*').first();
const metaCache = ['og:image']; const metaCache = ['og:image'];
@ -18,11 +22,17 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache, metaCache,
}); });
assert.equal(result, HTML.og.result); assert.equal(result, 'http://example.com/lead.jpg');
}); });
it('returns twitter:image', () => { it('returns twitter:image', () => {
const $ = cheerio.load(HTML.twitter.test); const $ = cheerio.load(`
<html>
<head>
<meta name="twitter:image" value="http://example.com/lead.jpg">
</head>
</html>
`);
const content = $('*').first(); const content = $('*').first();
const metaCache = ['twitter:image']; const metaCache = ['twitter:image'];
@ -32,11 +42,17 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache, metaCache,
}); });
assert.equal(result, HTML.twitter.result); assert.equal(result, 'http://example.com/lead.jpg');
}); });
it('finds images based on scoring', () => { it('finds images based on scoring', () => {
const $ = cheerio.load(HTML.scoring.test); const $ = cheerio.load(`
<div>
<img src="http://example.com/sprite/abadpic.jpg" />
<img src="http://example.com/upload/goodpic.jpg" />
<img src="http://example.com/upload/whateverpic.png" />
</div>
`);
const content = $('*').first(); const content = $('*').first();
const metaCache = []; const metaCache = [];
@ -46,11 +62,15 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache, metaCache,
}); });
assert.equal(result, HTML.scoring.result); assert.equal(result, 'http://example.com/upload/goodpic.jpg');
}); });
it('returns image based on selectors', () => { it('returns image based on selectors', () => {
const $ = cheerio.load(HTML.selectors.test); const $ = cheerio.load(`
<div>
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div>
`);
const content = $('*').first(); const content = $('*').first();
const metaCache = []; const metaCache = [];
@ -60,7 +80,7 @@ describe('GenericLeadImageUrlExtractor', () => {
metaCache, metaCache,
}); });
assert.equal(result, HTML.selectors.result); assert.equal(result, 'http://example.com/upload/goodpic.jpg');
}); });
}); });
}); });

@ -1,42 +0,0 @@
const HTML = {
og: {
test: `
<html>
<head>
<meta name="og:image" value="http://example.com/lead.jpg">
</head>
</html>
`,
result: 'http://example.com/lead.jpg',
},
twitter: {
test: `
<html>
<head>
<meta name="twitter:image" value="http://example.com/lead.jpg">
</head>
</html>
`,
result: 'http://example.com/lead.jpg',
},
scoring: {
test: `
<div>
<img src="http://example.com/sprite/abadpic.jpg" />
<img src="http://example.com/upload/goodpic.jpg" />
<img src="http://example.com/upload/whateverpic.png" />
</div>
`,
result: 'http://example.com/upload/goodpic.jpg',
},
selectors: {
test: `
<div>
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div>
`,
result: 'http://example.com/upload/goodpic.jpg',
},
};
export default HTML;

@ -66,15 +66,15 @@ describe('scoreAttr($img)', () => {
describe('scoreByParents($img)', () => { describe('scoreByParents($img)', () => {
it('gets 25 points if it has a figure parent', () => { it('gets 25 points if it has a figure parent', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`<div> <div>
<figure> <figure>
<div> <div>
<img alt="Wow" /> <img alt="Wow" />
</div> </div>
</figure> </figure>
</div>` </div>
); `);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByParents($img), 25); assert.equal(scoreByParents($img), 25);
@ -88,15 +88,15 @@ describe('scoreByParents($img)', () => {
}); });
it('gets 15 points if parent or gparent has photo hints', () => { it('gets 15 points if parent or gparent has photo hints', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`<div> <div>
<div class="figure"> <div class="figure">
<div> <div>
<img alt="Wow" /> <img alt="Wow" />
</div>
</div> </div>
</div>` </div>
); </div>
`);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByParents($img), 15); assert.equal(scoreByParents($img), 15);
@ -105,30 +105,28 @@ describe('scoreByParents($img)', () => {
describe('scoreBySibling($img)', () => { describe('scoreBySibling($img)', () => {
it('gets 25 points if its sibling is figcaption', () => { it('gets 25 points if its sibling is figcaption', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img /> <img />
<figcaption>Wow</figcaption> <figcaption>Wow</figcaption>
</div> </div>
` `);
);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreBySibling($img), 25); assert.equal(scoreBySibling($img), 25);
}); });
it('gets 15 points if its sibling has photo hints', () => { it('gets 15 points if its sibling has photo hints', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`<div> <div>
<div> <div>
<img alt="Wow" /> <img alt="Wow" />
<div class="caption"> <div class="caption">
Wow Wow
</div> </div>
</div> </div>
</div>` </div>
); `);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreBySibling($img), 15); assert.equal(scoreBySibling($img), 15);
@ -137,65 +135,55 @@ describe('scoreBySibling($img)', () => {
describe('scoreByDimensions($img)', () => { describe('scoreByDimensions($img)', () => {
it('penalizes skinny images', () => { it('penalizes skinny images', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img width="10" /> <img width="10" />
</div> </div>
` `);
);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50); assert.equal(scoreByDimensions($img), -50);
}); });
it('penalizes short images', () => { it('penalizes short images', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img height="10" /> <img height="10" />
</div> </div>
` `);
);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50); assert.equal(scoreByDimensions($img), -50);
}); });
it('ignores sprites', () => { it('ignores sprites', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img src="/sprite/etc/foo.png" width="1000" height="1000" /> <img src="/sprite/etc/foo.png" width="1000" height="1000" />
</div> </div>
` `);
);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByDimensions($img), 0); assert.equal(scoreByDimensions($img), 0);
}); });
it('penalizes images with small areas', () => { it('penalizes images with small areas', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img src="/etc/foo.png" width="60" height="60" /> <img src="/etc/foo.png" width="60" height="60" />
</div> </div>
` `);
);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByDimensions($img), -100); assert.equal(scoreByDimensions($img), -100);
}); });
it('prefers the largest images', () => { it('prefers the largest images', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img src="/etc/foo.png" width="1000" height="1000" /> <img src="/etc/foo.png" width="1000" height="1000" />
</div> </div>
` `);
);
const $img = $('img').first(); const $img = $('img').first();
assert.equal(scoreByDimensions($img), 1000); assert.equal(scoreByDimensions($img), 1000);
@ -204,8 +192,7 @@ describe('scoreByDimensions($img)', () => {
describe('scoreByPosition($imgs, index)', () => { describe('scoreByPosition($imgs, index)', () => {
it('gives higher scores to images that come first', () => { it('gives higher scores to images that come first', () => {
const $ = cheerio.load( const $ = cheerio.load(`
`
<div> <div>
<img width="10" /> <img width="10" />
<img width="10" /> <img width="10" />
@ -214,8 +201,7 @@ describe('scoreByPosition($imgs, index)', () => {
<img width="10" /> <img width="10" />
<img width="10" /> <img width="10" />
</div> </div>
` `);
);
const $imgs = $('img'); const $imgs = $('img');
assert.equal(scoreByPosition($imgs, 0), 3); assert.equal(scoreByPosition($imgs, 0), 3);

@ -8,7 +8,6 @@ const fs = require('fs');
describe('scoreLinks(links)', () => { describe('scoreLinks(links)', () => {
it('returns an object of scored links', () => { it('returns an object of scored links', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8'); const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html); const $ = cheerio.load(html);
const links = $('a[href]').toArray(); const links = $('a[href]').toArray();
const url = const url =
@ -25,9 +24,7 @@ describe('scoreLinks(links)', () => {
}); });
it('returns null if no possible pages', () => { it('returns null if no possible pages', () => {
const html = '<div><p>Hello wow</p></div>'; const $ = cheerio.load('<div><p>Hello wow</p></div>');
const $ = cheerio.load(html);
const links = $('a[href]').toArray(); const links = $('a[href]').toArray();
const url = const url =
'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';

@ -5,30 +5,26 @@ import scoreByParents from './score-by-parents';
describe('scoreByParents($link)', () => { describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => { it('returns 25 if parent sig looks like a page', () => {
const html = ` const $ = cheerio.load(`
<div> <div>
<div class="next-page"> <div class="next-page">
<a href="blah">Next page</a> <a href="blah">Next page</a>
</div> </div>
</div> </div>
`; `);
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), 25); assert.equal(scoreByParents($('a').first()), 25);
}); });
it('returns -25 if parent sig looks like a comment', () => { it('returns -25 if parent sig looks like a comment', () => {
const html = ` const $ = cheerio.load(`
<div> <div>
<div class="comment"> <div class="comment">
<a href="blah">Next page</a> <a href="blah">Next page</a>
</div> </div>
</div> </div>
`; `);
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), -25); assert.equal(scoreByParents($('a').first()), -25);
}); });
}); });

@ -4,14 +4,10 @@ import scoreCapLinks from './score-cap-links';
describe('scoreCapLinks(linkData)', () => { describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => { it('returns -65 if cap link with next link text', () => {
const linkData = 'foo next Last page'; assert.equal(scoreCapLinks('foo next Last page'), -65);
assert.equal(scoreCapLinks(linkData), -65);
}); });
it('returns 0 if does not match a cap link', () => { it('returns 0 if does not match a cap link', () => {
const linkData = 'foo bar WOW GREAT'; assert.equal(scoreCapLinks('foo bar WOW GREAT'), 0);
assert.equal(scoreCapLinks(linkData), 0);
}); });
}); });

@ -4,14 +4,10 @@ import scoreExtraneousLinks from './score-extraneous-links';
describe('scoreExtraneousLinks(href)', () => { describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => { it('returns -25 if link matches extraneous text', () => {
const url = 'http://example.com/email-link'; assert.equal(scoreExtraneousLinks('http://example.com/email-link'), -25);
assert.equal(scoreExtraneousLinks(url), -25);
}); });
it('returns 0 if does not match extraneous text', () => { it('returns 0 if does not match extraneous text', () => {
const url = 'http://example.com/asdf'; assert.equal(scoreExtraneousLinks('http://example.com/asdf'), 0);
assert.equal(scoreExtraneousLinks(url), 0);
}); });
}); });

@ -4,14 +4,10 @@ import scoreNextLinkText from './score-next-link-text';
describe('scoreNextLinkText(linkData)', () => { describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => { it('returns 50 if contains common next link text', () => {
const linkData = 'foo bar Next page'; assert.equal(scoreNextLinkText('foo bar Next page'), 50);
assert.equal(scoreNextLinkText(linkData), 50);
}); });
it('returns 0 if does not contain common next link text', () => { it('returns 0 if does not contain common next link text', () => {
const linkData = 'foo bar WOW GREAT'; assert.equal(scoreNextLinkText('foo bar WOW GREAT'), 0);
assert.equal(scoreNextLinkText(linkData), 0);
}); });
}); });

@ -1,5 +1,4 @@
import assert from 'assert'; import assert from 'assert';
import scorePageInLink from './score-page-in-link'; import scorePageInLink from './score-page-in-link';
describe('scorePageInLink(pageNum, isWp)', () => { describe('scorePageInLink(pageNum, isWp)', () => {

@ -4,14 +4,10 @@ import scorePrevLink from './score-prev-link';
describe('scorePrevLink(linkData)', () => { describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => { it('returns -200 if link matches previous text', () => {
const linkData = 'foo next previous page'; assert.equal(scorePrevLink('foo next previous page'), -200);
assert.equal(scorePrevLink(linkData), -200);
}); });
it('returns 0 if does not match a prev link', () => { it('returns 0 if does not match a prev link', () => {
const linkData = 'foo bar WOW GREAT'; assert.equal(scorePrevLink('foo bar WOW GREAT'), 0);
assert.equal(scorePrevLink(linkData), 0);
}); });
}); });

@ -1,49 +1,68 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import GenericTitleExtractor from './extractor'; import GenericTitleExtractor from './extractor';
describe('GenericTitleExtractor', () => { describe('GenericTitleExtractor', () => {
describe('extract({ $, url, cachedMeta })', () => { describe('extract({ $, url, cachedMeta })', () => {
it('extracts strong meta title tags', () => { it('extracts strong meta title tags', () => {
const $ = cheerio.load(HTML.dcTitle.test); const $ = cheerio.load(`
<html>
<meta name="dc.title" value="This Is the Title Okay" />
<html>
`);
const result = GenericTitleExtractor.extract({ const result = GenericTitleExtractor.extract({
$, $,
url: '', url: '',
metaCache: ['dc.title', 'something-else'], metaCache: ['dc.title', 'something-else'],
}); });
assert.equal(result, HTML.dcTitle.result); assert.equal(result, 'This Is the Title Okay');
}); });
it('pulls title from selectors lacking string meta', () => { it('pulls title from selectors lacking string meta', () => {
const $ = cheerio.load(HTML.strongTitleSelector.test); const $ = cheerio.load(`
<html>
<article class="hentry">
<h1 class="entry-title">This Is the Title Okay</h1>
</article>
<html>
`);
const result = GenericTitleExtractor.extract({ const result = GenericTitleExtractor.extract({
$, $,
url: '', url: '',
metaCache: ['og:title', 'something-else'], metaCache: ['og:title', 'something-else'],
}); });
assert.equal(result, HTML.ogTitle.result); assert.equal(result, 'This Is the Title Okay');
}); });
it('then falls back to weak meta title tags', () => { it('then falls back to weak meta title tags', () => {
const $ = cheerio.load(HTML.ogTitle.test); const $ = cheerio.load(`
<html>
<meta name="og:title" value="This Is the Title Okay" />
<html>
`);
const result = GenericTitleExtractor.extract({ const result = GenericTitleExtractor.extract({
$, $,
url: '', url: '',
metaCache: ['og:title', 'something-else'], metaCache: ['og:title', 'something-else'],
}); });
assert.equal(result, HTML.ogTitle.result); assert.equal(result, 'This Is the Title Okay');
}); });
}); });
it('then falls back to weak selectors', () => { it('then falls back to weak selectors', () => {
const $ = cheerio.load(HTML.weakTitleSelector.test); const $ = cheerio.load(`
<html>
<head>
<title>This Is the Weak Title Okay</title>
</head>
<html>
`);
const result = GenericTitleExtractor.extract({ $, url: '', metaCache: [] }); const result = GenericTitleExtractor.extract({ $, url: '', metaCache: [] });
assert.equal(result, HTML.weakTitleSelector.result); assert.equal(result, 'This Is the Weak Title Okay');
}); });
}); });

@ -1,40 +0,0 @@
const HTML = {
dcTitle: {
test: `
<html>
<meta name="dc.title" value="This Is the Title Okay" />
<html>
`,
result: 'This Is the Title Okay',
},
ogTitle: {
test: `
<html>
<meta name="og:title" value="This Is the Title Okay" />
<html>
`,
result: 'This Is the Title Okay',
},
strongTitleSelector: {
test: `
<html>
<article class="hentry">
<h1 class="entry-title">This Is the Title Okay</h1>
</article>
<html>
`,
result: 'This Is the Title Okay',
},
weakTitleSelector: {
test: `
<html>
<head>
<title>This Is the Weak Title Okay</title>
</head>
<html>
`,
result: 'This Is the Weak Title Okay',
},
};
export default HTML;

@ -44,7 +44,8 @@ describe('cleanBySelectors($content, $, { clean })', () => {
<p>This is some good content</p> <p>This is some good content</p>
<div class="ad">Advertisement!</div> <div class="ad">Advertisement!</div>
</div> </div>
</div>`; </div>
`;
const $ = cheerio.load(html); const $ = cheerio.load(html);
let $content = $('.body'); let $content = $('.body');
@ -58,13 +59,13 @@ describe('cleanBySelectors($content, $, { clean })', () => {
describe('transformElements($content, $, { transforms })', () => { describe('transformElements($content, $, { transforms })', () => {
it('performs a simple transformation on matched elements', () => { it('performs a simple transformation on matched elements', () => {
const html = ` const html = `
<div> <div>
<div class="body"> <div class="body">
<h1>WOW BIG TITLE</h1> <h1>WOW BIG TITLE</h1>
<p>Here are some words</p> <p>Here are some words</p>
<h1>WOW BIG TITLE</h1> <h1>WOW BIG TITLE</h1>
</div>
</div> </div>
</div>
`; `;
const opts = { const opts = {
transforms: { h1: 'h2' }, transforms: { h1: 'h2' },
@ -86,17 +87,17 @@ describe('transformElements($content, $, { transforms })', () => {
it('performs a complex transformation on matched elements', () => { it('performs a complex transformation on matched elements', () => {
const html = ` const html = `
<div> <div>
<div class="body"> <div class="body">
<noscript> <noscript>
<img src="/img.jpg" /> <img src="/img.jpg" />
</noscript> </noscript>
<noscript> <noscript>
Something else Something else
</noscript> </noscript>
<p>Here are some words</p> <p>Here are some words</p>
</div>
</div> </div>
</div>
`; `;
const opts = { const opts = {
transforms: { transforms: {

@ -1,5 +1,4 @@
import assert from 'assert'; import assert from 'assert';
import { record } from 'test-helpers'; import { record } from 'test-helpers';
import Mercury from './mercury'; import Mercury from './mercury';

@ -5,9 +5,7 @@ import convertLazyLoadedImages from './convert-lazy-loaded-images';
describe('convertLazyLoadedImages($)', () => { describe('convertLazyLoadedImages($)', () => {
it('moves image links to src if placed in another attribute', () => { it('moves image links to src if placed in another attribute', () => {
const html = '<img data-src="http://example.com/foo.jpg">'; const $ = cheerio.load('<img data-src="http://example.com/foo.jpg">');
const $ = cheerio.load(html);
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal( assert.equal(
@ -17,9 +15,7 @@ describe('convertLazyLoadedImages($)', () => {
}); });
it('moves image source candidates to srcset if placed in another attribute', () => { it('moves image source candidates to srcset if placed in another attribute', () => {
const html = '<img data-srcset="http://example.com/foo.jpg 2x">'; const $ = cheerio.load('<img data-srcset="http://example.com/foo.jpg 2x">');
const $ = cheerio.load(html);
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal( assert.equal(
@ -29,10 +25,9 @@ describe('convertLazyLoadedImages($)', () => {
}); });
it('moves image source candidates containing query strings to srcset if placed in another attribute', () => { it('moves image source candidates containing query strings to srcset if placed in another attribute', () => {
const html = const $ = cheerio.load(
'<img data-srcset="http://example.com/foo.jpg?w=400 2x, http://example.com/foo.jpg?w=600 3x">'; '<img data-srcset="http://example.com/foo.jpg?w=400 2x, http://example.com/foo.jpg?w=600 3x">'
const $ = cheerio.load(html); );
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal( assert.equal(
@ -42,10 +37,9 @@ describe('convertLazyLoadedImages($)', () => {
}); });
it('properly handles src and srcset attributes', () => { it('properly handles src and srcset attributes', () => {
const html = const $ = cheerio.load(
'<img data-src="http://example.com/foo.jpg" data-srcset="http://example.com/foo.jpg 2x">'; '<img data-src="http://example.com/foo.jpg" data-srcset="http://example.com/foo.jpg 2x">'
const $ = cheerio.load(html); );
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal( assert.equal(
@ -57,37 +51,30 @@ describe('convertLazyLoadedImages($)', () => {
it('does nothing when value is not a link', () => { it('does nothing when value is not a link', () => {
// This is far from perfect, since a relative url could // This is far from perfect, since a relative url could
// be perfectly correct. // be perfectly correct.
const html = '<img data-src="foo.jpg">'; const $ = cheerio.load('<img data-src="foo.jpg">');
const $ = cheerio.load(html);
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal(result, '<img data-src="foo.jpg">'); assert.equal(result, '<img data-src="foo.jpg">');
}); });
it('does nothing when value is not an image', () => { it('does nothing when value is not an image', () => {
const html = '<img data-src="http://example.com">'; const $ = cheerio.load('<img data-src="http://example.com">');
const $ = cheerio.load(html);
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal(result, '<img data-src="http://example.com">'); assert.equal(result, '<img data-src="http://example.com">');
}); });
it('does not change a correct img with src', () => { it('does not change a correct img with src', () => {
const html = '<img src="http://example.com/foo.jpg">'; const $ = cheerio.load('<img src="http://example.com/foo.jpg">');
const $ = cheerio.load(html);
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal(result, '<img src="http://example.com/foo.jpg">'); assert.equal(result, '<img src="http://example.com/foo.jpg">');
}); });
it('does not replace an img src with srcset value', () => { it('does not replace an img src with srcset value', () => {
const html = const $ = cheerio.load(
'<img src="http://example.com/foo.jpg" srcset="http://example.com/foo2x.jpg 2x, http://example.com/foo.jpg">'; '<img src="http://example.com/foo.jpg" srcset="http://example.com/foo2x.jpg 2x, http://example.com/foo.jpg">'
const $ = cheerio.load(html); );
const result = convertLazyLoadedImages($).html(); const result = convertLazyLoadedImages($).html();
assert.equal( assert.equal(

@ -5,29 +5,25 @@ import normalizeMetaTags from './normalize-meta-tags';
describe('normalizeMetaTags($)', () => { describe('normalizeMetaTags($)', () => {
it('replaces "content" attributes with "value"', () => { it('replaces "content" attributes with "value"', () => {
const html = '<html><meta name="foo" content="bar"></html>'; // browser cheerio/jquery will remove/replace html, so result is different
const test = '<html><meta name="foo" value="bar"></html>'; const test = cheerio.browser
? '<meta name="foo" value="bar">'
// browser cheerio/jquery will remove/replace html, so result : '<html><meta name="foo" value="bar"></html>';
// is different
const testBrowser = '<meta name="foo" value="bar">';
const $ = cheerio.load(html);
const $ = cheerio.load('<html><meta name="foo" content="bar"></html>');
const result = normalizeMetaTags($).html(); const result = normalizeMetaTags($).html();
assert.equal(result, cheerio.browser ? testBrowser : test); assert.equal(result, test);
}); });
it('replaces "property" attributes with "name"', () => { it('replaces "property" attributes with "name"', () => {
const html = '<html><meta property="foo" value="bar"></html>'; const test = cheerio.browser
const test = '<html><meta value="bar" name="foo"></html>'; ? '<meta value="bar" name="foo">'
const testBrowser = '<meta value="bar" name="foo">'; : '<html><meta value="bar" name="foo"></html>';
const $ = cheerio.load(html);
const $ = cheerio.load('<html><meta property="foo" value="bar"></html>');
const result = normalizeMetaTags($).html(); const result = normalizeMetaTags($).html();
assert.equal(result, cheerio.browser ? testBrowser : test); assert.equal(result, test);
}); });
}); });

@ -2,35 +2,95 @@ import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import brsToPs from './brs-to-ps'; import brsToPs from './brs-to-ps';
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before);
assertClean(fn($).html(), HTML[key].after);
}
describe('Generic Extractor Utils', () => { describe('Generic Extractor Utils', () => {
describe('brsToPs(node)', () => { describe('brsToPs(node)', () => {
it('does nothing when no BRs present', () => { it('does nothing when no BRs present', () => {
const $ = cheerio.load(HTML.positiveId); const html = `
assert.equal(brsToPs($).html(), HTML.positiveId); <div id="entry">
<p>Ooo good one</p>
</div>
`;
assert.equal(brsToPs(cheerio.load(html)).html(), html);
}); });
it('does nothing when a single BR is present', () => { it('does nothing when a single BR is present', () => {
assertBeforeAndAfter('singleBr', brsToPs); const before = `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
}); });
it('converts double BR tags to an empty P tag', () => { it('converts double BR tags to an empty P tag', () => {
assertBeforeAndAfter('doubleBrs', brsToPs); const before = `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
}); });
it('converts several BR tags to an empty P tag', () => { it('converts several BR tags to an empty P tag', () => {
assertBeforeAndAfter('severalBrs', brsToPs); const before = `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
}); });
it('converts BR tags in a P tag into a P containing inline children', () => { it('converts BR tags in a P tag into a P containing inline children', () => {
assertBeforeAndAfter('brsInP', brsToPs); const before = `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`;
const after = `
<p>
Here is some text
<p>
Here is more text
</p></p>
`;
assertClean(brsToPs(cheerio.load(before)).html(), after);
}); });
}); });
}); });

@ -2,21 +2,42 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanAttributes } from './index'; import { cleanAttributes } from './index';
describe('cleanAttributes($)', () => { describe('cleanAttributes($)', () => {
it('removes style attributes from nodes', () => { it('removes style attributes from nodes', () => {
const $ = cheerio.load(HTML.removeStyle.before); const $ = cheerio.load(`
<div>
<p style="color: red;">What do you think?</p>
</div>
`);
const result = cleanAttributes($('*').first(), $); const result = cleanAttributes($('*').first(), $);
assertClean($.html(result), HTML.removeStyle.after); assertClean(
$.html(result),
`
<div>
<p>What do you think?</p>
</div>
`
);
}); });
it('removes align attributes from nodes', () => { it('removes align attributes from nodes', () => {
const $ = cheerio.load(HTML.removeAlign.before); const $ = cheerio.load(`
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`);
const result = cleanAttributes($('*').first(), $); const result = cleanAttributes($('*').first(), $);
assertClean($.html(result), HTML.removeAlign.after); assertClean(
$.html(result),
`
<div>
<p>What do you think?</p>
</div>
`
);
}); });
}); });

@ -2,21 +2,52 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanHOnes } from './index'; import { cleanHOnes } from './index';
describe('cleanHOnes($)', () => { describe('cleanHOnes($)', () => {
it('removes H1s if there are less than 3 of them', () => { it('removes H1s if there are less than 3 of them', () => {
const $ = cheerio.load(HTML.removeTwoHOnes.before); const $ = cheerio.load(`
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`);
const result = cleanHOnes($('*').first(), $); const result = cleanHOnes($('*').first(), $);
assertClean(result.html(), HTML.removeTwoHOnes.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
</div>
`
);
}); });
it('converts H1s to H2s if there are 3 or more of them', () => { it('converts H1s to H2s if there are 3 or more of them', () => {
const $ = cheerio.load(HTML.convertThreeHOnes.before); const $ = cheerio.load(`
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`);
const result = cleanHOnes($('*').first(), $); const result = cleanHOnes($('*').first(), $);
assertClean(result.html(), HTML.convertThreeHOnes.after); assertClean(
result.html(),
`
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`
);
}); });
}); });

@ -2,28 +2,71 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanHeaders } from './index'; import { cleanHeaders } from './index';
describe('cleanHeaders(article, $)', () => { describe('cleanHeaders(article, $)', () => {
it('parses html and returns the article', () => { it('parses html and returns the article', () => {
const $ = cheerio.load(HTML.cleanFirstHeds.before); const $ = cheerio.load(`
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`);
const result = cleanHeaders($('*').first(), $); const result = cleanHeaders($('*').first(), $);
assertClean(result.html(), HTML.cleanFirstHeds.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`
);
}); });
it('removes headers when the header text matches the title', () => { it('removes headers when the header text matches the title', () => {
const $ = cheerio.load(HTML.cleanTitleMatch.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`);
const result = cleanHeaders($('*').first(), $, 'Title Match'); const result = cleanHeaders($('*').first(), $, 'Title Match');
assertClean(result.html(), HTML.cleanTitleMatch.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
);
}); });
it('removes headers with a negative weight', () => { it('removes headers with a negative weight', () => {
const $ = cheerio.load(HTML.dropWithNegativeWeight.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`);
const result = cleanHeaders($('*').first(), $); const result = cleanHeaders($('*').first(), $);
assertClean(result.html(), HTML.dropWithNegativeWeight.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
);
}); });
}); });

@ -2,28 +2,64 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanImages } from './index'; import { cleanImages } from './index';
describe('cleanImages($)', () => { describe('cleanImages($)', () => {
it('removes images with small heights/widths', () => { it('removes images with small heights/widths', () => {
const $ = cheerio.load(HTML.cleanSmallImages.before); const $ = cheerio.load(`
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`);
const result = cleanImages($('*').first(), $); const result = cleanImages($('*').first(), $);
assertClean(result.html(), HTML.cleanSmallImages.after); assertClean(
result.html(),
`
<div>
<img width="50">
</div>
`
);
}); });
it('removes height attribute from images that remain', () => { it('removes height attribute from images that remain', () => {
const $ = cheerio.load(HTML.cleanHeight.before); const $ = cheerio.load(`
<div>
<img width="50" height="50" />
</div>
`);
const result = cleanImages($('*').first(), $); const result = cleanImages($('*').first(), $);
assertClean(result.html(), HTML.cleanHeight.after); assertClean(
result.html(),
`
<div>
<img width="50">
</div>
`
);
}); });
it('removes spacer/transparent images', () => { it('removes spacer/transparent images', () => {
const $ = cheerio.load(HTML.cleanSpacer.before); const $ = cheerio.load(`
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`);
const result = cleanImages($('*').first(), $); const result = cleanImages($('*').first(), $);
assertClean(result.html(), HTML.cleanSpacer.after); assertClean(
result.html(),
`
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`
);
}); });
}); });

@ -2,12 +2,22 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { cleanTags } from './index'; import { cleanTags } from './index';
describe('cleanTags($)', () => { describe('cleanTags($)', () => {
it('drops a matching node with a negative score', () => { it('drops a matching node with a negative score', () => {
const $ = cheerio.load(HTML.dropNegativeScore.before); const $ = cheerio.load(`
<div score="5">
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
// again small adjustments for cheerio vs. jquery implementation quirks // again small adjustments for cheerio vs. jquery implementation quirks
@ -15,58 +25,231 @@ describe('cleanTags($)', () => {
assertClean( assertClean(
result.html(), result.html(),
cheerio.browser cheerio.browser
? HTML.dropNegativeScore.afterBrowser ? `
: HTML.dropNegativeScore.after <div score="5">
<p>What do you think?</p>
<p>
</p>
<p></p>
<p>What do you think?</p>
</div>
`
: `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`
); );
}); });
it('removes a node with too many inputs', () => { it('removes a node with too many inputs', () => {
const $ = cheerio.load(HTML.removeTooManyInputs.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score')); $('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.removeTooManyInputs.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
);
}); });
it('removes a div with no images and very little text', () => { it('removes a div with no images and very little text', () => {
const $ = cheerio.load(HTML.removeShortNoImg.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score')); $('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.removeShortNoImg.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`
);
}); });
it('removes a node with a link density that is too high', () => { it('removes a node with a link density that is too high', () => {
const $ = cheerio.load(HTML.linkDensityHigh.before); const $ = cheerio.load(`
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score')); $('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.linkDensityHigh.after); assertClean(
result.html(),
`
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
);
}); });
it('removes a node with a good score but link density > 0.5', () => { it('removes a node with a good score but link density > 0.5', () => {
const $ = cheerio.load(HTML.linkDensityHigh.before); const $ = cheerio.load(`
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
$('[score]').each((i, e) => $(e).removeAttr('score')); $('[score]').each((i, e) => $(e).removeAttr('score'));
assertClean(result.html(), HTML.linkDensityHigh.after); assertClean(
result.html(),
`
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
);
}); });
it('keeps node with a good score but link density > 0.5 if preceding text ends in colon', () => { it('keeps node with a good score but link density > 0.5 if preceding text ends in colon', () => {
const $ = cheerio.load(HTML.previousEndsInColon.before); const html = `
<div score="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`;
const $ = cheerio.load(html);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
assertClean(result.html(), HTML.previousEndsInColon.before); assertClean(result.html(), html);
}); });
it('keeps anything with a class of entry-content-asset', () => { it('keeps anything with a class of entry-content-asset', () => {
const $ = cheerio.load(HTML.cleanEntryContentAsset.before); const html = `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`;
const $ = cheerio.load(html);
const result = cleanTags($('*').first(), $); const result = cleanTags($('*').first(), $);
assertClean(result.html(), HTML.cleanEntryContentAsset.before); assertClean(result.html(), html);
}); });
}); });

@ -5,8 +5,7 @@ import convertNodeTo from './convert-node-to';
describe('convertNodeTo(node, $)', () => { describe('convertNodeTo(node, $)', () => {
it('takes a node and converts it to a diff tag', () => { it('takes a node and converts it to a diff tag', () => {
const html = '<div>Should become a p</div>'; const $ = cheerio.load('<div>Should become a p</div>');
const $ = cheerio.load(html);
const node = $('div').first(); const node = $('div').first();
const result = convertNodeTo(node, $).html(); const result = convertNodeTo(node, $).html();
@ -16,8 +15,9 @@ describe('convertNodeTo(node, $)', () => {
}); });
it('retains attributes on conversion', () => { it('retains attributes on conversion', () => {
const html = '<span class="foo" score="100">Should keep its attrs</span>'; const $ = cheerio.load(
const $ = cheerio.load(html); '<span class="foo" score="100">Should keep its attrs</span>'
);
const node = $('span').first(); const node = $('span').first();
const result = convertNodeTo(node, $, 'div').html(); const result = convertNodeTo(node, $, 'div').html();
@ -42,13 +42,14 @@ describe('convertNodeTo(node, $)', () => {
// transforms on the noscript tag (commonly used for lazy-loading) don't work // transforms on the noscript tag (commonly used for lazy-loading) don't work
// as expected. This test case handles that // as expected. This test case handles that
it('handles noscript tags in the browser', () => { it('handles noscript tags in the browser', () => {
const html = '<noscript><img src="http://example.com" /></noscript>'; const $ = cheerio.load(
const resultHtml = '<figure><img src="http://example.com"></figure>'; '<noscript><img src="http://example.com" /></noscript>'
const $ = cheerio.load(html); );
const node = $('noscript'); const node = $('noscript');
const result = convertNodeTo(node, $, 'figure', 'noscript').html(); const result = convertNodeTo(node, $, 'figure', 'noscript').html();
const resultHtml = '<figure><img src="http://example.com"></figure>';
assert.equal(result, resultHtml); assert.equal(result, resultHtml);
}); });
}); });

@ -1,21 +1,37 @@
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import convertToParagraphs from './convert-to-paragraphs'; import convertToParagraphs from './convert-to-paragraphs';
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before);
assertClean(fn($).html(), HTML[key].after);
}
describe('convertToParagraphs($)', () => { describe('convertToParagraphs($)', () => {
it('performs simple conversions', () => { it('performs simple conversions', () => {
// Skipping this one in the browser. It works, but since the browser wraps // Skipping this one in the browser. It works, but since the browser wraps
// elements in a div, the last span conversion won't work as expected. // elements in a div, the last span conversion won't work as expected.
if (!cheerio.browser) { if (!cheerio.browser) {
assertBeforeAndAfter('convertToParagraphs', convertToParagraphs); const before = `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`;
const after = `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`;
assertClean(convertToParagraphs(cheerio.load(before)).html(), after);
} }
}); });
@ -29,7 +45,7 @@ describe('convertToParagraphs($)', () => {
</div> </div>
</div> </div>
`; `;
const $ = cheerio.load(html);
assertClean(convertToParagraphs($).html(), html); assertClean(convertToParagraphs(cheerio.load(html)).html(), html);
}); });
}); });

@ -1,28 +1,41 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/extract-from-selectors';
import { extractFromMeta } from './index'; import { extractFromMeta } from './index';
describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => { describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
it('extracts an arbitrary meta tag by name', () => { it('extracts an arbitrary meta tag by name', () => {
const $ = cheerio.load(HTML.metaFoo.test); const $ = cheerio.load(`
<html>
<meta name="foo" value="bar" />
</html>
`);
const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']); const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
assert.equal(result, HTML.metaFoo.result); assert.equal(result, 'bar');
}); });
it('returns nothing if a meta name is duplicated', () => { it('returns nothing if a meta name is duplicated', () => {
const $ = cheerio.load(HTML.metaDupes.test); const $ = cheerio.load(`
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="baz" />
</html>
`);
const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']); const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
assert.equal(result, HTML.metaDupes.result); assert.equal(result, null);
}); });
it('ignores duplicate meta names with empty values', () => { it('ignores duplicate meta names with empty values', () => {
const $ = cheerio.load(HTML.metaEmptyDupes.test); const $ = cheerio.load(`
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="" />
</html>
`);
const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']); const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
assert.equal(result, HTML.metaEmptyDupes.result); assert.equal(result, 'bar');
}); });
}); });

@ -1,35 +1,55 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/extract-from-selectors';
import extractFromSelectors from './extract-from-selectors'; import extractFromSelectors from './extract-from-selectors';
describe('extractFromSelectors($, selectors, maxChildren, textOnly)', () => { describe('extractFromSelectors($, selectors, maxChildren, textOnly)', () => {
it('extracts an arbitrary node by selector', () => { it('extracts an arbitrary node by selector', () => {
const $ = cheerio.load(HTML.simpleSelector.test); const $ = cheerio.load(`
const result = extractFromSelectors($, ['.author']); <html>
<div class="author">Adam</div>
</html>
`);
assert.equal(result, HTML.simpleSelector.result); assert.equal(extractFromSelectors($, ['.author']), 'Adam');
}); });
it('ignores comments', () => { it('ignores comments', () => {
const $ = cheerio.load(HTML.insideComment.test); const $ = cheerio.load(`
const result = extractFromSelectors($, ['.author']); <html>
<div class="comments-section">
assert.equal(result, HTML.insideComment.result); <div class="author">Adam</div>
</div>
</html>`);
assert.equal(extractFromSelectors($, ['.author']), null);
}); });
it('skips a selector if it matches multiple nodes', () => { it('skips a selector if it matches multiple nodes', () => {
const $ = cheerio.load(HTML.multiMatch.test); const $ = cheerio.load(`
const result = extractFromSelectors($, ['.author']); <html>
<div>
assert.equal(result, HTML.multiMatch.result); <div class="author">Adam</div>
<div class="author">Adam</div>
</div>
</html>
`);
assert.equal(extractFromSelectors($, ['.author']), null);
}); });
it('skips a node with too many children', () => { it('skips a node with too many children', () => {
const $ = cheerio.load(HTML.manyChildren.test); const $ = cheerio.load(`
const result = extractFromSelectors($, ['.author']); <html>
<div>
assert.equal(result, HTML.manyChildren.result); <div class="author">
<span>Adam</span>
<span>Pash</span>
</div>
</div>
</html>
`);
assert.equal(extractFromSelectors($, ['.author']), null);
}); });
}); });

@ -1,75 +0,0 @@
const HTML = {
// extractFromMeta
metaFoo: {
test: `
<html>
<meta name="foo" value="bar" />
</html>`,
result: 'bar',
},
metaDupes: {
test: `
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="baz" />
</html>`,
result: null,
},
metaEmptyDupes: {
test: `
<html>
<meta name="foo" value="bar" />
<meta name="foo" value="" />
</html>`,
result: 'bar',
},
custom: {
test: `
<html>
<meta property="foo" content="bar" />
</html>`,
result: 'bar',
},
// extractFromSelectors
simpleSelector: {
test: `
<html>
<div class="author">Adam</div>
</html>`,
result: 'Adam',
},
insideComment: {
test: `
<html>
<div class="comments-section">
<div class="author">Adam</div>
</div>
</html>`,
result: null,
},
multiMatch: {
test: `
<html>
<div>
<div class="author">Adam</div>
<div class="author">Adam</div>
</div>
</html>`,
result: null,
},
manyChildren: {
test: `
<html>
<div>
<div class="author">
<span>Adam</span>
<span>Pash</span>
</div>
</div>
</html>`,
result: null,
},
};
export default HTML;

@ -1,714 +0,0 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`,
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`,
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`,
},
ignoresKeepable: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<iframe class="mercury-parser-keep" src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
<iframe class="" src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
</div>
`,
},
// markToKeep
marksYouTube: {
before: `
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615"></iframe>
</div>
`,
after: `
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen class="mercury-parser-keep"></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615" class="mercury-parser-keep"></iframe>
</div>
`,
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`,
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`,
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
},
// cleanConditionally
dropNegativeScore: {
before: `
<div score="5">
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`,
afterBrowser: `
<div score="5">
<p>What do you think?</p>
<p>
</p>
<p></p>
<p>What do you think?</p>
</div>
`,
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`,
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
previousEndsInColon: {
before: `
<div score="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: 'What do you think?',
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
};
export default HTML;

@ -1,16 +0,0 @@
const HTML = {
tooShort: `
<div class="foo bar">
<p>This is too short</p>
</div>
`,
longEnough: `
<div class="foo bar">
<p>
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
</p>
</div>
`,
};
export default HTML;

@ -1,13 +1,13 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { linkDensity } from './index'; import { linkDensity } from './index';
describe('linkDensity($)', () => { describe('linkDensity($)', () => {
it('returns 0.5 if half of the text is a link', () => { it('returns 0.5 if half of the text is a link', () => {
const $ = cheerio.load(HTML.linkDensity5); const $ = cheerio.load(`
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`);
const density = linkDensity($('div').first(), $); const density = linkDensity($('div').first(), $);
@ -15,7 +15,9 @@ describe('linkDensity($)', () => {
}); });
it('returns 1 if all of the text is a link', () => { it('returns 1 if all of the text is a link', () => {
const $ = cheerio.load(HTML.linkDensity1); const $ = cheerio.load(`
<div><p><a href="">Some text!</a></p></div>
`);
const density = linkDensity($('div').first(), $); const density = linkDensity($('div').first(), $);
@ -23,7 +25,9 @@ describe('linkDensity($)', () => {
}); });
it("returns 0 if there's no text", () => { it("returns 0 if there's no text", () => {
const $ = cheerio.load(HTML.linkDensity0); const $ = cheerio.load(`
<div><p><a href=""></a></p></div>
`);
const density = linkDensity($('div').first()); const density = linkDensity($('div').first());

@ -1,60 +1,53 @@
import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import makeLinksAbsolute from './make-links-absolute'; import makeLinksAbsolute from './make-links-absolute';
describe('makeLinksAbsolute($)', () => { describe('makeLinksAbsolute($)', () => {
it('makes relative #hrefs absolute', () => { it('makes relative #hrefs absolute', () => {
const html = '<div><a href="#foo">bar</a></div>'; const $ = cheerio.load('<div><a href="#foo">bar</a></div>');
const $ = cheerio.load(html);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html(makeLinksAbsolute($content, $, 'http://example.com')); const result = $.html(makeLinksAbsolute($content, $, 'http://example.com'));
assert.equal( assertClean(result, '<div><a href="http://example.com/#foo">bar</a></div>');
result,
'<div><a href="http://example.com/#foo">bar</a></div>'
);
}); });
it('makes relative ./relative paths absolute', () => { it('makes relative ./relative paths absolute', () => {
const html = '<div><a href="foo/bar">bar</a></div>'; const $ = cheerio.load('<div><a href="foo/bar">bar</a></div>');
const $ = cheerio.load(html);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com/baz/bat') makeLinksAbsolute($content, $, 'http://example.com/baz/bat')
); );
assert.equal( assertClean(
result, result,
'<div><a href="http://example.com/baz/foo/bar">bar</a></div>' '<div><a href="http://example.com/baz/foo/bar">bar</a></div>'
); );
}); });
it('makes relative /root/paths absolute', () => { it('makes relative /root/paths absolute', () => {
const html = '<div><a href="/foo/bar">bar</a></div>'; const $ = cheerio.load('<div><a href="/foo/bar">bar</a></div>');
const $ = cheerio.load(html);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com/baz/bat') makeLinksAbsolute($content, $, 'http://example.com/baz/bat')
); );
assert.equal( assertClean(
result, result,
'<div><a href="http://example.com/foo/bar">bar</a></div>' '<div><a href="http://example.com/foo/bar">bar</a></div>'
); );
}); });
it('makes relative srcs absolute', () => { it('makes relative srcs absolute', () => {
const html = '<div><img src="#foo"></div>'; const $ = cheerio.load('<div><img src="#foo"></div>');
const $ = cheerio.load(html);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html(makeLinksAbsolute($content, $, 'http://example.com')); const result = $.html(makeLinksAbsolute($content, $, 'http://example.com'));
assert.equal(result, '<div><img src="http://example.com/#foo"></div>'); assertClean(result, '<div><img src="http://example.com/#foo"></div>');
}); });
describe('makes relative srcsets absolute', () => { describe('makes relative srcsets absolute', () => {
@ -80,50 +73,54 @@ describe('makeLinksAbsolute($)', () => {
* assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x, * assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x,
* assets/images/rhythm/240@3x.jpg 3x * assets/images/rhythm/240@3x.jpg 3x
*/ */
const html = `<div> const $ = cheerio.load(`
<picture> <div>
<source srcset="assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)"> <picture>
<source srcset="assets/images/rhythm/120@2x.jpg 2x, assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)"> <source srcset="assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)">
<source srcset="assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x,assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)"> <source srcset="assets/images/rhythm/120@2x.jpg 2x, assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)">
<img src="assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm"> <source srcset="assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x,assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)">
</picture> <img src="assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm">
</div>`; </picture>
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com') makeLinksAbsolute($content, $, 'http://example.com')
); );
assert.equal( assertClean(
result, result,
`<div> `
<picture> <div>
<source srcset="http://example.com/assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)"> <picture>
<source srcset="http://example.com/assets/images/rhythm/120@2x.jpg 2x, http://example.com/assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)"> <source srcset="http://example.com/assets/images/rhythm/076.jpg,assets/images/rhythm/076@2x.jpg 2x" media="(max-width: 450px)">
<source srcset="http://example.com/assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x, http://example.com/assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)"> <source srcset="http://example.com/assets/images/rhythm/120@2x.jpg 2x, http://example.com/assets/images/rhythm/120.jpg,assets/images/rhythm/120@3x.jpg 3x" media="(max-width: 900px)">
<img src="http://example.com/assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm"> <source srcset="http://example.com/assets/images/rhythm/240.jpg,assets/images/rhythm/240@2x.jpg 2x, http://example.com/assets/images/rhythm/240@3x.jpg 3x" media="(min-width: 901px)">
</picture> <img src="http://example.com/assets/images/rhythm/120.jpg" alt="Vertical and horizontal rhythm">
</div>` </picture>
</div>
`
); );
}); });
it('does nothing when the srcset is empty or just whitespace', () => { it('does nothing when the srcset is empty or just whitespace', () => {
const html = `<div> const $ = cheerio.load(`
<picture> <div>
<source srcset="" media="(max-width: 450px)"> <picture>
<source srcset=" "> <source srcset="" media="(max-width: 450px)">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm"> <source srcset=" ">
</picture> <img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</div>`; </picture>
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com') makeLinksAbsolute($content, $, 'http://example.com')
); );
assert.equal( assertClean(
result, result,
`<div> `<div>
<picture> <picture>
@ -136,21 +133,22 @@ describe('makeLinksAbsolute($)', () => {
}); });
it('handles comma separated (with whitespace) srcset files with device-pixel-ratio descriptors', () => { it('handles comma separated (with whitespace) srcset files with device-pixel-ratio descriptors', () => {
const html = `<div> const $ = cheerio.load(`
<picture> <div>
<source srcset="assets/images/rhythm/076.jpg 2x, assets/images/rhythm/076.jpg" media="(max-width: 450px)"> <picture>
<source srcset="assets/images/rhythm/076@2x.jpg 2x, assets/images/rhythm/076.jpg"> <source srcset="assets/images/rhythm/076.jpg 2x, assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm"> <source srcset="assets/images/rhythm/076@2x.jpg 2x, assets/images/rhythm/076.jpg">
</picture> <img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</div>`; </picture>
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com') makeLinksAbsolute($content, $, 'http://example.com')
); );
assert.equal( assertClean(
result, result,
`<div> `<div>
<picture> <picture>
@ -163,88 +161,100 @@ describe('makeLinksAbsolute($)', () => {
}); });
it('handles comma separated (without whitespace) srcset files with device-pixel-ratio descriptors', () => { it('handles comma separated (without whitespace) srcset files with device-pixel-ratio descriptors', () => {
const html = `<div> const $ = cheerio.load(`
<picture> <div>
<source srcset="assets/images/rhythm/076.jpg 2x,assets/images/rhythm/076.jpg" media="(max-width: 450px)"> <picture>
<source srcset="assets/images/rhythm/076@2x.jpg 2x,assets/images/rhythm/076.jpg"> <source srcset="assets/images/rhythm/076.jpg 2x,assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm"> <source srcset="assets/images/rhythm/076@2x.jpg 2x,assets/images/rhythm/076.jpg">
</picture> <img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</div>`; </picture>
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com') makeLinksAbsolute($content, $, 'http://example.com')
); );
assert.equal( assertClean(
result, result,
`<div> `
<picture> <div>
<source srcset="http://example.com/assets/images/rhythm/076.jpg 2x, http://example.com/assets/images/rhythm/076.jpg" media="(max-width: 450px)"> <picture>
<source srcset="http://example.com/assets/images/rhythm/076@2x.jpg 2x, http://example.com/assets/images/rhythm/076.jpg"> <source srcset="http://example.com/assets/images/rhythm/076.jpg 2x, http://example.com/assets/images/rhythm/076.jpg" media="(max-width: 450px)">
<img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm"> <source srcset="http://example.com/assets/images/rhythm/076@2x.jpg 2x, http://example.com/assets/images/rhythm/076.jpg">
</picture> <img src="http://example.com/assets/images/rhythm/076.jpg" alt="Vertical and horizontal rhythm">
</div>` </picture>
</div>
`
); );
}); });
it('handles comma separated (with whitespace) srcset files with width descriptors', () => { it('handles comma separated (with whitespace) srcset files with width descriptors', () => {
const html = `<div> const $ = cheerio.load(`
<img srcset="elva-fairy-320w.jpg 320w, elva-fairy-480w.jpg 480w, elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy"> <div>
</div>`; <img srcset="elva-fairy-320w.jpg 320w, elva-fairy-480w.jpg 480w, elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com') makeLinksAbsolute($content, $, 'http://example.com')
); );
assert.equal( assertClean(
result, result,
`<div> `
<img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy"> <div>
</div>` <img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>
`
); );
}); });
it('handles multiline comma separated srcset files with width descriptors', () => { it('handles multiline comma separated srcset files with width descriptors', () => {
const html = `<div> const $ = cheerio.load(`
<img srcset="elva-fairy-320w.jpg 320w, <div>
elva-fairy-480w.jpg 480w, <img srcset="elva-fairy-320w.jpg 320w,
elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy"> elva-fairy-480w.jpg 480w,
</div>`; elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'http://example.com') makeLinksAbsolute($content, $, 'http://example.com')
); );
assert.equal( assertClean(
result, result,
`<div> `
<img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy"> <div>
</div>` <img srcset="http://example.com/elva-fairy-320w.jpg 320w, http://example.com/elva-fairy-480w.jpg 480w, http://example.com/elva-fairy-800w.jpg 800w" sizes="(max-width: 320px) 280px, (max-width: 480px) 440px, 800px" src="http://example.com/elva-fairy-800w.jpg" alt="Elva dressed as a fairy">
</div>
`
); );
}); });
it('handles URLs that contain a comma', () => { it('handles URLs that contain a comma', () => {
const html = `<div> const $ = cheerio.load(`
<picture><source media="(min-width: 768px)" srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"/><source srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"/><img src="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg" /></picture> <div>
</div>`; <picture><source media="(min-width: 768px)" srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"/><source srcset="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"/><img src="cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg" /></picture>
const $ = cheerio.load(html); </div>
`);
const $content = $('*').first(); const $content = $('*').first();
const result = $.html( const result = $.html(
makeLinksAbsolute($content, $, 'https://media.newyorker.com/') makeLinksAbsolute($content, $, 'https://media.newyorker.com/')
); );
assert.equal( assertClean(
result, result,
`<div> `
<picture><source media="(min-width: 768px)" srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"><source srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"><img src="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg"></picture> <div>
</div>` <picture><source media="(min-width: 768px)" srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_280,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_560,c_limit/181022_a22232.jpg 2x"><source srcset="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg, https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_1454,c_limit/181022_a22232.jpg 2x"><img src="https://media.newyorker.com/cartoons/5bbfca021e40b62d6cc418ea/master/w_727,c_limit/181022_a22232.jpg"></picture>
</div>
`
); );
}); });
}); });

@ -3,28 +3,43 @@ import assert from 'assert';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { markToKeep } from './index'; import { markToKeep } from './index';
import { KEEP_CLASS } from './constants'; import { KEEP_CLASS } from './constants';
describe('markToKeep($)', () => { describe('markToKeep($)', () => {
it('marks elements that should be kept', () => { it('marks elements that should be kept', () => {
const $ = cheerio.load(HTML.marksYouTube.before); const $ = cheerio.load(`
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615"></iframe>
</div>
`);
const result = markToKeep($('*').first(), $); const result = markToKeep($('*').first(), $);
assert.equal(result('iframe.mercury-parser-keep').length, 2); assert.equal(result('iframe.mercury-parser-keep').length, 2);
if (!$.browser) { if (!$.browser) {
assertClean(result.html(), HTML.marksYouTube.after); assertClean(
result.html(),
`
<div>
<p>What an article</p>
<iframe src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen class="mercury-parser-keep"></iframe>
<iframe src="foo" frameborder="0" allowfullscreen></iframe>
<iframe src="https://player.vimeo.com/video/57712615" class="mercury-parser-keep"></iframe>
</div>
`
);
} }
}); });
it('marks same-domain elements to keep', () => { it('marks same-domain elements to keep', () => {
const html = const $ = cheerio.load(
'<div><iframe src="https://medium.com/foo/bar"></iframe></div>'; '<div><iframe src="https://medium.com/foo/bar"></iframe></div>'
const $ = cheerio.load(html); );
const result = markToKeep($('*').first(), $, 'https://medium.com/foo'); const result = markToKeep($('*').first(), $, 'https://medium.com/foo');
const keptHtml = `<div><iframe src="https://medium.com/foo/bar" class="${KEEP_CLASS}"></iframe></div>`; const keptHtml = `<div><iframe src="https://medium.com/foo/bar" class="${KEEP_CLASS}"></iframe></div>`;

@ -1,21 +1,30 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/node-is-sufficient';
import nodeIsSufficient from './node-is-sufficient'; import nodeIsSufficient from './node-is-sufficient';
describe('Utils', () => { describe('Utils', () => {
describe('nodeIsSufficient(node)', () => { describe('nodeIsSufficient(node)', () => {
it('returns false if node text length < 100 chars', () => { it('returns false if node text length < 100 chars', () => {
const $ = cheerio.load(HTML.tooShort); const $ = cheerio.load(`
const sufficient = nodeIsSufficient($.root()); <div class="foo bar">
assert.equal(sufficient, false); <p>This is too short</p>
</div>
`);
assert.equal(nodeIsSufficient($.root()), false);
}); });
it('returns true if node text length > 100 chars', () => { it('returns true if node text length > 100 chars', () => {
const $ = cheerio.load(HTML.longEnough); const $ = cheerio.load(`
const sufficient = nodeIsSufficient($.root()); <div class="foo bar">
assert.equal(sufficient, true); <p>
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
</p>
</div>
`);
assert.equal(nodeIsSufficient($.root()), true);
}); });
}); });
}); });

@ -2,23 +2,46 @@ import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import { clean } from 'test-helpers'; import { clean } from 'test-helpers';
import HTML from './fixtures/html';
import { paragraphize } from './index'; import { paragraphize } from './index';
describe('Generic Extractor Utils', () => { describe('Generic Extractor Utils', () => {
describe('paragraphize(node)', () => { describe('paragraphize(node)', () => {
it('conversts a BR into P and moves inline contents to P tag after current parent', () => { it('conversts a BR into P and moves inline contents to P tag after current parent', () => {
const $ = cheerio.load(HTML.paragraphize.before); const $ = cheerio.load(`
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`);
const node = $('br').get(0); const node = $('br').get(0);
// note: result here is not valid html; will handle elsewhere // note: result here is not valid html; will handle elsewhere
const result = paragraphize(node, $, true).html(); const result = paragraphize(node, $, true).html();
assert.equal(clean(result), clean(HTML.paragraphize.after)); assert.equal(
clean(result),
clean(`
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`)
);
}); });
it('conversts a BR into P and stops when block element hit', () => { it('converts a BR into P and stops when block element hit', () => {
const $ = cheerio.load(HTML.paragraphizeBlock.before); const $ = cheerio.load(`
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`);
const node = $('br').get(0); const node = $('br').get(0);
// note: result here is not valid html; will handle elsewhere // note: result here is not valid html; will handle elsewhere
@ -30,7 +53,17 @@ describe('Generic Extractor Utils', () => {
'<p> Here is some text <p> Here is more text </p></p><div>And also this</div> <p></p>'; '<p> Here is some text <p> Here is more text </p></p><div>And also this</div> <p></p>';
assert.equal(clean(result), html); assert.equal(clean(result), html);
} else { } else {
assert.equal(clean(result), clean(HTML.paragraphizeBlock.after)); assert.equal(
clean(result),
clean(`
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`)
);
} }
}); });
}); });

@ -2,15 +2,26 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { removeEmpty } from './index'; import { removeEmpty } from './index';
describe('removeEmpty($)', () => { describe('removeEmpty($)', () => {
it('removes empty P tags', () => { it('removes empty P tags', () => {
const $ = cheerio.load(HTML.removeEmptyP.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<p></p>
</div>
`);
const result = removeEmpty($('*').first(), $); const result = removeEmpty($('*').first(), $);
assertClean(result.html(), HTML.removeEmptyP.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
</div>
`
);
}); });
it('removes P tags with only space', () => { it('removes P tags with only space', () => {
@ -22,10 +33,22 @@ describe('removeEmpty($)', () => {
}); });
it('does not remove empty DIV tags', () => { it('does not remove empty DIV tags', () => {
const $ = cheerio.load(HTML.removeEmptyP.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<p></p>
</div>
`);
const result = removeEmpty($('*').first(), $); const result = removeEmpty($('*').first(), $);
assertClean(result.html(), HTML.removeEmptyP.after); assertClean(
result.html(),
`
<div>
<p>What do you think?</p>
</div>
`
);
}); });
it('does not remove empty p tags containing an iframe', () => { it('does not remove empty p tags containing an iframe', () => {

@ -3,19 +3,25 @@ import assert from 'assert';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import rewriteTopLevel from './rewrite-top-level'; import rewriteTopLevel from './rewrite-top-level';
describe('rewriteTopLevel(node, $)', () => { describe('rewriteTopLevel(node, $)', () => {
it('turns html and body tags into divs', () => { it('turns html and body tags into divs', () => {
const $ = cheerio.load(HTML.rewriteHTMLBody.before); const $ = cheerio.load(`
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`);
const result = rewriteTopLevel($('html').first(), $); const result = rewriteTopLevel($('html').first(), $);
assert.equal(result('html').length, 0); assert.equal(result('html').length, 0);
assert.equal(result('body').length, 0); assert.equal(result('body').length, 0);
if (!cheerio.browser) { if (!cheerio.browser) {
assertClean(result.html(), HTML.rewriteHTMLBody.after); assertClean(
result.html(),
`
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`
);
} }
}); });
}); });

@ -3,19 +3,44 @@ import assert from 'assert';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import { stripJunkTags } from './index'; import { stripJunkTags } from './index';
describe('stripJunkTags($)', () => { describe('stripJunkTags($)', () => {
it('strips script and other junk tags', () => { it('strips script and other junk tags', () => {
const $ = cheerio.load(HTML.stripsJunk.before); const $ = cheerio.load(`
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`);
const result = stripJunkTags($('*').first(), $); const result = stripJunkTags($('*').first(), $);
assertClean(result.html(), HTML.stripsJunk.after); assertClean(
result.html(),
`
<div>
<p>What an article</p>
</div>
`
);
}); });
it('keeps youtube embeds', () => { it('keeps youtube embeds', () => {
let $ = cheerio.load(HTML.ignoresKeepable.before); let $ = cheerio.load(`
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<iframe class="mercury-parser-keep" src="https://www.youtube.com/embed/_2AqQV8wDvY" frameborder="0" allowfullscreen></iframe>
<hr />
</div>
`);
$ = stripJunkTags($('*').first(), $); $ = stripJunkTags($('*').first(), $);
assert.equal($('iframe[src^="https://www.youtube.com"]').length, 1); assert.equal($('iframe[src^="https://www.youtube.com"]').length, 1);

@ -2,32 +2,71 @@ import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import { assertClean } from 'test-helpers'; import { assertClean } from 'test-helpers';
import HTML from './fixtures/html';
import stripUnlikelyCandidates from './strip-unlikely-candidates'; import stripUnlikelyCandidates from './strip-unlikely-candidates';
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before);
assertClean(fn($).html(), HTML[key].after);
}
describe('Generic Extractor Utils', () => { describe('Generic Extractor Utils', () => {
describe('stripUnlikelyCandidates(node)', () => { describe('stripUnlikelyCandidates(node)', () => {
it('returns original doc if no matches found', () => { it('returns original doc if no matches found', () => {
const $ = cheerio.load(HTML.noMatches); const html = `
const stripped = stripUnlikelyCandidates($); <div id="foo">
assert.equal(stripped.html(), HTML.noMatches); <p>Ooo good one</p>
</div>
`;
const stripped = stripUnlikelyCandidates(cheerio.load(html));
assert.equal(stripped.html(), html);
}); });
it('strips unlikely matches from the doc', () => { it('strips unlikely matches from the doc', () => {
assertBeforeAndAfter('whitelistMatch', stripUnlikelyCandidates); const before = `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article">
<p>Ooo good one</p>
</div>
`;
assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
}); });
it('keeps likely matches even when they also match the blacklist', () => { it('keeps likely matches even when they also match the blacklist', () => {
assertBeforeAndAfter('whiteAndBlack', stripUnlikelyCandidates); const before = `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`;
const after = `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`;
assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
}); });
it('removed likely matches when inside blacklist node', () => { it('removed likely matches when inside blacklist node', () => {
assertBeforeAndAfter('whiteInsideBlack', stripUnlikelyCandidates); const before = `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`;
const after = `
<div>
<div>Something unrelated</div>
</div>
`;
assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
}); });
}); });
}); });

@ -5,29 +5,35 @@ import withinComment from './within-comment';
describe('withinComment(node)', () => { describe('withinComment(node)', () => {
it('returns false if its parent is not a comment', () => { it('returns false if its parent is not a comment', () => {
const $ = cheerio.load(`<div> const $ = cheerio.load(`
<div> <div>
<div class="author">Adam</div> <div>
</div> <div class="author">Adam</div>
</div>`); </div>
</div>
`);
assert.equal(withinComment($('.author').first()), false); assert.equal(withinComment($('.author').first()), false);
}); });
it('returns true if its parent has a class of comment', () => { it('returns true if its parent has a class of comment', () => {
const $ = cheerio.load(`<div class="comments"> const $ = cheerio.load(`
<div> <div class="comments">
<div class="author">Adam</div> <div>
</div> <div class="author">Adam</div>
</div>`); </div>
</div>
`);
assert.equal(withinComment($('.author').first()), true); assert.equal(withinComment($('.author').first()), true);
}); });
it('returns true if its parent has an id of comment', () => { it('returns true if its parent has an id of comment', () => {
const $ = cheerio.load(`<div id="comment"> const $ = cheerio.load(`
<div> <div id="comment">
<div class="author">Adam</div> <div>
</div> <div class="author">Adam</div>
</div>`); </div>
</div>
`);
assert.equal(withinComment($('.author').first()), true); assert.equal(withinComment($('.author').first()), true);
}); });
}); });

@ -1,5 +1,4 @@
import assert from 'assert'; import assert from 'assert';
import mergeSupportedDomains from './merge-supported-domains'; import mergeSupportedDomains from './merge-supported-domains';
describe('mergeSupportedDomains(extractor, domains)', () => { describe('mergeSupportedDomains(extractor, domains)', () => {
@ -8,6 +7,7 @@ describe('mergeSupportedDomains(extractor, domains)', () => {
domain: 'foo.com', domain: 'foo.com',
supportedDomains: ['example.com'], supportedDomains: ['example.com'],
}; };
const expected = { const expected = {
'foo.com': extractor, 'foo.com': extractor,
'example.com': extractor, 'example.com': extractor,
@ -21,6 +21,7 @@ describe('mergeSupportedDomains(extractor, domains)', () => {
const extractor = { const extractor = {
domain: 'foo.com', domain: 'foo.com',
}; };
const expected = { const expected = {
'foo.com': extractor, 'foo.com': extractor,
}; };

@ -1,674 +0,0 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`,
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`,
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`,
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`,
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`,
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`,
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`,
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
},
// cleanConditionally
dropNegativeScore: {
before: `
<div>
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`,
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`,
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`,
},
previousEndsInColon: {
before: `
<div weight="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: 'What do you think?',
},
normalizeSpacesPreserve: {
before: `
<div>
<p>What do you think?</p>
<pre> What happens to spaces? </pre>
</div>
`,
after:
'<div> <p>What do you think?</p> <pre> What happens to spaces? </pre> </div>',
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`,
},
};
export default HTML;

@ -1,26 +1,36 @@
import assert from 'assert'; import assert from 'assert';
import cheerio from 'cheerio'; import cheerio from 'cheerio';
import HTML from './fixtures/html';
import { normalizeSpaces } from './index'; import { normalizeSpaces } from './index';
describe('normalizeSpaces(text)', () => { describe('normalizeSpaces(text)', () => {
it('normalizes spaces from text', () => { it('normalizes spaces from text', () => {
const $ = cheerio.load(HTML.normalizeSpaces.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
</div>
`);
const result = normalizeSpaces( const result = normalizeSpaces(
$('*') $('*')
.first() .first()
.text() .text()
); );
assert.equal(result, HTML.normalizeSpaces.after); assert.equal(result, 'What do you think?');
}); });
it('preserves spaces in preformatted text blocks', () => { it('preserves spaces in preformatted text blocks', () => {
const $ = cheerio.load(HTML.normalizeSpacesPreserve.before); const $ = cheerio.load(`
<div>
<p>What do you think?</p>
<pre> What happens to spaces? </pre>
</div>
`);
const result = normalizeSpaces($.html()); const result = normalizeSpaces($.html());
assert.equal(result, HTML.normalizeSpacesPreserve.after); assert.equal(
result,
'<div> <p>What do you think?</p> <pre> What happens to spaces? </pre> </div>'
);
}); });
}); });

@ -4,42 +4,20 @@ import pageNumFromUrl from './page-num-from-url';
describe('pageNumFromUrl(url)', () => { describe('pageNumFromUrl(url)', () => {
it('returns null if there is no page num in the url', () => { it('returns null if there is no page num in the url', () => {
const url1 = 'http://example.com'; assert.equal(pageNumFromUrl('http://example.com'), null);
assert.equal(pageNumFromUrl(url1), null); assert.equal(pageNumFromUrl('http://example.com/?pg=102'), null);
assert.equal(pageNumFromUrl('http://example.com/?page:102'), null);
const url2 = 'http://example.com/?pg=102';
assert.equal(pageNumFromUrl(url2), null);
const url3 = 'http://example.com/?page:102';
assert.equal(pageNumFromUrl(url3), null);
}); });
it('returns a page num if one matches the url', () => { it('returns a page num if one matches the url', () => {
const url1 = 'http://example.com/foo?page=1'; assert.equal(pageNumFromUrl('http://example.com/foo?page=1'), 1);
assert.equal(pageNumFromUrl(url1), 1); assert.equal(pageNumFromUrl('http://example.com/foo?pg=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?p=1'), 1);
const url2 = 'http://example.com/foo?pg=1'; assert.equal(pageNumFromUrl('http://example.com/foo?paging=1'), 1);
assert.equal(pageNumFromUrl(url2), 1); assert.equal(pageNumFromUrl('http://example.com/foo?pag=1'), 1);
assert.equal(pageNumFromUrl('http://example.com/foo?pagination/1'), 1);
const url3 = 'http://example.com/foo?p=1'; assert.equal(pageNumFromUrl('http://example.com/foo?paging/99'), 99);
assert.equal(pageNumFromUrl(url3), 1); assert.equal(pageNumFromUrl('http://example.com/foo?pa/99'), 99);
assert.equal(pageNumFromUrl('http://example.com/foo?p/99'), 99);
const url4 = 'http://example.com/foo?paging=1';
assert.equal(pageNumFromUrl(url4), 1);
const url5 = 'http://example.com/foo?pag=1';
assert.equal(pageNumFromUrl(url5), 1);
const url6 = 'http://example.com/foo?pagination/1';
assert.equal(pageNumFromUrl(url6), 1);
const url7 = 'http://example.com/foo?paging/88';
assert.equal(pageNumFromUrl(url7), 88);
const url8 = 'http://example.com/foo?pa/88';
assert.equal(pageNumFromUrl(url8), 88);
const url9 = 'http://example.com/foo?p/88';
assert.equal(pageNumFromUrl(url9), 88);
}); });
}); });

Loading…
Cancel
Save