From 112846f74f1736a9c4e8251c702498888ced28e9 Mon Sep 17 00:00:00 2001 From: John Holdun Date: Mon, 15 Aug 2022 17:00:04 -0700 Subject: [PATCH] chore: Inline test fixtures (#683) Not to be confused with extractor fixtures, which are snapshots of a webpage. This change removes the pattern of separate JS files that provide "fixtures" for tests, which are used as provided or expected strings in tests. They were inconsistent and disorganized, and generally just served to add indirection to test files. So now all those strings are defined where they are used in their respective tests. --- src/cleaners/fixtures/html.js | 15 - src/cleaners/lead-image-url.test.js | 3 +- src/cleaners/title.test.js | 18 +- src/extractors/detect-by-html.test.js | 10 +- .../generic/author/extractor.test.js | 29 +- .../generic/author/fixtures/html.js | 32 - .../generic/content/extract-best-node.test.js | 10 +- .../generic/content/scoring/add-score.test.js | 10 +- .../content/scoring/add-to-parent.test.js | 7 +- .../scoring/find-top-candidate.test.js | 40 +- .../content/scoring/fixtures/get-weight.js | 664 ---------------- .../generic/content/scoring/fixtures/html.js | 87 --- .../content/scoring/get-or-init-score.test.js | 46 +- .../generic/content/scoring/get-score.test.js | 9 +- .../content/scoring/get-weight.test.js | 55 +- .../content/scoring/score-content.test.js | 51 +- .../content/scoring/score-node.test.js | 33 +- .../content/scoring/score-paragraph.test.js | 28 +- .../generic/content/scoring/set-score.test.js | 5 +- .../generic/date-published/extractor.test.js | 23 +- .../generic/date-published/fixtures/html.js | 26 - .../generic/lead-image-url/extractor.test.js | 40 +- .../generic/lead-image-url/fixtures/html.js | 42 -- .../lead-image-url/score-image.test.js | 96 +-- .../next-page-url/scoring/score-links.test.js | 5 +- .../scoring/utils/score-by-parents.test.js | 16 +- .../scoring/utils/score-cap-links.test.js | 8 +- .../utils/score-extraneous-links.test.js | 8 +- .../utils/score-next-link-text.test.js | 8 +- .../scoring/utils/score-page-in-link.test.js | 1 - .../scoring/utils/score-prev-link.test.js | 8 +- .../generic/title/extractor.test.js | 37 +- src/extractors/generic/title/fixtures/html.js | 40 - src/extractors/root-extractor.test.js | 35 +- src/mercury.test.js | 1 - .../dom/convert-lazy-loaded-images.test.js | 41 +- .../utils/dom/normalize-meta-tags.test.js | 26 +- src/utils/dom/brs-to-ps.test.js | 84 ++- src/utils/dom/clean-attributes.test.js | 31 +- src/utils/dom/clean-h-ones.test.js | 41 +- src/utils/dom/clean-headers.test.js | 57 +- src/utils/dom/clean-images.test.js | 50 +- src/utils/dom/clean-tags.test.js | 215 +++++- src/utils/dom/convert-node-to.test.js | 15 +- src/utils/dom/convert-to-paragraphs.test.js | 34 +- src/utils/dom/extract-from-meta.test.js | 27 +- src/utils/dom/extract-from-selectors.test.js | 52 +- .../dom/fixtures/extract-from-selectors.js | 75 -- src/utils/dom/fixtures/html.js | 714 ------------------ src/utils/dom/fixtures/node-is-sufficient.js | 16 - src/utils/dom/link-density.test.js | 14 +- src/utils/dom/make-links-absolute.test.js | 198 ++--- src/utils/dom/mark-to-keep.test.js | 29 +- src/utils/dom/node-is-sufficient.test.js | 23 +- src/utils/dom/paragraphize.test.js | 45 +- src/utils/dom/remove-empty.test.js | 33 +- src/utils/dom/rewrite-top-level.test.js | 12 +- src/utils/dom/strip-junk-tags.test.js | 33 +- .../dom/strip-unlikely-candidates.test.js | 63 +- src/utils/dom/within-comment.test.js | 36 +- src/utils/merge-supported-domains.test.js | 3 +- src/utils/text/fixtures/html.js | 674 ----------------- src/utils/text/normalize-spaces.test.js | 22 +- src/utils/text/page-num-from-url.test.js | 46 +- 64 files changed, 1260 insertions(+), 2995 deletions(-) delete mode 100644 src/cleaners/fixtures/html.js delete mode 100644 src/extractors/generic/author/fixtures/html.js delete mode 100644 src/extractors/generic/content/scoring/fixtures/get-weight.js delete mode 100644 src/extractors/generic/content/scoring/fixtures/html.js delete mode 100644 src/extractors/generic/date-published/fixtures/html.js delete mode 100644 src/extractors/generic/lead-image-url/fixtures/html.js delete mode 100644 src/extractors/generic/title/fixtures/html.js delete mode 100644 src/utils/dom/fixtures/extract-from-selectors.js delete mode 100644 src/utils/dom/fixtures/html.js delete mode 100644 src/utils/dom/fixtures/node-is-sufficient.js delete mode 100644 src/utils/text/fixtures/html.js diff --git a/src/cleaners/fixtures/html.js b/src/cleaners/fixtures/html.js deleted file mode 100644 index a75cd793..00000000 --- a/src/cleaners/fixtures/html.js +++ /dev/null @@ -1,15 +0,0 @@ -const HTML = { - docWithH1: '

This Is the Real Title

', - docWith2H1s: ` -
-

This Is the Real Title

-

This Is the Real Title

-
- `, - docWithTagsInH1: { - before: '

This Is the Real Title

', - after: 'This Is the Real Title', - }, -}; - -export default HTML; diff --git a/src/cleaners/lead-image-url.test.js b/src/cleaners/lead-image-url.test.js index 90632c58..47e2ebb3 100644 --- a/src/cleaners/lead-image-url.test.js +++ b/src/cleaners/lead-image-url.test.js @@ -9,8 +9,7 @@ describe('clean(leadImageUrl)', () => { }); it('returns null if the url is not valid', () => { - const url = 'this is not a valid url'; - assert.equal(clean(url), null); + assert.equal(clean('this is not a valid url'), null); }); it('trims whitespace', () => { diff --git a/src/cleaners/title.test.js b/src/cleaners/title.test.js index a25f35fa..0a94e893 100644 --- a/src/cleaners/title.test.js +++ b/src/cleaners/title.test.js @@ -1,27 +1,35 @@ import assert from 'assert'; import cheerio from 'cheerio'; -import HTML from './fixtures/html'; import { cleanTitle } from './index'; describe('cleanTitle(title, { url, $ })', () => { it('only uses h1 if there is only one on the page', () => { const title = 'Too Short'; - const $ = cheerio.load(HTML.docWith2H1s); + const $ = cheerio.load(` +
+

This Is the Real Title

+

This Is the Real Title

+
+ `); assert.equal(cleanTitle(title, { url: '', $ }), title); }); it('removes HTML tags from titles', () => { - const $ = cheerio.load(HTML.docWithTagsInH1.before); + const $ = cheerio.load( + '

This Is the Real Title

' + ); const title = $('h1').html(); - assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after); + assert.equal(cleanTitle(title, { url: '', $ }), 'This Is the Real Title'); }); it('trims extraneous spaces', () => { const title = " This Is a Great Title That You'll Love "; - const $ = cheerio.load(HTML.docWithTagsInH1.before); + const $ = cheerio.load( + '

This Is the Real Title

' + ); assert.equal(cleanTitle(title, { url: '', $ }), title.trim()); }); diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js index 9ad5c862..af53fbfc 100644 --- a/src/extractors/detect-by-html.test.js +++ b/src/extractors/detect-by-html.test.js @@ -5,17 +5,15 @@ import detectByHtml from './detect-by-html'; describe('detectByHtml', () => { it('detects a medium post from the html', () => { - const html = ''; - - const $ = cheerio.load(html); + const $ = cheerio.load( + '' + ); assert.equal(detectByHtml($).domain, 'medium.com'); }); it('returns nothing if no match is found', () => { - const html = '
'; - - const $ = cheerio.load(html); + const $ = cheerio.load('
'); assert.equal(detectByHtml($), null); }); diff --git a/src/extractors/generic/author/extractor.test.js b/src/extractors/generic/author/extractor.test.js index 9f909429..8f316341 100644 --- a/src/extractors/generic/author/extractor.test.js +++ b/src/extractors/generic/author/extractor.test.js @@ -1,39 +1,54 @@ import assert from 'assert'; import cheerio from 'cheerio'; -import HTML from './fixtures/html'; import GenericAuthorExtractor from './extractor'; describe('GenericAuthorExtractor', () => { describe('extract($, cachedMeta)', () => { it('extracts author from meta tags', () => { - const $ = cheerio.load(HTML.authorMeta.test); + const $ = cheerio.load(` + + + + `); const result = GenericAuthorExtractor.extract({ $, metaCache: ['dc.author', 'something-else'], }); - assert.equal(result, HTML.authorMeta.result); + assert.equal(result, 'Adam'); }); it('extracts author from author selectors', () => { - const $ = cheerio.load(HTML.authorSelectors.test); + const $ = cheerio.load(` +
+ +
+ `); const result = GenericAuthorExtractor.extract({ $, metaCache: ['dc.author', 'something-else'], }); - assert.equal(result, HTML.authorSelectors.result); + assert.equal(result, 'Adam'); }); it('extracts author with regex selectors', () => { - const $ = cheerio.load(HTML.authorRegSelectors.test); + const $ = cheerio.load(` +
+ +
+ `); const result = GenericAuthorExtractor.extract({ $, metaCache: ['dc.author', 'something-else'], }); - assert.equal(result, HTML.authorRegSelectors.result); + assert.equal(result, 'Adam'); }); it('returns null if no author found', () => { diff --git a/src/extractors/generic/author/fixtures/html.js b/src/extractors/generic/author/fixtures/html.js deleted file mode 100644 index 84ed985d..00000000 --- a/src/extractors/generic/author/fixtures/html.js +++ /dev/null @@ -1,32 +0,0 @@ -const HTML = { - authorMeta: { - test: ` - - - - `, - result: 'Adam', - }, - authorSelectors: { - test: ` -
- -
- `, - result: 'Adam', - }, - authorRegSelectors: { - test: ` -
- -
- `, - result: 'Adam', - }, -}; - -export default HTML; diff --git a/src/extractors/generic/content/extract-best-node.test.js b/src/extractors/generic/content/extract-best-node.test.js index 9e960727..af44e653 100644 --- a/src/extractors/generic/content/extract-best-node.test.js +++ b/src/extractors/generic/content/extract-best-node.test.js @@ -7,14 +7,12 @@ const fs = require('fs'); describe('extractBestNode($, flags)', () => { it('scores the dom nodes and returns the best option', () => { const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8'); - const opts = { - stripUnlikelyCandidates: true, - weightNodes: true, - }; - const $ = cheerio.load(html); - const bestNode = extractBestNode($, opts); + const bestNode = extractBestNode($, { + stripUnlikelyCandidates: true, + weightNodes: true, + }); assert(typeof bestNode, 'object'); }); diff --git a/src/extractors/generic/content/scoring/add-score.test.js b/src/extractors/generic/content/scoring/add-score.test.js index f91203c9..2ba23be5 100644 --- a/src/extractors/generic/content/scoring/add-score.test.js +++ b/src/extractors/generic/content/scoring/add-score.test.js @@ -7,17 +7,15 @@ describe('Scoring utils', () => { describe('addScore(node, $, amount)', () => { it("adds the specified amount to a node's score", () => { const $ = cheerio.load('

Foo

'); - let $node = $('p').first(); - - $node = addScore($node, $, 25); + const $node = $('p').first(); + addScore($node, $, 25); assert.equal(getScore($node), 50); }); it('adds score if score not yet set (assumes score is 0)', () => { const $ = cheerio.load('

Foo

'); - let $node = $('p').first(); - - $node = addScore($node, $, 25); + const $node = $('p').first(); + addScore($node, $, 25); assert.equal(getScore($node), 25); }); }); diff --git a/src/extractors/generic/content/scoring/add-to-parent.test.js b/src/extractors/generic/content/scoring/add-to-parent.test.js index d0d39311..45c2b69f 100644 --- a/src/extractors/generic/content/scoring/add-to-parent.test.js +++ b/src/extractors/generic/content/scoring/add-to-parent.test.js @@ -6,11 +6,8 @@ import { addToParent, getScore } from './index'; describe('Scoring utils', () => { describe('addToParent(node, $, amount)', () => { it("adds 1/4 of a node's score it its parent", () => { - const html = '

Foo

'; - const $ = cheerio.load(html); - let $node = $('p').first(); - - $node = addToParent($node, $, 40); + const $ = cheerio.load('

Foo

'); + const $node = addToParent($('p').first(), $, 40); assert.equal(getScore($node.parent()), 35); assert.equal(getScore($node), 40); diff --git a/src/extractors/generic/content/scoring/find-top-candidate.test.js b/src/extractors/generic/content/scoring/find-top-candidate.test.js index 8a825203..59f1510c 100644 --- a/src/extractors/generic/content/scoring/find-top-candidate.test.js +++ b/src/extractors/generic/content/scoring/find-top-candidate.test.js @@ -1,15 +1,17 @@ import assert from 'assert'; import cheerio from 'cheerio'; -import HTML from './fixtures/html'; - import { getScore, findTopCandidate, scoreContent } from './index'; const fs = require('fs'); describe('findTopCandidate($)', () => { it('finds the top candidate from simple case', () => { - const $ = cheerio.load(HTML.findDom1); + const $ = cheerio.load(` +
+

Lorem ipsum etc

+
+ `); const $$topCandidate = findTopCandidate($); @@ -17,17 +19,27 @@ describe('findTopCandidate($)', () => { }); it('finds the top candidate from a nested case', () => { - const $ = cheerio.load(HTML.findDom2); + const $ = cheerio.load(` +
+
+

Lorem ipsum etc

+
+
+ `); const $$topCandidate = findTopCandidate($); - // this is wrapped in a div so checking - // the score of the first child + // this is wrapped in a div so checking the score of the first child assert.equal(getScore($$topCandidate.first()), 50); }); it('ignores tags like BR', () => { - const $ = cheerio.load(HTML.findDom3); + const $ = cheerio.load(` +
+

Lorem ipsum br

+
+
+ `); const $topCandidate = findTopCandidate($); @@ -35,13 +47,19 @@ describe('findTopCandidate($)', () => { }); it('returns BODY if no candidates found', () => { - const $ = cheerio.load(HTML.topBody); + const $ = cheerio.load(` + +
+

Lorem ipsum etc

+
+
+ + `); const $topCandidate = findTopCandidate($); - // browser won't allow body tag to be placed - // arbitrarily/loaded on the page, so we tranform - // it in cheerio-query, so this test would fail. + // browser won't allow body tag to be placed arbitrarily/loaded on the page, + // so we tranform it in cheerio-query, so this test would fail. if (!$.browser) { assert.equal($topCandidate.get(0).tagName, 'body'); } diff --git a/src/extractors/generic/content/scoring/fixtures/get-weight.js b/src/extractors/generic/content/scoring/fixtures/get-weight.js deleted file mode 100644 index 97dc6148..00000000 --- a/src/extractors/generic/content/scoring/fixtures/get-weight.js +++ /dev/null @@ -1,664 +0,0 @@ -const HTML = { - // getWeight fixtures - positiveId: ` -
-

Ooo good one

-
- `, - negativeId: ` -
-

Ooo good one

-
- `, - positiveClass: ` -
-

Ooo good one

-
- `, - negativeClass: ` - - `, - positiveIdAndClass: ` -
-

Ooo good one

-
- `, - positiveIdNegClass: ` -
-

Ooo good one

-
- `, - positivePhotoClass: ` -
-

Ooo good one

-
- `, - positiveIdAndPhoto: ` -
-

Ooo good one

-
- `, - entryContentAsset: ` -
-

Ooo good one

-
- `, - - // stripUnlikelyCandidates - noMatches: ` -
-

Ooo good one

-
- `, - whitelistMatch: { - before: ` -
Stuff
-
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - whiteAndBlack: { - before: ` -
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - whiteInsideBlack: { - before: ` -
-
-
-

Ooo good one

-
-
-
Something unrelated
-
- `, - after: ` -
-
Something unrelated
-
- `, - }, - - // brsToPs - singleBr: { - before: ` -
-
-

Ooo good one

-
- `, - after: ` -
-
-

Ooo good one

-
- `, - }, - doubleBrs: { - before: ` -
-
-
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - severalBrs: { - before: ` -
-
-
-
-
-
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - brsInP: { - before: ` -

- Here is some text -
-
- Here is more text -

- `, - after: ` -

- Here is some text -

- Here is more text -

- `, - }, - paragraphize: { - before: ` -

- Here is some text -
- Here is more text - And also this -

- `, - after: ` -

- Here is some text -

- Here is more text - And also this -

- `, - }, - paragraphizeBlock: { - before: ` -

- Here is some text -
- Here is more text -

And also this
-

- `, - after: ` -

- Here is some text -

- Here is more text -

And also this
-

- `, - }, - - // convertToParagraphs - convertToParagraphs: { - before: ` -

- Here is some text - This should remain in a p -
-
- This should be wrapped in a p -

This should become a p
-

- This should become a p - `, - after: ` -

- Here is some text - This should remain in a p -

- This should be wrapped in a p -

This should become a p

-

This should become a p

- `, - }, - - // linkDensity - linkDensity5: ` -

Some text!

Some text!

- `, - linkDensity1: ` -

Some text!

- `, - linkDensity0: ` -

- `, - - // rewriteTopLevel - rewriteHTMLBody: { - before: ` -

Wow how about that

- `, - after: ` -

Wow how about that

- `, - }, - - // cleanImages - cleanSmallImages: { - before: ` -
- - -
- `, - after: ` -
- -
- `, - }, - cleanHeight: { - before: ` -
- -
- `, - after: ` -
- -
- `, - }, - cleanSpacer: { - before: ` -
- - -

Some text

-
- `, - after: ` -
- -

Some text

-
- `, - }, - // stripJunkTags - stripsJunk: { - before: ` -
- - WOW - -

What an article

- - -
-
- `, - after: ` -
-

What an article

-
- `, - }, - - // stripHOnes - removeTwoHOnes: { - before: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - convertThreeHOnes: { - before: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-

What do you think?

-

Can you believe it?!

-
- `, - after: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-

What do you think?

-

Can you believe it?!

-
- `, - }, - - // cleanAttributes - removeStyle: { - before: ` -
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - removeAlign: { - before: ` -
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - - // removeEmpty - removeEmptyP: { - before: ` -
-

What do you think?

-

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - doNotRemoveBr: { - before: ` -
-

What do you think?

-

-
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
-

What do you think?

-
- `, - }, - doNotNested: { - before: ` -
-

What do you think?

-

-

-


-
- `, - after: ` -
-

What an article

- -
- `, - }, - - // markToKeep - marksYouTube: { - before: ` -
-

What an article

- - - -
- `, - after: ` -
-

What an article

- - - -
- `, - }, - - // stripHOnes - removeTwoHOnes: { - before: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - convertThreeHOnes: { - before: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-

What do you think?

-

Can you believe it?!

-
- `, - after: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-

What do you think?

-

Can you believe it?!

-
- `, - }, - - // cleanAttributes - removeStyle: { - before: ` -
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - removeAlign: { - before: ` -
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - - // removeEmpty - removeEmptyP: { - before: ` -
-

What do you think?

-

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - doNotRemoveBr: { - before: ` -
-

What do you think?

-

-
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
-

What do you think?

-
- `, - }, - doNotNested: { - before: ` -
-

What do you think?

-

-

+ + +

+ `); const result = markToKeep($('*').first(), $); assert.equal(result('iframe.mercury-parser-keep').length, 2); if (!$.browser) { - assertClean(result.html(), HTML.marksYouTube.after); + assertClean( + result.html(), + ` +
+

What an article

+ + + +
+ ` + ); } }); it('marks same-domain elements to keep', () => { - const html = - '
'; - const $ = cheerio.load(html); - + const $ = cheerio.load( + '
' + ); const result = markToKeep($('*').first(), $, 'https://medium.com/foo'); const keptHtml = `
`; diff --git a/src/utils/dom/node-is-sufficient.test.js b/src/utils/dom/node-is-sufficient.test.js index e666f1f6..4ccfe586 100644 --- a/src/utils/dom/node-is-sufficient.test.js +++ b/src/utils/dom/node-is-sufficient.test.js @@ -1,21 +1,30 @@ import assert from 'assert'; import cheerio from 'cheerio'; -import HTML from './fixtures/node-is-sufficient'; import nodeIsSufficient from './node-is-sufficient'; describe('Utils', () => { describe('nodeIsSufficient(node)', () => { it('returns false if node text length < 100 chars', () => { - const $ = cheerio.load(HTML.tooShort); - const sufficient = nodeIsSufficient($.root()); - assert.equal(sufficient, false); + const $ = cheerio.load(` +
+

This is too short

+
+ `); + + assert.equal(nodeIsSufficient($.root()), false); }); it('returns true if node text length > 100 chars', () => { - const $ = cheerio.load(HTML.longEnough); - const sufficient = nodeIsSufficient($.root()); - assert.equal(sufficient, true); + const $ = cheerio.load(` +
+

+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m +

+
+ `); + + assert.equal(nodeIsSufficient($.root()), true); }); }); }); diff --git a/src/utils/dom/paragraphize.test.js b/src/utils/dom/paragraphize.test.js index 8c7c5479..aa9582c3 100644 --- a/src/utils/dom/paragraphize.test.js +++ b/src/utils/dom/paragraphize.test.js @@ -2,23 +2,46 @@ import assert from 'assert'; import cheerio from 'cheerio'; import { clean } from 'test-helpers'; -import HTML from './fixtures/html'; import { paragraphize } from './index'; describe('Generic Extractor Utils', () => { describe('paragraphize(node)', () => { it('conversts a BR into P and moves inline contents to P tag after current parent', () => { - const $ = cheerio.load(HTML.paragraphize.before); + const $ = cheerio.load(` +

+ Here is some text +
+ Here is more text + And also this +

+ `); const node = $('br').get(0); // note: result here is not valid html; will handle elsewhere const result = paragraphize(node, $, true).html(); - assert.equal(clean(result), clean(HTML.paragraphize.after)); + assert.equal( + clean(result), + clean(` +

+ Here is some text +

+ Here is more text + And also this +

+ `) + ); }); - it('conversts a BR into P and stops when block element hit', () => { - const $ = cheerio.load(HTML.paragraphizeBlock.before); + it('converts a BR into P and stops when block element hit', () => { + const $ = cheerio.load(` +

+ Here is some text +
+ Here is more text +

And also this
+

+ `); const node = $('br').get(0); // note: result here is not valid html; will handle elsewhere @@ -30,7 +53,17 @@ describe('Generic Extractor Utils', () => { '

Here is some text

Here is more text

And also this

'; assert.equal(clean(result), html); } else { - assert.equal(clean(result), clean(HTML.paragraphizeBlock.after)); + assert.equal( + clean(result), + clean(` +

+ Here is some text +

+ Here is more text +

And also this
+

+ `) + ); } }); }); diff --git a/src/utils/dom/remove-empty.test.js b/src/utils/dom/remove-empty.test.js index 8789aed2..a391ffb5 100644 --- a/src/utils/dom/remove-empty.test.js +++ b/src/utils/dom/remove-empty.test.js @@ -2,15 +2,26 @@ import cheerio from 'cheerio'; import { assertClean } from 'test-helpers'; -import HTML from './fixtures/html'; import { removeEmpty } from './index'; describe('removeEmpty($)', () => { it('removes empty P tags', () => { - const $ = cheerio.load(HTML.removeEmptyP.before); + const $ = cheerio.load(` +
+

What do you think?

+

+
+ `); const result = removeEmpty($('*').first(), $); - assertClean(result.html(), HTML.removeEmptyP.after); + assertClean( + result.html(), + ` +
+

What do you think?

+
+ ` + ); }); it('removes P tags with only space', () => { @@ -22,10 +33,22 @@ describe('removeEmpty($)', () => { }); it('does not remove empty DIV tags', () => { - const $ = cheerio.load(HTML.removeEmptyP.before); + const $ = cheerio.load(` +
+

What do you think?

+

+
+ `); const result = removeEmpty($('*').first(), $); - assertClean(result.html(), HTML.removeEmptyP.after); + assertClean( + result.html(), + ` +
+

What do you think?

+
+ ` + ); }); it('does not remove empty p tags containing an iframe', () => { diff --git a/src/utils/dom/rewrite-top-level.test.js b/src/utils/dom/rewrite-top-level.test.js index 79ab0d65..379388e7 100644 --- a/src/utils/dom/rewrite-top-level.test.js +++ b/src/utils/dom/rewrite-top-level.test.js @@ -3,19 +3,25 @@ import assert from 'assert'; import { assertClean } from 'test-helpers'; -import HTML from './fixtures/html'; import rewriteTopLevel from './rewrite-top-level'; describe('rewriteTopLevel(node, $)', () => { it('turns html and body tags into divs', () => { - const $ = cheerio.load(HTML.rewriteHTMLBody.before); + const $ = cheerio.load(` +

Wow how about that

+ `); const result = rewriteTopLevel($('html').first(), $); assert.equal(result('html').length, 0); assert.equal(result('body').length, 0); if (!cheerio.browser) { - assertClean(result.html(), HTML.rewriteHTMLBody.after); + assertClean( + result.html(), + ` +

Wow how about that

+ ` + ); } }); }); diff --git a/src/utils/dom/strip-junk-tags.test.js b/src/utils/dom/strip-junk-tags.test.js index 2817925b..6fe18762 100644 --- a/src/utils/dom/strip-junk-tags.test.js +++ b/src/utils/dom/strip-junk-tags.test.js @@ -3,19 +3,44 @@ import assert from 'assert'; import { assertClean } from 'test-helpers'; -import HTML from './fixtures/html'; import { stripJunkTags } from './index'; describe('stripJunkTags($)', () => { it('strips script and other junk tags', () => { - const $ = cheerio.load(HTML.stripsJunk.before); + const $ = cheerio.load(` +
+ + WOW + +

What an article

+ + +
+
+ `); const result = stripJunkTags($('*').first(), $); - assertClean(result.html(), HTML.stripsJunk.after); + assertClean( + result.html(), + ` +
+

What an article

+
+ ` + ); }); it('keeps youtube embeds', () => { - let $ = cheerio.load(HTML.ignoresKeepable.before); + let $ = cheerio.load(` +
+ + WOW + +

What an article

+ +
+
+ `); $ = stripJunkTags($('*').first(), $); assert.equal($('iframe[src^="https://www.youtube.com"]').length, 1); diff --git a/src/utils/dom/strip-unlikely-candidates.test.js b/src/utils/dom/strip-unlikely-candidates.test.js index 1b077ace..e3e6743e 100644 --- a/src/utils/dom/strip-unlikely-candidates.test.js +++ b/src/utils/dom/strip-unlikely-candidates.test.js @@ -2,32 +2,71 @@ import assert from 'assert'; import cheerio from 'cheerio'; import { assertClean } from 'test-helpers'; -import HTML from './fixtures/html'; import stripUnlikelyCandidates from './strip-unlikely-candidates'; -function assertBeforeAndAfter(key, fn) { - const $ = cheerio.load(HTML[key].before); - assertClean(fn($).html(), HTML[key].after); -} - describe('Generic Extractor Utils', () => { describe('stripUnlikelyCandidates(node)', () => { it('returns original doc if no matches found', () => { - const $ = cheerio.load(HTML.noMatches); - const stripped = stripUnlikelyCandidates($); - assert.equal(stripped.html(), HTML.noMatches); + const html = ` +
+

Ooo good one

+
+ `; + + const stripped = stripUnlikelyCandidates(cheerio.load(html)); + assert.equal(stripped.html(), html); }); it('strips unlikely matches from the doc', () => { - assertBeforeAndAfter('whitelistMatch', stripUnlikelyCandidates); + const before = ` +
Stuff
+
+

Ooo good one

+
+ `; + const after = ` +
+

Ooo good one

+
+ `; + + assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after); }); it('keeps likely matches even when they also match the blacklist', () => { - assertBeforeAndAfter('whiteAndBlack', stripUnlikelyCandidates); + const before = ` +
+

Ooo good one

+
+ `; + + const after = ` +
+

Ooo good one

+
+ `; + assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after); }); it('removed likely matches when inside blacklist node', () => { - assertBeforeAndAfter('whiteInsideBlack', stripUnlikelyCandidates); + const before = ` +
+
+
+

Ooo good one

+
+
+
Something unrelated
+
+ `; + + const after = ` +
+
Something unrelated
+
+ `; + + assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after); }); }); }); diff --git a/src/utils/dom/within-comment.test.js b/src/utils/dom/within-comment.test.js index cc2511ae..c3623edc 100644 --- a/src/utils/dom/within-comment.test.js +++ b/src/utils/dom/within-comment.test.js @@ -5,29 +5,35 @@ import withinComment from './within-comment'; describe('withinComment(node)', () => { it('returns false if its parent is not a comment', () => { - const $ = cheerio.load(`
-
-
Adam
-
-
`); + const $ = cheerio.load(` +
+
+
Adam
+
+
+ `); assert.equal(withinComment($('.author').first()), false); }); it('returns true if its parent has a class of comment', () => { - const $ = cheerio.load(`
-
-
Adam
-
-
`); + const $ = cheerio.load(` +
+
+
Adam
+
+
+ `); assert.equal(withinComment($('.author').first()), true); }); it('returns true if its parent has an id of comment', () => { - const $ = cheerio.load(`
-
-
Adam
-
-
`); + const $ = cheerio.load(` +
+
+
Adam
+
+
+ `); assert.equal(withinComment($('.author').first()), true); }); }); diff --git a/src/utils/merge-supported-domains.test.js b/src/utils/merge-supported-domains.test.js index 44e6ca2c..b3485c80 100644 --- a/src/utils/merge-supported-domains.test.js +++ b/src/utils/merge-supported-domains.test.js @@ -1,5 +1,4 @@ import assert from 'assert'; - import mergeSupportedDomains from './merge-supported-domains'; describe('mergeSupportedDomains(extractor, domains)', () => { @@ -8,6 +7,7 @@ describe('mergeSupportedDomains(extractor, domains)', () => { domain: 'foo.com', supportedDomains: ['example.com'], }; + const expected = { 'foo.com': extractor, 'example.com': extractor, @@ -21,6 +21,7 @@ describe('mergeSupportedDomains(extractor, domains)', () => { const extractor = { domain: 'foo.com', }; + const expected = { 'foo.com': extractor, }; diff --git a/src/utils/text/fixtures/html.js b/src/utils/text/fixtures/html.js deleted file mode 100644 index 0045efa7..00000000 --- a/src/utils/text/fixtures/html.js +++ /dev/null @@ -1,674 +0,0 @@ -const HTML = { - // getWeight fixtures - positiveId: ` -
-

Ooo good one

-
- `, - negativeId: ` -
-

Ooo good one

-
- `, - positiveClass: ` -
-

Ooo good one

-
- `, - negativeClass: ` - - `, - positiveIdAndClass: ` -
-

Ooo good one

-
- `, - positiveIdNegClass: ` -
-

Ooo good one

-
- `, - positivePhotoClass: ` -
-

Ooo good one

-
- `, - positiveIdAndPhoto: ` -
-

Ooo good one

-
- `, - entryContentAsset: ` -
-

Ooo good one

-
- `, - - // stripUnlikelyCandidates - noMatches: ` -
-

Ooo good one

-
- `, - whitelistMatch: { - before: ` -
Stuff
-
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - whiteAndBlack: { - before: ` -
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - whiteInsideBlack: { - before: ` -
-
-
-

Ooo good one

-
-
-
Something unrelated
-
- `, - after: ` -
-
Something unrelated
-
- `, - }, - - // brsToPs - singleBr: { - before: ` -
-
-

Ooo good one

-
- `, - after: ` -
-
-

Ooo good one

-
- `, - }, - doubleBrs: { - before: ` -
-
-
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - severalBrs: { - before: ` -
-
-
-
-
-
-

Ooo good one

-
- `, - after: ` -
-

Ooo good one

-
- `, - }, - brsInP: { - before: ` -

- Here is some text -
-
- Here is more text -

- `, - after: ` -

- Here is some text -

- Here is more text -

- `, - }, - paragraphize: { - before: ` -

- Here is some text -
- Here is more text - And also this -

- `, - after: ` -

- Here is some text -

- Here is more text - And also this -

- `, - }, - paragraphizeBlock: { - before: ` -

- Here is some text -
- Here is more text -

And also this
-

- `, - after: ` -

- Here is some text -

- Here is more text -

And also this
-

- `, - }, - - // convertToParagraphs - convertToParagraphs: { - before: ` -

- Here is some text - This should remain in a p -
-
- This should be wrapped in a p -

This should become a p
-

- This should become a p - `, - after: ` -

- Here is some text - This should remain in a p -

- This should be wrapped in a p -

This should become a p

-

This should become a p

- `, - }, - - // linkDensity - linkDensity5: ` -

Some text!

Some text!

- `, - linkDensity1: ` -

Some text!

- `, - linkDensity0: ` -

- `, - - // rewriteTopLevel - rewriteHTMLBody: { - before: ` -

Wow how about that

- `, - after: ` -

Wow how about that

- `, - }, - - // cleanImages - cleanSmallImages: { - before: ` -
- - -
- `, - after: ` -
- -
- `, - }, - cleanHeight: { - before: ` -
- -
- `, - after: ` -
- -
- `, - }, - cleanSpacer: { - before: ` -
- - -

Some text

-
- `, - after: ` -
- -

Some text

-
- `, - }, - // stripJunkTags - stripsJunk: { - before: ` -
- - WOW - -

What an article

- - -
-
- `, - after: ` -
-

What an article

-
- `, - }, - - // stripHOnes - removeTwoHOnes: { - before: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - convertThreeHOnes: { - before: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-

What do you think?

-

Can you believe it?!

-
- `, - after: ` -
-

Look at this!

-

What do you think?

-

Can you believe it?!

-

What do you think?

-

Can you believe it?!

-
- `, - }, - - // cleanAttributes - removeStyle: { - before: ` -
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - removeAlign: { - before: ` -
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - - // removeEmpty - removeEmptyP: { - before: ` -
-

What do you think?

-

-
- `, - after: ` -
-

What do you think?

-
- `, - }, - doNotRemoveBr: { - before: ` -
-

What do you think?

-

-
-

What do you think?

-
- `, - after: ` -
-

What do you think?

-
-

What do you think?

-
- `, - }, - doNotNested: { - before: ` -
-

What do you think?

-

-