'
+ );
const title = $('h1').html();
- assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after);
+ assert.equal(cleanTitle(title, { url: '', $ }), 'This Is the Real Title');
});
it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love ";
- const $ = cheerio.load(HTML.docWithTagsInH1.before);
+ const $ = cheerio.load(
+ '
This Is the Real Title
'
+ );
assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
});
diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js
index 9ad5c862..af53fbfc 100644
--- a/src/extractors/detect-by-html.test.js
+++ b/src/extractors/detect-by-html.test.js
@@ -5,17 +5,15 @@ import detectByHtml from './detect-by-html';
describe('detectByHtml', () => {
it('detects a medium post from the html', () => {
- const html = '';
-
- const $ = cheerio.load(html);
+ const $ = cheerio.load(
+ ''
+ );
assert.equal(detectByHtml($).domain, 'medium.com');
});
it('returns nothing if no match is found', () => {
- const html = '';
-
- const $ = cheerio.load(html);
+ const $ = cheerio.load('');
assert.equal(detectByHtml($), null);
});
diff --git a/src/extractors/generic/author/extractor.test.js b/src/extractors/generic/author/extractor.test.js
index 9f909429..8f316341 100644
--- a/src/extractors/generic/author/extractor.test.js
+++ b/src/extractors/generic/author/extractor.test.js
@@ -1,39 +1,54 @@
import assert from 'assert';
import cheerio from 'cheerio';
-import HTML from './fixtures/html';
import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => {
- const $ = cheerio.load(HTML.authorMeta.test);
+ const $ = cheerio.load(`
+
+
+
+ `);
const result = GenericAuthorExtractor.extract({
$,
metaCache: ['dc.author', 'something-else'],
});
- assert.equal(result, HTML.authorMeta.result);
+ assert.equal(result, 'Adam');
});
it('extracts author from author selectors', () => {
- const $ = cheerio.load(HTML.authorSelectors.test);
+ const $ = cheerio.load(`
+
');
+ const $node = addToParent($('p').first(), $, 40);
assert.equal(getScore($node.parent()), 35);
assert.equal(getScore($node), 40);
diff --git a/src/extractors/generic/content/scoring/find-top-candidate.test.js b/src/extractors/generic/content/scoring/find-top-candidate.test.js
index 8a825203..59f1510c 100644
--- a/src/extractors/generic/content/scoring/find-top-candidate.test.js
+++ b/src/extractors/generic/content/scoring/find-top-candidate.test.js
@@ -1,15 +1,17 @@
import assert from 'assert';
import cheerio from 'cheerio';
-import HTML from './fixtures/html';
-
import { getScore, findTopCandidate, scoreContent } from './index';
const fs = require('fs');
describe('findTopCandidate($)', () => {
it('finds the top candidate from simple case', () => {
- const $ = cheerio.load(HTML.findDom1);
+ const $ = cheerio.load(`
+
+
Lorem ipsum etc
+
+ `);
const $$topCandidate = findTopCandidate($);
@@ -17,17 +19,27 @@ describe('findTopCandidate($)', () => {
});
it('finds the top candidate from a nested case', () => {
- const $ = cheerio.load(HTML.findDom2);
+ const $ = cheerio.load(`
+
+
+
Lorem ipsum etc
+
+
+ `);
const $$topCandidate = findTopCandidate($);
- // this is wrapped in a div so checking
- // the score of the first child
+ // this is wrapped in a div so checking the score of the first child
assert.equal(getScore($$topCandidate.first()), 50);
});
it('ignores tags like BR', () => {
- const $ = cheerio.load(HTML.findDom3);
+ const $ = cheerio.load(`
+
+
+
+
+
+ `);
const $topCandidate = findTopCandidate($);
- // browser won't allow body tag to be placed
- // arbitrarily/loaded on the page, so we tranform
- // it in cheerio-query, so this test would fail.
+ // browser won't allow body tag to be placed arbitrarily/loaded on the page,
+ // so we tranform it in cheerio-query, so this test would fail.
if (!$.browser) {
assert.equal($topCandidate.get(0).tagName, 'body');
}
diff --git a/src/extractors/generic/content/scoring/fixtures/get-weight.js b/src/extractors/generic/content/scoring/fixtures/get-weight.js
deleted file mode 100644
index 97dc6148..00000000
--- a/src/extractors/generic/content/scoring/fixtures/get-weight.js
+++ /dev/null
@@ -1,664 +0,0 @@
-const HTML = {
- // getWeight fixtures
- positiveId: `
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
-
- `,
- },
- goodScoreTooDense: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
-
- `,
- },
- previousEndsInColon: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
- `,
- divScore5: `
-
Lorem ipsum, dolor sit, amet
- `,
- blockquoteScore3: `
-
Lorem ipsum, dolor sit, amet
- `,
- formScoreNeg3: `
-
- `,
- thScoreNeg5: `
-
Lorem ipsum, dolor sit, amet
- `,
- score44: `
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
- `,
- score44Parent: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
- `,
- hNews: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
- `,
- after: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
- `,
- },
- nonHNews: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
- `,
- after: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
- `,
- },
-
- // findTopCandidate
- findDom1: `
-
-
Lorem ipsum etc
-
- `,
- findDom2: `
-
-
-
Lorem ipsum etc
-
-
- `,
- findDom3: `
-
-
Lorem ipsum br
-
-
- `,
- topBody: `
-
-
-
Lorem ipsum etc
-
-
-
- `,
-};
-
-export default HTML;
diff --git a/src/extractors/generic/content/scoring/get-or-init-score.test.js b/src/extractors/generic/content/scoring/get-or-init-score.test.js
index 77bdcfe3..12953576 100644
--- a/src/extractors/generic/content/scoring/get-or-init-score.test.js
+++ b/src/extractors/generic/content/scoring/get-or-init-score.test.js
@@ -1,17 +1,13 @@
import assert from 'assert';
import cheerio from 'cheerio';
-import HTML from './fixtures/html';
import { getOrInitScore, getScore } from './index';
describe('getOrInitScore(node, $)', () => {
describe('when score set', () => {
it("returns score if node's score already set", () => {
- const html = '
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
');
- const $node = $('p').first();
- assert.equal(getScore($node), null);
+ assert.equal(getScore($('p').first()), null);
});
it('returns 25 if the node has a score attr of 25', () => {
const $ = cheerio.load('
Foo
');
- const $node = $('p').first();
- assert.equal(typeof getScore($node), 'number');
- assert.equal(getScore($node), 25);
+ const score = getScore($('p').first());
+ assert.equal(typeof score, 'number');
+ assert.equal(score, 25);
});
});
});
diff --git a/src/extractors/generic/content/scoring/get-weight.test.js b/src/extractors/generic/content/scoring/get-weight.test.js
index 4184f7d6..92664bdc 100644
--- a/src/extractors/generic/content/scoring/get-weight.test.js
+++ b/src/extractors/generic/content/scoring/get-weight.test.js
@@ -1,55 +1,90 @@
import assert from 'assert';
import cheerio from 'cheerio';
-import HTML from './fixtures/get-weight';
import { getWeight } from './index';
describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => {
it('returns a score of 25 if node has positive id', () => {
- const $ = cheerio.load(HTML.positiveId);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of -25 if node has negative id', () => {
- const $ = cheerio.load(HTML.negativeId);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), -25);
});
it('returns a score of 25 if node has positive class', () => {
- const $ = cheerio.load(HTML.positiveClass);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of -25 if node has negative class', () => {
- const $ = cheerio.load(HTML.negativeClass);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), -25);
});
it('returns a score of 25 if node has both positive id and class', () => {
- const $ = cheerio.load(HTML.positiveIdAndClass);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of 25 if node has pos id and neg class', () => {
// is this really wanted? id="entry" class="adbox"
// should get positive score?
- const $ = cheerio.load(HTML.positiveIdNegClass);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of 10 if node has pos img class', () => {
- const $ = cheerio.load(HTML.positivePhotoClass);
+ const $ = cheerio.load(`
+
+
Ooo good one
+
+ `);
assert.equal(getWeight($('div')), 10);
});
it('returns a score of 35 if node has pos id pos img class', () => {
- const $ = cheerio.load(HTML.positiveIdAndPhoto);
+ const $ = cheerio.load(`
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
- Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
- when an unknown printer took a galley of type and scrambled it to make a type
+
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
+ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
+ when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
-
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
- Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
- when an unknown printer took a galley of type and scrambled it to make a type
+
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
+ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
+ when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
-
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
- Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
- when an unknown printer took a galley of type and scrambled it to make a type
+
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
+ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
+ when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
-
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
- Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
- when an unknown printer took a galley of type and scrambled it to make a type
+
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
+ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
+ when an unknown printer took a galley of type and scrambled it to make a type
specimen book.
`;
- let $ = cheerio.load(html);
- $ = scoreContent($);
+
+ const $ = cheerio.load(html);
+ scoreContent($);
assert.equal(
$('p')
diff --git a/src/extractors/generic/content/scoring/score-node.test.js b/src/extractors/generic/content/scoring/score-node.test.js
index a0214885..5fea41c8 100644
--- a/src/extractors/generic/content/scoring/score-node.test.js
+++ b/src/extractors/generic/content/scoring/score-node.test.js
@@ -1,14 +1,11 @@
import assert from 'assert';
import cheerio from 'cheerio';
-import HTML from './fixtures/html';
-
import { scoreNode, scoreParagraph } from './index';
describe('scoreNode(node)', () => {
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
- const html = '
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
`;
const opts = {
transforms: {
diff --git a/src/mercury.test.js b/src/mercury.test.js
index 0a59e881..aac2b9fe 100644
--- a/src/mercury.test.js
+++ b/src/mercury.test.js
@@ -1,5 +1,4 @@
import assert from 'assert';
-
import { record } from 'test-helpers';
import Mercury from './mercury';
diff --git a/src/resource/utils/dom/convert-lazy-loaded-images.test.js b/src/resource/utils/dom/convert-lazy-loaded-images.test.js
index fe2c281c..175f0fb6 100644
--- a/src/resource/utils/dom/convert-lazy-loaded-images.test.js
+++ b/src/resource/utils/dom/convert-lazy-loaded-images.test.js
@@ -5,9 +5,7 @@ import convertLazyLoadedImages from './convert-lazy-loaded-images';
describe('convertLazyLoadedImages($)', () => {
it('moves image links to src if placed in another attribute', () => {
- const html = '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load('');
const result = convertLazyLoadedImages($).html();
assert.equal(
@@ -17,9 +15,7 @@ describe('convertLazyLoadedImages($)', () => {
});
it('moves image source candidates to srcset if placed in another attribute', () => {
- const html = '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load('');
const result = convertLazyLoadedImages($).html();
assert.equal(
@@ -29,10 +25,9 @@ describe('convertLazyLoadedImages($)', () => {
});
it('moves image source candidates containing query strings to srcset if placed in another attribute', () => {
- const html =
- '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load(
+ ''
+ );
const result = convertLazyLoadedImages($).html();
assert.equal(
@@ -42,10 +37,9 @@ describe('convertLazyLoadedImages($)', () => {
});
it('properly handles src and srcset attributes', () => {
- const html =
- '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load(
+ ''
+ );
const result = convertLazyLoadedImages($).html();
assert.equal(
@@ -57,37 +51,30 @@ describe('convertLazyLoadedImages($)', () => {
it('does nothing when value is not a link', () => {
// This is far from perfect, since a relative url could
// be perfectly correct.
- const html = '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load('');
const result = convertLazyLoadedImages($).html();
assert.equal(result, '');
});
it('does nothing when value is not an image', () => {
- const html = '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load('');
const result = convertLazyLoadedImages($).html();
assert.equal(result, '');
});
it('does not change a correct img with src', () => {
- const html = '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load('');
const result = convertLazyLoadedImages($).html();
assert.equal(result, '');
});
it('does not replace an img src with srcset value', () => {
- const html =
- '';
- const $ = cheerio.load(html);
-
+ const $ = cheerio.load(
+ ''
+ );
const result = convertLazyLoadedImages($).html();
assert.equal(
diff --git a/src/resource/utils/dom/normalize-meta-tags.test.js b/src/resource/utils/dom/normalize-meta-tags.test.js
index 66430901..e60d5a92 100644
--- a/src/resource/utils/dom/normalize-meta-tags.test.js
+++ b/src/resource/utils/dom/normalize-meta-tags.test.js
@@ -5,29 +5,25 @@ import normalizeMetaTags from './normalize-meta-tags';
describe('normalizeMetaTags($)', () => {
it('replaces "content" attributes with "value"', () => {
- const html = '';
- const test = '';
-
- // browser cheerio/jquery will remove/replace html, so result
- // is different
- const testBrowser = '';
-
- const $ = cheerio.load(html);
+ // browser cheerio/jquery will remove/replace html, so result is different
+ const test = cheerio.browser
+ ? ''
+ : '';
+ const $ = cheerio.load('');
const result = normalizeMetaTags($).html();
- assert.equal(result, cheerio.browser ? testBrowser : test);
+ assert.equal(result, test);
});
it('replaces "property" attributes with "name"', () => {
- const html = '';
- const test = '';
- const testBrowser = '';
-
- const $ = cheerio.load(html);
+ const test = cheerio.browser
+ ? ''
+ : '';
+ const $ = cheerio.load('');
const result = normalizeMetaTags($).html();
- assert.equal(result, cheerio.browser ? testBrowser : test);
+ assert.equal(result, test);
});
});
diff --git a/src/utils/dom/brs-to-ps.test.js b/src/utils/dom/brs-to-ps.test.js
index 20ea97a3..85fd0910 100644
--- a/src/utils/dom/brs-to-ps.test.js
+++ b/src/utils/dom/brs-to-ps.test.js
@@ -2,35 +2,95 @@ import assert from 'assert';
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
-import HTML from './fixtures/html';
import brsToPs from './brs-to-ps';
-function assertBeforeAndAfter(key, fn) {
- const $ = cheerio.load(HTML[key].before);
- assertClean(fn($).html(), HTML[key].after);
-}
-
describe('Generic Extractor Utils', () => {
describe('brsToPs(node)', () => {
it('does nothing when no BRs present', () => {
- const $ = cheerio.load(HTML.positiveId);
- assert.equal(brsToPs($).html(), HTML.positiveId);
+ const html = `
+
+
Ooo good one
+
+ `;
+ assert.equal(brsToPs(cheerio.load(html)).html(), html);
});
it('does nothing when a single BR is present', () => {
- assertBeforeAndAfter('singleBr', brsToPs);
+ const before = `
+
+
+
Ooo good one
+
+ `;
+
+ const after = `
+
+
+
Ooo good one
+
+ `;
+
+ assertClean(brsToPs(cheerio.load(before)).html(), after);
});
it('converts double BR tags to an empty P tag', () => {
- assertBeforeAndAfter('doubleBrs', brsToPs);
+ const before = `
+
+
+
+
Ooo good one
+
+ `;
+
+ const after = `
+
+
Ooo good one
+
+ `;
+
+ assertClean(brsToPs(cheerio.load(before)).html(), after);
});
it('converts several BR tags to an empty P tag', () => {
- assertBeforeAndAfter('severalBrs', brsToPs);
+ const before = `
+
+
+
+
+
+
+
Ooo good one
+
+ `;
+
+ const after = `
+
+
Ooo good one
+
+ `;
+
+ assertClean(brsToPs(cheerio.load(before)).html(), after);
});
it('converts BR tags in a P tag into a P containing inline children', () => {
- assertBeforeAndAfter('brsInP', brsToPs);
+ const before = `
+
+ Here is some text
+
+
+ Here is more text
+
+ `;
+
+ const after = `
+
+ Here is some text
+
+ Here is more text
+
+ `;
+
+ assertClean(brsToPs(cheerio.load(before)).html(), after);
});
});
});
diff --git a/src/utils/dom/clean-attributes.test.js b/src/utils/dom/clean-attributes.test.js
index 990510ac..dcb3a785 100644
--- a/src/utils/dom/clean-attributes.test.js
+++ b/src/utils/dom/clean-attributes.test.js
@@ -2,21 +2,42 @@ import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
-import HTML from './fixtures/html';
import { cleanAttributes } from './index';
describe('cleanAttributes($)', () => {
it('removes style attributes from nodes', () => {
- const $ = cheerio.load(HTML.removeStyle.before);
+ const $ = cheerio.load(`
+
+ `
+ );
});
it('converts H1s to H2s if there are 3 or more of them', () => {
- const $ = cheerio.load(HTML.convertThreeHOnes.before);
+ const $ = cheerio.load(`
+
+ `
+ );
});
it('removes a div with no images and very little text', () => {
- const $ = cheerio.load(HTML.removeShortNoImg.before);
+ const $ = cheerio.load(`
+
+ `
+ );
});
it('removes a node with a link density that is too high', () => {
- const $ = cheerio.load(HTML.linkDensityHigh.before);
+ const $ = cheerio.load(`
+
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
+
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
+
+ `
+ );
});
it('removes a node with a good score but link density > 0.5', () => {
- const $ = cheerio.load(HTML.linkDensityHigh.before);
+ const $ = cheerio.load(`
+
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
+
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
Keep this one
+
+
+ `
+ );
});
it('keeps node with a good score but link density > 0.5 if preceding text ends in colon', () => {
- const $ = cheerio.load(HTML.previousEndsInColon.before);
+ const html = `
+
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
+ `;
+ const $ = cheerio.load(html);
const result = cleanTags($('*').first(), $);
- assertClean(result.html(), HTML.previousEndsInColon.before);
+ assertClean(result.html(), html);
});
it('keeps anything with a class of entry-content-asset', () => {
- const $ = cheerio.load(HTML.cleanEntryContentAsset.before);
+ const html = `
+
+
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
-
- `,
- },
- goodScoreTooDense: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
-
- `,
- },
- previousEndsInColon: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
+ `);
const density = linkDensity($('div').first(), $);
@@ -15,7 +15,9 @@ describe('linkDensity($)', () => {
});
it('returns 1 if all of the text is a link', () => {
- const $ = cheerio.load(HTML.linkDensity1);
+ const $ = cheerio.load(`
+
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
+
+
+ `);
+
+ assert.equal(nodeIsSufficient($.root()), true);
});
});
});
diff --git a/src/utils/dom/paragraphize.test.js b/src/utils/dom/paragraphize.test.js
index 8c7c5479..aa9582c3 100644
--- a/src/utils/dom/paragraphize.test.js
+++ b/src/utils/dom/paragraphize.test.js
@@ -2,23 +2,46 @@ import assert from 'assert';
import cheerio from 'cheerio';
import { clean } from 'test-helpers';
-import HTML from './fixtures/html';
import { paragraphize } from './index';
describe('Generic Extractor Utils', () => {
describe('paragraphize(node)', () => {
it('conversts a BR into P and moves inline contents to P tag after current parent', () => {
- const $ = cheerio.load(HTML.paragraphize.before);
+ const $ = cheerio.load(`
+
+ Here is some text
+
+ Here is more text
+ And also this
+
+ `);
const node = $('br').get(0);
// note: result here is not valid html; will handle elsewhere
const result = paragraphize(node, $, true).html();
- assert.equal(clean(result), clean(HTML.paragraphize.after));
+ assert.equal(
+ clean(result),
+ clean(`
+
+ Here is some text
+
+ Here is more text
+ And also this
+
+ `)
+ );
});
- it('conversts a BR into P and stops when block element hit', () => {
- const $ = cheerio.load(HTML.paragraphizeBlock.before);
+ it('converts a BR into P and stops when block element hit', () => {
+ const $ = cheerio.load(`
+
+ Here is some text
+
+ Here is more text
+
And also this
+
+ `);
const node = $('br').get(0);
// note: result here is not valid html; will handle elsewhere
@@ -30,7 +53,17 @@ describe('Generic Extractor Utils', () => {
'
+ `);
$ = stripJunkTags($('*').first(), $);
assert.equal($('iframe[src^="https://www.youtube.com"]').length, 1);
diff --git a/src/utils/dom/strip-unlikely-candidates.test.js b/src/utils/dom/strip-unlikely-candidates.test.js
index 1b077ace..e3e6743e 100644
--- a/src/utils/dom/strip-unlikely-candidates.test.js
+++ b/src/utils/dom/strip-unlikely-candidates.test.js
@@ -2,32 +2,71 @@ import assert from 'assert';
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
-import HTML from './fixtures/html';
import stripUnlikelyCandidates from './strip-unlikely-candidates';
-function assertBeforeAndAfter(key, fn) {
- const $ = cheerio.load(HTML[key].before);
- assertClean(fn($).html(), HTML[key].after);
-}
-
describe('Generic Extractor Utils', () => {
describe('stripUnlikelyCandidates(node)', () => {
it('returns original doc if no matches found', () => {
- const $ = cheerio.load(HTML.noMatches);
- const stripped = stripUnlikelyCandidates($);
- assert.equal(stripped.html(), HTML.noMatches);
+ const html = `
+
+
Ooo good one
+
+ `;
+
+ const stripped = stripUnlikelyCandidates(cheerio.load(html));
+ assert.equal(stripped.html(), html);
});
it('strips unlikely matches from the doc', () => {
- assertBeforeAndAfter('whitelistMatch', stripUnlikelyCandidates);
+ const before = `
+
Stuff
+
+
Ooo good one
+
+ `;
+ const after = `
+
+
Ooo good one
+
+ `;
+
+ assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
});
it('keeps likely matches even when they also match the blacklist', () => {
- assertBeforeAndAfter('whiteAndBlack', stripUnlikelyCandidates);
+ const before = `
+
+ `;
+
+ assertClean(stripUnlikelyCandidates(cheerio.load(before)).html(), after);
});
});
});
diff --git a/src/utils/dom/within-comment.test.js b/src/utils/dom/within-comment.test.js
index cc2511ae..c3623edc 100644
--- a/src/utils/dom/within-comment.test.js
+++ b/src/utils/dom/within-comment.test.js
@@ -5,29 +5,35 @@ import withinComment from './within-comment';
describe('withinComment(node)', () => {
it('returns false if its parent is not a comment', () => {
- const $ = cheerio.load(`
-
-
Adam
-
-
`);
+ const $ = cheerio.load(`
+
+
+
Adam
+
+
+ `);
assert.equal(withinComment($('.author').first()), false);
});
it('returns true if its parent has a class of comment', () => {
- const $ = cheerio.load(`
-
-
Adam
-
-
`);
+ const $ = cheerio.load(`
+
+
+
Adam
+
+
+ `);
assert.equal(withinComment($('.author').first()), true);
});
it('returns true if its parent has an id of comment', () => {
- const $ = cheerio.load(`
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
-
- `,
- },
- goodScoreTooDense: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
-
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
Keep this one
-
-
- `,
- },
- previousEndsInColon: {
- before: `
-
-
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.