You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/cleaners/content.test.js

30 lines
891 B
JavaScript

import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractCleanNode from './content';
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it('cleans cruft out of a DOM node', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html);
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
};
const bestNode = extractBestNode($, opts);
const cleanNode = extractCleanNode(bestNode, { $, opts });
const text = $(cleanNode).text()
.replace(/\n/g, '')
.replace(/\s+/g, ' ')
.trim();
assert.equal(text.length === 2656 || text.length === 2657, true);
});
});