You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/root-extractor.test.js

165 lines
4.1 KiB
JavaScript

import assert from 'assert';
import fs from 'fs';
import cheerio from 'cheerio';
import { assertClean } from 'test-helpers';
import {
default as RootExtractor,
select,
cleanBySelectors,
transformElements,
} from './root-extractor';
import { NYMagExtractor } from './custom/nymag.com';
describe('RootExtractor', () => {
it('only returns what the custom parser gives it if fallback is disabled', () => {
const fullUrl = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8');
const $ = cheerio.load(html);
const { url } = RootExtractor.extract(
NYMagExtractor, { url: fullUrl, html, $, metaCache: [], fallback: false }
);
assert.equal(url, null);
});
});
describe('cleanBySelectors($content, $, { clean })', () => {
it('removes provided selectors from the content', () => {
const opts = { clean: ['.ad', '.share'] };
const html = `
<div>
<div class="body">
<div class="share">Share this on twitter plz</div>
<p>This is some good content</p>
<div class="ad">Advertisement!</div>
</div>
</div>`;
const $ = cheerio.load(html);
let $content = $('.body');
$content = cleanBySelectors($content, $, opts);
assert.equal($content.find('.ad').length, 0);
assert.equal($content.find('.share').length, 0);
});
});
describe('transformElements($content, $, { transforms })', () => {
it('performs a simple transformation on matched elements', () => {
const html = `
<div>
<div class="body">
<h1>WOW BIG TITLE</h1>
<p>Here are some words</p>
<h1>WOW BIG TITLE</h1>
</div>
</div>
`;
const opts = {
transforms: { h1: 'h2' },
};
const $ = cheerio.load(html);
let $content = $('.body');
const after = `
<div class="body">
<h2>WOW BIG TITLE</h2>
<p>Here are some words</p>
<h2>WOW BIG TITLE</h2>
</div>
`;
$content = transformElements($content, $, opts);
assertClean($.html($content), after);
});
it('performs a complex transformation on matched elements', () => {
const html = `
<div>
<div class="body">
<noscript>
<img src="/img.jpg" />
</noscript>
<noscript>
Something else
</noscript>
<p>Here are some words</p>
</div>
</div>
`;
const opts = {
transforms: {
noscript: ($node, $) => {
const $children = $.browser ? $($node.text()) : $node.children();
if ($children.length === 1 && $children.get(0) !== undefined &&
$children.get(0).tagName.toLowerCase() === 'img') {
return 'figure';
}
return null;
},
},
};
const $ = cheerio.load(html);
let $content = $('.body');
const after = `
<div class="body">
<figure>
<img src="/img.jpg">
</figure>
<noscript>
Something else
</noscript>
<p>Here are some words</p>
</div>
`;
$content = transformElements($content, $, opts);
assertClean($.html($content), after);
});
});
describe('select(opts)', () => {
it('returns a node\'s text with a simple selector', () => {
const html = `
<div><div class="author">Bob</div></div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'author',
$,
extractionOpts: {
selectors: ['.author'],
},
};
const result = select(opts);
assert.equal(result, 'Bob');
});
it('returns a node\'s attr with an attr selector', () => {
const html = `
<div>
<time datetime="2016-09-07T05:07:59-04:00">
September 7, 2016
</time>
</div>
`;
const $ = cheerio.load(html);
const opts = {
type: 'date_published',
$,
extractionOpts: {
selectors: [['time', 'datetime']],
},
};
const result = select(opts);
assert.equal(result, '2016-09-07T09:07:59.000Z');
});
});