You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
303 lines
7.4 KiB
JavaScript
303 lines
7.4 KiB
JavaScript
import assert from 'assert';
|
|
import cheerio from 'cheerio';
|
|
|
|
import { assertClean } from 'test-helpers';
|
|
import RootExtractor, {
|
|
select,
|
|
cleanBySelectors,
|
|
transformElements,
|
|
} from './root-extractor';
|
|
|
|
import { NYMagExtractor } from './custom/nymag.com';
|
|
|
|
const fs = require('fs');
|
|
|
|
describe('RootExtractor', () => {
|
|
it('only returns what the custom parser gives it if fallback is disabled', () => {
|
|
const fullUrl =
|
|
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
|
|
const html = fs.readFileSync(
|
|
'./src/extractors/custom/nymag.com/fixtures/test.html',
|
|
'utf8'
|
|
);
|
|
const $ = cheerio.load(html);
|
|
|
|
const { url } = RootExtractor.extract(NYMagExtractor, {
|
|
url: fullUrl,
|
|
html,
|
|
$,
|
|
metaCache: [],
|
|
fallback: false,
|
|
});
|
|
|
|
assert.equal(url, null);
|
|
});
|
|
});
|
|
|
|
describe('cleanBySelectors($content, $, { clean })', () => {
|
|
it('removes provided selectors from the content', () => {
|
|
const opts = { clean: ['.ad', '.share'] };
|
|
const html = `
|
|
<div>
|
|
<div class="body">
|
|
<div class="share">Share this on twitter plz</div>
|
|
<p>This is some good content</p>
|
|
<div class="ad">Advertisement!</div>
|
|
</div>
|
|
</div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
|
|
let $content = $('.body');
|
|
$content = cleanBySelectors($content, $, opts);
|
|
|
|
assert.equal($content.find('.ad').length, 0);
|
|
assert.equal($content.find('.share').length, 0);
|
|
});
|
|
});
|
|
|
|
describe('transformElements($content, $, { transforms })', () => {
|
|
it('performs a simple transformation on matched elements', () => {
|
|
const html = `
|
|
<div>
|
|
<div class="body">
|
|
<h1>WOW BIG TITLE</h1>
|
|
<p>Here are some words</p>
|
|
<h1>WOW BIG TITLE</h1>
|
|
</div>
|
|
</div>
|
|
`;
|
|
const opts = {
|
|
transforms: { h1: 'h2' },
|
|
};
|
|
const $ = cheerio.load(html);
|
|
let $content = $('.body');
|
|
|
|
const after = `
|
|
<div class="body">
|
|
<h2>WOW BIG TITLE</h2>
|
|
<p>Here are some words</p>
|
|
<h2>WOW BIG TITLE</h2>
|
|
</div>
|
|
`;
|
|
|
|
$content = transformElements($content, $, opts);
|
|
assertClean($.html($content), after);
|
|
});
|
|
|
|
it('performs a complex transformation on matched elements', () => {
|
|
const html = `
|
|
<div>
|
|
<div class="body">
|
|
<noscript>
|
|
<img src="/img.jpg" />
|
|
</noscript>
|
|
<noscript>
|
|
Something else
|
|
</noscript>
|
|
<p>Here are some words</p>
|
|
</div>
|
|
</div>
|
|
`;
|
|
const opts = {
|
|
transforms: {
|
|
noscript: ($node, $) => {
|
|
const $children = $.browser ? $($node.text()) : $node.children();
|
|
if (
|
|
$children.length === 1 &&
|
|
$children.get(0) !== undefined &&
|
|
$children.get(0).tagName.toLowerCase() === 'img'
|
|
) {
|
|
return 'figure';
|
|
}
|
|
|
|
return null;
|
|
},
|
|
},
|
|
};
|
|
const $ = cheerio.load(html);
|
|
let $content = $('.body');
|
|
|
|
const after = `
|
|
<div class="body">
|
|
<figure>
|
|
<img src="/img.jpg">
|
|
</figure>
|
|
<noscript>
|
|
Something else
|
|
</noscript>
|
|
<p>Here are some words</p>
|
|
</div>
|
|
`;
|
|
|
|
$content = transformElements($content, $, opts);
|
|
assertClean($.html($content), after);
|
|
});
|
|
});
|
|
|
|
describe('select(opts)', () => {
|
|
it("returns a node's text with a simple selector", () => {
|
|
const html = `
|
|
<div><div class="author">Bob</div></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'author',
|
|
$,
|
|
extractionOpts: {
|
|
selectors: ['.author'],
|
|
},
|
|
};
|
|
|
|
const result = select(opts);
|
|
assert.equal(result, 'Bob');
|
|
});
|
|
|
|
it("returns a node's attr with an attr selector", () => {
|
|
const html = `
|
|
<div>
|
|
<time datetime="2016-09-07T05:07:59-04:00">
|
|
September 7, 2016
|
|
</time>
|
|
</div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'date_published',
|
|
$,
|
|
extractionOpts: {
|
|
selectors: [['time', 'datetime']],
|
|
},
|
|
};
|
|
|
|
const result = select(opts);
|
|
assert.equal(result, '2016-09-07T09:07:59.000Z');
|
|
});
|
|
|
|
it("returns a node's html when it is a content selector", () => {
|
|
const html = `
|
|
<div><div class="content-is-here"><p>Wow what a piece of content</p></div></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'content',
|
|
$,
|
|
extractionOpts: {
|
|
selectors: ['.content-is-here'],
|
|
},
|
|
extractHtml: true,
|
|
};
|
|
|
|
const result = select(opts);
|
|
assertClean(result, html);
|
|
});
|
|
|
|
it('handles multiple matches when the content selector is an array', () => {
|
|
const html = `
|
|
<div><div><img class="lead-image" src="#" /><div class="content-is-here"><p>Wow what a piece of content</p></div></div></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'content',
|
|
$,
|
|
extractionOpts: {
|
|
selectors: [['.lead-image', '.content-is-here']],
|
|
},
|
|
extractHtml: true,
|
|
};
|
|
|
|
const result = select(opts);
|
|
assert.equal($(result).find('img.lead-image').length, 1);
|
|
assert.equal($(result).find('.content-is-here').length, 1);
|
|
});
|
|
|
|
it('skips multi-match if not all selectors are present', () => {
|
|
const html = `
|
|
<div><div><img class="lead-image" src="#" /><div class="content-is-here"><p>Wow what a piece of content</p></div></div></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'content',
|
|
$,
|
|
extractionOpts: {
|
|
selectors: [['.lead-image', '.content-is-here', '.foo']],
|
|
},
|
|
extractHtml: true,
|
|
};
|
|
|
|
const result = select(opts);
|
|
|
|
assert.equal(result, null);
|
|
});
|
|
|
|
it('returns an array of results if allowMultiple is true', () => {
|
|
const html = `
|
|
<div><div><ul><li class="item">One</li><li class="item">Two</li></ul></div></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'items',
|
|
$,
|
|
extractionOpts: {
|
|
selectors: ['.item'],
|
|
allowMultiple: true,
|
|
},
|
|
extractHtml: true,
|
|
};
|
|
|
|
const result = select(opts);
|
|
|
|
assert.equal(result.length, 2);
|
|
assert.deepEqual(result, [
|
|
'<li class="item">One</li>',
|
|
'<li class="item">Two</li>',
|
|
]);
|
|
});
|
|
|
|
it('makes links absolute in extended types when extracting HTML', () => {
|
|
const html = `
|
|
<div><p><a class="linky" href="/foo">Bar</a></p></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'links',
|
|
$,
|
|
url: 'http://example.com',
|
|
extractionOpts: {
|
|
selectors: ['.linky'],
|
|
},
|
|
extractHtml: true,
|
|
};
|
|
|
|
const result = select(opts);
|
|
|
|
assert.equal(
|
|
result,
|
|
'<div><a class="linky" href="http://example.com/foo">Bar</a></div>'
|
|
);
|
|
});
|
|
|
|
it('makes links absolute in extended types when extracting attrs', () => {
|
|
const html = `
|
|
<div><p><a class="linky" href="/foo">Bar</a><a class="linky" href="/bar">Baz</a></p></div>
|
|
`;
|
|
const $ = cheerio.load(html);
|
|
const opts = {
|
|
type: 'links',
|
|
$,
|
|
url: 'http://example.com',
|
|
extractionOpts: {
|
|
selectors: [['.linky', 'href']],
|
|
allowMultiple: true,
|
|
},
|
|
};
|
|
|
|
const result = select(opts);
|
|
|
|
assert.deepEqual(result, [
|
|
'http://example.com/foo',
|
|
'http://example.com/bar',
|
|
]);
|
|
});
|
|
});
|