import assert from 'assert'; import fs from 'fs'; import cheerio from 'cheerio'; import { assertClean } from 'test-helpers'; import { default as RootExtractor, select, cleanBySelectors, transformElements, } from './root-extractor'; import { NYMagExtractor } from './custom/nymag.com'; describe('RootExtractor', () => { it('only returns what the custom parser gives it if fallback is disabled', () => { const fullUrl = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'; const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8'); const $ = cheerio.load(html); const { url } = RootExtractor.extract( NYMagExtractor, { url: fullUrl, html, $, metaCache: [], fallback: false } ); assert.equal(url, null); }); }); describe('cleanBySelectors($content, $, { clean })', () => { it('removes provided selectors from the content', () => { const opts = { clean: ['.ad', '.share'] }; const html = `

This is some good content

Advertisement!
`; const $ = cheerio.load(html); let $content = $('.body'); $content = cleanBySelectors($content, $, opts); assert.equal($content.find('.ad').length, 0); assert.equal($content.find('.share').length, 0); }); }); describe('transformElements($content, $, { transforms })', () => { it('performs a simple transformation on matched elements', () => { const html = `

WOW BIG TITLE

Here are some words

WOW BIG TITLE

`; const opts = { transforms: { h1: 'h2' }, }; const $ = cheerio.load(html); let $content = $('.body'); const after = `

WOW BIG TITLE

Here are some words

WOW BIG TITLE

`; $content = transformElements($content, $, opts); assertClean($.html($content), after); }); it('performs a complex transformation on matched elements', () => { const html = `

Here are some words

`; const opts = { transforms: { noscript: ($node, $) => { const $children = $.browser ? $($node.text()) : $node.children(); if ($children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img') { return 'figure'; } return null; }, }, }; const $ = cheerio.load(html); let $content = $('.body'); const after = `

Here are some words

`; $content = transformElements($content, $, opts); assertClean($.html($content), after); }); }); describe('select(opts)', () => { it('returns a node\'s text with a simple selector', () => { const html = `
Bob
`; const $ = cheerio.load(html); const opts = { type: 'author', $, extractionOpts: { selectors: ['.author'], }, }; const result = select(opts); assert.equal(result, 'Bob'); }); it('returns a node\'s attr with an attr selector', () => { const html = `
`; const $ = cheerio.load(html); const opts = { type: 'date_published', $, extractionOpts: { selectors: [['time', 'datetime']], }, }; const result = select(opts); assert.equal(result, '2016-09-07T09:07:59.000Z'); }); });