You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/mercury.test.js

221 lines
6.8 KiB
JavaScript

import assert from 'assert';
import { record } from 'test-helpers';
import Parser from './mercury';
const fs = require('fs');
describe('Parser', () => {
const recorder = record('mercury-test');
beforeAll(recorder.before);
afterAll(recorder.after);
describe('parse(url)', () => {
it('returns an error if a malformed url is passed', async () => {
const error = await Parser.parse('foo.com');
assert(/does not look like a valid URL/i.test(error.message));
});
it('does the whole thing', async () => {
const result = await Parser.parse(
'http://deadspin.com/remember-when-donald-trump-got-booed-for-butchering-ta-1788216229'
);
assert.equal(typeof result, 'object');
assert.equal(result.content.indexOf('score="') === -1, true);
});
it('returns an error on non-200 responses', async () => {
const error = await Parser.parse(
'https://www.thekitchn.com/instant-pot-chicken-pesto-pasta-eating-instantly-267141'
);
assert(/instructed to reject non-200/i.test(error.message));
});
it('returns an error on invalid content types', async () => {
const error = await Parser.parse(
'https://upload.wikimedia.org/wikipedia/commons/5/52/Spacer.gif'
);
assert(/content-type for this resource/i.test(error.message));
});
it('does wikipedia', async () => {
const result = await Parser.parse(
'https://en.wikipedia.org/wiki/Brihadeeswarar_Temple_fire'
);
assert.equal(typeof result, 'object');
});
it('does washingtonpost', async () => {
jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000;
const result = await Parser.parse(
'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/'
);
assert.equal(typeof result, 'object');
assert.equal(result.total_pages, 1);
assert.equal(
result.url,
'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/'
);
});
it('does the nyt', async () => {
const result = await Parser.parse(
'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0'
);
assert.equal(typeof result, 'object');
assert.equal(result.total_pages, 1);
});
it('does ars pagination', async () => {
jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000;
const url =
'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const result = await Parser.parse(url, { fetchAllPages: true });
const { total_pages, pages_rendered } = result;
assert.equal(total_pages, 3);
assert.equal(pages_rendered, 3);
assert.equal(result.next_page_url, `${url}2`);
});
});
it('returns text content if text is passed as contentType', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { content } = await Parser.parse(url, { html, contentType: 'text' });
const htmlRe = /<[a-z][\s\S]*>/g;
assert.equal(htmlRe.test(content), false);
});
it('returns markdown if markdown is passed as contentType', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { content } = await Parser.parse(url, {
html,
contentType: 'markdown',
});
const htmlRe = /<[a-z][\s\S]*>/;
const markdownRe = /\[[\w\s]+\]\(.*\)/;
assert.equal(htmlRe.test(content), false);
assert.equal(markdownRe.test(content), true);
});
it('returns custom elements if an extend object is passed', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { sites } = await Parser.parse(url, {
html,
extend: {
sites: {
selectors: ['a.site-name'],
allowMultiple: true,
},
},
});
assert.ok(sites);
assert.equal(sites.length, 8);
assert.equal(sites[0], 'NYMag.com');
});
it('returns an array if a single element matches a custom extend', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { sites } = await Parser.parse(url, {
html,
extend: {
sites: {
selectors: [['li:first-child a.site-name', 'href']],
allowMultiple: true,
},
},
});
assert.ok(sites);
assert.equal(sites.length, 1);
});
it('returns custom attributes if an extend object is passed', async () => {
const url =
'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync(
'./src/extractors/custom/nymag.com/fixtures/test.html',
'utf8'
);
const { sites } = await Parser.parse(url, {
html,
extend: {
sites: {
selectors: [['a.site-name', 'href']],
allowMultiple: true,
},
},
});
assert.ok(sites);
assert.equal(sites.length, 8);
assert.equal(sites[1], 'http://nymag.com/daily/intelligencer/');
});
it('is able to use custom extractors (with extension) added via api', async () => {
const url =
'https://www.sandiegouniontribune.com/business/growth-development/story/2019-08-27/sdsu-mission-valley-stadium-management-firm';
const html = fs.readFileSync(
'./fixtures/sandiegouniontribune.com/test.html',
'utf8'
);
const customExtractor = {
domain: 'www.sandiegouniontribune.com',
title: {
selectors: ['h1', '.ArticlePage-headline'],
},
author: {
selectors: ['.ArticlePage-authorInfo-bio-name'],
},
content: {
selectors: ['article'],
},
extend: {
testContent: {
selectors: ['.ArticlePage-breadcrumbs a'],
},
},
};
Parser.addExtractor(customExtractor);
const result = await Parser.parse(url, { html });
assert.equal(typeof result, 'object');
assert.equal(result.author, 'Jennifer Van Grove');
assert.equal(result.domain, 'www.sandiegouniontribune.com');
assert.equal(result.total_pages, 1);
assert.equal(result.testContent, 'Growth & Development');
});
});