You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/resource/index.test.js

119 lines
3.7 KiB
JavaScript

import assert from 'assert';
import cheerio from 'cheerio';
import { Errors } from 'utils';
import { getEncoding } from 'utils/text';
import { record } from 'test-helpers';
import Resource from './index';
describe('Resource', () => {
const recorder = record('resource-test');
beforeAll(recorder.before);
afterAll(recorder.after);
describe('create(url)', () => {
it('fetches the page and returns a cheerio object', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057';
const $ = await Resource.create(url);
assert.equal(typeof $, 'function');
});
it('returns an error message if the url is malformed', async () => {
const url = 'http://nytimes.com/500';
const error = await Resource.create(url);
assert.equal(error, Errors.badUrl);
});
it('fetches with different encoding on body', async () => {
const url =
'http://www.playnation.de/spiele-news/kojima-productions/hideo-kojima-reflektiert-ueber-seinen-werdegang-bei-konami-id68950.html';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type]').attr('value');
assert.equal(getEncoding(metaContentType), 'iso-8859-1');
const encodedU = /ü/g;
assert.equal(encodedU.test($.html()), true);
assert.equal(typeof $, 'function');
});
it('fetches with different encoding and case insensitive regex', async () => {
const url =
'https://www.finam.ru/analysis/newsitem/putin-nagradil-grefa-ordenom-20190208-203615/';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type i]').attr(
'value'
);
assert.equal(getEncoding(metaContentType), 'windows-1251');
const badEncodingRe = /�/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('fetches with different encoding and HTML5 charset tag', async () => {
const url =
'https://www.idnes.cz/fotbal/prvni-liga/fotbalova-liga-8-kolo-slovan-liberec-slovacko.A170925_173123_fotbal_min';
const $ = await Resource.create(url);
const metaContentType = $('meta[charset]').attr('charset');
assert.equal(getEncoding(metaContentType), 'windows-1250');
const badEncodingRe = /�/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('handles special encoding', async () => {
const url =
'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
const $ = await Resource.create(url);
const badEncodingRe = /<2F>/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
});
describe('generateDoc({ body, response })', () => {
// Ideally the body would be a buffer, because of potential issues with
// string re-encoding, since these strings are blank, it should be fine
// but this is why iconv is throwing warnings.
it('throws an error if the content is not text', () => {
const response = {
headers: {
'content-type': 'foo',
},
};
const body = '';
assert.throws(() => {
Resource.generateDoc({ body, response });
}, /content does not appear to be text/i);
});
it('throws an error if the content has no children', () => {
// jquery's parser won't work this way, and this is
// an outside case
if (!cheerio.browser) {
const response = {
headers: {
'content-type': 'html',
},
};
const body = '';
assert.throws(() => {
Resource.generateDoc({ body, response });
}, /no children/i);
}
});
});
});