Fix Encoding on Body (#143)

* fix: check encoding on body
pull/149/head
Kevin Ngao 7 years ago committed by GitHub
parent 9d4c883d51
commit afbef9bc39

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

9
dist/mercury.js vendored

@ -264,6 +264,15 @@ function get(options) {
body = iconv.decode(body, encoding);
}
if (typeof body !== 'string') {
var $ = cheerio.load(iconv.decode(body, 'utf8'));
var contentType = $('meta[http-equiv=content-type]').attr('content');
var properEncoding = getEncoding(contentType);
if (iconv.encodingExists(properEncoding)) {
body = iconv.decode(body, properEncoding);
}
}
resolve({ body: body, response: response });
}
});

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -43,7 +43,9 @@ if (process.env.CI) {
assert.equal(article.title, result.title);
done();
}).catch((e) => {
console.log('THIS WENT WRONG', e); // eslint-disable-line no-console
console.log('There was an error', e.message); // eslint-disable-line no-console
console.log('e.fileName', e.fileName);
console.log('e.lineNumber', e.lineNumber);
assert.equal(true, false);
done();
});

@ -1,5 +1,7 @@
import cheerio from 'cheerio';
import iconv from 'iconv-lite';
import { getEncoding } from 'utils/text';
import { fetchResource } from './utils';
import {
normalizeMetaTags,
@ -51,7 +53,7 @@ const Resource = {
throw new Error('Content does not appear to be text.');
}
let $ = cheerio.load(content);
let $ = this.encodeDoc({ content, contentType });
if ($.root().children().length === 0) {
throw new Error('No children, likely a bad parse.');
@ -63,6 +65,24 @@ const Resource = {
return $;
},
encodeDoc({ content, contentType }) {
const encoding = getEncoding(contentType);
let decodedContent = iconv.decode(content, encoding);
let $ = cheerio.load(decodedContent);
// after first cheerio.load, check to see if encoding matches
const metaContentType = $('meta[http-equiv=content-type]').attr('content');
const properEncoding = getEncoding(metaContentType);
// if encodings in the header/body dont match, use the one in the body
if (properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}
return $;
},
};
export default Resource;

@ -1,6 +1,7 @@
import assert from 'assert';
import cheerio from 'cheerio';
import { Errors } from 'utils';
import { getEncoding } from 'utils/text';
import { record } from 'test-helpers';
import Resource from './index';
@ -24,18 +25,31 @@ describe('Resource', () => {
assert.equal(error, Errors.badUrl);
});
});
describe('generateDoc({ body, response })', () => {
it('returns a cheerio object if valid', () => {
const response = { headers: { 'content-type': 'text/html' } };
it('fetches with different encoding on body', async () => {
const url = 'http://www.playnation.de/spiele-news/kojima-productions/hideo-kojima-reflektiert-ueber-seinen-werdegang-bei-konami-id68950.html';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type]').attr('value');
assert.equal(getEncoding(metaContentType), 'iso-8859-1');
const encodedU = /ü/g;
assert.equal(encodedU.test($.html()), true);
assert.equal(typeof $, 'function');
});
it('handles special encoding', async () => {
const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
const $ = await Resource.create(url);
const body = '<div><p>Hi</p></div>';
const $ = Resource.generateDoc({ body, response });
const badEncodingRe = /<2F>/g;
assert.equal($.html(), body);
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
});
describe('generateDoc({ body, response })', () => {
it('throws an error if the content is not text', () => {
const response = {
headers: {

@ -1,8 +1,6 @@
import URL from 'url';
import request from 'request';
import iconv from 'iconv-lite';
import { Errors } from 'utils';
import { getEncoding } from 'utils/text';
import {
REQUEST_HEADERS,
@ -17,12 +15,6 @@ function get(options) {
if (err) {
reject(err);
} else {
const encoding = getEncoding(response.headers['content-type']);
if (iconv.encodingExists(encoding)) {
body = iconv.decode(body, encoding);
}
resolve({ body, response });
}
});
@ -97,9 +89,6 @@ export default async function fetchResource(url, parsedUrl) {
url: parsedUrl.href,
headers: { ...REQUEST_HEADERS },
timeout: FETCH_TIMEOUT,
// Don't set encoding; fixes issues
// w/gzipped responses
encoding: null,
// Accept cookies
jar: true,
// Accept and decode gzip

@ -23,40 +23,30 @@ describe('fetchResource(url)', () => {
it('fetches nyt', async () => {
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
const { body } = await fetchResource(url);
const { response } = await fetchResource(url);
assert.equal(typeof body, 'string');
assert.equal(response.statusCode, 200);
});
it('fetches domains', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057';
const { body } = await fetchResource(url);
const { response } = await fetchResource(url);
assert.equal(typeof body, 'string');
assert.equal(response.statusCode, 200);
});
it('fetches nyt', async () => {
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
const { body } = await fetchResource(url);
const { response } = await fetchResource(url);
assert.equal(typeof body, 'string');
assert.equal(response.statusCode, 200);
});
it('handles this gzip error', async () => {
const url = 'http://www.redcross.ca/blog/2016/11/photo-of-the-day--one-year-anniversary-of-the-end-of-ebola-in-sierra-leone';
const { body } = await fetchResource(url);
const { response } = await fetchResource(url);
assert.equal(typeof body, 'string');
});
// this test addresses https://twitter.com/flikxxi/status/800074680342351872
it('handles different encoding', async () => {
const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
const { body } = await fetchResource(url);
const badEncodingRe = /<2F>/g;
assert.equal(badEncodingRe.test(body.toString()), false);
assert.equal(response.statusCode, 200);
});
});

@ -3,6 +3,7 @@
// don't need it for already rendered text
const iconv = {
encodingExists: () => false,
decode: s => s,
};
export default iconv;

@ -22,3 +22,4 @@ export const IS_ALPHA_RE = /^[a-z]+$/i;
export const IS_DIGIT_RE = /^[0-9]+$/i;
export const ENCODING_RE = /charset=([\w-]+)\b/;
export const DEFAULT_ENCODING = 'utf-8';

@ -1,12 +1,16 @@
import { ENCODING_RE } from './constants';
import iconv from 'iconv-lite';
import { DEFAULT_ENCODING, ENCODING_RE } from './constants';
// check a string for encoding; this is
// used in our fetchResource function to
// ensure correctly encoded responses
export default function getEncoding(str) {
let encoding = DEFAULT_ENCODING;
if (ENCODING_RE.test(str)) {
return ENCODING_RE.exec(str)[1];
const testEncode = ENCODING_RE.exec(str)[1];
if (iconv.encodingExists(testEncode)) {
encoding = testEncode;
}
}
return null;
return encoding;
}

@ -1,15 +1,26 @@
import assert from 'assert';
import cheerio from 'cheerio';
import getEncoding from './get-encoding';
// Tests are bypassed in the browser because it has an encoding
// A shim is used /src/shims/iconv-lite.js to decrease load size
describe('getEncoding(str)', () => {
if (cheerio.browser) return;
it('returns the encoding as a string', () => {
const contentType = 'text/html; charset=iso-8859-15';
assert.equal(getEncoding(contentType), 'iso-8859-15');
});
it('returns null if no encoding found', () => {
it('returns utf-8 as a default if no encoding found', () => {
const contentType = 'text/html';
assert.equal(getEncoding(contentType), null);
assert.equal(getEncoding(contentType), 'utf-8');
});
it('returns utf-8 if there is an invalid encoding', () => {
const contentType = 'text/html; charset=fake-charset';
assert.equal(getEncoding(contentType), 'utf-8');
});
});

Loading…
Cancel
Save