feat: Various Character Encoding Improvements (#270)

* Support HTML5 charset tag

In HTML5 `<meta charset="">` is shorthand for `<meta http-equiv="content-type" content="">`
https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta

* Handle more character encoding declaration methods.
fix-custom-generator
Ben Ubois 5 years ago committed by Adam Pash
parent b3fa18b6d9
commit 0e27448866

File diff suppressed because one or more lines are too long

@ -66,11 +66,13 @@ const Resource = {
let $ = cheerio.load(decodedContent);
// after first cheerio.load, check to see if encoding matches
const metaContentType = $('meta[http-equiv=content-type]').attr('content');
const metaContentType =
$('meta[http-equiv=content-type i]').attr('content') ||
$('meta[charset]').attr('charset');
const properEncoding = getEncoding(metaContentType);
// if encodings in the header/body dont match, use the one in the body
if (properEncoding !== encoding) {
if (metaContentType && properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}

@ -39,6 +39,36 @@ describe('Resource', () => {
assert.equal(typeof $, 'function');
});
it('fetches with different encoding and case insensitive regex', async () => {
const url =
'https://www.finam.ru/analysis/newsitem/putin-nagradil-grefa-ordenom-20190208-203615/';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type i]').attr(
'value'
);
assert.equal(getEncoding(metaContentType), 'windows-1251');
const badEncodingRe = /&#xFFFD;/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('fetches with different encoding and HTML5 charset tag', async () => {
const url =
'https://www.idnes.cz/fotbal/prvni-liga/fotbalova-liga-8-kolo-slovan-liberec-slovacko.A170925_173123_fotbal_min';
const $ = await Resource.create(url);
const metaContentType = $('meta[charset]').attr('charset');
assert.equal(getEncoding(metaContentType), 'windows-1250');
const badEncodingRe = /&#xFFFD;/g;
assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});
it('handles special encoding', async () => {
const url =
'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';

@ -52,12 +52,11 @@ export function record(name, options = {}) {
// eslint-disable-next-line no-console
console.log(
`This is disabled for browser/node interop. To capture fixutres,
open ${'`src/test-helpers.js`'} and comment out lines 57 and 58 and
uncomment the fs import at top of file.`
open ${'`src/test-helpers.js`'} and uncomment lines 58 and 59 and
the fs import at top of file.`
);
// const text = `const nock = require('nock');\n${has_fixtures.join('\n')}`;
// fs.writeFile(fp, text, done);
} else {
done();
}

@ -6,11 +6,12 @@ import { DEFAULT_ENCODING, ENCODING_RE } from './constants';
// ensure correctly encoded responses
export default function getEncoding(str) {
let encoding = DEFAULT_ENCODING;
if (ENCODING_RE.test(str)) {
const testEncode = ENCODING_RE.exec(str)[1];
if (iconv.encodingExists(testEncode)) {
encoding = testEncode;
}
const matches = ENCODING_RE.exec(str);
if (matches !== null) {
[, str] = matches;
}
if (iconv.encodingExists(str)) {
encoding = str;
}
return encoding;
}

Loading…
Cancel
Save