feat: encoding response body based on content-type charset (#21)

Also some small code organization
pull/22/head
Adam Pash 8 years ago committed by GitHub
parent 88c125d022
commit 7411922c55

@ -24,10 +24,10 @@ dependencies:
test:
override:
# Using 4.3.2 by default
- yarn build && yarn test -- --maxWorkers=4:
- yarn build && yarn test:node -- --maxWorkers=4:
parallel: true
# Switch to 7 and lint
- nvm use 7.0 && yarn lint:ci && yarn build && yarn test -- --maxWorkers=4:
- nvm use 7.0 && yarn lint:ci && yarn build && yarn test:node -- --maxWorkers=4:
parallel: true
- nvm use 7.0 && yarn test:web -- --maxWorkers=4 && yarn build:web -- --maxWorkers=4:
parallel: true

318
dist/mercury.js vendored

@ -9,10 +9,11 @@ var URL = _interopDefault(require('url'));
var cheerio = _interopDefault(require('cheerio'));
var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
var request = _interopDefault(require('request'));
var iconv = _interopDefault(require('iconv-lite'));
var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
var _Reflect$ownKeys = _interopDefault(require('babel-runtime/core-js/reflect/own-keys'));
var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
var _defineProperty = _interopDefault(require('babel-runtime/helpers/defineProperty'));
var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
var _typeof = _interopDefault(require('babel-runtime/helpers/typeof'));
var _getIterator = _interopDefault(require('babel-runtime/core-js/get-iterator'));
var _Object$keys = _interopDefault(require('babel-runtime/core-js/object/keys'));
@ -68,6 +69,167 @@ var Errors = {
}
};
var NORMALIZE_RE = /\s{2,}/g;
function normalizeSpaces(text) {
return text.replace(NORMALIZE_RE, ' ').trim();
}
// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
function extractFromUrl(url, regexList) {
var matchRe = regexList.find(function (re) {
return re.test(url);
});
if (matchRe) {
return matchRe.exec(url)[1];
}
return null;
}
// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
// page=1
// pg=1
// p=1
// paging=12
// pag=7
// pagination/1
// paging/88
// pa/83
// p/11
//
// Does not match:
// pg=102
// page:2
var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
var HAS_ALPHA_RE = /[a-z]/i;
var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
var ENCODING_RE = /charset=([\w-]+)\b/;
function pageNumFromUrl(url) {
var matches = url.match(PAGE_IN_HREF_RE);
if (!matches) return null;
var pageNum = parseInt(matches[6], 10);
// Return pageNum < 100, otherwise
// return null
return pageNum < 100 ? pageNum : null;
}
function removeAnchor(url) {
return url.split('#')[0].replace(/\/$/, '');
}
function isGoodSegment(segment, index, firstSegmentHasLetters) {
var goodSegment = true;
// If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
goodSegment = true;
}
// If this is the first url_segment and it's just "index",
// remove it
if (index === 0 && segment.toLowerCase() === 'index') {
goodSegment = false;
}
// If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
goodSegment = false;
}
return goodSegment;
}
// Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
function articleBaseUrl(url, parsed) {
var parsedUrl = parsed || URL.parse(url);
var protocol = parsedUrl.protocol,
host = parsedUrl.host,
path = parsedUrl.path;
var firstSegmentHasLetters = false;
var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
var segment = rawSegment;
// Split off and save anything that looks like a file type.
if (segment.includes('.')) {
var _segment$split = segment.split('.'),
_segment$split2 = _slicedToArray(_segment$split, 2),
possibleSegment = _segment$split2[0],
fileExt = _segment$split2[1];
if (IS_ALPHA_RE.test(fileExt)) {
segment = possibleSegment;
}
}
// If our first or second segment has anything looking like a page
// number, remove it.
if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
segment = segment.replace(PAGE_IN_HREF_RE, '');
}
// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if (index === 0) {
firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
}
// If it's not marked for deletion, push it to cleaned_segments.
if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
acc.push(segment);
}
return acc;
}, []);
return protocol + '//' + host + cleanedSegments.reverse().join('/');
}
// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
var SENTENCE_END_RE = new RegExp('.( |$)');
function hasSentenceEnd(text) {
return SENTENCE_END_RE.test(text);
}
function excerptContent(content) {
var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
return content.trim().split(/\s+/).slice(0, words).join(' ');
}
// check a string for encoding; this is
// used in our fetchResource function to
// ensure correctly encoded responses
function getEncoding(str) {
if (ENCODING_RE.test(str)) {
return ENCODING_RE.exec(str)[1];
}
return null;
}
// Browser does not like us setting user agent
var REQUEST_HEADERS = cheerio.browser ? {} : {
'User-Agent': 'Mercury - https://mercury.postlight.com/web-parser/'
@ -95,6 +257,12 @@ function get(options) {
if (err) {
reject(err);
} else {
var encoding = getEncoding(response.headers['content-type']);
if (iconv.encodingExists(encoding)) {
body = iconv.decode(body, encoding);
}
resolve({ body: body, response: response });
}
});
@ -1047,154 +1215,6 @@ function scoreContent$$1($) {
return $;
}
var NORMALIZE_RE = /\s{2,}/g;
function normalizeSpaces(text) {
return text.replace(NORMALIZE_RE, ' ').trim();
}
// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
function extractFromUrl(url, regexList) {
var matchRe = regexList.find(function (re) {
return re.test(url);
});
if (matchRe) {
return matchRe.exec(url)[1];
}
return null;
}
// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
// page=1
// pg=1
// p=1
// paging=12
// pag=7
// pagination/1
// paging/88
// pa/83
// p/11
//
// Does not match:
// pg=102
// page:2
var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
var HAS_ALPHA_RE = /[a-z]/i;
var IS_ALPHA_RE = /^[a-z]+$/i;
var IS_DIGIT_RE = /^[0-9]+$/i;
function pageNumFromUrl(url) {
var matches = url.match(PAGE_IN_HREF_RE);
if (!matches) return null;
var pageNum = parseInt(matches[6], 10);
// Return pageNum < 100, otherwise
// return null
return pageNum < 100 ? pageNum : null;
}
function removeAnchor(url) {
return url.split('#')[0].replace(/\/$/, '');
}
function isGoodSegment(segment, index, firstSegmentHasLetters) {
var goodSegment = true;
// If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
goodSegment = true;
}
// If this is the first url_segment and it's just "index",
// remove it
if (index === 0 && segment.toLowerCase() === 'index') {
goodSegment = false;
}
// If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
goodSegment = false;
}
return goodSegment;
}
// Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
function articleBaseUrl(url, parsed) {
var parsedUrl = parsed || URL.parse(url);
var protocol = parsedUrl.protocol,
host = parsedUrl.host,
path = parsedUrl.path;
var firstSegmentHasLetters = false;
var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
var segment = rawSegment;
// Split off and save anything that looks like a file type.
if (segment.includes('.')) {
var _segment$split = segment.split('.'),
_segment$split2 = _slicedToArray(_segment$split, 2),
possibleSegment = _segment$split2[0],
fileExt = _segment$split2[1];
if (IS_ALPHA_RE.test(fileExt)) {
segment = possibleSegment;
}
}
// If our first or second segment has anything looking like a page
// number, remove it.
if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
segment = segment.replace(PAGE_IN_HREF_RE, '');
}
// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if (index === 0) {
firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
}
// If it's not marked for deletion, push it to cleaned_segments.
if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
acc.push(segment);
}
return acc;
}, []);
return protocol + '//' + host + cleanedSegments.reverse().join('/');
}
// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
var SENTENCE_END_RE = new RegExp('.( |$)');
function hasSentenceEnd(text) {
return SENTENCE_END_RE.test(text);
}
function excerptContent(content) {
var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
return content.trim().split(/\s+/).slice(0, words).join(' ');
}
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -13,7 +13,7 @@
"build:generator": "rollup -c scripts/rollup.config.js",
"test_build": "rollup -c",
"test": "yarn test:node && yarn test:web",
"test:node": "jest",
"test:node": "jest ./src",
"test:web": "./node_modules/karma/bin/karma start karma.conf.js",
"test:build": "cd ./scripts && jest check-build.test.js",
"test:build:web": "node ./scripts/proxy-browser-test.js",
@ -82,6 +82,7 @@
"cheerio": "^0.22.0",
"difflib": "adampash/difflib.js",
"ellipsize": "0.0.2",
"iconv-lite": "^0.4.15",
"jquery": "^3.1.1",
"moment": "^2.14.1",
"request": "czardoz/request",
@ -93,7 +94,8 @@
},
"browser": {
"main": "./dist/mercury.web.js",
"cheerio": "./src/utils/cheerio-query",
"request": "browser-request"
"cheerio": "./src/shims/cheerio-query",
"request": "browser-request",
"iconv-lite": "./src/shims/iconv-lite"
}
}

@ -1,6 +1,8 @@
import URL from 'url';
import request from 'request';
import iconv from 'iconv-lite';
import { Errors } from 'utils';
import { getEncoding } from 'utils/text';
import {
REQUEST_HEADERS,
@ -15,6 +17,12 @@ function get(options) {
if (err) {
reject(err);
} else {
const encoding = getEncoding(response.headers['content-type']);
if (iconv.encodingExists(encoding)) {
body = iconv.decode(body, encoding);
}
resolve({ body, response });
}
});

@ -25,27 +25,38 @@ describe('fetchResource(url)', () => {
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
const { body } = await fetchResource(url);
assert.equal(typeof body, 'object');
assert.equal(typeof body, 'string');
});
it('fetches domains', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057';
const { body } = await fetchResource(url);
assert.equal(typeof body, 'object');
assert.equal(typeof body, 'string');
});
it('fetches nyt', async () => {
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
const { body } = await fetchResource(url);
assert.equal(typeof body, 'object');
assert.equal(typeof body, 'string');
});
it('handles this gzip error', async () => {
const url = 'http://www.redcross.ca/blog/2016/11/photo-of-the-day--one-year-anniversary-of-the-end-of-ebola-in-sierra-leone';
const { body } = await fetchResource(url);
assert.equal(typeof body, 'object');
assert.equal(typeof body, 'string');
});
// this test addresses https://twitter.com/flikxxi/status/800074680342351872
it('handles different encoding', async () => {
const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
const { body } = await fetchResource(url);
const badEncodingRe = /<2F>/g;
assert.equal(badEncodingRe.test(body.toString()), false);
});
});

@ -0,0 +1,8 @@
// this is a shim for the browser build;
// iconv-lite doubles build size, and we
// don't need it for already rendered text
const iconv = {
encodingExists: () => false,
};
export default iconv;

@ -20,3 +20,5 @@ export const HAS_ALPHA_RE = /[a-z]/i;
export const IS_ALPHA_RE = /^[a-z]+$/i;
export const IS_DIGIT_RE = /^[0-9]+$/i;
export const ENCODING_RE = /charset=([\w-]+)\b/;

@ -0,0 +1,12 @@
import { ENCODING_RE } from './constants';
// check a string for encoding; this is
// used in our fetchResource function to
// ensure correctly encoded responses
export default function getEncoding(str) {
if (ENCODING_RE.test(str)) {
return ENCODING_RE.exec(str)[1];
}
return null;
}

@ -0,0 +1,15 @@
import assert from 'assert';
import getEncoding from './get-encoding';
describe('getEncoding(str)', () => {
it('returns the encoding as a string', () => {
const contentType = 'text/html; charset=iso-8859-15';
assert.equal(getEncoding(contentType), 'iso-8859-15');
});
it('returns null if no encoding found', () => {
const contentType = 'text/html';
assert.equal(getEncoding(contentType), null);
});
});

@ -5,3 +5,4 @@ export { default as removeAnchor } from './remove-anchor';
export { default as articleBaseUrl } from './article-base-url';
export { default as hasSentenceEnd } from './has-sentence-end';
export { default as excerptContent } from './excerpt-content';
export { default as getEncoding } from './get-encoding';

@ -2539,7 +2539,7 @@ https-browserify@~0.0.0:
version "0.0.1"
resolved "https://registry.yarnpkg.com/https-browserify/-/https-browserify-0.0.1.tgz#3f91365cabe60b77ed0ebba24b454e3e09d95a82"
iconv-lite@^0.4.13:
iconv-lite, iconv-lite@^0.4.13:
version "0.4.15"
resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.15.tgz#fe265a218ac6a57cfe854927e9d04c19825eddeb"

Loading…
Cancel
Save