feat: encoding response body based on content-type charset (#21)

Also some small code organization
8 years ago · 7411922c55
parent 88c125d022
commit 7411922c55
15 changed files with 390 additions and 243 deletions
--- a/circle.yml
+++ b/circle.yml
@ -24,10 +24,10 @@ dependencies:
 test:
  override:
    # Using 4.3.2 by default
-    - yarn build && yarn test -- --maxWorkers=4:
+    - yarn build && yarn test:node -- --maxWorkers=4:
        parallel: true
    # Switch to 7 and lint
-    - nvm use 7.0 && yarn lint:ci && yarn build && yarn test -- --maxWorkers=4:
+    - nvm use 7.0 && yarn lint:ci && yarn build && yarn test:node -- --maxWorkers=4:
        parallel: true
    - nvm use 7.0 && yarn test:web -- --maxWorkers=4 && yarn build:web -- --maxWorkers=4:
        parallel: true
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -9,10 +9,11 @@ var URL = _interopDefault(require('url'));
 var cheerio = _interopDefault(require('cheerio'));
 var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
 var request = _interopDefault(require('request'));
+var iconv = _interopDefault(require('iconv-lite'));
+var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
 var _Reflect$ownKeys = _interopDefault(require('babel-runtime/core-js/reflect/own-keys'));
 var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
 var _defineProperty = _interopDefault(require('babel-runtime/helpers/defineProperty'));
-var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
 var _typeof = _interopDefault(require('babel-runtime/helpers/typeof'));
 var _getIterator = _interopDefault(require('babel-runtime/core-js/get-iterator'));
 var _Object$keys = _interopDefault(require('babel-runtime/core-js/object/keys'));
@ -68,6 +69,167 @@ var Errors = {
  }
 };

+var NORMALIZE_RE = /\s{2,}/g;
+
+function normalizeSpaces(text) {
+  return text.replace(NORMALIZE_RE, ' ').trim();
+}
+
+// Given a node type to search for, and a list of regular expressions,
+// look to see if this extraction can be found in the URL. Expects
+// that each expression in r_list will return group(1) as the proper
+// string to be cleaned.
+// Only used for date_published currently.
+function extractFromUrl(url, regexList) {
+  var matchRe = regexList.find(function (re) {
+    return re.test(url);
+  });
+  if (matchRe) {
+    return matchRe.exec(url)[1];
+  }
+
+  return null;
+}
+
+// An expression that looks to try to find the page digit within a URL, if
+// it exists.
+// Matches:
+//  page=1
+//  pg=1
+//  p=1
+//  paging=12
+//  pag=7
+//  pagination/1
+//  paging/88
+//  pa/83
+//  p/11
+//
+// Does not match:
+//  pg=102
+//  page:2
+var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
+
+var HAS_ALPHA_RE = /[a-z]/i;
+
+var IS_ALPHA_RE = /^[a-z]+$/i;
+var IS_DIGIT_RE = /^[0-9]+$/i;
+
+var ENCODING_RE = /charset=([\w-]+)\b/;
+
+function pageNumFromUrl(url) {
+  var matches = url.match(PAGE_IN_HREF_RE);
+  if (!matches) return null;
+
+  var pageNum = parseInt(matches[6], 10);
+
+  // Return pageNum < 100, otherwise
+  // return null
+  return pageNum < 100 ? pageNum : null;
+}
+
+function removeAnchor(url) {
+  return url.split('#')[0].replace(/\/$/, '');
+}
+
+function isGoodSegment(segment, index, firstSegmentHasLetters) {
+  var goodSegment = true;
+
+  // If this is purely a number, and it's the first or second
+  // url_segment, it's probably a page number. Remove it.
+  if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
+    goodSegment = true;
+  }
+
+  // If this is the first url_segment and it's just "index",
+  // remove it
+  if (index === 0 && segment.toLowerCase() === 'index') {
+    goodSegment = false;
+  }
+
+  // If our first or second url_segment is smaller than 3 characters,
+  // and the first url_segment had no alphas, remove it.
+  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
+    goodSegment = false;
+  }
+
+  return goodSegment;
+}
+
+// Take a URL, and return the article base of said URL. That is, no
+// pagination data exists in it. Useful for comparing to other links
+// that might have pagination data within them.
+function articleBaseUrl(url, parsed) {
+  var parsedUrl = parsed || URL.parse(url);
+  var protocol = parsedUrl.protocol,
+      host = parsedUrl.host,
+      path = parsedUrl.path;
+
+
+  var firstSegmentHasLetters = false;
+  var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
+    var segment = rawSegment;
+
+    // Split off and save anything that looks like a file type.
+    if (segment.includes('.')) {
+      var _segment$split = segment.split('.'),
+          _segment$split2 = _slicedToArray(_segment$split, 2),
+          possibleSegment = _segment$split2[0],
+          fileExt = _segment$split2[1];
+
+      if (IS_ALPHA_RE.test(fileExt)) {
+        segment = possibleSegment;
+      }
+    }
+
+    // If our first or second segment has anything looking like a page
+    // number, remove it.
+    if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
+      segment = segment.replace(PAGE_IN_HREF_RE, '');
+    }
+
+    // If we're on the first segment, check to see if we have any
+    // characters in it. The first segment is actually the last bit of
+    // the URL, and this will be helpful to determine if we're on a URL
+    // segment that looks like "/2/" for example.
+    if (index === 0) {
+      firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
+    }
+
+    // If it's not marked for deletion, push it to cleaned_segments.
+    if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
+      acc.push(segment);
+    }
+
+    return acc;
+  }, []);
+
+  return protocol + '//' + host + cleanedSegments.reverse().join('/');
+}
+
+// Given a string, return True if it appears to have an ending sentence
+// within it, false otherwise.
+var SENTENCE_END_RE = new RegExp('.( |$)');
+function hasSentenceEnd(text) {
+  return SENTENCE_END_RE.test(text);
+}
+
+function excerptContent(content) {
+              var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
+
+              return content.trim().split(/\s+/).slice(0, words).join(' ');
+}
+
+// check a string for encoding; this is
+// used in our fetchResource function to
+// ensure correctly encoded responses
+function getEncoding(str) {
+  if (ENCODING_RE.test(str)) {
+    return ENCODING_RE.exec(str)[1];
+  }
+
+  return null;
+}
+
 // Browser does not like us setting user agent
 var REQUEST_HEADERS = cheerio.browser ? {} : {
  'User-Agent': 'Mercury - https://mercury.postlight.com/web-parser/'
@ -95,6 +257,12 @@ function get(options) {
      if (err) {
        reject(err);
      } else {
+        var encoding = getEncoding(response.headers['content-type']);
+
+        if (iconv.encodingExists(encoding)) {
+          body = iconv.decode(body, encoding);
+        }
+
        resolve({ body: body, response: response });
      }
    });
@ -1047,154 +1215,6 @@ function scoreContent$$1($) {
  return $;
 }

-var NORMALIZE_RE = /\s{2,}/g;
-
-function normalizeSpaces(text) {
-  return text.replace(NORMALIZE_RE, ' ').trim();
-}
-
-// Given a node type to search for, and a list of regular expressions,
-// look to see if this extraction can be found in the URL. Expects
-// that each expression in r_list will return group(1) as the proper
-// string to be cleaned.
-// Only used for date_published currently.
-function extractFromUrl(url, regexList) {
-  var matchRe = regexList.find(function (re) {
-    return re.test(url);
-  });
-  if (matchRe) {
-    return matchRe.exec(url)[1];
-  }
-
-  return null;
-}
-
-// An expression that looks to try to find the page digit within a URL, if
-// it exists.
-// Matches:
-//  page=1
-//  pg=1
-//  p=1
-//  paging=12
-//  pag=7
-//  pagination/1
-//  paging/88
-//  pa/83
-//  p/11
-//
-// Does not match:
-//  pg=102
-//  page:2
-var PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i');
-
-var HAS_ALPHA_RE = /[a-z]/i;
-
-var IS_ALPHA_RE = /^[a-z]+$/i;
-var IS_DIGIT_RE = /^[0-9]+$/i;
-
-function pageNumFromUrl(url) {
-  var matches = url.match(PAGE_IN_HREF_RE);
-  if (!matches) return null;
-
-  var pageNum = parseInt(matches[6], 10);
-
-  // Return pageNum < 100, otherwise
-  // return null
-  return pageNum < 100 ? pageNum : null;
-}
-
-function removeAnchor(url) {
-  return url.split('#')[0].replace(/\/$/, '');
-}
-
-function isGoodSegment(segment, index, firstSegmentHasLetters) {
-  var goodSegment = true;
-
-  // If this is purely a number, and it's the first or second
-  // url_segment, it's probably a page number. Remove it.
-  if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
-    goodSegment = true;
-  }
-
-  // If this is the first url_segment and it's just "index",
-  // remove it
-  if (index === 0 && segment.toLowerCase() === 'index') {
-    goodSegment = false;
-  }
-
-  // If our first or second url_segment is smaller than 3 characters,
-  // and the first url_segment had no alphas, remove it.
-  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
-    goodSegment = false;
-  }
-
-  return goodSegment;
-}
-
-// Take a URL, and return the article base of said URL. That is, no
-// pagination data exists in it. Useful for comparing to other links
-// that might have pagination data within them.
-function articleBaseUrl(url, parsed) {
-  var parsedUrl = parsed || URL.parse(url);
-  var protocol = parsedUrl.protocol,
-      host = parsedUrl.host,
-      path = parsedUrl.path;
-
-
-  var firstSegmentHasLetters = false;
-  var cleanedSegments = path.split('/').reverse().reduce(function (acc, rawSegment, index) {
-    var segment = rawSegment;
-
-    // Split off and save anything that looks like a file type.
-    if (segment.includes('.')) {
-      var _segment$split = segment.split('.'),
-          _segment$split2 = _slicedToArray(_segment$split, 2),
-          possibleSegment = _segment$split2[0],
-          fileExt = _segment$split2[1];
-
-      if (IS_ALPHA_RE.test(fileExt)) {
-        segment = possibleSegment;
-      }
-    }
-
-    // If our first or second segment has anything looking like a page
-    // number, remove it.
-    if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
-      segment = segment.replace(PAGE_IN_HREF_RE, '');
-    }
-
-    // If we're on the first segment, check to see if we have any
-    // characters in it. The first segment is actually the last bit of
-    // the URL, and this will be helpful to determine if we're on a URL
-    // segment that looks like "/2/" for example.
-    if (index === 0) {
-      firstSegmentHasLetters = HAS_ALPHA_RE.test(segment);
-    }
-
-    // If it's not marked for deletion, push it to cleaned_segments.
-    if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
-      acc.push(segment);
-    }
-
-    return acc;
-  }, []);
-
-  return protocol + '//' + host + cleanedSegments.reverse().join('/');
-}
-
-// Given a string, return True if it appears to have an ending sentence
-// within it, false otherwise.
-var SENTENCE_END_RE = new RegExp('.( |$)');
-function hasSentenceEnd(text) {
-  return SENTENCE_END_RE.test(text);
-}
-
-function excerptContent(content) {
-              var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10;
-
-              return content.trim().split(/\s+/).slice(0, words).join(' ');
-}
-
 // Now that we have a top_candidate, look through the siblings of
 // it to see if any of them are decently scored. If they are, they
 // may be split parts of the content (Like two divs, a preamble and
--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js
--- a/fixtures/nock/fetch-resource-test.js
+++ b/fixtures/nock/fetch-resource-test.js
--- a/package.json
+++ b/package.json
@ -13,7 +13,7 @@
    "build:generator": "rollup -c scripts/rollup.config.js",
    "test_build": "rollup -c",
    "test": "yarn test:node && yarn test:web",
-    "test:node": "jest",
+    "test:node": "jest ./src",
    "test:web": "./node_modules/karma/bin/karma start karma.conf.js",
    "test:build": "cd ./scripts && jest check-build.test.js",
    "test:build:web": "node ./scripts/proxy-browser-test.js",
@ -82,6 +82,7 @@
    "cheerio": "^0.22.0",
    "difflib": "adampash/difflib.js",
    "ellipsize": "0.0.2",
+    "iconv-lite": "^0.4.15",
    "jquery": "^3.1.1",
    "moment": "^2.14.1",
    "request": "czardoz/request",
@ -93,7 +94,8 @@
  },
  "browser": {
    "main": "./dist/mercury.web.js",
-    "cheerio": "./src/utils/cheerio-query",
-    "request": "browser-request"
+    "cheerio": "./src/shims/cheerio-query",
+    "request": "browser-request",
+    "iconv-lite": "./src/shims/iconv-lite"
  }
 }
--- a/src/resource/utils/fetch-resource.js
+++ b/src/resource/utils/fetch-resource.js
@ -1,6 +1,8 @@
 import URL from 'url';
 import request from 'request';
+import iconv from 'iconv-lite';
 import { Errors } from 'utils';
+import { getEncoding } from 'utils/text';

 import {
  REQUEST_HEADERS,
@ -15,6 +17,12 @@ function get(options) {
      if (err) {
        reject(err);
      } else {
+        const encoding = getEncoding(response.headers['content-type']);
+
+        if (iconv.encodingExists(encoding)) {
+          body = iconv.decode(body, encoding);
+        }
+
        resolve({ body, response });
      }
    });
--- a/src/resource/utils/fetch-resource.test.js
+++ b/src/resource/utils/fetch-resource.test.js
@ -25,27 +25,38 @@ describe('fetchResource(url)', () => {
    const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
    const { body } = await fetchResource(url);

-    assert.equal(typeof body, 'object');
+    assert.equal(typeof body, 'string');
  });
+
  it('fetches domains', async () => {
    const url = 'http://theconcourse.deadspin.com/1786177057';
    const { body } = await fetchResource(url);

-    assert.equal(typeof body, 'object');
+    assert.equal(typeof body, 'string');
  });

  it('fetches nyt', async () => {
    const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
    const { body } = await fetchResource(url);

-    assert.equal(typeof body, 'object');
+    assert.equal(typeof body, 'string');
  });

  it('handles this gzip error', async () => {
    const url = 'http://www.redcross.ca/blog/2016/11/photo-of-the-day--one-year-anniversary-of-the-end-of-ebola-in-sierra-leone';
    const { body } = await fetchResource(url);

-    assert.equal(typeof body, 'object');
+    assert.equal(typeof body, 'string');
+  });
+
+  // this test addresses https://twitter.com/flikxxi/status/800074680342351872
+  it('handles different encoding', async () => {
+    const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
+    const { body } = await fetchResource(url);
+
+    const badEncodingRe = /<2F>/g;
+
+    assert.equal(badEncodingRe.test(body.toString()), false);
  });
 });

--- a/src/shims/cheerio-query.js
+++ b/src/shims/cheerio-query.js
--- a/src/shims/iconv-lite.js
+++ b/src/shims/iconv-lite.js
@ -0,0 +1,8 @@
+// this is a shim for the browser build;
+// iconv-lite doubles build size, and we
+// don't need it for already rendered text
+const iconv = {
+  encodingExists: () => false,
+};
+
+export default iconv;
--- a/src/utils/text/constants.js
+++ b/src/utils/text/constants.js
@ -20,3 +20,5 @@ export const HAS_ALPHA_RE = /[a-z]/i;

 export const IS_ALPHA_RE = /^[a-z]+$/i;
 export const IS_DIGIT_RE = /^[0-9]+$/i;
+
+export const ENCODING_RE = /charset=([\w-]+)\b/;
--- a/src/utils/text/get-encoding.js
+++ b/src/utils/text/get-encoding.js
@ -0,0 +1,12 @@
+import { ENCODING_RE } from './constants';
+
+// check a string for encoding; this is
+// used in our fetchResource function to
+// ensure correctly encoded responses
+export default function getEncoding(str) {
+  if (ENCODING_RE.test(str)) {
+    return ENCODING_RE.exec(str)[1];
+  }
+
+  return null;
+}
--- a/src/utils/text/get-encoding.test.js
+++ b/src/utils/text/get-encoding.test.js
@ -0,0 +1,15 @@
+import assert from 'assert';
+
+import getEncoding from './get-encoding';
+
+describe('getEncoding(str)', () => {
+  it('returns the encoding as a string', () => {
+    const contentType = 'text/html; charset=iso-8859-15';
+    assert.equal(getEncoding(contentType), 'iso-8859-15');
+  });
+
+  it('returns null if no encoding found', () => {
+    const contentType = 'text/html';
+    assert.equal(getEncoding(contentType), null);
+  });
+});
--- a/src/utils/text/index.js
+++ b/src/utils/text/index.js
@ -5,3 +5,4 @@ export { default as removeAnchor } from './remove-anchor';
 export { default as articleBaseUrl } from './article-base-url';
 export { default as hasSentenceEnd } from './has-sentence-end';
 export { default as excerptContent } from './excerpt-content';
+export { default as getEncoding } from './get-encoding';
--- a/yarn.lock
+++ b/yarn.lock
@ -2539,7 +2539,7 @@ https-browserify@~0.0.0:
  version "0.0.1"
  resolved "https://registry.yarnpkg.com/https-browserify/-/https-browserify-0.0.1.tgz#3f91365cabe60b77ed0ebba24b454e3e09d95a82"

-iconv-lite@^0.4.13:
+iconv-lite, iconv-lite@^0.4.13:
  version "0.4.15"
  resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.15.tgz#fe265a218ac6a57cfe854927e9d04c19825eddeb"