feat: some basic error handling for bad urls

8 years ago · bf13b38a9b
parent 9f0c075de4
commit bf13b38a9b
14 changed files with 213 additions and 68 deletions
--- a/dist/iris.js
+++ b/dist/iris.js
@ -2,9 +2,9 @@

 function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }

+var URL = _interopDefault(require('url'));
 var babelPolyfill = require('babel-polyfill');
 var cheerio = _interopDefault(require('cheerio'));
-var URL = _interopDefault(require('url'));
 var request = _interopDefault(require('request'));
 var stringDirection = _interopDefault(require('string-direction'));
 var validUrl = _interopDefault(require('valid-url'));
@ -13,6 +13,50 @@ var wuzzy = _interopDefault(require('wuzzy'));
 var difflib = _interopDefault(require('difflib'));
 var ellipsize = _interopDefault(require('ellipsize'));

+var _marked = [range].map(regeneratorRuntime.mark);
+
+function range() {
+  var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];
+  var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];
+  return regeneratorRuntime.wrap(function range$(_context) {
+    while (1) {
+      switch (_context.prev = _context.next) {
+        case 0:
+          if (!(start <= end)) {
+            _context.next = 5;
+            break;
+          }
+
+          _context.next = 3;
+          return start += 1;
+
+        case 3:
+          _context.next = 0;
+          break;
+
+        case 5:
+        case "end":
+          return _context.stop();
+      }
+    }
+  }, _marked[0], this);
+}
+
+// extremely simple url validation as a first step
+function validateUrl(_ref) {
+  var hostname = _ref.hostname;
+
+  // If this isn't a valid url, return an error message
+  return !!hostname;
+}
+
+var Errors = {
+  badUrl: {
+    error: true,
+    messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
+  }
+};
+
 var REQUEST_HEADERS = {
  'User-Agent': 'Readability - http://readability.com/about/'
 };
@ -185,14 +229,15 @@ function validateResponse(response) {
 //       unicode content for HTML, with charset conversion.

 var fetchResource = (function () {
-  var _ref2 = asyncToGenerator(regeneratorRuntime.mark(function _callee(url) {
-    var parsedUrl, options, _ref3, response, body;
+  var _ref2 = asyncToGenerator(regeneratorRuntime.mark(function _callee(url, parsedUrl) {
+    var options, _ref3, response, body;

    return regeneratorRuntime.wrap(function _callee$(_context) {
      while (1) {
        switch (_context.prev = _context.next) {
          case 0:
-            parsedUrl = URL.parse(encodeURI(url));
+            parsedUrl = parsedUrl || URL.parse(encodeURI(url));
+
            options = {
              url: parsedUrl,
              headers: _extends({}, REQUEST_HEADERS),
@ -222,7 +267,7 @@ var fetchResource = (function () {
          case 12:
            _context.prev = 12;
            _context.t0 = _context['catch'](7);
-            return _context.abrupt('return', _context.t0);
+            return _context.abrupt('return', Errors.badUrl);

          case 15:
          case 'end':
@ -232,7 +277,7 @@ var fetchResource = (function () {
    }, _callee, this, [[7, 12]]);
  }));

-  function fetchResource(_x2) {
+  function fetchResource(_x2, _x3) {
    return _ref2.apply(this, arguments);
  }

@ -313,7 +358,7 @@ var Resource = {
  // :param response: If set, use as the response rather than
  //                  attempting to fetch it ourselves. Expects a
  //                  string.
-  create: function create(url, preparedResponse) {
+  create: function create(url, preparedResponse, parsedUrl) {
    var _this = this;

    return asyncToGenerator(regeneratorRuntime.mark(function _callee() {
@ -345,15 +390,23 @@ var Resource = {

            case 6:
              _context.next = 8;
-              return fetchResource(url);
+              return fetchResource(url, parsedUrl);

            case 8:
              result = _context.sent;

            case 9:
+              if (!result.error) {
+                _context.next = 11;
+                break;
+              }
+
+              return _context.abrupt('return', result);
+
+            case 11:
              return _context.abrupt('return', _this.generateDoc(result));

-            case 10:
+            case 12:
            case 'end':
              return _context.stop();
          }
@ -911,7 +964,7 @@ var TwitterExtractor = {
  },

  date_published: {
-    selectors: ['.tweet.permalink-tweet .metadata']
+    selectors: ['.permalink-tweet ._timestamp[data-time-ms]']
  }

 };
@ -2256,6 +2309,8 @@ var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
 // CLEAN DEK CONSTANTS
 var TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
 // CLEAN DATE PUBLISHED CONSTANTS
+var MS_DATE_STRING = /^\d{13}$/i;
+var SEC_DATE_STRING = /^\d{10}$/i;
 var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
 var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
 var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
@ -2315,6 +2370,11 @@ function cleanDateString(dateString) {
 // Take a date published string, and hopefully return a date out of
 // it. Return none if we fail.
 function cleanDatePublished(dateString) {
+  // If string is in milliseconds or seconds, convert to int
+  if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
+    dateString = parseInt(dateString, 10);
+  }
+
  var date = moment(new Date(dateString));

  if (!date.isValid()) {
@ -3367,35 +3427,6 @@ function scoreExtraneousLinks(href) {
  return 0;
 }

-var _marked = [range].map(regeneratorRuntime.mark);
-
-function range() {
-  var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];
-  var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];
-  return regeneratorRuntime.wrap(function range$(_context) {
-    while (1) {
-      switch (_context.prev = _context.next) {
-        case 0:
-          if (!(start <= end)) {
-            _context.next = 5;
-            break;
-          }
-
-          _context.next = 3;
-          return start += 1;
-
-        case 3:
-          _context.next = 0;
-          break;
-
-        case 5:
-        case "end":
-          return _context.stop();
-      }
-    }
-  }, _marked[0], this);
-}
-
 function makeSig$1($link) {
  return ($link.attr('class') || '') + ' ' + ($link.attr('id') || '');
 }
@ -3788,9 +3819,10 @@ var GenericExtractor = {
  }
 };

-function getExtractor(url) {
-  var parsedUrl = URL.parse(url);
-  var hostname = parsedUrl.hostname;
+function getExtractor(url, parsedUrl) {
+  parsedUrl = parsedUrl || URL.parse(url);
+  var _parsedUrl = parsedUrl;
+  var hostname = _parsedUrl.hostname;

  var baseDomain = hostname.split('.').slice(-2).join('.');

@ -4060,7 +4092,7 @@ var Iris = {

    var opts = arguments.length <= 2 || arguments[2] === undefined ? {} : arguments[2];
    return asyncToGenerator(regeneratorRuntime.mark(function _callee() {
-      var _ref, _ref$fetchAllPages, fetchAllPages, Extractor, $, metaCache, result, _result, title, next_page_url;
+      var _ref, _ref$fetchAllPages, fetchAllPages, parsedUrl, Extractor, $, metaCache, result, _result, title, next_page_url;

      return regeneratorRuntime.wrap(function _callee$(_context) {
        while (1) {
@ -4069,16 +4101,34 @@ var Iris = {
              _ref = opts || true;
              _ref$fetchAllPages = _ref.fetchAllPages;
              fetchAllPages = _ref$fetchAllPages === undefined ? true : _ref$fetchAllPages;
-              Extractor = getExtractor(url);
+              parsedUrl = URL.parse(url);

-              console.log('Using extractor for ' + Extractor.domain);
+              if (validateUrl(parsedUrl)) {
+                _context.next = 6;
+                break;
+              }

-              _context.next = 7;
-              return Resource.create(url, html);
+              return _context.abrupt('return', Errors.badUrl);

-            case 7:
+            case 6:
+              Extractor = getExtractor(url, parsedUrl);
+              // console.log(`Using extractor for ${Extractor.domain}`);
+
+              _context.next = 9;
+              return Resource.create(url, html, parsedUrl);
+
+            case 9:
              $ = _context.sent;

+              if (!$.error) {
+                _context.next = 12;
+                break;
+              }
+
+              return _context.abrupt('return', $);
+
+            case 12:
+
              html = $.html();

              // Cached value of every meta name in our document.
@ -4086,7 +4136,7 @@ var Iris = {
              metaCache = $('meta').map(function (_, node) {
                return $(node).attr('name');
              }).toArray();
-              result = RootExtractor.extract(Extractor, { url: url, html: html, $: $, metaCache: metaCache });
+              result = RootExtractor.extract(Extractor, { url: url, html: html, $: $, metaCache: metaCache, parsedUrl: parsedUrl });
              _result = result;
              title = _result.title;
              next_page_url = _result.next_page_url;
@ -4094,11 +4144,11 @@ var Iris = {
              // Fetch more pages if next_page_url found

              if (!(fetchAllPages && next_page_url)) {
-                _context.next = 20;
+                _context.next = 24;
                break;
              }

-              _context.next = 17;
+              _context.next = 21;
              return collectAllPages({
                Extractor: Extractor,
                next_page_url: next_page_url,
@ -4110,21 +4160,21 @@ var Iris = {
                url: url
              });

-            case 17:
+            case 21:
              result = _context.sent;
-              _context.next = 21;
+              _context.next = 25;
              break;

-            case 20:
+            case 24:
              result = _extends({}, result, {
                total_pages: 1,
                rendered_pages: 1
              });

-            case 21:
+            case 25:
              return _context.abrupt('return', result);

-            case 22:
+            case 26:
            case 'end':
              return _context.stop();
          }
--- a/dist/iris.js.map
+++ b/dist/iris.js.map
--- a/src/cleaners/date-published.js
+++ b/src/cleaners/date-published.js
@ -26,7 +26,7 @@ export function cleanDateString(dateString) {
 export default function cleanDatePublished(dateString) {
  // If string is in milliseconds or seconds, convert to int
  if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
-    dateString = parseInt(dateString, 10)
+    dateString = parseInt(dateString, 10);
  }

  let date = moment(new Date(dateString));
--- a/src/extractors/get-extractor.js
+++ b/src/extractors/get-extractor.js
@ -3,8 +3,8 @@ import URL from 'url';
 import Extractors from './all';
 import GenericExtractor from './generic';

-export default function getExtractor(url) {
-  const parsedUrl = URL.parse(url);
+export default function getExtractor(url, parsedUrl) {
+  parsedUrl = parsedUrl || URL.parse(url);
  const { hostname } = parsedUrl;
  const baseDomain = hostname.split('.').slice(-2).join('.');

--- a/src/iris.js
+++ b/src/iris.js
@ -1,4 +1,10 @@
+import URL from 'url';
+
 import Resource from 'resource';
+import {
+  validateUrl,
+  Errors,
+} from 'utils';
 import getExtractor from 'extractors/get-extractor';
 import RootExtractor from 'extractors/root-extractor';
 import collectAllPages from 'extractors/collect-all-pages';
@ -6,17 +12,30 @@ import collectAllPages from 'extractors/collect-all-pages';
 const Iris = {
  async parse(url, html, opts = {}) {
    const { fetchAllPages = true } = opts || true;
-    const Extractor = getExtractor(url);
+
+    const parsedUrl = URL.parse(url);
+
+    if (!validateUrl(parsedUrl)) {
+      return Errors.badUrl;
+    }
+
+    const Extractor = getExtractor(url, parsedUrl);
    // console.log(`Using extractor for ${Extractor.domain}`);

-    const $ = await Resource.create(url, html);
+    const $ = await Resource.create(url, html, parsedUrl);
+
+    // If we found an error creating the resource, return that error
+    if ($.error) {
+      return $;
+    }
+
    html = $.html();

    // Cached value of every meta name in our document.
    // Used when extracting title/author/date_published/dek
    const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();

-    let result = RootExtractor.extract(Extractor, { url, html, $, metaCache });
+    let result = RootExtractor.extract(Extractor, { url, html, $, metaCache, parsedUrl });
    const { title, next_page_url } = result;

    // Fetch more pages if next_page_url found
--- a/src/iris.test.js
+++ b/src/iris.test.js
@ -1,10 +1,23 @@
 import assert from 'assert';
+import { Errors } from 'utils';

 import Iris from './iris';

 describe('Iris', () => {
  describe('parse(url)', function test() {
    this.timeout(1000000);
+    it('returns an error if a malformed url is passed', async function() {
+      const error = await Iris.parse('foo.com');
+
+      assert.equal(error, Errors.badUrl);
+    });
+
+    it('returns an error if a bad url is passed', async function() {
+      const error = await Iris.parse('foo.com');
+
+      assert.equal(error, Errors.badUrl);
+    });
+
    it('does the whole thing', async function() {
      const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220');

--- a/src/resource/index.js
+++ b/src/resource/index.js
@ -17,7 +17,7 @@ const Resource = {
  // :param response: If set, use as the response rather than
  //                  attempting to fetch it ourselves. Expects a
  //                  string.
-  async create(url, preparedResponse) {
+  async create(url, preparedResponse, parsedUrl) {
    let result;

    if (preparedResponse) {
@ -32,8 +32,13 @@ const Resource = {

      result = { body: preparedResponse, response: validResponse };
    } else {
-      result = await fetchResource(url);
+      result = await fetchResource(url, parsedUrl);
    }
+
+    if (result.error) {
+      return result;
+    }
+
    return this.generateDoc(result);
  },

--- a/src/resource/index.test.js
+++ b/src/resource/index.test.js
@ -1,4 +1,5 @@
 import assert from 'assert';
+import { Errors } from 'utils';

 import Resource from './index';

@ -11,6 +12,13 @@ describe('Resource', () => {

      assert.equal(typeof $, 'function');
    });
+
+    it('returns an error message if the url is malformed', (async) () => {
+      const url = 'http://nytimes.com/500';
+      const error = await Resource.create(url);
+
+      assert.equal(error, Errors.badUrl);
+    });
  });

  describe('generateDoc({ body, response })', () => {
--- a/src/resource/utils/fetch-resource.js
+++ b/src/resource/utils/fetch-resource.js
@ -2,6 +2,7 @@ import 'babel-polyfill';

 import URL from 'url';
 import request from 'request';
+import { Errors } from 'utils';

 import {
  REQUEST_HEADERS,
@ -75,8 +76,8 @@ export function baseDomain({ host }) {
 // TODO: Ensure we are not fetching something enormous. Always return
 //       unicode content for HTML, with charset conversion.

-export default async function fetchResource(url) {
-  const parsedUrl = URL.parse(encodeURI(url));
+export default async function fetchResource(url, parsedUrl) {
+  parsedUrl = parsedUrl || URL.parse(encodeURI(url));

  const options = {
    url: parsedUrl,
@ -99,6 +100,6 @@ export default async function fetchResource(url) {
    validateResponse(response);
    return { body, response };
  } catch (e) {
-    return e;
+    return Errors.badUrl;
  }
 }
--- a/src/resource/utils/fetch-resource.test.js
+++ b/src/resource/utils/fetch-resource.test.js
@ -10,6 +10,19 @@ import { MAX_CONTENT_LENGTH } from './constants';

 describe('fetchResource(url)', function test() {
  this.timeout(1000000);
+  it('returns appropriate json for bad url', (async) () => {
+    const url = 'http://www.nytimes.com/500';
+    const { error } = await fetchResource(url);
+
+    assert.equal(error, true);
+  });
+
+  it('fetches nyt', (async) () => {
+    const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
+    const { body } = await fetchResource(url);
+
+    assert.equal(typeof body, 'object');
+  });
  it('fetches domains', (async) () => {
    const url = 'http://theconcourse.deadspin.com/1786177057';
    const { body } = await fetchResource(url);
--- a/src/utils/errors.js
+++ b/src/utils/errors.js
@ -0,0 +1,8 @@
+const Errors = {
+  badUrl: {
+    error: true,
+    messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.',
+  },
+};
+
+export default Errors;
--- a/src/utils/index.js
+++ b/src/utils/index.js
@ -1 +1,3 @@
 export { default as range } from './range';
+export { default as validateUrl } from './validate-url';
+export { default as Errors } from './errors';
--- a/src/utils/validate-url.js
+++ b/src/utils/validate-url.js
@ -0,0 +1,6 @@
+
+// extremely simple url validation as a first step
+export default function validateUrl({ hostname }) {
+  // If this isn't a valid url, return an error message
+  return !!hostname;
+}
--- a/src/utils/validate-url.test.js
+++ b/src/utils/validate-url.test.js
@ -0,0 +1,20 @@
+import assert from 'assert';
+import URL from 'url';
+
+import validateUrl from './validate-url';
+
+describe('validateUrl(parsedUrl)', () => {
+  it('returns false if url is not valid', () => {
+    const url = URL.parse('example.com');
+    const valid = validateUrl(url);
+
+    assert.equal(valid, false);
+  });
+
+  it('returns true if url is valid', () => {
+    const url = URL.parse('http://example.com');
+    const valid = validateUrl(url);
+
+    assert.equal(valid, true);
+  });
+});