feat: some basic error handling for bad urls

pull/5/head
Adam Pash 8 years ago
parent 9f0c075de4
commit bf13b38a9b

162
dist/iris.js vendored

@ -2,9 +2,9 @@
function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }
var URL = _interopDefault(require('url'));
var babelPolyfill = require('babel-polyfill');
var cheerio = _interopDefault(require('cheerio'));
var URL = _interopDefault(require('url'));
var request = _interopDefault(require('request'));
var stringDirection = _interopDefault(require('string-direction'));
var validUrl = _interopDefault(require('valid-url'));
@ -13,6 +13,50 @@ var wuzzy = _interopDefault(require('wuzzy'));
var difflib = _interopDefault(require('difflib'));
var ellipsize = _interopDefault(require('ellipsize'));
var _marked = [range].map(regeneratorRuntime.mark);
function range() {
var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];
var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];
return regeneratorRuntime.wrap(function range$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
if (!(start <= end)) {
_context.next = 5;
break;
}
_context.next = 3;
return start += 1;
case 3:
_context.next = 0;
break;
case 5:
case "end":
return _context.stop();
}
}
}, _marked[0], this);
}
// extremely simple url validation as a first step
function validateUrl(_ref) {
var hostname = _ref.hostname;
// If this isn't a valid url, return an error message
return !!hostname;
}
var Errors = {
badUrl: {
error: true,
messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
}
};
var REQUEST_HEADERS = {
'User-Agent': 'Readability - http://readability.com/about/'
};
@ -185,14 +229,15 @@ function validateResponse(response) {
// unicode content for HTML, with charset conversion.
var fetchResource = (function () {
var _ref2 = asyncToGenerator(regeneratorRuntime.mark(function _callee(url) {
var parsedUrl, options, _ref3, response, body;
var _ref2 = asyncToGenerator(regeneratorRuntime.mark(function _callee(url, parsedUrl) {
var options, _ref3, response, body;
return regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
parsedUrl = URL.parse(encodeURI(url));
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
options = {
url: parsedUrl,
headers: _extends({}, REQUEST_HEADERS),
@ -222,7 +267,7 @@ var fetchResource = (function () {
case 12:
_context.prev = 12;
_context.t0 = _context['catch'](7);
return _context.abrupt('return', _context.t0);
return _context.abrupt('return', Errors.badUrl);
case 15:
case 'end':
@ -232,7 +277,7 @@ var fetchResource = (function () {
}, _callee, this, [[7, 12]]);
}));
function fetchResource(_x2) {
function fetchResource(_x2, _x3) {
return _ref2.apply(this, arguments);
}
@ -313,7 +358,7 @@ var Resource = {
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
create: function create(url, preparedResponse) {
create: function create(url, preparedResponse, parsedUrl) {
var _this = this;
return asyncToGenerator(regeneratorRuntime.mark(function _callee() {
@ -345,15 +390,23 @@ var Resource = {
case 6:
_context.next = 8;
return fetchResource(url);
return fetchResource(url, parsedUrl);
case 8:
result = _context.sent;
case 9:
if (!result.error) {
_context.next = 11;
break;
}
return _context.abrupt('return', result);
case 11:
return _context.abrupt('return', _this.generateDoc(result));
case 10:
case 12:
case 'end':
return _context.stop();
}
@ -911,7 +964,7 @@ var TwitterExtractor = {
},
date_published: {
selectors: ['.tweet.permalink-tweet .metadata']
selectors: ['.permalink-tweet ._timestamp[data-time-ms]']
}
};
@ -2256,6 +2309,8 @@ var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// CLEAN DEK CONSTANTS
var TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// CLEAN DATE PUBLISHED CONSTANTS
var MS_DATE_STRING = /^\d{13}$/i;
var SEC_DATE_STRING = /^\d{10}$/i;
var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
var TIME_MERIDIAN_DOTS_RE = /\.m\./i;
@ -2315,6 +2370,11 @@ function cleanDateString(dateString) {
// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
function cleanDatePublished(dateString) {
// If string is in milliseconds or seconds, convert to int
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
dateString = parseInt(dateString, 10);
}
var date = moment(new Date(dateString));
if (!date.isValid()) {
@ -3367,35 +3427,6 @@ function scoreExtraneousLinks(href) {
return 0;
}
var _marked = [range].map(regeneratorRuntime.mark);
function range() {
var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];
var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];
return regeneratorRuntime.wrap(function range$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
if (!(start <= end)) {
_context.next = 5;
break;
}
_context.next = 3;
return start += 1;
case 3:
_context.next = 0;
break;
case 5:
case "end":
return _context.stop();
}
}
}, _marked[0], this);
}
function makeSig$1($link) {
return ($link.attr('class') || '') + ' ' + ($link.attr('id') || '');
}
@ -3788,9 +3819,10 @@ var GenericExtractor = {
}
};
function getExtractor(url) {
var parsedUrl = URL.parse(url);
var hostname = parsedUrl.hostname;
function getExtractor(url, parsedUrl) {
parsedUrl = parsedUrl || URL.parse(url);
var _parsedUrl = parsedUrl;
var hostname = _parsedUrl.hostname;
var baseDomain = hostname.split('.').slice(-2).join('.');
@ -4060,7 +4092,7 @@ var Iris = {
var opts = arguments.length <= 2 || arguments[2] === undefined ? {} : arguments[2];
return asyncToGenerator(regeneratorRuntime.mark(function _callee() {
var _ref, _ref$fetchAllPages, fetchAllPages, Extractor, $, metaCache, result, _result, title, next_page_url;
var _ref, _ref$fetchAllPages, fetchAllPages, parsedUrl, Extractor, $, metaCache, result, _result, title, next_page_url;
return regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
@ -4069,16 +4101,34 @@ var Iris = {
_ref = opts || true;
_ref$fetchAllPages = _ref.fetchAllPages;
fetchAllPages = _ref$fetchAllPages === undefined ? true : _ref$fetchAllPages;
Extractor = getExtractor(url);
parsedUrl = URL.parse(url);
console.log('Using extractor for ' + Extractor.domain);
if (validateUrl(parsedUrl)) {
_context.next = 6;
break;
}
_context.next = 7;
return Resource.create(url, html);
return _context.abrupt('return', Errors.badUrl);
case 7:
case 6:
Extractor = getExtractor(url, parsedUrl);
// console.log(`Using extractor for ${Extractor.domain}`);
_context.next = 9;
return Resource.create(url, html, parsedUrl);
case 9:
$ = _context.sent;
if (!$.error) {
_context.next = 12;
break;
}
return _context.abrupt('return', $);
case 12:
html = $.html();
// Cached value of every meta name in our document.
@ -4086,7 +4136,7 @@ var Iris = {
metaCache = $('meta').map(function (_, node) {
return $(node).attr('name');
}).toArray();
result = RootExtractor.extract(Extractor, { url: url, html: html, $: $, metaCache: metaCache });
result = RootExtractor.extract(Extractor, { url: url, html: html, $: $, metaCache: metaCache, parsedUrl: parsedUrl });
_result = result;
title = _result.title;
next_page_url = _result.next_page_url;
@ -4094,11 +4144,11 @@ var Iris = {
// Fetch more pages if next_page_url found
if (!(fetchAllPages && next_page_url)) {
_context.next = 20;
_context.next = 24;
break;
}
_context.next = 17;
_context.next = 21;
return collectAllPages({
Extractor: Extractor,
next_page_url: next_page_url,
@ -4110,21 +4160,21 @@ var Iris = {
url: url
});
case 17:
case 21:
result = _context.sent;
_context.next = 21;
_context.next = 25;
break;
case 20:
case 24:
result = _extends({}, result, {
total_pages: 1,
rendered_pages: 1
});
case 21:
case 25:
return _context.abrupt('return', result);
case 22:
case 26:
case 'end':
return _context.stop();
}

2
dist/iris.js.map vendored

File diff suppressed because one or more lines are too long

@ -26,7 +26,7 @@ export function cleanDateString(dateString) {
export default function cleanDatePublished(dateString) {
// If string is in milliseconds or seconds, convert to int
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
dateString = parseInt(dateString, 10)
dateString = parseInt(dateString, 10);
}
let date = moment(new Date(dateString));

@ -3,8 +3,8 @@ import URL from 'url';
import Extractors from './all';
import GenericExtractor from './generic';
export default function getExtractor(url) {
const parsedUrl = URL.parse(url);
export default function getExtractor(url, parsedUrl) {
parsedUrl = parsedUrl || URL.parse(url);
const { hostname } = parsedUrl;
const baseDomain = hostname.split('.').slice(-2).join('.');

@ -1,4 +1,10 @@
import URL from 'url';
import Resource from 'resource';
import {
validateUrl,
Errors,
} from 'utils';
import getExtractor from 'extractors/get-extractor';
import RootExtractor from 'extractors/root-extractor';
import collectAllPages from 'extractors/collect-all-pages';
@ -6,17 +12,30 @@ import collectAllPages from 'extractors/collect-all-pages';
const Iris = {
async parse(url, html, opts = {}) {
const { fetchAllPages = true } = opts || true;
const Extractor = getExtractor(url);
const parsedUrl = URL.parse(url);
if (!validateUrl(parsedUrl)) {
return Errors.badUrl;
}
const Extractor = getExtractor(url, parsedUrl);
// console.log(`Using extractor for ${Extractor.domain}`);
const $ = await Resource.create(url, html);
const $ = await Resource.create(url, html, parsedUrl);
// If we found an error creating the resource, return that error
if ($.error) {
return $;
}
html = $.html();
// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
let result = RootExtractor.extract(Extractor, { url, html, $, metaCache });
let result = RootExtractor.extract(Extractor, { url, html, $, metaCache, parsedUrl });
const { title, next_page_url } = result;
// Fetch more pages if next_page_url found

@ -1,10 +1,23 @@
import assert from 'assert';
import { Errors } from 'utils';
import Iris from './iris';
describe('Iris', () => {
describe('parse(url)', function test() {
this.timeout(1000000);
it('returns an error if a malformed url is passed', async function() {
const error = await Iris.parse('foo.com');
assert.equal(error, Errors.badUrl);
});
it('returns an error if a bad url is passed', async function() {
const error = await Iris.parse('foo.com');
assert.equal(error, Errors.badUrl);
});
it('does the whole thing', async function() {
const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220');

@ -17,7 +17,7 @@ const Resource = {
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
async create(url, preparedResponse) {
async create(url, preparedResponse, parsedUrl) {
let result;
if (preparedResponse) {
@ -32,8 +32,13 @@ const Resource = {
result = { body: preparedResponse, response: validResponse };
} else {
result = await fetchResource(url);
result = await fetchResource(url, parsedUrl);
}
if (result.error) {
return result;
}
return this.generateDoc(result);
},

@ -1,4 +1,5 @@
import assert from 'assert';
import { Errors } from 'utils';
import Resource from './index';
@ -11,6 +12,13 @@ describe('Resource', () => {
assert.equal(typeof $, 'function');
});
it('returns an error message if the url is malformed', (async) () => {
const url = 'http://nytimes.com/500';
const error = await Resource.create(url);
assert.equal(error, Errors.badUrl);
});
});
describe('generateDoc({ body, response })', () => {

@ -2,6 +2,7 @@ import 'babel-polyfill';
import URL from 'url';
import request from 'request';
import { Errors } from 'utils';
import {
REQUEST_HEADERS,
@ -75,8 +76,8 @@ export function baseDomain({ host }) {
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
export default async function fetchResource(url) {
const parsedUrl = URL.parse(encodeURI(url));
export default async function fetchResource(url, parsedUrl) {
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
const options = {
url: parsedUrl,
@ -99,6 +100,6 @@ export default async function fetchResource(url) {
validateResponse(response);
return { body, response };
} catch (e) {
return e;
return Errors.badUrl;
}
}

@ -10,6 +10,19 @@ import { MAX_CONTENT_LENGTH } from './constants';
describe('fetchResource(url)', function test() {
this.timeout(1000000);
it('returns appropriate json for bad url', (async) () => {
const url = 'http://www.nytimes.com/500';
const { error } = await fetchResource(url);
assert.equal(error, true);
});
it('fetches nyt', (async) () => {
const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';
const { body } = await fetchResource(url);
assert.equal(typeof body, 'object');
});
it('fetches domains', (async) () => {
const url = 'http://theconcourse.deadspin.com/1786177057';
const { body } = await fetchResource(url);

@ -0,0 +1,8 @@
const Errors = {
badUrl: {
error: true,
messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.',
},
};
export default Errors;

@ -1 +1,3 @@
export { default as range } from './range';
export { default as validateUrl } from './validate-url';
export { default as Errors } from './errors';

@ -0,0 +1,6 @@
// extremely simple url validation as a first step
export default function validateUrl({ hostname }) {
// If this isn't a valid url, return an error message
return !!hostname;
}

@ -0,0 +1,20 @@
import assert from 'assert';
import URL from 'url';
import validateUrl from './validate-url';
describe('validateUrl(parsedUrl)', () => {
it('returns false if url is not valid', () => {
const url = URL.parse('example.com');
const valid = validateUrl(url);
assert.equal(valid, false);
});
it('returns true if url is valid', () => {
const url = URL.parse('http://example.com');
const valid = validateUrl(url);
assert.equal(valid, true);
});
});
Loading…
Cancel
Save