You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/resource/utils/fetch-resource.js

115 lines
3.2 KiB
JavaScript

import URL from 'url';
import request from 'request';
import { Errors } from 'utils';
import {
REQUEST_HEADERS,
FETCH_TIMEOUT,
BAD_CONTENT_TYPES_RE,
MAX_CONTENT_LENGTH,
} from './constants';
function get(options) {
return new Promise((resolve, reject) => {
request(options, (err, response, body) => {
if (err) {
reject(err);
} else {
resolve({ body, response });
}
});
});
}
// Evaluate a response to ensure it's something we should be keeping.
// This does not validate in the sense of a response being 200 level or
// not. Validation here means that we haven't found reason to bail from
// further processing of this url.
export function validateResponse(response, parseNon2xx = false) {
// Check if we got a valid status code
// This isn't great, but I'm requiring a statusMessage to be set
// before short circuiting b/c nock doesn't set it in tests
// statusMessage only not set in nock response, in which case
// I check statusCode, which is currently only 200 for OK responses
// in tests
if (
(response.statusMessage && response.statusMessage !== 'OK') ||
response.statusCode !== 200
) {
if (!response.statusCode) {
throw new Error(
`Unable to fetch content. Original exception was ${response.error}`
);
} else if (!parseNon2xx) {
throw new Error(
`Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.`
);
}
}
const {
'content-type': contentType,
'content-length': contentLength,
} = response.headers;
// Check that the content is not in BAD_CONTENT_TYPES
if (BAD_CONTENT_TYPES_RE.test(contentType)) {
throw new Error(
`Content-type for this resource was ${contentType} and is not allowed.`
);
}
// Check that the content length is below maximum
if (contentLength > MAX_CONTENT_LENGTH) {
throw new Error(
`Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`
);
}
return true;
}
// Grabs the last two pieces of the URL and joins them back together
// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'
export function baseDomain({ host }) {
return host.split('.').slice(-2).join('.');
}
// Set our response attribute to the result of fetching our URL.
// TODO: This should gracefully handle timeouts and raise the
// proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
export default async function fetchResource(url, parsedUrl) {
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
const options = {
url: parsedUrl.href,
headers: { ...REQUEST_HEADERS },
timeout: FETCH_TIMEOUT,
// Don't set encoding; fixes issues
// w/gzipped responses
encoding: null,
// Accept cookies
jar: true,
// Accept and decode gzip
gzip: true,
// Follow any redirect
followAllRedirects: true,
};
const { response, body } = await get(options);
try {
validateResponse(response);
return {
body,
response,
};
} catch (e) {
return Errors.badUrl;
}
}