You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/resource/index.js

96 lines
2.8 KiB
JavaScript

import cheerio from 'cheerio';
import iconv from 'iconv-lite';
import { getEncoding } from 'utils/text';
import { fetchResource } from './utils';
import { normalizeMetaTags, convertLazyLoadedImages, clean } from './utils/dom';
const Resource = {
// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
// :param headers: Custom headers to be included in the request
async create(url, preparedResponse, parsedUrl, headers = {}) {
let result;
if (preparedResponse) {
const validResponse = {
statusMessage: 'OK',
statusCode: 200,
headers: {
'content-type': 'text/html',
'content-length': 500,
},
};
result = {
body: preparedResponse,
response: validResponse,
alreadyDecoded: true,
};
} else {
result = await fetchResource(url, parsedUrl, headers);
}
if (result.error) {
result.failed = true;
return result;
}
return this.generateDoc(result);
},
generateDoc({ body: content, response, alreadyDecoded = false }) {
const { 'content-type': contentType = '' } = response.headers;
// TODO: Implement is_text function from
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
if (!contentType.includes('html') && !contentType.includes('text')) {
throw new Error('Content does not appear to be text.');
}
let $ = this.encodeDoc({ content, contentType, alreadyDecoded });
if ($.root().children().length === 0) {
throw new Error('No children, likely a bad parse.');
}
$ = normalizeMetaTags($);
$ = convertLazyLoadedImages($);
$ = clean($);
return $;
},
encodeDoc({ content, contentType, alreadyDecoded = false }) {
if (alreadyDecoded) {
return cheerio.load(content);
}
const encoding = getEncoding(contentType);
let decodedContent = iconv.decode(content, encoding);
let $ = cheerio.load(decodedContent);
// after first cheerio.load, check to see if encoding matches
const contentTypeSelector = cheerio.browser
? 'meta[http-equiv=content-type]'
: 'meta[http-equiv=content-type i]';
const metaContentType =
$(contentTypeSelector).attr('content') ||
$('meta[charset]').attr('charset');
const properEncoding = getEncoding(metaContentType);
// if encodings in the header/body dont match, use the one in the body
if (metaContentType && properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}
return $;
},
};
export default Resource;