You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/resource/index.js

69 lines
1.7 KiB
JavaScript

import cheerio from 'cheerio';
import { fetchResource } from './utils';
import {
normalizeMetaTags,
convertLazyLoadedImages,
clean,
} from './utils/dom';
const Resource = {
// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
async create(url, preparedResponse, parsedUrl) {
let result;
if (preparedResponse) {
const validResponse = {
statusMessage: 'OK',
statusCode: 200,
headers: {
'content-type': 'text/html',
'content-length': 500,
},
};
result = { body: preparedResponse, response: validResponse };
} else {
result = await fetchResource(url, parsedUrl);
}
if (result.error) {
result.failed = true;
return result;
}
return this.generateDoc(result);
},
generateDoc({ body: content, response }) {
const { 'content-type': contentType } = response.headers;
// TODO: Implement is_text function from
// https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57
if (!contentType.includes('html') &&
!contentType.includes('text')) {
throw new Error('Content does not appear to be text.');
}
let $ = cheerio.load(content, { normalizeWhitespace: true });
if ($.root().children().length === 0) {
throw new Error('No children, likely a bad parse.');
}
$ = normalizeMetaTags($);
$ = convertLazyLoadedImages($);
$ = clean($);
return $;
},
};
export default Resource;