You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
129 lines
3.2 KiB
JavaScript
129 lines
3.2 KiB
JavaScript
import URL from 'url';
|
|
import cheerio from 'cheerio';
|
|
import TurndownService from 'turndown';
|
|
|
|
import Resource from 'resource';
|
|
import { validateUrl } from 'utils';
|
|
import addCustomExtractor from 'extractors/add-extractor';
|
|
import getExtractor from 'extractors/get-extractor';
|
|
import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
|
|
import collectAllPages from 'extractors/collect-all-pages';
|
|
|
|
const Parser = {
|
|
async parse(url, { html, ...opts } = {}) {
|
|
const {
|
|
fetchAllPages = true,
|
|
fallback = true,
|
|
contentType = 'html',
|
|
headers = {},
|
|
extend,
|
|
customExtractor,
|
|
} = opts;
|
|
|
|
// if no url was passed and this is the browser version,
|
|
// set url to window.location.href and load the html
|
|
// from the current page
|
|
if (!url && cheerio.browser) {
|
|
url = window.location.href; // eslint-disable-line no-undef
|
|
html = html || cheerio.html();
|
|
}
|
|
|
|
const parsedUrl = URL.parse(url);
|
|
|
|
if (!validateUrl(parsedUrl)) {
|
|
return {
|
|
error: true,
|
|
message:
|
|
'The url parameter passed does not look like a valid URL. Please check your URL and try again.',
|
|
};
|
|
}
|
|
|
|
const $ = await Resource.create(url, html, parsedUrl, headers);
|
|
|
|
// If we found an error creating the resource, return that error
|
|
if ($.failed) {
|
|
return $;
|
|
}
|
|
|
|
// Add custom extractor via cli.
|
|
if (customExtractor) {
|
|
addCustomExtractor(customExtractor);
|
|
}
|
|
|
|
const Extractor = getExtractor(url, parsedUrl, $);
|
|
// console.log(`Using extractor for ${Extractor.domain}`);
|
|
|
|
// if html still has not been set (i.e., url passed to Parser.parse),
|
|
// set html from the response of Resource.create
|
|
if (!html) {
|
|
html = $.html();
|
|
}
|
|
|
|
// Cached value of every meta name in our document.
|
|
// Used when extracting title/author/date_published/dek
|
|
const metaCache = $('meta')
|
|
.map((_, node) => $(node).attr('name'))
|
|
.toArray();
|
|
|
|
let extendedTypes = {};
|
|
if (extend) {
|
|
extendedTypes = selectExtendedTypes(extend, { $, url, html });
|
|
}
|
|
|
|
let result = RootExtractor.extract(Extractor, {
|
|
url,
|
|
html,
|
|
$,
|
|
metaCache,
|
|
parsedUrl,
|
|
fallback,
|
|
contentType,
|
|
});
|
|
|
|
const { title, next_page_url } = result;
|
|
|
|
// Fetch more pages if next_page_url found
|
|
if (fetchAllPages && next_page_url) {
|
|
result = await collectAllPages({
|
|
Extractor,
|
|
next_page_url,
|
|
html,
|
|
$,
|
|
metaCache,
|
|
result,
|
|
title,
|
|
url,
|
|
});
|
|
} else {
|
|
result = {
|
|
...result,
|
|
total_pages: 1,
|
|
rendered_pages: 1,
|
|
};
|
|
}
|
|
|
|
if (contentType === 'markdown') {
|
|
const turndownService = new TurndownService();
|
|
result.content = turndownService.turndown(result.content);
|
|
} else if (contentType === 'text') {
|
|
result.content = $.text($(result.content));
|
|
}
|
|
|
|
return { ...result, ...extendedTypes };
|
|
},
|
|
|
|
browser: !!cheerio.browser,
|
|
|
|
// A convenience method for getting a resource
|
|
// to work with, e.g., for custom extractor generator
|
|
fetchResource(url) {
|
|
return Resource.create(url);
|
|
},
|
|
|
|
addExtractor(extractor) {
|
|
return addCustomExtractor(extractor);
|
|
},
|
|
};
|
|
|
|
export default Parser;
|