You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
103 lines
2.3 KiB
JavaScript
103 lines
2.3 KiB
JavaScript
import URL from 'url';
|
|
import cheerio from 'cheerio';
|
|
|
|
import Resource from 'resource';
|
|
import {
|
|
validateUrl,
|
|
Errors,
|
|
} from 'utils';
|
|
import getExtractor from 'extractors/get-extractor';
|
|
import RootExtractor from 'extractors/root-extractor';
|
|
import collectAllPages from 'extractors/collect-all-pages';
|
|
|
|
const Mercury = {
|
|
async parse(url, html, opts = {}) {
|
|
const {
|
|
fetchAllPages = true,
|
|
fallback = true,
|
|
} = opts;
|
|
|
|
// if no url was passed and this is the browser version,
|
|
// set url to window.location.href and load the html
|
|
// from the current page
|
|
if (!url && cheerio.browser) {
|
|
url = window.location.href; // eslint-disable-line no-undef
|
|
html = html || cheerio.html();
|
|
}
|
|
|
|
const parsedUrl = URL.parse(url);
|
|
|
|
if (!validateUrl(parsedUrl)) {
|
|
return Errors.badUrl;
|
|
}
|
|
|
|
const Extractor = getExtractor(url, parsedUrl);
|
|
// console.log(`Using extractor for ${Extractor.domain}`);
|
|
|
|
const $ = await Resource.create(url, html, parsedUrl);
|
|
|
|
// If we found an error creating the resource, return that error
|
|
if ($.failed) {
|
|
return $;
|
|
}
|
|
|
|
// if html still has not been set (i.e., url passed to Mercury.parse),
|
|
// set html from the response of Resource.create
|
|
if (!html) {
|
|
html = $.html();
|
|
}
|
|
|
|
// Cached value of every meta name in our document.
|
|
// Used when extracting title/author/date_published/dek
|
|
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
|
|
|
|
let result = RootExtractor.extract(
|
|
Extractor,
|
|
{
|
|
url,
|
|
html,
|
|
$,
|
|
metaCache,
|
|
parsedUrl,
|
|
fallback,
|
|
});
|
|
|
|
const { title, next_page_url } = result;
|
|
|
|
// Fetch more pages if next_page_url found
|
|
if (fetchAllPages && next_page_url) {
|
|
result = await collectAllPages(
|
|
{
|
|
Extractor,
|
|
next_page_url,
|
|
html,
|
|
$,
|
|
metaCache,
|
|
result,
|
|
title,
|
|
url,
|
|
}
|
|
);
|
|
} else {
|
|
result = {
|
|
...result,
|
|
total_pages: 1,
|
|
rendered_pages: 1,
|
|
};
|
|
}
|
|
|
|
return result;
|
|
},
|
|
|
|
browser: !!cheerio.browser,
|
|
|
|
// A convenience method for getting a resource
|
|
// to work with, e.g., for custom extractor generator
|
|
async fetchResource(url) {
|
|
return await Resource.create(url);
|
|
},
|
|
|
|
};
|
|
|
|
export default Mercury;
|