You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
1.6 KiB
JavaScript
69 lines
1.6 KiB
JavaScript
import URL from 'url';
|
|
|
|
import Resource from 'resource';
|
|
import {
|
|
validateUrl,
|
|
Errors,
|
|
} from 'utils';
|
|
import getExtractor from 'extractors/get-extractor';
|
|
import RootExtractor from 'extractors/root-extractor';
|
|
import collectAllPages from 'extractors/collect-all-pages';
|
|
|
|
const Mercury = {
|
|
async parse(url, html, opts = {}) {
|
|
const { fetchAllPages = true } = opts || true;
|
|
|
|
const parsedUrl = URL.parse(url);
|
|
|
|
if (!validateUrl(parsedUrl)) {
|
|
return Errors.badUrl;
|
|
}
|
|
|
|
const Extractor = getExtractor(url, parsedUrl);
|
|
// console.log(`Using extractor for ${Extractor.domain}`);
|
|
|
|
const $ = await Resource.create(url, html, parsedUrl);
|
|
|
|
// If we found an error creating the resource, return that error
|
|
if ($.error) {
|
|
return $;
|
|
}
|
|
|
|
html = $.html();
|
|
|
|
// Cached value of every meta name in our document.
|
|
// Used when extracting title/author/date_published/dek
|
|
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
|
|
|
|
let result = RootExtractor.extract(Extractor, { url, html, $, metaCache, parsedUrl });
|
|
const { title, next_page_url } = result;
|
|
|
|
// Fetch more pages if next_page_url found
|
|
if (fetchAllPages && next_page_url) {
|
|
result = await collectAllPages(
|
|
{
|
|
Extractor,
|
|
next_page_url,
|
|
html,
|
|
$,
|
|
metaCache,
|
|
result,
|
|
title,
|
|
url,
|
|
}
|
|
);
|
|
} else {
|
|
result = {
|
|
...result,
|
|
total_pages: 1,
|
|
rendered_pages: 1,
|
|
};
|
|
}
|
|
|
|
return result;
|
|
},
|
|
|
|
};
|
|
|
|
export default Mercury;
|