You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/mercury.js

129 lines
3.2 KiB
JavaScript

import URL from 'url';
import cheerio from 'cheerio';
import TurndownService from 'turndown';
import Resource from 'resource';
import { validateUrl } from 'utils';
import addCustomExtractor from 'extractors/add-extractor';
import getExtractor from 'extractors/get-extractor';
import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
import collectAllPages from 'extractors/collect-all-pages';
const Parser = {
async parse(url, { html, ...opts } = {}) {
const {
fetchAllPages = true,
fallback = true,
contentType = 'html',
headers = {},
extend,
customExtractor,
} = opts;
// if no url was passed and this is the browser version,
// set url to window.location.href and load the html
// from the current page
if (!url && cheerio.browser) {
url = window.location.href; // eslint-disable-line no-undef
html = html || cheerio.html();
}
const parsedUrl = URL.parse(url);
if (!validateUrl(parsedUrl)) {
return {
error: true,
message:
'The url parameter passed does not look like a valid URL. Please check your URL and try again.',
};
}
const $ = await Resource.create(url, html, parsedUrl, headers);
// If we found an error creating the resource, return that error
if ($.failed) {
return $;
}
// Add custom extractor via cli.
if (customExtractor) {
addCustomExtractor(customExtractor);
}
const Extractor = getExtractor(url, parsedUrl, $);
// console.log(`Using extractor for ${Extractor.domain}`);
// if html still has not been set (i.e., url passed to Parser.parse),
// set html from the response of Resource.create
if (!html) {
html = $.html();
}
// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
const metaCache = $('meta')
.map((_, node) => $(node).attr('name'))
.toArray();
let extendedTypes = {};
if (extend) {
extendedTypes = selectExtendedTypes(extend, { $, url, html });
}
let result = RootExtractor.extract(Extractor, {
url,
html,
$,
metaCache,
parsedUrl,
fallback,
contentType,
});
const { title, next_page_url } = result;
// Fetch more pages if next_page_url found
if (fetchAllPages && next_page_url) {
result = await collectAllPages({
Extractor,
next_page_url,
html,
$,
metaCache,
result,
title,
url,
});
} else {
result = {
...result,
total_pages: 1,
rendered_pages: 1,
8 years ago
};
}
if (contentType === 'markdown') {
const turndownService = new TurndownService();
result.content = turndownService.turndown(result.content);
} else if (contentType === 'text') {
result.content = $.text($(result.content));
}
return { ...result, ...extendedTypes };
},
browser: !!cheerio.browser,
feat: generator for custom parsers and some documentation Squashed commit of the following: commit deaf9e60d031d9ee06e74b8c0895495b187032a5 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 20 10:31:09 2016 -0400 chore: README for custom parsers commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 23:36:09 2016 -0400 draft of readme commit 4f0f463f821465c282ce006378e5d55f8f41df5f Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:56:34 2016 -0400 custom extractor used to build basic parser for theatlantic commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:20:13 2016 -0400 pre-commit to test custom parser generator commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:19:55 2016 -0400 feat: added nytimes parser commit 58b8d83a56927177984ddfdf70830bc4f328f200 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:17:28 2016 -0400 feat: can do fuzzy search or go straight to file commit c99add753723a8e2ac64d51d7379ac8e23125526 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 10:52:26 2016 -0400 refactored export for custom extractors for easier renames commit 22563413669651bb497f1bb2a92085b71f2ae324 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 17:36:13 2016 -0400 feat: custom extractor generation in place commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 16:42:20 2016 -0400 good progress
8 years ago
// A convenience method for getting a resource
// to work with, e.g., for custom extractor generator
fetchResource(url) {
return Resource.create(url);
feat: generator for custom parsers and some documentation Squashed commit of the following: commit deaf9e60d031d9ee06e74b8c0895495b187032a5 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 20 10:31:09 2016 -0400 chore: README for custom parsers commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 23:36:09 2016 -0400 draft of readme commit 4f0f463f821465c282ce006378e5d55f8f41df5f Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:56:34 2016 -0400 custom extractor used to build basic parser for theatlantic commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:20:13 2016 -0400 pre-commit to test custom parser generator commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:19:55 2016 -0400 feat: added nytimes parser commit 58b8d83a56927177984ddfdf70830bc4f328f200 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:17:28 2016 -0400 feat: can do fuzzy search or go straight to file commit c99add753723a8e2ac64d51d7379ac8e23125526 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 10:52:26 2016 -0400 refactored export for custom extractors for easier renames commit 22563413669651bb497f1bb2a92085b71f2ae324 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 17:36:13 2016 -0400 feat: custom extractor generation in place commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 16:42:20 2016 -0400 good progress
8 years ago
},
addExtractor(extractor) {
return addCustomExtractor(extractor);
},
};
export default Parser;