mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
refactor: page collection
This commit is contained in:
parent
b325a4acdd
commit
67296691c2
7
TODO.md
7
TODO.md
@ -1,4 +1,11 @@
|
||||
TODO:
|
||||
- Complete response:
|
||||
- add canonicalUrl
|
||||
- add excerpt
|
||||
- add domain
|
||||
- add word count
|
||||
- add total pages
|
||||
- add rendered pages
|
||||
- Test if .is method is faster than regex methods
|
||||
|
||||
DONE:
|
||||
|
@ -6,7 +6,7 @@
|
||||
"scripts": {
|
||||
"start": "node ./build",
|
||||
"lint": "eslint src/** --fix",
|
||||
"build": "eslint src/** && rollup -c",
|
||||
"build": "eslint src/** --fix && rollup -c",
|
||||
"test_build": "rollup -c",
|
||||
"test": "./test-runner"
|
||||
},
|
||||
|
56
src/extractors/collect-all-pages.js
Normal file
56
src/extractors/collect-all-pages.js
Normal file
@ -0,0 +1,56 @@
|
||||
import 'babel-polyfill';
|
||||
import { removeAnchor } from 'utils/text';
|
||||
import Resource from 'resource';
|
||||
|
||||
import Iris from '../iris';
|
||||
|
||||
export default async function collectAllPages(
|
||||
{
|
||||
nextPageUrl,
|
||||
html,
|
||||
$,
|
||||
metaCache,
|
||||
result,
|
||||
Extractor,
|
||||
title,
|
||||
url,
|
||||
}
|
||||
) {
|
||||
let pages = 2;
|
||||
const previousUrls = [removeAnchor(url)];
|
||||
// If we've gone over 26 pages, something has
|
||||
// likely gone wrong.
|
||||
while (nextPageUrl && pages < 26) {
|
||||
$ = await Resource.create(nextPageUrl);
|
||||
html = $.html();
|
||||
|
||||
const extractorOpts = {
|
||||
url: nextPageUrl,
|
||||
html,
|
||||
$,
|
||||
metaCache,
|
||||
contentOnly: true,
|
||||
extractedTitle: title,
|
||||
previousUrls,
|
||||
};
|
||||
|
||||
const nextPageResult = Iris.runExtraction(Extractor, extractorOpts);
|
||||
|
||||
previousUrls.push(nextPageUrl);
|
||||
result = {
|
||||
...result,
|
||||
content: `
|
||||
${result.content}
|
||||
<hr>
|
||||
<h4>Page ${pages}</h4>
|
||||
${nextPageResult.content}
|
||||
`,
|
||||
};
|
||||
|
||||
nextPageUrl = nextPageResult.nextPageUrl;
|
||||
|
||||
pages += 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
12
src/extractors/collect-all-pages.test.js
Normal file
12
src/extractors/collect-all-pages.test.js
Normal file
@ -0,0 +1,12 @@
|
||||
// import assert from 'assert';
|
||||
// import fs from 'fs';
|
||||
// import cheerio from 'cheerio';
|
||||
//
|
||||
// import collectAllPages from './collect-all-pages';
|
||||
//
|
||||
// describe('collectAllPages(opts)', () => {
|
||||
// it('fetches additional pages', () => {
|
||||
// const html = fs.readFileSync('./fixtures/ars.html');
|
||||
// const $ = cheerio.load(html);
|
||||
// });
|
||||
// });
|
61
src/iris.js
61
src/iris.js
@ -1,34 +1,33 @@
|
||||
import Resource from 'resource';
|
||||
import getExtractor from 'extractors/get-extractor';
|
||||
import RootExtractor from 'extractors/root-extractor';
|
||||
import { removeAnchor } from 'utils/text';
|
||||
import collectAllPages from 'extractors/collect-all-pages';
|
||||
|
||||
const Iris = {
|
||||
async parse(url, html, opts = {}) {
|
||||
const { fetchAllPages = true } = opts || true;
|
||||
const $ = await Resource.create(url, html);
|
||||
html = $.html();
|
||||
|
||||
const Extractor = getExtractor(url);
|
||||
console.log(`Using extractor for ${Extractor.domain}`);
|
||||
|
||||
const $ = await Resource.create(url, html);
|
||||
html = $.html();
|
||||
|
||||
// Cached value of every meta name in our document.
|
||||
// Used when extracting title/author/date_published/dek
|
||||
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
|
||||
|
||||
const extractorOpts = { url, html, $, metaCache };
|
||||
let result = RootExtractor.extract(Extractor, extractorOpts);
|
||||
let result = this.runExtraction(Extractor, { url, html, $, metaCache });
|
||||
const { title, nextPageUrl } = result;
|
||||
|
||||
if (fetchAllPages && nextPageUrl) {
|
||||
result = await this.collectAllPages(
|
||||
result = await collectAllPages(
|
||||
{
|
||||
Extractor,
|
||||
nextPageUrl,
|
||||
html,
|
||||
$,
|
||||
metaCache,
|
||||
result,
|
||||
Extractor,
|
||||
title,
|
||||
url,
|
||||
}
|
||||
@ -38,50 +37,10 @@ const Iris = {
|
||||
return result;
|
||||
},
|
||||
|
||||
async collectAllPages({
|
||||
nextPageUrl,
|
||||
html,
|
||||
$,
|
||||
metaCache,
|
||||
result,
|
||||
Extractor,
|
||||
title,
|
||||
url,
|
||||
}) {
|
||||
let pages = 2;
|
||||
const previousUrls = [removeAnchor(url)];
|
||||
while (nextPageUrl && pages < 26) {
|
||||
$ = await Resource.create(nextPageUrl);
|
||||
html = $.html();
|
||||
const extractorOpts = { url: nextPageUrl, html, $, metaCache };
|
||||
const nextPageResult = RootExtractor.extract(
|
||||
Extractor,
|
||||
{
|
||||
...extractorOpts,
|
||||
url: nextPageUrl,
|
||||
contentOnly: true,
|
||||
extractedTitle: title,
|
||||
previousUrls,
|
||||
}
|
||||
);
|
||||
|
||||
previousUrls.push(nextPageUrl);
|
||||
result = {
|
||||
...result,
|
||||
content: `
|
||||
${result.content}
|
||||
<hr>
|
||||
<h4>Page ${pages}</h4>
|
||||
${nextPageResult.content}
|
||||
`,
|
||||
};
|
||||
|
||||
nextPageUrl = nextPageResult.nextPageUrl;
|
||||
|
||||
pages += 1;
|
||||
}
|
||||
return result;
|
||||
runExtraction(Extractor, opts) {
|
||||
return RootExtractor.extract(Extractor, opts);
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
export default Iris;
|
||||
|
Loading…
Reference in New Issue
Block a user