refactor: page collection

This commit is contained in:
Adam Pash 2016-09-14 11:12:28 -04:00
parent b325a4acdd
commit 67296691c2
5 changed files with 86 additions and 52 deletions

View File

@ -1,4 +1,11 @@
TODO:
- Complete response:
- add canonicalUrl
- add excerpt
- add domain
- add word count
- add total pages
- add rendered pages
- Test if .is method is faster than regex methods
DONE:

View File

@ -6,7 +6,7 @@
"scripts": {
"start": "node ./build",
"lint": "eslint src/** --fix",
"build": "eslint src/** && rollup -c",
"build": "eslint src/** --fix && rollup -c",
"test_build": "rollup -c",
"test": "./test-runner"
},

View File

@ -0,0 +1,56 @@
import 'babel-polyfill';
import { removeAnchor } from 'utils/text';
import Resource from 'resource';
import Iris from '../iris';
export default async function collectAllPages(
{
nextPageUrl,
html,
$,
metaCache,
result,
Extractor,
title,
url,
}
) {
let pages = 2;
const previousUrls = [removeAnchor(url)];
// If we've gone over 26 pages, something has
// likely gone wrong.
while (nextPageUrl && pages < 26) {
$ = await Resource.create(nextPageUrl);
html = $.html();
const extractorOpts = {
url: nextPageUrl,
html,
$,
metaCache,
contentOnly: true,
extractedTitle: title,
previousUrls,
};
const nextPageResult = Iris.runExtraction(Extractor, extractorOpts);
previousUrls.push(nextPageUrl);
result = {
...result,
content: `
${result.content}
<hr>
<h4>Page ${pages}</h4>
${nextPageResult.content}
`,
};
nextPageUrl = nextPageResult.nextPageUrl;
pages += 1;
}
return result;
}

View File

@ -0,0 +1,12 @@
// import assert from 'assert';
// import fs from 'fs';
// import cheerio from 'cheerio';
//
// import collectAllPages from './collect-all-pages';
//
// describe('collectAllPages(opts)', () => {
// it('fetches additional pages', () => {
// const html = fs.readFileSync('./fixtures/ars.html');
// const $ = cheerio.load(html);
// });
// });

View File

@ -1,34 +1,33 @@
import Resource from 'resource';
import getExtractor from 'extractors/get-extractor';
import RootExtractor from 'extractors/root-extractor';
import { removeAnchor } from 'utils/text';
import collectAllPages from 'extractors/collect-all-pages';
const Iris = {
async parse(url, html, opts = {}) {
const { fetchAllPages = true } = opts || true;
const $ = await Resource.create(url, html);
html = $.html();
const Extractor = getExtractor(url);
console.log(`Using extractor for ${Extractor.domain}`);
const $ = await Resource.create(url, html);
html = $.html();
// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
const extractorOpts = { url, html, $, metaCache };
let result = RootExtractor.extract(Extractor, extractorOpts);
let result = this.runExtraction(Extractor, { url, html, $, metaCache });
const { title, nextPageUrl } = result;
if (fetchAllPages && nextPageUrl) {
result = await this.collectAllPages(
result = await collectAllPages(
{
Extractor,
nextPageUrl,
html,
$,
metaCache,
result,
Extractor,
title,
url,
}
@ -38,50 +37,10 @@ const Iris = {
return result;
},
async collectAllPages({
nextPageUrl,
html,
$,
metaCache,
result,
Extractor,
title,
url,
}) {
let pages = 2;
const previousUrls = [removeAnchor(url)];
while (nextPageUrl && pages < 26) {
$ = await Resource.create(nextPageUrl);
html = $.html();
const extractorOpts = { url: nextPageUrl, html, $, metaCache };
const nextPageResult = RootExtractor.extract(
Extractor,
{
...extractorOpts,
url: nextPageUrl,
contentOnly: true,
extractedTitle: title,
previousUrls,
}
);
previousUrls.push(nextPageUrl);
result = {
...result,
content: `
${result.content}
<hr>
<h4>Page ${pages}</h4>
${nextPageResult.content}
`,
};
nextPageUrl = nextPageResult.nextPageUrl;
pages += 1;
}
return result;
runExtraction(Extractor, opts) {
return RootExtractor.extract(Extractor, opts);
},
};
export default Iris;