refactor: page collection
parent
b325a4acdd
commit
67296691c2
@ -0,0 +1,56 @@
|
||||
import 'babel-polyfill';
|
||||
import { removeAnchor } from 'utils/text';
|
||||
import Resource from 'resource';
|
||||
|
||||
import Iris from '../iris';
|
||||
|
||||
export default async function collectAllPages(
|
||||
{
|
||||
nextPageUrl,
|
||||
html,
|
||||
$,
|
||||
metaCache,
|
||||
result,
|
||||
Extractor,
|
||||
title,
|
||||
url,
|
||||
}
|
||||
) {
|
||||
let pages = 2;
|
||||
const previousUrls = [removeAnchor(url)];
|
||||
// If we've gone over 26 pages, something has
|
||||
// likely gone wrong.
|
||||
while (nextPageUrl && pages < 26) {
|
||||
$ = await Resource.create(nextPageUrl);
|
||||
html = $.html();
|
||||
|
||||
const extractorOpts = {
|
||||
url: nextPageUrl,
|
||||
html,
|
||||
$,
|
||||
metaCache,
|
||||
contentOnly: true,
|
||||
extractedTitle: title,
|
||||
previousUrls,
|
||||
};
|
||||
|
||||
const nextPageResult = Iris.runExtraction(Extractor, extractorOpts);
|
||||
|
||||
previousUrls.push(nextPageUrl);
|
||||
result = {
|
||||
...result,
|
||||
content: `
|
||||
${result.content}
|
||||
<hr>
|
||||
<h4>Page ${pages}</h4>
|
||||
${nextPageResult.content}
|
||||
`,
|
||||
};
|
||||
|
||||
nextPageUrl = nextPageResult.nextPageUrl;
|
||||
|
||||
pages += 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
// import assert from 'assert';
|
||||
// import fs from 'fs';
|
||||
// import cheerio from 'cheerio';
|
||||
//
|
||||
// import collectAllPages from './collect-all-pages';
|
||||
//
|
||||
// describe('collectAllPages(opts)', () => {
|
||||
// it('fetches additional pages', () => {
|
||||
// const html = fs.readFileSync('./fixtures/ars.html');
|
||||
// const $ = cheerio.load(html);
|
||||
// });
|
||||
// });
|
Loading…
Reference in New Issue