diff --git a/TODO.md b/TODO.md index 6f38ba0d..3a94a6dc 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,11 @@ TODO: +- Complete response: + - add canonicalUrl + - add excerpt + - add domain + - add word count + - add total pages + - add rendered pages - Test if .is method is faster than regex methods DONE: diff --git a/package.json b/package.json index 89312cad..5b29845d 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ "scripts": { "start": "node ./build", "lint": "eslint src/** --fix", - "build": "eslint src/** && rollup -c", + "build": "eslint src/** --fix && rollup -c", "test_build": "rollup -c", "test": "./test-runner" }, diff --git a/src/extractors/collect-all-pages.js b/src/extractors/collect-all-pages.js new file mode 100644 index 00000000..4ba503c5 --- /dev/null +++ b/src/extractors/collect-all-pages.js @@ -0,0 +1,56 @@ +import 'babel-polyfill'; +import { removeAnchor } from 'utils/text'; +import Resource from 'resource'; + +import Iris from '../iris'; + +export default async function collectAllPages( + { + nextPageUrl, + html, + $, + metaCache, + result, + Extractor, + title, + url, + } +) { + let pages = 2; + const previousUrls = [removeAnchor(url)]; + // If we've gone over 26 pages, something has + // likely gone wrong. + while (nextPageUrl && pages < 26) { + $ = await Resource.create(nextPageUrl); + html = $.html(); + + const extractorOpts = { + url: nextPageUrl, + html, + $, + metaCache, + contentOnly: true, + extractedTitle: title, + previousUrls, + }; + + const nextPageResult = Iris.runExtraction(Extractor, extractorOpts); + + previousUrls.push(nextPageUrl); + result = { + ...result, + content: ` + ${result.content} +
+

Page ${pages}

+ ${nextPageResult.content} + `, + }; + + nextPageUrl = nextPageResult.nextPageUrl; + + pages += 1; + } + + return result; +} diff --git a/src/extractors/collect-all-pages.test.js b/src/extractors/collect-all-pages.test.js new file mode 100644 index 00000000..e6f9ea5b --- /dev/null +++ b/src/extractors/collect-all-pages.test.js @@ -0,0 +1,12 @@ +// import assert from 'assert'; +// import fs from 'fs'; +// import cheerio from 'cheerio'; +// +// import collectAllPages from './collect-all-pages'; +// +// describe('collectAllPages(opts)', () => { +// it('fetches additional pages', () => { +// const html = fs.readFileSync('./fixtures/ars.html'); +// const $ = cheerio.load(html); +// }); +// }); diff --git a/src/iris.js b/src/iris.js index e9594ef4..c6f54d62 100644 --- a/src/iris.js +++ b/src/iris.js @@ -1,34 +1,33 @@ import Resource from 'resource'; import getExtractor from 'extractors/get-extractor'; import RootExtractor from 'extractors/root-extractor'; -import { removeAnchor } from 'utils/text'; +import collectAllPages from 'extractors/collect-all-pages'; const Iris = { async parse(url, html, opts = {}) { const { fetchAllPages = true } = opts || true; - const $ = await Resource.create(url, html); - html = $.html(); - const Extractor = getExtractor(url); console.log(`Using extractor for ${Extractor.domain}`); + const $ = await Resource.create(url, html); + html = $.html(); + // Cached value of every meta name in our document. // Used when extracting title/author/date_published/dek const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray(); - const extractorOpts = { url, html, $, metaCache }; - let result = RootExtractor.extract(Extractor, extractorOpts); + let result = this.runExtraction(Extractor, { url, html, $, metaCache }); const { title, nextPageUrl } = result; if (fetchAllPages && nextPageUrl) { - result = await this.collectAllPages( + result = await collectAllPages( { + Extractor, nextPageUrl, html, $, metaCache, result, - Extractor, title, url, } @@ -38,50 +37,10 @@ const Iris = { return result; }, - async collectAllPages({ - nextPageUrl, - html, - $, - metaCache, - result, - Extractor, - title, - url, - }) { - let pages = 2; - const previousUrls = [removeAnchor(url)]; - while (nextPageUrl && pages < 26) { - $ = await Resource.create(nextPageUrl); - html = $.html(); - const extractorOpts = { url: nextPageUrl, html, $, metaCache }; - const nextPageResult = RootExtractor.extract( - Extractor, - { - ...extractorOpts, - url: nextPageUrl, - contentOnly: true, - extractedTitle: title, - previousUrls, - } - ); - - previousUrls.push(nextPageUrl); - result = { - ...result, - content: ` - ${result.content} -
-

Page ${pages}

- ${nextPageResult.content} - `, - }; - - nextPageUrl = nextPageResult.nextPageUrl; - - pages += 1; - } - return result; + runExtraction(Extractor, opts) { + return RootExtractor.extract(Extractor, opts); }, + }; export default Iris;