diff --git a/TODO.md b/TODO.md
index 6f38ba0d..3a94a6dc 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,4 +1,11 @@
TODO:
+- Complete response:
+ - add canonicalUrl
+ - add excerpt
+ - add domain
+ - add word count
+ - add total pages
+ - add rendered pages
- Test if .is method is faster than regex methods
DONE:
diff --git a/package.json b/package.json
index 89312cad..5b29845d 100644
--- a/package.json
+++ b/package.json
@@ -6,7 +6,7 @@
"scripts": {
"start": "node ./build",
"lint": "eslint src/** --fix",
- "build": "eslint src/** && rollup -c",
+ "build": "eslint src/** --fix && rollup -c",
"test_build": "rollup -c",
"test": "./test-runner"
},
diff --git a/src/extractors/collect-all-pages.js b/src/extractors/collect-all-pages.js
new file mode 100644
index 00000000..4ba503c5
--- /dev/null
+++ b/src/extractors/collect-all-pages.js
@@ -0,0 +1,56 @@
+import 'babel-polyfill';
+import { removeAnchor } from 'utils/text';
+import Resource from 'resource';
+
+import Iris from '../iris';
+
+export default async function collectAllPages(
+ {
+ nextPageUrl,
+ html,
+ $,
+ metaCache,
+ result,
+ Extractor,
+ title,
+ url,
+ }
+) {
+ let pages = 2;
+ const previousUrls = [removeAnchor(url)];
+ // If we've gone over 26 pages, something has
+ // likely gone wrong.
+ while (nextPageUrl && pages < 26) {
+ $ = await Resource.create(nextPageUrl);
+ html = $.html();
+
+ const extractorOpts = {
+ url: nextPageUrl,
+ html,
+ $,
+ metaCache,
+ contentOnly: true,
+ extractedTitle: title,
+ previousUrls,
+ };
+
+ const nextPageResult = Iris.runExtraction(Extractor, extractorOpts);
+
+ previousUrls.push(nextPageUrl);
+ result = {
+ ...result,
+ content: `
+ ${result.content}
+
+ Page ${pages}
+ ${nextPageResult.content}
+ `,
+ };
+
+ nextPageUrl = nextPageResult.nextPageUrl;
+
+ pages += 1;
+ }
+
+ return result;
+}
diff --git a/src/extractors/collect-all-pages.test.js b/src/extractors/collect-all-pages.test.js
new file mode 100644
index 00000000..e6f9ea5b
--- /dev/null
+++ b/src/extractors/collect-all-pages.test.js
@@ -0,0 +1,12 @@
+// import assert from 'assert';
+// import fs from 'fs';
+// import cheerio from 'cheerio';
+//
+// import collectAllPages from './collect-all-pages';
+//
+// describe('collectAllPages(opts)', () => {
+// it('fetches additional pages', () => {
+// const html = fs.readFileSync('./fixtures/ars.html');
+// const $ = cheerio.load(html);
+// });
+// });
diff --git a/src/iris.js b/src/iris.js
index e9594ef4..c6f54d62 100644
--- a/src/iris.js
+++ b/src/iris.js
@@ -1,34 +1,33 @@
import Resource from 'resource';
import getExtractor from 'extractors/get-extractor';
import RootExtractor from 'extractors/root-extractor';
-import { removeAnchor } from 'utils/text';
+import collectAllPages from 'extractors/collect-all-pages';
const Iris = {
async parse(url, html, opts = {}) {
const { fetchAllPages = true } = opts || true;
- const $ = await Resource.create(url, html);
- html = $.html();
-
const Extractor = getExtractor(url);
console.log(`Using extractor for ${Extractor.domain}`);
+ const $ = await Resource.create(url, html);
+ html = $.html();
+
// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
- const extractorOpts = { url, html, $, metaCache };
- let result = RootExtractor.extract(Extractor, extractorOpts);
+ let result = this.runExtraction(Extractor, { url, html, $, metaCache });
const { title, nextPageUrl } = result;
if (fetchAllPages && nextPageUrl) {
- result = await this.collectAllPages(
+ result = await collectAllPages(
{
+ Extractor,
nextPageUrl,
html,
$,
metaCache,
result,
- Extractor,
title,
url,
}
@@ -38,50 +37,10 @@ const Iris = {
return result;
},
- async collectAllPages({
- nextPageUrl,
- html,
- $,
- metaCache,
- result,
- Extractor,
- title,
- url,
- }) {
- let pages = 2;
- const previousUrls = [removeAnchor(url)];
- while (nextPageUrl && pages < 26) {
- $ = await Resource.create(nextPageUrl);
- html = $.html();
- const extractorOpts = { url: nextPageUrl, html, $, metaCache };
- const nextPageResult = RootExtractor.extract(
- Extractor,
- {
- ...extractorOpts,
- url: nextPageUrl,
- contentOnly: true,
- extractedTitle: title,
- previousUrls,
- }
- );
-
- previousUrls.push(nextPageUrl);
- result = {
- ...result,
- content: `
- ${result.content}
-
- Page ${pages}
- ${nextPageResult.content}
- `,
- };
-
- nextPageUrl = nextPageResult.nextPageUrl;
-
- pages += 1;
- }
- return result;
+ runExtraction(Extractor, opts) {
+ return RootExtractor.extract(Extractor, opts);
},
+
};
export default Iris;