From 7e2a34945f2b394a5ba4075826ab60d1cc4b3aea Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Tue, 13 Sep 2016 15:22:27 -0400 Subject: [PATCH] chore: refactored and linted --- .eslintignore | 1 + .eslintrc | 39 ++ package.json | 13 +- score-move | 21 + src/cleaners/author.js | 4 +- src/cleaners/author.test.js | 20 +- src/cleaners/constants.js | 38 +- src/cleaners/content.js | 30 +- src/cleaners/content.test.js | 48 +- src/cleaners/date-published.js | 32 +- src/cleaners/date-published.test.js | 59 +- src/cleaners/dek.js | 13 +- src/cleaners/dek.test.js | 62 +- src/cleaners/fixtures/html.js | 10 +- src/cleaners/index.js | 30 +- src/cleaners/lead-image-url.js | 10 +- src/cleaners/lead-image-url.test.js | 24 +- src/cleaners/resolve-split-title.js | 106 ++-- src/cleaners/resolve-split-title.test.js | 35 +- src/cleaners/title.js | 15 +- src/cleaners/title.test.js | 37 +- src/extractors/all.js | 11 +- src/extractors/constants.js | 2 +- src/extractors/custom/blogspot.com/index.js | 16 +- src/extractors/custom/nymag.com/index.js | 26 +- src/extractors/custom/wikipedia.org/index.js | 10 +- src/extractors/generic/author/constants.js | 30 +- src/extractors/generic/author/extractor.js | 39 +- .../generic/author/extractor.test.js | 52 +- .../generic/author/fixtures/html.js | 10 +- .../generic/content/extract-best-node.js | 21 +- .../generic/content/extract-best-node.test.js | 28 +- src/extractors/generic/content/extractor.js | 69 +- .../generic/content/extractor.test.js | 34 +- .../generic/content/scoring/add-score.js | 14 +- .../generic/content/scoring/add-score.test.js | 35 +- .../generic/content/scoring/add-to-parent.js | 8 +- .../content/scoring/add-to-parent.test.js | 27 +- .../generic/content/scoring/constants.js | 490 +++++++------- .../content/scoring/find-top-candidate.js | 108 +--- .../scoring/find-top-candidate.test.js | 68 +- .../content/scoring/fixtures/get-weight.js | 46 +- .../generic/content/scoring/fixtures/html.js | 4 +- .../content/scoring/get-or-init-score.js | 23 +- .../content/scoring/get-or-init-score.test.js | 78 +-- .../generic/content/scoring/get-score.js | 2 +- .../generic/content/scoring/get-score.test.js | 35 +- .../generic/content/scoring/get-weight.js | 27 +- .../content/scoring/get-weight.test.js | 95 ++- .../generic/content/scoring/index.js | 24 +- .../generic/content/scoring/merge-siblings.js | 79 +++ .../generic/content/scoring/score-commas.js | 2 +- .../content/scoring/score-commas.test.js | 26 +- .../generic/content/scoring/score-content.js | 124 ++-- .../content/scoring/score-content.test.js | 58 +- .../generic/content/scoring/score-length.js | 19 +- .../content/scoring/score-length.test.js | 31 +- .../generic/content/scoring/score-node.js | 18 +- .../content/scoring/score-node.test.js | 125 ++-- .../content/scoring/score-paragraph.js | 18 +- .../content/scoring/score-paragraph.test.js | 62 +- .../generic/content/scoring/set-score.js | 5 +- .../generic/content/scoring/set-score.test.js | 27 +- .../generic/date-published/constants.js | 46 +- .../generic/date-published/extractor.js | 39 +- .../generic/date-published/extractor.test.js | 92 ++- .../generic/date-published/fixtures/html.js | 8 +- src/extractors/generic/dek/extractor.js | 31 +- src/extractors/generic/dek/extractor.test.js | 22 +- src/extractors/generic/index.js | 50 +- src/extractors/generic/index.test.js | 49 +- .../generic/lead-image-url/constants.js | 82 +-- .../generic/lead-image-url/extractor.js | 87 ++- .../generic/lead-image-url/extractor.test.js | 60 +- .../generic/lead-image-url/fixtures/html.js | 12 +- .../generic/lead-image-url/score-image.js | 86 +-- .../lead-image-url/score-image.test.js | 163 +++-- .../generic/next-page-url/extractor.js | 45 +- .../generic/next-page-url/extractor.test.js | 40 +- .../next-page-url/scoring/constants.js | 42 +- .../next-page-url/scoring/score-links.js | 306 ++------- .../next-page-url/scoring/score-links.test.js | 235 +------ .../next-page-url/scoring/utils/index.js | 10 + .../scoring/utils/score-base-url.js | 11 + .../scoring/utils/score-base-url.test.js | 23 + .../scoring/utils/score-by-parents.js | 52 ++ .../scoring/utils/score-by-parents.test.js | 35 + .../scoring/utils/score-cap-links.js | 19 + .../scoring/utils/score-cap-links.test.js | 18 + .../scoring/utils/score-extraneous-links.js | 10 + .../utils/score-extraneous-links.test.js | 18 + .../scoring/utils/score-link-text.js | 30 + .../scoring/utils/score-link-text.test.js | 22 + .../scoring/utils/score-next-link-text.js | 10 + .../utils/score-next-link-text.test.js | 18 + .../scoring/utils/score-page-in-link.js | 10 + .../scoring/utils/score-page-in-link.test.js | 18 + .../scoring/utils/score-prev-link.js | 11 + .../scoring/utils/score-prev-link.test.js | 18 + .../scoring/utils/score-similarity.js | 23 + .../scoring/utils/score-similarity.test.js | 22 + .../scoring/utils/should-score.js | 55 ++ .../scoring/utils/should-score.test.js | 28 + src/extractors/generic/title/constants.js | 22 +- src/extractors/generic/title/extractor.js | 41 +- .../generic/title/extractor.test.js | 50 +- src/extractors/generic/title/fixtures/html.js | 12 +- src/extractors/generic/title/utils/index.js | 4 +- src/extractors/get-extractor.js | 14 +- src/extractors/get-extractor.test.js | 24 +- src/extractors/index.js | 4 +- src/extractors/root-extractor.js | 210 +++--- src/extractors/root-extractor.test.js | 151 ++--- src/iris.js | 135 ++-- src/iris.test.js | 43 +- src/resource/index.js | 100 ++- src/resource/index.test.js | 62 +- src/resource/utils/constants.js | 31 +- src/resource/utils/dom/clean.js | 24 +- src/resource/utils/dom/clean.test.js | 32 +- src/resource/utils/dom/constants.js | 6 +- .../utils/dom/convert-lazy-loaded-images.js | 14 +- .../dom/convert-lazy-loaded-images.test.js | 48 +- src/resource/utils/dom/index.js | 6 +- src/resource/utils/dom/normalize-meta-tags.js | 30 +- .../utils/dom/normalize-meta-tags.test.js | 32 +- src/resource/utils/fetch-resource.js | 107 ++-- src/resource/utils/fetch-resource.test.js | 116 ++-- src/resource/utils/index.js | 2 +- src/test-helpers.js | 7 +- src/utils/dom/brs-to-ps.js | 18 +- src/utils/dom/brs-to-ps.test.js | 60 +- src/utils/dom/clean-attributes.js | 43 +- src/utils/dom/clean-attributes.test.js | 32 +- src/utils/dom/clean-h-ones.js | 12 +- src/utils/dom/clean-h-ones.test.js | 35 +- src/utils/dom/clean-headers.js | 35 +- src/utils/dom/clean-headers.test.js | 41 +- src/utils/dom/clean-images.js | 38 +- src/utils/dom/clean-images.test.js | 43 +- src/utils/dom/clean-tags.js | 137 ++-- src/utils/dom/clean-tags.test.js | 92 ++- src/utils/dom/constants.js | 596 ++++++++---------- src/utils/dom/convert-node-to.js | 6 +- src/utils/dom/convert-node-to.test.js | 24 +- src/utils/dom/convert-to-paragraphs.js | 65 +- src/utils/dom/convert-to-paragraphs.test.js | 27 +- src/utils/dom/extract-from-meta.js | 64 +- src/utils/dom/extract-from-meta.test.js | 36 +- src/utils/dom/extract-from-selectors.js | 81 +-- src/utils/dom/extract-from-selectors.test.js | 45 +- .../dom/fixtures/extract-from-selectors.js | 12 +- src/utils/dom/fixtures/html.js | 46 +- src/utils/dom/fixtures/node-is-sufficient.js | 6 +- src/utils/dom/index.js | 42 +- src/utils/dom/is-wordpress.js | 4 +- src/utils/dom/is-wordpress.test.js | 30 +- src/utils/dom/link-density.js | 25 +- src/utils/dom/link-density.test.js | 39 +- src/utils/dom/make-links-absolute.js | 22 +- src/utils/dom/make-links-absolute.test.js | 56 +- src/utils/dom/node-is-sufficient.js | 2 +- src/utils/dom/node-is-sufficient.test.js | 32 +- src/utils/dom/paragraphize.js | 35 +- src/utils/dom/paragraphize.test.js | 43 +- src/utils/dom/remove-empty.js | 14 +- src/utils/dom/remove-empty.test.js | 44 +- src/utils/dom/rewrite-top-level.js | 8 +- src/utils/dom/rewrite-top-level.test.js | 22 +- src/utils/dom/strip-junk-tags.js | 8 +- src/utils/dom/strip-junk-tags.test.js | 24 +- src/utils/dom/strip-tags.js | 4 +- src/utils/dom/strip-tags.test.js | 24 +- src/utils/dom/strip-unlikely-candidates.js | 31 +- .../dom/strip-unlikely-candidates.test.js | 54 +- src/utils/dom/within-comment.js | 12 +- src/utils/dom/within-comment.test.js | 28 +- src/utils/index.js | 2 +- src/utils/range.js | 2 +- src/utils/text/article-base-url.js | 80 +-- src/utils/text/article-base-url.test.js | 23 +- src/utils/text/constants.js | 8 +- src/utils/text/extract-from-url.js | 10 +- src/utils/text/extract-from-url.test.js | 31 +- src/utils/text/fixtures/html.js | 46 +- src/utils/text/has-sentence-end.js | 7 + src/utils/text/index.js | 12 +- src/utils/text/normalize-spaces.js | 4 +- src/utils/text/normalize-spaces.test.js | 21 +- src/utils/text/page-num-from-url.js | 10 +- src/utils/text/page-num-from-url.test.js | 58 +- src/utils/text/remove-anchor.js | 2 +- src/utils/text/remove-anchor.test.js | 23 +- 193 files changed, 4177 insertions(+), 4315 deletions(-) create mode 100644 .eslintignore create mode 100644 .eslintrc create mode 100755 score-move create mode 100644 src/extractors/generic/content/scoring/merge-siblings.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/index.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-base-url.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-base-url.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-by-parents.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-cap-links.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-link-text.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-link-text.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-prev-link.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-similarity.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/score-similarity.test.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/should-score.js create mode 100644 src/extractors/generic/next-page-url/scoring/utils/should-score.test.js create mode 100644 src/utils/text/has-sentence-end.js diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 00000000..b0c38463 --- /dev/null +++ b/.eslintignore @@ -0,0 +1 @@ +**/fixtures/* diff --git a/.eslintrc b/.eslintrc new file mode 100644 index 00000000..6011b843 --- /dev/null +++ b/.eslintrc @@ -0,0 +1,39 @@ +// Use this file as a starting point for your project's .eslintrc. +// Copy this file, and add rule overrides as needed. +{ + "parser": "babel-eslint", + "extends": "airbnb", + "plugins": [ + "babel" + ], + "globals": { + /* mocha */ + "describe", + "it" + }, + "rules": { + "no-param-reassign": 0, + /* TODO fix this; this should work w/import/resolver below, but doesn't */ + "import/no-extraneous-dependencies": 0, + "import/no-unresolved": 0, + "no-control-regex": 0, + "import/prefer-default-export": 0, + "generator-star-spacing": 0, + "babel/generator-star-spacing": 0, + "func-names": 0, + "no-useless-escape": 0, + "no-confusing-arrow": 0, + }, + "settings": { + "import/resolver": { + "babel-module": { + "extensions": [".js"] + } + } + }, + "parserOptions":{ + "ecmaFeatures": { + "experimentalObjectRestSpread": true + } + } +} diff --git a/package.json b/package.json index aa8aaac1..d0103ef1 100644 --- a/package.json +++ b/package.json @@ -5,14 +5,17 @@ "main": "index.js", "scripts": { "start": "node ./build", - "build": "rollup -c", + "lint": "eslint src/**", + "build": "eslint src/** && rollup -c", "test": "./test-runner" }, "author": "", "license": "ISC", "devDependencies": { + "babel-eslint": "^6.1.2", "babel-plugin-external-helpers": "^6.8.0", "babel-plugin-module-alias": "^1.6.0", + "babel-plugin-module-resolver": "^2.2.0", "babel-plugin-transform-async-to-generator": "^6.8.0", "babel-plugin-transform-es2015-destructuring": "^6.9.0", "babel-plugin-transform-object-rest-spread": "^6.8.0", @@ -21,6 +24,14 @@ "babel-preset-es2015-rollup": "^1.2.0", "babel-register": "^6.11.6", "babelrc-rollup": "^3.0.0", + "eslint": "^3.5.0", + "eslint-config-airbnb": "^11.1.0", + "eslint-import-resolver-babel-module": "^2.0.1", + "eslint-plugin-async": "^0.1.1", + "eslint-plugin-babel": "^3.3.0", + "eslint-plugin-import": "^1.15.0", + "eslint-plugin-jsx-a11y": "^2.2.2", + "eslint-plugin-react": "^6.2.1", "mocha": "^3.0.2", "rollup": "^0.34.13", "rollup-plugin-babel": "^2.6.1", diff --git a/score-move b/score-move new file mode 100755 index 00000000..50182a1a --- /dev/null +++ b/score-move @@ -0,0 +1,21 @@ +#!/usr/local/bin/fish + +set file $argv[1] +set function $argv[2] + +touch src/extractors/generic/next-page-url/scoring/utils/index.js +touch src/extractors/generic/next-page-url/scoring/utils/$file.js +touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js + +echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js +echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js +echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js +echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js +echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js + +echo "Now make it a default export" +echo "Move it to its file" +echo "Move its tests to its test file" +echo "import in score-links" +echo "Test it." + diff --git a/src/cleaners/author.js b/src/cleaners/author.js index 58cff5b0..f0a67096 100644 --- a/src/cleaners/author.js +++ b/src/cleaners/author.js @@ -1,7 +1,7 @@ -import { CLEAN_AUTHOR_RE } from './constants' +import { CLEAN_AUTHOR_RE } from './constants'; // Take an author string (like 'By David Smith ') and clean it to // just the name(s): 'David Smith'. export default function cleanAuthor(author) { - return author.replace(CLEAN_AUTHOR_RE, '$2').trim() + return author.replace(CLEAN_AUTHOR_RE, '$2').trim(); } diff --git a/src/cleaners/author.test.js b/src/cleaners/author.test.js index 4407effb..e7e881e3 100644 --- a/src/cleaners/author.test.js +++ b/src/cleaners/author.test.js @@ -1,21 +1,21 @@ -import assert from 'assert' +import assert from 'assert'; -import cleanAuthor from './author' +import cleanAuthor from './author'; describe('cleanAuthor(author)', () => { it('removes the By from an author string', () => { - const author = cleanAuthor('By Bob Dylan') + const author = cleanAuthor('By Bob Dylan'); - assert.equal(author, 'Bob Dylan') - }) + assert.equal(author, 'Bob Dylan'); + }); it('trims trailing whitespace and line breaks', () => { const text = ` written by Bob Dylan - ` - const author = cleanAuthor(text) + `; + const author = cleanAuthor(text); - assert.equal(author, 'Bob Dylan') - }) -}) + assert.equal(author, 'Bob Dylan'); + }); +}); diff --git a/src/cleaners/constants.js b/src/cleaners/constants.js index 54557dc7..c2b4dd50 100644 --- a/src/cleaners/constants.js +++ b/src/cleaners/constants.js @@ -1,9 +1,9 @@ // CLEAN AUTHOR CONSTANTS -export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i +export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)', // CLEAN DEK CONSTANTS -export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i') +export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks. // From most distinct to least distinct. // @@ -14,7 +14,7 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i') // However, these tags often have SEO-specific junk in them that's not // header-worthy like a dek is. Excerpt material at best. export const DEK_META_TAGS = [ -] +]; // An ordered list of Selectors to find likely article deks. From // most explicit to least explicit. @@ -23,18 +23,36 @@ export const DEK_META_TAGS = [ // detrimental to the aesthetics of an article. export const DEK_SELECTORS = [ '.entry-summary', -] +]; // CLEAN DATE PUBLISHED CONSTANTS -export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i -export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i -export const TIME_MERIDIAN_DOTS_RE = /\.m\./i -export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}(\s?[ap]\.?m\.?)?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig +export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i; +export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i; +export const TIME_MERIDIAN_DOTS_RE = /\.m\./i; +const months = [ + 'jan', + 'feb', + 'mar', + 'apr', + 'may', + 'jun', + 'jul', + 'aug', + 'sep', + 'oct', + 'nov', + 'dec', +]; +const allMonths = months.join('|'); +const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?'; +const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}'; +export const SPLIT_DATE_STRING = + new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig'); // CLEAN TITLE CONSTANTS // A regular expression that will match separating characters on a // title, that usually denote breadcrumbs or something similar. -export const TITLE_SPLITTERS_RE = /(: | - | \| )/g +export const TITLE_SPLITTERS_RE = /(: | - | \| )/g; export const DOMAIN_ENDINGS_RE = - new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g') + new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g'); diff --git a/src/cleaners/content.js b/src/cleaners/content.js index 76d7310a..00cd939f 100644 --- a/src/cleaners/content.js +++ b/src/cleaners/content.js @@ -8,54 +8,52 @@ import { rewriteTopLevel, stripJunkTags, makeLinksAbsolute, -} from 'utils/dom' - -import { convertNodeTo } from 'utils/dom' +} from 'utils/dom'; // Clean our article content, returning a new, cleaned node. export default function extractCleanNode( article, { $, - cleanConditionally=true, - title='', - url='', + cleanConditionally = true, + title = '', + url = '', } ) { // Rewrite the tag name to div if it's a top level node like body or // html to avoid later complications with multiple body tags. - rewriteTopLevel(article, $) + rewriteTopLevel(article, $); // Drop small images and spacer images - cleanImages(article, $) + cleanImages(article, $); // Drop certain tags like , etc // This is -mostly- for cleanliness, not security. - stripJunkTags(article, $) + stripJunkTags(article, $); // H1 tags are typically the article title, which should be extracted // by the title extractor instead. If there's less than 3 of them (<3), // strip them. Otherwise, turn 'em into H2s. - cleanHOnes(article, $) + cleanHOnes(article, $); // Clean headers - cleanHeaders(article, $, title) + cleanHeaders(article, $, title); // Make links absolute - makeLinksAbsolute(article, $, url) + makeLinksAbsolute(article, $, url); // Remove style or align attributes - cleanAttributes(article, $) + cleanAttributes(article); // We used to clean UL's and OL's here, but it was leading to // too many in-article lists being removed. Consider a better // way to detect menus particularly and remove them. - cleanTags(article, $, cleanConditionally) + cleanTags(article, $, cleanConditionally); // Remove empty paragraph nodes - removeEmpty(article, $) + removeEmpty(article, $); - return article + return article; } // headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6') // for header in headers: diff --git a/src/cleaners/content.test.js b/src/cleaners/content.test.js index 68f6f346..c12b7bbf 100644 --- a/src/cleaners/content.test.js +++ b/src/cleaners/content.test.js @@ -1,32 +1,32 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import fs from 'fs' +import assert from 'assert'; +import cheerio from 'cheerio'; +import fs from 'fs'; -import extractCleanNode from './content' -import extractBestNode from 'extractors/generic/content/extract-best-node' +import extractBestNode from 'extractors/generic/content/extract-best-node'; +import extractCleanNode from './content'; describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => { - it("cleans cruft out of a DOM node", () => { - const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') - let $ = cheerio.load(html) + it('cleans cruft out of a DOM node', () => { + const html = fs.readFileSync('./fixtures/wired.html', 'utf-8'); + const $ = cheerio.load(html); const opts = { - stripUnlikelyCandidates: true, - weightNodes: true, - cleanConditionally: true, - } + stripUnlikelyCandidates: true, + weightNodes: true, + cleanConditionally: true, + }; - const bestNode = extractBestNode($, opts) - let result = $.html(bestNode) - // console.log(result) - // console.log(result.length) - const cleanNode = extractCleanNode(bestNode, { $, opts }) - result = $.html(cleanNode) - // console.log(result.length) - // console.log(result) - // console.log(bestNode.html()) + const bestNode = extractBestNode($, opts); + // let result = $.html(bestNode); + // // console.log(result) + // // console.log(result.length) + const cleanNode = extractCleanNode(bestNode, { $, opts }); + // result = $.html(cleanNode); + // // console.log(result.length) + // // console.log(result) + // // console.log(bestNode.html()) - assert.equal($(bestNode).text().length, 2687) - }) -}) + assert.equal($(cleanNode).text().length, 2687); + }); +}); diff --git a/src/cleaners/date-published.js b/src/cleaners/date-published.js index 7347222d..c8e196e2 100644 --- a/src/cleaners/date-published.js +++ b/src/cleaners/date-published.js @@ -1,4 +1,4 @@ -import moment from 'moment' +import moment from 'moment'; // Is there a compelling reason to use moment here? // Mostly only being used for the isValid() method, // but could just check for 'Invalid Date' string. @@ -7,27 +7,27 @@ import { CLEAN_DATE_STRING_RE, SPLIT_DATE_STRING, TIME_MERIDIAN_SPACE_RE, - TIME_MERIDIAN_DOTS_RE -} from './constants' + TIME_MERIDIAN_DOTS_RE, +} from './constants'; + +export function cleanDateString(dateString) { + return (dateString.match(SPLIT_DATE_STRING) || []) + .join(' ') + .replace(TIME_MERIDIAN_DOTS_RE, 'm') + .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3') + .replace(CLEAN_DATE_STRING_RE, '$1') + .trim(); +} // Take a date published string, and hopefully return a date out of // it. Return none if we fail. export default function cleanDatePublished(dateString) { - let date = moment(new Date(dateString)) + let date = moment(new Date(dateString)); if (!date.isValid()) { - dateString = cleanDateString(dateString) - date = moment(new Date(dateString)) + dateString = cleanDateString(dateString); + date = moment(new Date(dateString)); } - return date.isValid() ? date.toISOString() : null -} - -export function cleanDateString(dateString) { - return (dateString.match(SPLIT_DATE_STRING) || []) - .join(' ') - .replace(TIME_MERIDIAN_DOTS_RE, 'm') - .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3') - .replace(CLEAN_DATE_STRING_RE, '$1') - .trim() + return date.isValid() ? date.toISOString() : null; } diff --git a/src/cleaners/date-published.test.js b/src/cleaners/date-published.test.js index 4c254a28..fe955d8a 100644 --- a/src/cleaners/date-published.test.js +++ b/src/cleaners/date-published.test.js @@ -1,67 +1,62 @@ -import assert from 'assert' +import assert from 'assert'; import { default as cleanDatePublished, cleanDateString, -} from './date-published' +} from './date-published'; describe('cleanDatePublished(dateString)', () => { it('returns a date object', () => { - const datePublished = cleanDatePublished('published: 1/1/2020') + const datePublished = cleanDatePublished('published: 1/1/2020'); assert.equal( datePublished, new Date('1/1/2020').toISOString() - ) - }) + ); + }); it('returns null if date is invalid', () => { - const datePublished = cleanDatePublished('blargh') + const datePublished = cleanDatePublished('blargh'); - assert.equal(datePublished, null) - }) - -}) + assert.equal(datePublished, null); + }); +}); describe('cleanDateString(dateString)', () => { it('removes "published" text from an datePublished string', () => { - const datePublished = cleanDateString('published: 1/1/2020') + const datePublished = cleanDateString('published: 1/1/2020'); - assert.equal(datePublished, '1/1/2020') - }) + assert.equal(datePublished, '1/1/2020'); + }); it('trims whitespace', () => { - const datePublished = cleanDateString(' 1/1/2020 ') + const datePublished = cleanDateString(' 1/1/2020 '); - assert.equal(datePublished, '1/1/2020') - }) + assert.equal(datePublished, '1/1/2020'); + }); it('puts a space b/w a time and am/pm', () => { // The JS date parser is forgiving, but // it needs am/pm separated from a time - const date1 = cleanDateString('1/1/2020 8:30am') - assert.equal(date1, '1/1/2020 8:30 am') + const date1 = cleanDateString('1/1/2020 8:30am'); + assert.equal(date1, '1/1/2020 8:30 am'); - const date2 = cleanDateString('8:30PM 1/1/2020') - assert.equal(date2, '8:30 PM 1/1/2020') - }) + const date2 = cleanDateString('8:30PM 1/1/2020'); + assert.equal(date2, '8:30 PM 1/1/2020'); + }); it('cleans the dots from a.m. or p.m.', () => { // The JS date parser is forgiving, but // it needs a.m./p.m. without dots - const date1 = cleanDateString('1/1/2020 8:30 a.m.') - assert.equal(date1, '1/1/2020 8:30 am') - }) + const date1 = cleanDateString('1/1/2020 8:30 a.m.'); + assert.equal(date1, '1/1/2020 8:30 am'); + }); it('can handle some tough timestamps', () => { // The JS date parser is forgiving, but // it needs am/pm separated from a time - const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.') - assert.equal(date1, '15 Apr 2016 10:59') - - const date2 = cleanDateString('8:30PM 1/1/2020') - assert.equal(date2, '8:30 PM 1/1/2020') - }) - -}) + const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.'); + assert.equal(date1, '15 Apr 2016 10:59'); + }); +}); diff --git a/src/cleaners/dek.js b/src/cleaners/dek.js index 7ddbdf65..8686120b 100644 --- a/src/cleaners/dek.js +++ b/src/cleaners/dek.js @@ -1,17 +1,18 @@ -import { TEXT_LINK_RE } from './constants' -import { stripTags } from 'utils/dom' +import { stripTags } from 'utils/dom'; + +import { TEXT_LINK_RE } from './constants'; // Take a dek HTML fragment, and return the cleaned version of it. // Return None if the dek wasn't good enough. export default function cleanDek(dek, { $ }) { // Sanity check that we didn't get too short or long of a dek. - if (dek.length > 1000 || dek.length < 5) return null + if (dek.length > 1000 || dek.length < 5) return null; - const dekText = stripTags(dek, $) + const dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's // not a good dek - bail. - if (TEXT_LINK_RE.test(dekText)) return null + if (TEXT_LINK_RE.test(dekText)) return null; - return dekText.trim() + return dekText.trim(); } diff --git a/src/cleaners/dek.test.js b/src/cleaners/dek.test.js index eaa4fc2f..39aef99e 100644 --- a/src/cleaners/dek.test.js +++ b/src/cleaners/dek.test.js @@ -1,52 +1,50 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import { - default as cleanDek, - cleanDekString, -} from './dek' +import cleanDek from './dek'; describe('cleanDek(dekString, { $ })', () => { it('returns null if the dek is < 5 chars', () => { - const $ = cheerio.load('<div></div>') - assert.equal(cleanDek('Hi', { $ }), null) - }) + const $ = cheerio.load('<div></div>'); + assert.equal(cleanDek('Hi', { $ }), null); + }); it('returns null if the dek is > 1000 chars', () => { - const $ = cheerio.load('<div></div>') + const $ = cheerio.load('<div></div>'); const longDek = // generate a string that is 1,280 chars - [0,1,2,3,4,5,6].reduce((acc, i) => - acc += acc, '0123456789' - ) - assert.equal(cleanDek(longDek, { $ }), null) - }) + [0, 1, 2, 3, 4, 5, 6].reduce((acc) => { + acc += acc; + return acc; + }, '0123456789'); + assert.equal(cleanDek(longDek, { $ }), null); + }); it('strip html tags from the dek', () => { - const $ = cheerio.load('<div></div>') - const dek = 'This is a <em>very</em> important dek.' + const $ = cheerio.load('<div></div>'); + const dek = 'This is a <em>very</em> important dek.'; - assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.') - }) + assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.'); + }); it('returns null if dek contains plain text link', () => { - const $ = cheerio.load('<div></div>') - const dek = 'This has this link http://example.com/foo/bar' + const $ = cheerio.load('<div></div>'); + const dek = 'This has this link http://example.com/foo/bar'; - assert.equal(cleanDek(dek, { $ }), null) - }) + assert.equal(cleanDek(dek, { $ }), null); + }); it('returns a normal dek as is', () => { - const $ = cheerio.load('<div></div>') - const dek = 'This is the dek' + const $ = cheerio.load('<div></div>'); + const dek = 'This is the dek'; - assert.equal(cleanDek(dek, { $ }), dek) - }) + assert.equal(cleanDek(dek, { $ }), dek); + }); it('cleans extra whitespace', () => { - const $ = cheerio.load('<div></div>') - const dek = ' This is the dek ' + const $ = cheerio.load('<div></div>'); + const dek = ' This is the dek '; - assert.equal(cleanDek(dek, { $ }), 'This is the dek') - }) -}) + assert.equal(cleanDek(dek, { $ }), 'This is the dek'); + }); +}); diff --git a/src/cleaners/fixtures/html.js b/src/cleaners/fixtures/html.js index f7b776a9..a75cd793 100644 --- a/src/cleaners/fixtures/html.js +++ b/src/cleaners/fixtures/html.js @@ -1,5 +1,5 @@ const HTML = { - docWithH1: `<div><h1>This Is the Real Title</h1></div>`, + docWithH1: '<div><h1>This Is the Real Title</h1></div>', docWith2H1s: ` <div> <h1>This Is the Real Title</h1> @@ -7,9 +7,9 @@ const HTML = { </div> `, docWithTagsInH1: { - before: `<div><h1>This Is the <em>Real</em> Title</h1></div>`, - after: `This Is the Real Title` + before: '<div><h1>This Is the <em>Real</em> Title</h1></div>', + after: 'This Is the Real Title', }, -} +}; -export default HTML +export default HTML; diff --git a/src/cleaners/index.js b/src/cleaners/index.js index 11439970..ce1ab34c 100644 --- a/src/cleaners/index.js +++ b/src/cleaners/index.js @@ -1,9 +1,9 @@ -import cleanAuthor from './author' -import cleanImage from './lead-image-url' -import cleanDek from './dek' -import cleanDatePublished from './date-published' -import cleanContent from './content' -import cleanTitle from './title' +import cleanAuthor from './author'; +import cleanImage from './lead-image-url'; +import cleanDek from './dek'; +import cleanDatePublished from './date-published'; +import cleanContent from './content'; +import cleanTitle from './title'; const Cleaners = { author: cleanAuthor, @@ -12,15 +12,15 @@ const Cleaners = { datePublished: cleanDatePublished, content: cleanContent, title: cleanTitle, -} +}; -export default Cleaners +export default Cleaners; -export { cleanAuthor } -export { cleanImage } -export { cleanDek } -export { cleanDatePublished } -export { cleanContent } -export { cleanTitle } -export { default as resolveSplitTitle } from './resolve-split-title' +export { cleanAuthor }; +export { cleanImage }; +export { cleanDek }; +export { cleanDatePublished }; +export { cleanContent }; +export { cleanTitle }; +export { default as resolveSplitTitle } from './resolve-split-title'; diff --git a/src/cleaners/lead-image-url.js b/src/cleaners/lead-image-url.js index f33cd914..a61d11ef 100644 --- a/src/cleaners/lead-image-url.js +++ b/src/cleaners/lead-image-url.js @@ -1,10 +1,10 @@ -import validUrl from 'valid-url' +import validUrl from 'valid-url'; export default function clean(leadImageUrl) { - leadImageUrl = leadImageUrl.trim() + leadImageUrl = leadImageUrl.trim(); if (validUrl.isWebUri(leadImageUrl)) { - return leadImageUrl - } else { - return null + return leadImageUrl; } + + return null; } diff --git a/src/cleaners/lead-image-url.test.js b/src/cleaners/lead-image-url.test.js index 0ff85abe..90632c58 100644 --- a/src/cleaners/lead-image-url.test.js +++ b/src/cleaners/lead-image-url.test.js @@ -1,20 +1,20 @@ -import assert from 'assert' +import assert from 'assert'; -import clean from './lead-image-url' +import clean from './lead-image-url'; describe('clean(leadImageUrl)', () => { it('returns the url if valid', () => { - const url = 'https://example.com' - assert.equal(clean(url), url) - }) + const url = 'https://example.com'; + assert.equal(clean(url), url); + }); it('returns null if the url is not valid', () => { - const url = 'this is not a valid url' - assert.equal(clean(url), null) - }) + const url = 'this is not a valid url'; + assert.equal(clean(url), null); + }); it('trims whitespace', () => { - const url = ' https://example.com/foo/bar.jpg' - assert.equal(clean(url), url.trim()) - }) -}) + const url = ' https://example.com/foo/bar.jpg'; + assert.equal(clean(url), url.trim()); + }); +}); diff --git a/src/cleaners/resolve-split-title.js b/src/cleaners/resolve-split-title.js index 9654140d..7c393bca 100644 --- a/src/cleaners/resolve-split-title.js +++ b/src/cleaners/resolve-split-title.js @@ -1,34 +1,11 @@ -import URL from 'url' -import 'babel-polyfill' -import wuzzy from 'wuzzy' +import URL from 'url'; +import 'babel-polyfill'; +import wuzzy from 'wuzzy'; import { TITLE_SPLITTERS_RE, DOMAIN_ENDINGS_RE, -} from './constants' - -// Given a title with separators in it (colons, dashes, etc), -// resolve whether any of the segments should be removed. -export default function resolveSplitTitle(title, url='') { - // Splits while preserving splitters, like: - // ['The New New York', ' - ', 'The Washington Post'] - title = title - - let splitTitle = title.split(TITLE_SPLITTERS_RE) - if (splitTitle.length === 1) { - return title - } - - let newTitle = extractBreadcrumbTitle(splitTitle, title) - if (newTitle) return newTitle - - newTitle = cleanDomainFromTitle(splitTitle, url) - if (newTitle) return newTitle - - // Fuzzy ratio didn't find anything, so this title is probably legit. - // Just return it all. - return title -} +} from './constants'; function extractBreadcrumbTitle(splitTitle, text) { // This must be a very breadcrumbed title, like: @@ -38,40 +15,40 @@ function extractBreadcrumbTitle(splitTitle, text) { // Look to see if we can find a breadcrumb splitter that happens // more than once. If we can, we'll be able to better pull out // the title. - const termCounts = splitTitle.reduce((acc, text) => { - acc[text] = acc[text] ? acc[text] + 1 : 1 - return acc - }, {}) + const termCounts = splitTitle.reduce((acc, titleText) => { + acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1; + return acc; + }, {}); const [maxTerm, termCount] = Reflect.ownKeys(termCounts) .reduce((acc, key) => { if (acc[1] < termCounts[key]) { - return [key, termCounts[key]] - } else { - return acc + return [key, termCounts[key]]; } - }, [0, 0]) + + return acc; + }, [0, 0]); // We found a splitter that was used more than once, so it // is probably the breadcrumber. Split our title on that instead. // Note: max_term should be <= 4 characters, so that " >> " // will match, but nothing longer than that. if (termCount >= 2 && maxTerm.length <= 4) { - splitTitle = text.split(maxTerm) + splitTitle = text.split(maxTerm); } - const splitEnds = [splitTitle[0], splitTitle.slice(-1)] - const longestEnd = splitEnds.reduce((acc, end) => { - return acc.length > end.length ? acc : end - }, '') + const splitEnds = [splitTitle[0], splitTitle.slice(-1)]; + const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, ''); if (longestEnd.length > 10) { - return longestEnd - } else { - return text + return longestEnd; } + + return text; } + + return null; } function cleanDomainFromTitle(splitTitle, url) { @@ -81,20 +58,43 @@ function cleanDomainFromTitle(splitTitle, url) { // // Strip out the big TLDs - it just makes the matching a bit more // accurate. Not the end of the world if it doesn't strip right. - const { host } = URL.parse(url) - const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '') + const { host } = URL.parse(url); + const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, ''); - const startSlug = splitTitle[0].toLowerCase().replace(' ', '') - const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain) + const startSlug = splitTitle[0].toLowerCase().replace(' ', ''); + const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain); - if (startSlugRatio > .4 && startSlug.length > 5) { - return splitTitle.slice(2).join('') + if (startSlugRatio > 0.4 && startSlug.length > 5) { + return splitTitle.slice(2).join(''); } - const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '') - const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain) + const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', ''); + const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain); - if (endSlugRatio > .4 && endSlug.length >= 5) { - return splitTitle.slice(0, -2).join('') + if (endSlugRatio > 0.4 && endSlug.length >= 5) { + return splitTitle.slice(0, -2).join(''); } + + return null; +} + +// Given a title with separators in it (colons, dashes, etc), +// resolve whether any of the segments should be removed. +export default function resolveSplitTitle(title, url = '') { + // Splits while preserving splitters, like: + // ['The New New York', ' - ', 'The Washington Post'] + const splitTitle = title.split(TITLE_SPLITTERS_RE); + if (splitTitle.length === 1) { + return title; + } + + let newTitle = extractBreadcrumbTitle(splitTitle, title); + if (newTitle) return newTitle; + + newTitle = cleanDomainFromTitle(splitTitle, url); + if (newTitle) return newTitle; + + // Fuzzy ratio didn't find anything, so this title is probably legit. + // Just return it all. + return title; } diff --git a/src/cleaners/resolve-split-title.test.js b/src/cleaners/resolve-split-title.test.js index 871d1191..5fee794c 100644 --- a/src/cleaners/resolve-split-title.test.js +++ b/src/cleaners/resolve-split-title.test.js @@ -1,32 +1,31 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; -import { resolveSplitTitle } from './index' +import { resolveSplitTitle } from './index'; describe('resolveSplitTitle(text)', () => { it('does nothing if title not splittable', () => { - const title = "This Is a Normal Title" + const title = 'This Is a Normal Title'; - assert.equal(resolveSplitTitle(title), title) - }) + assert.equal(resolveSplitTitle(title), title); + }); it('extracts titles from breadcrumb-like titles', () => { - const title = "The Best Gadgets on Earth : Bits : Blogs : NYTimes.com" + const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com'; - assert.equal(resolveSplitTitle(title), "The Best Gadgets on Earth ") - }) + assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth '); + }); it('cleans domains from titles at the front', () => { - const title = "NYTimes - The Best Gadgets on Earth" - const url = "https://www.nytimes.com/bits/blog/etc/" + const title = 'NYTimes - The Best Gadgets on Earth'; + const url = 'https://www.nytimes.com/bits/blog/etc/'; - assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth") - }) + assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth'); + }); it('cleans domains from titles at the back', () => { - const title = "The Best Gadgets on Earth | NYTimes" - const url = "https://www.nytimes.com/bits/blog/etc/" + const title = 'The Best Gadgets on Earth | NYTimes'; + const url = 'https://www.nytimes.com/bits/blog/etc/'; - assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth") - }) -}) + assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth'); + }); +}); diff --git a/src/cleaners/title.js b/src/cleaners/title.js index 9328becd..a1fd2f9a 100644 --- a/src/cleaners/title.js +++ b/src/cleaners/title.js @@ -1,25 +1,26 @@ -import { TITLE_SPLITTERS_RE } from './constants' -import { resolveSplitTitle } from './index' -import { stripTags } from 'utils/dom' +import { stripTags } from 'utils/dom'; + +import { TITLE_SPLITTERS_RE } from './constants'; +import { resolveSplitTitle } from './index'; export default function cleanTitle(title, { url, $ }) { // If title has |, :, or - in it, see if // we can clean it up. if (TITLE_SPLITTERS_RE.test(title)) { - title = resolveSplitTitle(title, url) + title = resolveSplitTitle(title, url); } // Final sanity check that we didn't get a crazy title. // if (title.length > 150 || title.length < 15) { if (title.length > 150) { // If we did, return h1 from the document if it exists - const h1 = $('h1') + const h1 = $('h1'); if (h1.length === 1) { - title = h1.text() + title = h1.text(); } } // strip any html tags in the title text - return stripTags(title, $).trim() + return stripTags(title, $).trim(); } diff --git a/src/cleaners/title.test.js b/src/cleaners/title.test.js index c8a0c7a5..c99d3d05 100644 --- a/src/cleaners/title.test.js +++ b/src/cleaners/title.test.js @@ -1,8 +1,8 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { cleanTitle } from './index' +import HTML from './fixtures/html'; +import { cleanTitle } from './index'; describe('cleanTitle(title, { url, $ })', () => { it('uses a single h1 if the title is too short or too long', () => { @@ -10,28 +10,27 @@ describe('cleanTitle(title, { url, $ })', () => { // const $ = cheerio.load(HTML.docWithH1) // // assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text()) - }) + }); it('only uses h1 if there is only one on the page', () => { - const title = "Too Short" - const $ = cheerio.load(HTML.docWith2H1s) + const title = 'Too Short'; + const $ = cheerio.load(HTML.docWith2H1s); - assert.equal(cleanTitle(title, { url: '', $ }), title) - }) + assert.equal(cleanTitle(title, { url: '', $ }), title); + }); it('removes HTML tags from titles', () => { - const $ = cheerio.load(HTML.docWithTagsInH1.before) - const title = $('h1').html() + const $ = cheerio.load(HTML.docWithTagsInH1.before); + const title = $('h1').html(); - assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after) - }) + assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after); + }); it('trims extraneous spaces', () => { - const title = " This Is a Great Title That You'll Love " - const $ = cheerio.load(HTML.docWithTagsInH1.before) + const title = " This Is a Great Title That You'll Love "; + const $ = cheerio.load(HTML.docWithTagsInH1.before); - assert.equal(cleanTitle(title, { url: '', $ }), title.trim()) - }) - -}) + assert.equal(cleanTitle(title, { url: '', $ }), title.trim()); + }); +}); diff --git a/src/extractors/all.js b/src/extractors/all.js index 399c2466..6b26f28f 100644 --- a/src/extractors/all.js +++ b/src/extractors/all.js @@ -1,12 +1,11 @@ -import GenericExtractor from './generic' -import NYMagExtractor from './custom/nymag.com' -import BloggerExtractor from './custom/blogspot.com' -import WikipediaExtractor from './custom/wikipedia.org' +import NYMagExtractor from './custom/nymag.com'; +import BloggerExtractor from './custom/blogspot.com'; +import WikipediaExtractor from './custom/wikipedia.org'; const Extractors = { 'nymag.com': NYMagExtractor, 'blogspot.com': BloggerExtractor, 'wikipedia.org': WikipediaExtractor, -} +}; -export default Extractors +export default Extractors; diff --git a/src/extractors/constants.js b/src/extractors/constants.js index b6fc067b..f490a68e 100644 --- a/src/extractors/constants.js +++ b/src/extractors/constants.js @@ -1 +1 @@ -export const ATTR_RE = /\[([\w-]+)\]/ +export const ATTR_RE = /\[([\w-]+)\]/; diff --git a/src/extractors/custom/blogspot.com/index.js b/src/extractors/custom/blogspot.com/index.js index 20a294ae..8fa5a8a8 100644 --- a/src/extractors/custom/blogspot.com/index.js +++ b/src/extractors/custom/blogspot.com/index.js @@ -14,27 +14,27 @@ const BloggerExtractor = { // Convert the noscript tag to a div transforms: { - 'noscript': 'div' + noscript: 'div', }, }, author: { selectors: [ - '.post-author-name' - ] + '.post-author-name', + ], }, title: { selectors: [ 'h2.title', - ] + ], }, datePublished: { selectors: [ 'span.publishdate', - ] - } -} + ], + }, +}; -export default BloggerExtractor +export default BloggerExtractor; diff --git a/src/extractors/custom/nymag.com/index.js b/src/extractors/custom/nymag.com/index.js index d96a4bc6..c7622191 100644 --- a/src/extractors/custom/nymag.com/index.js +++ b/src/extractors/custom/nymag.com/index.js @@ -22,37 +22,39 @@ const NYMagExtractor = { // the transformation. transforms: { // Convert h1s to h2s - 'h1': 'h2', + h1: 'h2', // Convert lazy-loaded noscript images to figures - 'noscript': ($node) => { - const $children = $node.children() + noscript: ($node) => { + const $children = $node.children(); if ($children.length === 1 && $children.get(0).tagName === 'img') { - return 'figure' + return 'figure'; } - } - } + + return null; + }, + }, }, title: { selectors: [ 'h1.headline-primary', 'h1', - ] + ], }, author: { selectors: [ '.by-authors', - ] + ], }, datePublished: { selectors: [ 'time.article-timestamp[datetime]', 'time.article-timestamp', - ] - } -} + ], + }, +}; -export default NYMagExtractor +export default NYMagExtractor; diff --git a/src/extractors/custom/wikipedia.org/index.js b/src/extractors/custom/wikipedia.org/index.js index 73c07aca..a30ce35b 100644 --- a/src/extractors/custom/wikipedia.org/index.js +++ b/src/extractors/custom/wikipedia.org/index.js @@ -8,7 +8,7 @@ const WikipediaExtractor = { // transform top infobox to an image with caption transforms: { '.infobox img': ($node) => { - $node.parents('.infobox').prepend($node) + $node.parents('.infobox').prepend($node); }, '.infobox caption': 'figcaption', '.infobox': 'figure', @@ -28,15 +28,15 @@ const WikipediaExtractor = { title: { selectors: [ 'h2.title', - ] + ], }, datePublished: { selectors: [ '#footer-info-lastmod', - ] + ], }, -} +}; -export default WikipediaExtractor +export default WikipediaExtractor; diff --git a/src/extractors/generic/author/constants.js b/src/extractors/generic/author/constants.js index 942b101c..3b0b8d94 100644 --- a/src/extractors/generic/author/constants.js +++ b/src/extractors/generic/author/constants.js @@ -5,22 +5,22 @@ // Note: "author" is too often the -developer- of the page, so it is not // added here. export const AUTHOR_META_TAGS = [ - 'byl', - 'clmst', - 'dc.author', - 'dcsext.author', - 'dc.creator', - 'rbauthors', - 'authors', -] + 'byl', + 'clmst', + 'dc.author', + 'dcsext.author', + 'dc.creator', + 'rbauthors', + 'authors', +]; -export const AUTHOR_MAX_LENGTH = 300 +export const AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From // most explicit to least explicit. // // Note - this does not use classes like CSS. This checks to see if the string -// exists in the className, which is not as accurate as .className (which +// exists in the className, which is not as accurate as .className (which // splits on spaces/endlines), but for our purposes it's close enough. The // speed tradeoff is worth the accuracy hit. export const AUTHOR_SELECTORS = [ @@ -47,12 +47,12 @@ export const AUTHOR_SELECTORS = [ '.articleauthor', '.ArticleAuthor', '.byline', -] +]; // An ordered list of Selectors to find likely article authors, with // regular expression for content. -const byline_re = /^[\n\s]*By/i +const bylineRe = /^[\n\s]*By/i; export const BYLINE_SELECTORS_RE = [ - ['#byline', byline_re], - ['.byline', byline_re], -] + ['#byline', bylineRe], + ['.byline', bylineRe], +]; diff --git a/src/extractors/generic/author/extractor.js b/src/extractors/generic/author/extractor.js index 240d6e75..5a7c6cf3 100644 --- a/src/extractors/generic/author/extractor.js +++ b/src/extractors/generic/author/extractor.js @@ -1,49 +1,48 @@ +import { cleanAuthor } from 'cleaners'; +import { + extractFromMeta, + extractFromSelectors, +} from 'utils/dom'; + import { AUTHOR_META_TAGS, AUTHOR_MAX_LENGTH, AUTHOR_SELECTORS, BYLINE_SELECTORS_RE, -} from './constants' - -import { cleanAuthor } from 'cleaners' - -import { - extractFromMeta, - extractFromSelectors -} from 'utils/dom' +} from './constants'; const GenericAuthorExtractor = { extract({ $, metaCache }) { - let author + let author; // First, check to see if we have a matching // meta tag that we can make use of. - author = extractFromMeta($, AUTHOR_META_TAGS, metaCache) + author = extractFromMeta($, AUTHOR_META_TAGS, metaCache); if (author && author.length < AUTHOR_MAX_LENGTH) { - return cleanAuthor(author) + return cleanAuthor(author); } // Second, look through our selectors looking for potential authors. - author = extractFromSelectors($, AUTHOR_SELECTORS, 2) + author = extractFromSelectors($, AUTHOR_SELECTORS, 2); if (author && author.length < AUTHOR_MAX_LENGTH) { - return cleanAuthor(author) + return cleanAuthor(author); } // Last, use our looser regular-expression based selectors for // potential authors. for (const [selector, regex] of BYLINE_SELECTORS_RE) { - const node = $(selector) + const node = $(selector); if (node.length === 1) { - const text = node.text() + const text = node.text(); if (regex.test(text)) { - return cleanAuthor(text) + return cleanAuthor(text); } } } - return null - } -} + return null; + }, +}; -export default GenericAuthorExtractor +export default GenericAuthorExtractor; diff --git a/src/extractors/generic/author/extractor.test.js b/src/extractors/generic/author/extractor.test.js index f1df9107..fa522cf9 100644 --- a/src/extractors/generic/author/extractor.test.js +++ b/src/extractors/generic/author/extractor.test.js @@ -1,46 +1,46 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import GenericAuthorExtractor from './extractor' +import HTML from './fixtures/html'; +import GenericAuthorExtractor from './extractor'; describe('GenericAuthorExtractor', () => { describe('extract($, cachedMeta)', () => { it('extracts author from meta tags', () => { - const $ = cheerio.load(HTML.authorMeta.test) + const $ = cheerio.load(HTML.authorMeta.test); const result = GenericAuthorExtractor.extract( - { $, metaCache: ["dc.author", "something-else"] } - ) + { $, metaCache: ['dc.author', 'something-else'] } + ); - assert.equal(result, HTML.authorMeta.result) - }) + assert.equal(result, HTML.authorMeta.result); + }); it('extracts author from author selectors', () => { - const $ = cheerio.load(HTML.authorSelectors.test) + const $ = cheerio.load(HTML.authorSelectors.test); const result = GenericAuthorExtractor.extract( - { $, metaCache: ["dc.author", "something-else"] } - ) + { $, metaCache: ['dc.author', 'something-else'] } + ); - assert.equal(result, HTML.authorSelectors.result) - }) + assert.equal(result, HTML.authorSelectors.result); + }); it('extracts author with regex selectors', () => { - const $ = cheerio.load(HTML.authorRegSelectors.test) + const $ = cheerio.load(HTML.authorRegSelectors.test); const result = GenericAuthorExtractor.extract( - { $, metaCache: ["dc.author", "something-else"] } - ) + { $, metaCache: ['dc.author', 'something-else'] } + ); - assert.equal(result, HTML.authorRegSelectors.result) - }) + assert.equal(result, HTML.authorRegSelectors.result); + }); it('returns null if no author found', () => { - const $ = cheerio.load('<div></div>') + const $ = cheerio.load('<div></div>'); const result = GenericAuthorExtractor.extract( - { $, metaCache: ["dc.author", "something-else"] } - ) + { $, metaCache: ['dc.author', 'something-else'] } + ); - assert.equal(result, null) - }) - }) -}) + assert.equal(result, null); + }); + }); +}); diff --git a/src/extractors/generic/author/fixtures/html.js b/src/extractors/generic/author/fixtures/html.js index 499a0588..84ed985d 100644 --- a/src/extractors/generic/author/fixtures/html.js +++ b/src/extractors/generic/author/fixtures/html.js @@ -5,7 +5,7 @@ const HTML = { <meta name="dc.author" value="Adam" /> </html> `, - result: `Adam` + result: 'Adam', }, authorSelectors: { test: ` @@ -15,7 +15,7 @@ const HTML = { </div> </div> `, - result: `Adam` + result: 'Adam', }, authorRegSelectors: { test: ` @@ -25,8 +25,8 @@ const HTML = { </div> </div> `, - result: `Adam` + result: 'Adam', }, -} +}; -export default HTML +export default HTML; diff --git a/src/extractors/generic/content/extract-best-node.js b/src/extractors/generic/content/extract-best-node.js index 10138124..2ac4b662 100644 --- a/src/extractors/generic/content/extract-best-node.js +++ b/src/extractors/generic/content/extract-best-node.js @@ -1,11 +1,12 @@ -import { - scoreContent, - findTopCandidate, -} from './scoring' import { stripUnlikelyCandidates, convertToParagraphs, -} from 'utils/dom' +} from 'utils/dom'; + +import { + scoreContent, + findTopCandidate, +} from './scoring'; // Using a variety of scoring techniques, extract the content most // likely to be article text. @@ -26,12 +27,12 @@ export default function extractBestNode($, opts) { if (opts.stripUnlikelyCandidates) { - $ = stripUnlikelyCandidates($) + $ = stripUnlikelyCandidates($); } - $ = convertToParagraphs($) - $ = scoreContent($, opts.weightNodes) - const $topCandidate = findTopCandidate($) + $ = convertToParagraphs($); + $ = scoreContent($, opts.weightNodes); + const $topCandidate = findTopCandidate($); - return $topCandidate + return $topCandidate; } diff --git a/src/extractors/generic/content/extract-best-node.test.js b/src/extractors/generic/content/extract-best-node.test.js index 2dc4829a..9b083099 100644 --- a/src/extractors/generic/content/extract-best-node.test.js +++ b/src/extractors/generic/content/extract-best-node.test.js @@ -1,24 +1,26 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import fs from 'fs' +import assert from 'assert'; +import cheerio from 'cheerio'; +import fs from 'fs'; // import HTML from './fixtures/html' -import extractBestNode from './extract-best-node' +import extractBestNode from './extract-best-node'; describe('extractBestNode($, flags)', () => { - it("scores the dom nodes and returns the best option", () => { - const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8') + it('scores the dom nodes and returns the best option', () => { + const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8'); const opts = { - stripUnlikelyCandidates: true, - weightNodes: true, - } + stripUnlikelyCandidates: true, + weightNodes: true, + }; - let $ = cheerio.load(html) + const $ = cheerio.load(html); - const bestNode = extractBestNode($, opts) + const bestNode = extractBestNode($, opts); + + assert(typeof bestNode, 'object'); // console.log(bestNode.html()) // assert.equal($(bestNode).text().length, 3652) - }) -}) + }); +}); diff --git a/src/extractors/generic/content/extractor.js b/src/extractors/generic/content/extractor.js index a0f137f9..4d8108b6 100644 --- a/src/extractors/generic/content/extractor.js +++ b/src/extractors/generic/content/extractor.js @@ -1,10 +1,11 @@ -import cheerio from 'cheerio' -import 'babel-polyfill' +import cheerio from 'cheerio'; +import 'babel-polyfill'; -import extractBestNode from './extract-best-node' -import { nodeIsSufficient } from 'utils/dom' -import { cleanContent } from 'cleaners' -import { normalizeSpaces } from 'utils/text' +import { nodeIsSufficient } from 'utils/dom'; +import { cleanContent } from 'cleaners'; +import { normalizeSpaces } from 'utils/text'; + +import extractBestNode from './extract-best-node'; const GenericContentExtractor = { defaultOpts: { @@ -33,46 +34,44 @@ const GenericContentExtractor = { // cleanConditionally: Clean the node to return of some // superfluous content. Things like forms, ads, etc. extract({ $, html, title, url }, opts) { - opts = { ...this.defaultOpts, ...opts } + opts = { ...this.defaultOpts, ...opts }; - $ = $ || cheerio.load(html) + $ = $ || cheerio.load(html); // Cascade through our extraction-specific opts in an ordered fashion, // turning them off as we try to extract content. - let node = this.getContentNode($, title, url, opts) + let node = this.getContentNode($, title, url, opts); if (nodeIsSufficient(node)) { - return this.cleanAndReturnNode(node, $) - } else { - // We didn't succeed on first pass, one by one disable our - // extraction opts and try again. - for (const key of Reflect.ownKeys(opts).filter(key => opts[key] === true)) { - opts[key] = false - $ = cheerio.load(html) - - node = this.getContentNode($, title, url, opts) - - if (nodeIsSufficient(node)) { - break - } - } + return this.cleanAndReturnNode(node, $); + } - return this.cleanAndReturnNode(node, $) + // We didn't succeed on first pass, one by one disable our + // extraction opts and try again. + for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) { + opts[key] = false; + $ = cheerio.load(html); + + node = this.getContentNode($, title, url, opts); + + if (nodeIsSufficient(node)) { + break; + } } - return this.cleanAndReturnNode(node, $) + return this.cleanAndReturnNode(node, $); }, // Get node given current options getContentNode($, title, url, opts) { return cleanContent( extractBestNode($, opts), - { - $, - cleanConditionally: opts.cleanConditionally, - title, - url, - }) + { + $, + cleanConditionally: opts.cleanConditionally, + title, + url, + }); }, // Once we got here, either we're at our last-resort node, or @@ -80,10 +79,10 @@ const GenericContentExtractor = { // move forward. cleanAndReturnNode(node, $) { if (!node) { - return null + return null; } - return normalizeSpaces($.html(node)) + return normalizeSpaces($.html(node)); // if return_type == "html": // return normalize_spaces(node_to_html(node)) @@ -91,6 +90,6 @@ const GenericContentExtractor = { // return node }, -} +}; -export default GenericContentExtractor +export default GenericContentExtractor; diff --git a/src/extractors/generic/content/extractor.test.js b/src/extractors/generic/content/extractor.test.js index 90a1dc52..168faf9e 100644 --- a/src/extractors/generic/content/extractor.test.js +++ b/src/extractors/generic/content/extractor.test.js @@ -1,16 +1,15 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import fs from 'fs' +import assert from 'assert'; +import fs from 'fs'; -import { clean } from 'test-helpers' +import { clean } from 'test-helpers'; -import GenericContentExtractor from './extractor' +import GenericContentExtractor from './extractor'; -describe('GenericContentExtractor', function() { - this.timeout(1000000) +describe('GenericContentExtractor', function () { + this.timeout(1000000); describe('extract($, html, opts)', () => { - it("extracts html and returns the article", () => { - const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') + it('extracts html and returns the article', () => { + const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8'); // Array.from(range(1, 100)).map((i) => { // console.log(i) @@ -20,15 +19,10 @@ describe('GenericContentExtractor', function() { // }) const result = clean(GenericContentExtractor.extract( { $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' } - )) - // console.log(result) - }) - }) -}) - + )); -function* range(start = 1, end = 1) { - while (start <= end) { - yield start++ - } -} + assert(typeof result, 'string'); + // console.log(result) + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/add-score.js b/src/extractors/generic/content/scoring/add-score.js index 065b4a5c..35564da4 100644 --- a/src/extractors/generic/content/scoring/add-score.js +++ b/src/extractors/generic/content/scoring/add-score.js @@ -1,15 +1,15 @@ import { getOrInitScore, setScore, -} from './index' +} from './index'; export default function addScore($node, $, amount) { try { - const score = getOrInitScore($node, $) + amount - setScore($node, $, score) - } catch(e) { - console.debug(e) - } finally { - return $node + const score = getOrInitScore($node, $) + amount; + setScore($node, $, score); + } catch (e) { + // Ignoring; error occurs in scoreNode } + + return $node; } diff --git a/src/extractors/generic/content/scoring/add-score.test.js b/src/extractors/generic/content/scoring/add-score.test.js index e04e97e1..e9ec341e 100644 --- a/src/extractors/generic/content/scoring/add-score.test.js +++ b/src/extractors/generic/content/scoring/add-score.test.js @@ -1,28 +1,27 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; import { addScore, getScore, -} from './index' +} from './index'; describe('Scoring utils', () => { describe('addScore(node, $, amount)', () => { - it(`adds the specified amount to a node's score`, () => { - const $ = cheerio.load('<p score="25">Foo</p>') - let $node = $('p').first() + it('adds the specified amount to a node\'s score', () => { + const $ = cheerio.load('<p score="25">Foo</p>'); + let $node = $('p').first(); - $node = addScore($node, $, 25) - assert.equal(getScore($node), 50) - }) + $node = addScore($node, $, 25); + assert.equal(getScore($node), 50); + }); - it(`adds score if score not yet set (assumes score is 0)`, () => { - const $ = cheerio.load('<p>Foo</p>') - let $node = $('p').first() + it('adds score if score not yet set (assumes score is 0)', () => { + const $ = cheerio.load('<p>Foo</p>'); + let $node = $('p').first(); - $node = addScore($node, $, 25) - assert.equal(getScore($node), 25) - }) - - }) -}) + $node = addScore($node, $, 25); + assert.equal(getScore($node), 25); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/add-to-parent.js b/src/extractors/generic/content/scoring/add-to-parent.js index ae02b0fa..420dc805 100644 --- a/src/extractors/generic/content/scoring/add-to-parent.js +++ b/src/extractors/generic/content/scoring/add-to-parent.js @@ -1,11 +1,11 @@ -import { addScore } from './index' +import { addScore } from './index'; // Adds 1/4 of a child's score to its parent export default function addToParent(node, $, score) { - const parent = node.parent() + const parent = node.parent(); if (parent) { - addScore(parent, $, score * .25) + addScore(parent, $, score * 0.25); } - return node + return node; } diff --git a/src/extractors/generic/content/scoring/add-to-parent.test.js b/src/extractors/generic/content/scoring/add-to-parent.test.js index 610643bb..aa062975 100644 --- a/src/extractors/generic/content/scoring/add-to-parent.test.js +++ b/src/extractors/generic/content/scoring/add-to-parent.test.js @@ -1,24 +1,23 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; import { addToParent, getScore, -} from './index' +} from './index'; describe('Scoring utils', () => { describe('addToParent(node, $, amount)', () => { - it(`adds 1/4 of a node's score it its parent`, () => { - const html = '<div score="25"><p score="40">Foo</p></div>' - const $ = cheerio.load(html) - let $node = $('p').first() + it('adds 1/4 of a node\'s score it its parent', () => { + const html = '<div score="25"><p score="40">Foo</p></div>'; + const $ = cheerio.load(html); + let $node = $('p').first(); - $node = addToParent($node, $, 40) + $node = addToParent($node, $, 40); - assert.equal(getScore($node.parent()), 35) - assert.equal(getScore($node), 40) - }) - }) - -}) + assert.equal(getScore($node.parent()), 35); + assert.equal(getScore($node), 40); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/constants.js b/src/extractors/generic/content/scoring/constants.js index d02b4d5d..27b68e14 100644 --- a/src/extractors/generic/content/scoring/constants.js +++ b/src/extractors/generic/content/scoring/constants.js @@ -1,49 +1,49 @@ -//// CONTENT FETCHING CONSTANTS //// +// // CONTENT FETCHING CONSTANTS //// // A list of strings that can be considered unlikely candidates when // extracting content from a resource. These strings are joined together // and then tested for existence using re:test, so may contain simple, // non-pipe style regular expression queries if necessary. export const UNLIKELY_CANDIDATES_BLACKLIST = [ - 'ad-break', - 'adbox', - 'advert', - 'addthis', - 'agegate', - 'aux', - 'blogger-labels', - 'combx', - 'comment', - 'conversation', - 'disqus', - 'entry-unrelated', - 'extra', - 'foot', - 'form', - 'header', - 'hidden', - 'loader', - 'login', // Note: This can hit 'blogindex'. - 'menu', - 'meta', - 'nav', - 'pager', - 'pagination', - 'predicta', // readwriteweb inline ad box - 'presence_control_external', // lifehacker.com container full of false positives - 'popup', - 'printfriendly', - 'related', - 'remove', - 'remark', - 'rss', - 'share', - 'shoutbox', - 'sidebar', - 'sociable', - 'sponsor', - 'tools' -] + 'ad-break', + 'adbox', + 'advert', + 'addthis', + 'agegate', + 'aux', + 'blogger-labels', + 'combx', + 'comment', + 'conversation', + 'disqus', + 'entry-unrelated', + 'extra', + 'foot', + 'form', + 'header', + 'hidden', + 'loader', + 'login', // Note: This can hit 'blogindex'. + 'menu', + 'meta', + 'nav', + 'pager', + 'pagination', + 'predicta', // readwriteweb inline ad box + 'presence_control_external', // lifehacker.com container full of false positives + 'popup', + 'printfriendly', + 'related', + 'remove', + 'remark', + 'rss', + 'share', + 'shoutbox', + 'sidebar', + 'sociable', + 'sponsor', + 'tools', +]; // A list of strings that can be considered LIKELY candidates when // extracting content from a resource. Essentially, the inverse of the @@ -57,56 +57,56 @@ export const UNLIKELY_CANDIDATES_BLACKLIST = [ // re:test, so may contain simple, non-pipe style regular expression queries // if necessary. export const UNLIKELY_CANDIDATES_WHITELIST = [ - 'and', - 'article', - 'body', - 'blogindex', - 'column', - 'content', - 'entry-content-asset', - 'format', // misuse of form - 'hfeed', - 'hentry', - 'hatom', - 'main', - 'page', - 'posts', - 'shadow' -] + 'and', + 'article', + 'body', + 'blogindex', + 'column', + 'content', + 'entry-content-asset', + 'format', // misuse of form + 'hfeed', + 'hentry', + 'hatom', + 'main', + 'page', + 'posts', + 'shadow', +]; // A list of tags which, if found inside, should cause a <div /> to NOT // be turned into a paragraph tag. Shallow div tags without these elements // should be turned into <p /> tags. export const DIV_TO_P_BLOCK_TAGS = [ - 'a', - 'blockquote', - 'dl', - 'div', - 'img', - 'p', - 'pre', - 'table', -].join(',') + 'a', + 'blockquote', + 'dl', + 'div', + 'img', + 'p', + 'pre', + 'table', +].join(','); // A list of tags that should be ignored when trying to find the top candidate // for a document. export const NON_TOP_CANDIDATE_TAGS = [ - 'br', - 'b', - 'i', - 'label', - 'hr', - 'area', - 'base', - 'basefont', - 'input', - 'img', - 'link', - 'meta', -] + 'br', + 'b', + 'i', + 'label', + 'hr', + 'area', + 'base', + 'basefont', + 'input', + 'img', + 'link', + 'meta', +]; export const NON_TOP_CANDIDATE_TAGS_RE = - new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i') + new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i'); // A list of selectors that specify, very clearly, either hNews or other // very content-specific style content, like Blogger templates. @@ -118,53 +118,15 @@ export const HNEWS_CONTENT_SELECTORS = [ ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body'], -] -// export const HNEWS_CONTENT_SELECTORS = [ -// { -// //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'), -// must_exist: { -// classes: ['hentry', 'entry-content'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'), -// must_exist: { -// classes: ['entry', 'entry-content'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'), -// must_exist: { -// classes: ['entry', 'entry_content'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'), -// must_exist: { -// classes: ['post', 'post-body'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'), -// must_exist: { -// classes: ['post', 'post_body'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'), -// must_exist: { -// classes: ['post', 'postbody'], -// } -// }, -// ] +]; export const PHOTO_HINTS = [ - 'figure', - 'photo', - 'image', - 'caption' -] -export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i') + 'figure', + 'photo', + 'image', + 'caption', +]; +export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being @@ -172,175 +134,175 @@ export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i') // // TODO: Perhaps have these scale based on their odds of being quality? export const POSITIVE_SCORE_HINTS = [ - 'article', - 'articlecontent', - 'instapaper_body', - 'blog', - 'body', - 'content', - 'entry-content-asset', - 'entry', - 'hentry', - 'main', - 'Normal', - 'page', - 'pagination', - 'permalink', - 'post', - 'story', - 'text', - '[-_]copy', //usatoday - '\Bcopy' -] + 'article', + 'articlecontent', + 'instapaper_body', + 'blog', + 'body', + 'content', + 'entry-content-asset', + 'entry', + 'hentry', + 'main', + 'Normal', + 'page', + 'pagination', + 'permalink', + 'post', + 'story', + 'text', + '[-_]copy', // usatoday + '\Bcopy', +]; // The above list, joined into a matching regular expression -export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i') +export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines -export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i') +export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? export const NEGATIVE_SCORE_HINTS = [ - 'adbox', - 'advert', - 'author', - 'bio', - 'bookmark', - 'bottom', - 'byline', - 'clear', - 'com-', - 'combx', - 'comment', - 'comment\B', - 'contact', - 'copy', - 'credit', - 'crumb', - 'date', - 'deck', - 'excerpt', - 'featured', //tnr.com has a featured_content which throws us off - 'foot', - 'footer', - 'footnote', - 'graf', - 'head', - 'info', - 'infotext', //newscientist.com copyright - 'instapaper_ignore', - 'jump', - 'linebreak', - 'link', - 'masthead', - 'media', - 'meta', - 'modal', - 'outbrain', //slate.com junk - 'promo', - 'pr_', // autoblog - press release - 'related', - 'respond', - 'roundcontent', //lifehacker restricted content warning - 'scroll', - 'secondary', - 'share', - 'shopping', - 'shoutbox', - 'side', - 'sidebar', - 'sponsor', - 'stamp', - 'sub', - 'summary', - 'tags', - 'tools', - 'widget' -] + 'adbox', + 'advert', + 'author', + 'bio', + 'bookmark', + 'bottom', + 'byline', + 'clear', + 'com-', + 'combx', + 'comment', + 'comment\B', + 'contact', + 'copy', + 'credit', + 'crumb', + 'date', + 'deck', + 'excerpt', + 'featured', // tnr.com has a featured_content which throws us off + 'foot', + 'footer', + 'footnote', + 'graf', + 'head', + 'info', + 'infotext', // newscientist.com copyright + 'instapaper_ignore', + 'jump', + 'linebreak', + 'link', + 'masthead', + 'media', + 'meta', + 'modal', + 'outbrain', // slate.com junk + 'promo', + 'pr_', // autoblog - press release + 'related', + 'respond', + 'roundcontent', // lifehacker restricted content warning + 'scroll', + 'secondary', + 'share', + 'shopping', + 'shoutbox', + 'side', + 'sidebar', + 'sponsor', + 'stamp', + 'sub', + 'summary', + 'tags', + 'tools', + 'widget', +]; // The above list, joined into a matching regular expression -export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i') +export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // Match a digit. Pretty clear. -export const DIGIT_RE = new RegExp('[0-9]') +export const DIGIT_RE = new RegExp('[0-9]'); // Match 2 or more consecutive <br> tags -export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i') +export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i'); // Match 1 BR tag. -export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i') +export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i'); // A list of all of the block level tags known in HTML5 and below. Taken from // http://bit.ly/qneNIT export const BLOCK_LEVEL_TAGS = [ - 'article', - 'aside', - 'blockquote', - 'body', - 'br', - 'button', - 'canvas', - 'caption', - 'col', - 'colgroup', - 'dd', - 'div', - 'dl', - 'dt', - 'embed', - 'fieldset', - 'figcaption', - 'figure', - 'footer', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'header', - 'hgroup', - 'hr', - 'li', - 'map', - 'object', - 'ol', - 'output', - 'p', - 'pre', - 'progress', - 'section', - 'table', - 'tbody', - 'textarea', - 'tfoot', - 'th', - 'thead', - 'tr', - 'ul', - 'video', -] -export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i') + 'article', + 'aside', + 'blockquote', + 'body', + 'br', + 'button', + 'canvas', + 'caption', + 'col', + 'colgroup', + 'dd', + 'div', + 'dl', + 'dt', + 'embed', + 'fieldset', + 'figcaption', + 'figure', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'header', + 'hgroup', + 'hr', + 'li', + 'map', + 'object', + 'ol', + 'output', + 'p', + 'pre', + 'progress', + 'section', + 'table', + 'tbody', + 'textarea', + 'tfoot', + 'th', + 'thead', + 'tr', + 'ul', + 'video', +]; +export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i'); // The removal is implemented as a blacklist and whitelist, this test finds // blacklisted elements that aren't whitelisted. We do this all in one // expression-both because it's only one pass, and because this skips the // serialization for whitelisted nodes. -const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|') -export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i') +const candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|'); +export const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i'); -const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|') -export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i') +const candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|'); +export const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i'); -export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i') +export const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i'); -export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i') -export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i') -export const BAD_TAGS = new RegExp('^(address|form)$', 'i') +export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i'); +export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i'); +export const BAD_TAGS = new RegExp('^(address|form)$', 'i'); -export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i') +export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i'); diff --git a/src/extractors/generic/content/scoring/find-top-candidate.js b/src/extractors/generic/content/scoring/find-top-candidate.js index fef60946..3a434cb5 100644 --- a/src/extractors/generic/content/scoring/find-top-candidate.js +++ b/src/extractors/generic/content/scoring/find-top-candidate.js @@ -1,115 +1,35 @@ -import { NON_TOP_CANDIDATE_TAGS_RE } from './constants' -import { getScore } from './index' -import { - textLength, - linkDensity -} from 'utils/dom' +import { NON_TOP_CANDIDATE_TAGS_RE } from './constants'; +import { getScore } from './index'; +import mergeSiblings from './merge-siblings'; // After we've calculated scores, loop through all of the possible // candidate nodes we found and find the one with the highest score. export default function findTopCandidate($) { - let $candidate, topScore = 0 + let $candidate; + let topScore = 0; $('*[score]').each((index, node) => { - const $node = $(node) + const $node = $(node); // Ignore tags like BR, HR, etc if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) { - return + return; } - const score = getScore($node) + const score = getScore($node); if (score > topScore) { - topScore = score - $candidate = $node + topScore = score; + $candidate = $node; } - }) + }); // If we don't have a candidate, return the body // or whatever the first element is if (!$candidate) { - return $('body') || $('*').first() + return $('body') || $('*').first(); } - $candidate = mergeSiblings($candidate, topScore, $) + $candidate = mergeSiblings($candidate, topScore, $); - return $candidate -} - -// Now that we have a top_candidate, look through the siblings of -// it to see if any of them are decently scored. If they are, they -// may be split parts of the content (Like two divs, a preamble and -// a body.) Example: -// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14 -export function mergeSiblings($candidate, topScore, $) { - if (!$candidate.parent().length) { - return $candidate - } - - const siblingScoreThreshold = Math.max(10, topScore * 0.2) - let wrappingDiv = $('<div></div>') - - $candidate.parent().children().each((index, child) => { - const $child = $(child) - // Ignore tags like BR, HR, etc - if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) { - return - } - - const childScore = getScore($child) - if (childScore) { - if ($child === $candidate) { - wrappingDiv.append($child) - } else { - let contentBonus = 0 - // extract to scoreLinkDensity() TODO - const density = linkDensity($child) - - // If sibling has a very low link density, - // give it a small bonus - if (density < .05) { - contentBonus = contentBonus + 20 - } - - // If sibling has a high link density, - // give it a penalty - if (density >= 0.5) { - contentBonus = contentBonus - 20 - } - - // If sibling node has the same class as - // candidate, give it a bonus - if ($child.attr('class') === $candidate.attr('class')) { - contentBonus = contentBonus + topScore * .2 - } - - const newScore = getScore($child) + contentBonus - - if (newScore >= siblingScoreThreshold) { - return wrappingDiv.append($child) - } else if (child.tagName === 'p') { - const childContentLength = textLength($child.text()) - - if (childContentLength > 80 && density < .25) { - return wrappingDiv.append($child) - } else if (childContentLength <= 80 && density === 0 && - hasSentenceEnd(childContent)) { - - return wrappingDiv.append($child) - } - } - } - } - - }) - - return wrappingDiv -} - -// TODO Extract into util - AP -// Given a string, return True if it appears to have an ending sentence -// within it, false otherwise. -const SENTENCE_END_RE = new RegExp('\.( |$)') -function hasSentenceEnd(text) { - return SENTENCE_END_RE.test(text) + return $candidate; } diff --git a/src/extractors/generic/content/scoring/find-top-candidate.test.js b/src/extractors/generic/content/scoring/find-top-candidate.test.js index 1903678c..599987cf 100644 --- a/src/extractors/generic/content/scoring/find-top-candidate.test.js +++ b/src/extractors/generic/content/scoring/find-top-candidate.test.js @@ -1,58 +1,58 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import fs from 'fs' +import assert from 'assert'; +import cheerio from 'cheerio'; +import fs from 'fs'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; import { getScore, findTopCandidate, - scoreContent -} from './index' + scoreContent, +} from './index'; describe('findTopCandidate($)', () => { - it("finds the top candidate from simple case", () => { - const $ = cheerio.load(HTML.findDom1) + it('finds the top candidate from simple case', () => { + const $ = cheerio.load(HTML.findDom1); - const $$topCandidate = findTopCandidate($) + const $$topCandidate = findTopCandidate($); - assert.equal(getScore($$topCandidate), 100) - }) + assert.equal(getScore($$topCandidate), 100); + }); - it("finds the top candidate from a nested case", () => { - const $ = cheerio.load(HTML.findDom2) + it('finds the top candidate from a nested case', () => { + const $ = cheerio.load(HTML.findDom2); - const $$topCandidate = findTopCandidate($) + const $$topCandidate = findTopCandidate($); // this is wrapped in a div so checking // the score of the first child - assert.equal(getScore($$topCandidate.children().first()), 50) - }) + assert.equal(getScore($$topCandidate.children().first()), 50); + }); - it("ignores tags like BR", () => { - const $ = cheerio.load(HTML.findDom3) + it('ignores tags like BR', () => { + const $ = cheerio.load(HTML.findDom3); - const $topCandidate = findTopCandidate($) + const $topCandidate = findTopCandidate($); - assert.equal(getScore($topCandidate), 50) - }) + assert.equal(getScore($topCandidate), 50); + }); - it("returns BODY if no candidates found", () => { - const $ = cheerio.load(HTML.topBody) + it('returns BODY if no candidates found', () => { + const $ = cheerio.load(HTML.topBody); - const $topCandidate = findTopCandidate($) + const $topCandidate = findTopCandidate($); - assert.equal($topCandidate.get(0).tagName, 'body') - }) + assert.equal($topCandidate.get(0).tagName, 'body'); + }); - it("appends a sibling with a good enough score", () => { - const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') + it('appends a sibling with a good enough score', () => { + const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8'); - let $ = cheerio.load(html) - $ = scoreContent($) + let $ = cheerio.load(html); + $ = scoreContent($); - const $topCandidate = findTopCandidate($) - assert.equal($($topCandidate).text().length, 3652) - }) -}) + const $topCandidate = findTopCandidate($); + assert.equal($($topCandidate).text().length, 3652); + }); +}); diff --git a/src/extractors/generic/content/scoring/fixtures/get-weight.js b/src/extractors/generic/content/scoring/fixtures/get-weight.js index 88025271..97dc6148 100644 --- a/src/extractors/generic/content/scoring/fixtures/get-weight.js +++ b/src/extractors/generic/content/scoring/fixtures/get-weight.js @@ -237,7 +237,7 @@ const HTML = { `, after: ` <div><div><div><p><a href="">Wow how about that</a></p></div></div></div> - ` + `, }, // cleanImages @@ -252,7 +252,7 @@ const HTML = { <div> <img width="50"> </div> - ` + `, }, cleanHeight: { before: ` @@ -264,7 +264,7 @@ const HTML = { <div> <img width="50"> </div> - ` + `, }, cleanSpacer: { before: ` @@ -279,7 +279,7 @@ const HTML = { <img src="/foo/bar/baz/normal.png"> <p>Some text</p> </div> - ` + `, }, // stripJunkTags stripsJunk: { @@ -298,7 +298,7 @@ const HTML = { <div> <p>What an article</p> </div> - ` + `, }, // stripHOnes @@ -314,7 +314,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, convertThreeHOnes: { before: ` @@ -334,7 +334,7 @@ const HTML = { <p>What do you think?</p> <h2>Can you believe it?!</h2> </div> - ` + `, }, // cleanAttributes @@ -348,7 +348,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, removeAlign: { before: ` @@ -360,7 +360,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, // removeEmpty @@ -375,7 +375,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, doNotRemoveBr: { before: ` @@ -392,7 +392,7 @@ const HTML = { <div></div> <p>What do you think?</p> </div> - ` + `, }, doNotNested: { before: ` @@ -409,7 +409,7 @@ const HTML = { <p><img src="foo/bar.jpg" /></p> <p>What do you think?</p> </div> - ` + `, }, // cleanConditionally @@ -433,7 +433,7 @@ const HTML = { </p> <p>What do you think?</p> </div> - ` + `, }, removeTooManyInputs: { before: ` @@ -467,7 +467,7 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, removeShortNoImg: { before: ` @@ -490,7 +490,7 @@ const HTML = { <img src="asdf"> </div> </div> - ` + `, }, linkDensityHigh: { @@ -527,7 +527,7 @@ const HTML = { <li>Keep this one</li> </ul> </div> - ` + `, }, goodScoreTooDense: { before: ` @@ -567,7 +567,7 @@ const HTML = { <li>Keep this one</li> </ul> </div> - ` + `, }, previousEndsInColon: { before: ` @@ -608,7 +608,7 @@ const HTML = { <p>What do you think?</p> </div> `, - after: `What do you think?` + after: 'What do you think?', }, // cleanHeaders @@ -627,7 +627,7 @@ const HTML = { <h2>Keep me</h2> <p>What do you think?</p> </div> - ` + `, }, cleanTitleMatch: { before: ` @@ -642,7 +642,7 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, dropWithNegativeWeight: { before: ` @@ -657,8 +657,8 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, -} +}; -export default HTML +export default HTML; diff --git a/src/extractors/generic/content/scoring/fixtures/html.js b/src/extractors/generic/content/scoring/fixtures/html.js index 3021ce2f..fb2e2801 100644 --- a/src/extractors/generic/content/scoring/fixtures/html.js +++ b/src/extractors/generic/content/scoring/fixtures/html.js @@ -82,6 +82,6 @@ const HTML = { </article> <body> `, -} +}; -export default HTML +export default HTML; diff --git a/src/extractors/generic/content/scoring/get-or-init-score.js b/src/extractors/generic/content/scoring/get-or-init-score.js index 8a63f55b..ccf127e4 100644 --- a/src/extractors/generic/content/scoring/get-or-init-score.js +++ b/src/extractors/generic/content/scoring/get-or-init-score.js @@ -3,27 +3,26 @@ import { scoreNode, getWeight, addToParent, -} from './index' +} from './index'; // gets and returns the score if it exists // if not, initializes a score based on // the node's tag type -export default function getOrInitScore($node, $, weightNodes=true) { - let score = getScore($node) +export default function getOrInitScore($node, $, weightNodes = true) { + let score = getScore($node); if (score) { - return score - } else { - score = scoreNode($node) + return score; + } - if (weightNodes) { - score = score + getWeight($node) - } + score = scoreNode($node); - addToParent($node, $, score) + if (weightNodes) { + score += getWeight($node); } - return score -} + addToParent($node, $, score); + return score; +} diff --git a/src/extractors/generic/content/scoring/get-or-init-score.test.js b/src/extractors/generic/content/scoring/get-or-init-score.test.js index f2545cef..889b008a 100644 --- a/src/extractors/generic/content/scoring/get-or-init-score.test.js +++ b/src/extractors/generic/content/scoring/get-or-init-score.test.js @@ -1,61 +1,61 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; import { getOrInitScore, getScore, -} from './index' +} from './index'; describe('getOrInitScore(node, $)', () => { describe('when score set', () => { - it(`returns score if node's score already set`, () => { - const html = '<p score="40">Foo</p>' - const $ = cheerio.load(html) - const node = $('p').first() + it('returns score if node\'s score already set', () => { + const html = '<p score="40">Foo</p>'; + const $ = cheerio.load(html); + const node = $('p').first(); - const score = getOrInitScore(node, $) + const score = getOrInitScore(node, $); - assert.equal(score, 40) - }) - }) + assert.equal(score, 40); + }); + }); describe('when no score set', () => { - it(`returns 0 if no class/id and text < 25 chars`, () => { - const html = '<p>Foo</p>' - const $ = cheerio.load(html) - const node = $('p').first() + it('returns 0 if no class/id and text < 25 chars', () => { + const html = '<p>Foo</p>'; + const $ = cheerio.load(html); + const node = $('p').first(); - const score = getOrInitScore(node, $) + const score = getOrInitScore(node, $); - assert.equal(score, 0) - }) + assert.equal(score, 0); + }); - it(`returns score if no class/id and has commas/length`, () => { - const $ = cheerio.load(HTML.score19) - const node = $('p').first() + it('returns score if no class/id and has commas/length', () => { + const $ = cheerio.load(HTML.score19); + const node = $('p').first(); - const score = getOrInitScore(node, $) + const score = getOrInitScore(node, $); - assert.equal(score, 19) - }) + assert.equal(score, 19); + }); - it(`returns greater score if weighted class/id is set`, () => { - const $ = cheerio.load(HTML.score44) - const node = $('p').first() + it('returns greater score if weighted class/id is set', () => { + const $ = cheerio.load(HTML.score44); + const node = $('p').first(); - const score = getOrInitScore(node, $) + const score = getOrInitScore(node, $); - assert.equal(score, 44) - }) + assert.equal(score, 44); + }); - it(`gives 1/4 of its score to its parent`, () => { - const $ = cheerio.load(HTML.score44Parent) - const node = $('p').first() + it('gives 1/4 of its score to its parent', () => { + const $ = cheerio.load(HTML.score44Parent); + const node = $('p').first(); - const score = getOrInitScore(node, $) + getOrInitScore(node, $); - assert.equal(getScore(node.parent()), 16) - }) - }) -}) + assert.equal(getScore(node.parent()), 16); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/get-score.js b/src/extractors/generic/content/scoring/get-score.js index 933a91a0..ff1d4fe5 100644 --- a/src/extractors/generic/content/scoring/get-score.js +++ b/src/extractors/generic/content/scoring/get-score.js @@ -2,5 +2,5 @@ // the node's score attribute // returns null if no score set export default function getScore($node) { - return parseFloat($node.attr('score')) || null + return parseFloat($node.attr('score')) || null; } diff --git a/src/extractors/generic/content/scoring/get-score.test.js b/src/extractors/generic/content/scoring/get-score.test.js index 33774c4b..87bf422f 100644 --- a/src/extractors/generic/content/scoring/get-score.test.js +++ b/src/extractors/generic/content/scoring/get-score.test.js @@ -1,25 +1,22 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import { getScore } from './index' +import { getScore } from './index'; describe('Scoring utils', () => { describe('getScore($node)', () => { - it("returns null if the node has no score set", () => { - const $ = cheerio.load('<p>Foo</p>') - const $node = $('p').first() - assert.equal(getScore($node), null) - }) + it('returns null if the node has no score set', () => { + const $ = cheerio.load('<p>Foo</p>'); + const $node = $('p').first(); + assert.equal(getScore($node), null); + }); - it("returns 25 if the node has a score attr of 25", () => { - const $ = cheerio.load('<p score="25">Foo</p>') - const $node = $('p').first() - assert.equal(typeof getScore($node), 'number') - assert.equal(getScore($node), 25) - }) - - }) - - -}) + it('returns 25 if the node has a score attr of 25', () => { + const $ = cheerio.load('<p score="25">Foo</p>'); + const $node = $('p').first(); + assert.equal(typeof getScore($node), 'number'); + assert.equal(getScore($node), 25); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/get-weight.js b/src/extractors/generic/content/scoring/get-weight.js index 4f187a86..5dbd67b4 100644 --- a/src/extractors/generic/content/scoring/get-weight.js +++ b/src/extractors/generic/content/scoring/get-weight.js @@ -3,42 +3,42 @@ import { POSITIVE_SCORE_RE, PHOTO_HINTS_RE, READABILITY_ASSET, -} from './constants' +} from './constants'; // Get the score of a node based on its className and id. export default function getWeight(node) { - const classes = node.attr('class') - const id = node.attr('id') - let score = 0 + const classes = node.attr('class'); + const id = node.attr('id'); + let score = 0; if (id) { // if id exists, try to score on both positive and negative if (POSITIVE_SCORE_RE.test(id)) { - score = score + 25 + score += 25; } if (NEGATIVE_SCORE_RE.test(id)) { - score = score - 25 + score -= 25; } } if (classes) { - if (score == 0) { + if (score === 0) { // if classes exist and id did not contribute to score // try to score on both positive and negative if (POSITIVE_SCORE_RE.test(classes)) { - score = score + 25 + score += 25; } if (NEGATIVE_SCORE_RE.test(classes)) { - score = score - 25 + score -= 25; } } - // even if score has been set by id, add score for + // even if score has been set by id, add score for // possible photo matches // "try to keep photos if we can" if (PHOTO_HINTS_RE.test(classes)) { - score = score + 10 + score += 10; } // add 25 if class matches entry-content-asset, @@ -46,11 +46,10 @@ export default function getWeight(node) { // Readability publisher guidelines // https://www.readability.com/developers/guidelines if (READABILITY_ASSET.test(classes)) { - score = score + 25 + score += 25; } - } - return score + return score; } diff --git a/src/extractors/generic/content/scoring/get-weight.test.js b/src/extractors/generic/content/scoring/get-weight.test.js index 74a42422..a12a3df3 100644 --- a/src/extractors/generic/content/scoring/get-weight.test.js +++ b/src/extractors/generic/content/scoring/get-weight.test.js @@ -1,59 +1,58 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/get-weight' +import HTML from './fixtures/get-weight'; import { - getWeight -} from './index' + getWeight, +} from './index'; describe('Generic Extractor Utils', () => { describe('getWeight(node)', () => { - it("returns a score of 25 if node has positive id", () => { - const $ = cheerio.load(HTML.positiveId) - assert.equal(getWeight($('div')), 25) - }) - - it("returns a score of -25 if node has negative id", () => { - const $ = cheerio.load(HTML.negativeId) - assert.equal(getWeight($('div')), -25) - }) - - it("returns a score of 25 if node has positive class", () => { - const $ = cheerio.load(HTML.positiveClass) - assert.equal(getWeight($('div')), 25) - }) - - it("returns a score of -25 if node has negative class", () => { - const $ = cheerio.load(HTML.negativeClass) - assert.equal(getWeight($('div')), -25) - }) - - it("returns a score of 25 if node has both positive id and class", () => { - const $ = cheerio.load(HTML.positiveIdAndClass) - assert.equal(getWeight($('div')), 25) - }) - - it("returns a score of 25 if node has pos id and neg class", () => { + it('returns a score of 25 if node has positive id', () => { + const $ = cheerio.load(HTML.positiveId); + assert.equal(getWeight($('div')), 25); + }); + + it('returns a score of -25 if node has negative id', () => { + const $ = cheerio.load(HTML.negativeId); + assert.equal(getWeight($('div')), -25); + }); + + it('returns a score of 25 if node has positive class', () => { + const $ = cheerio.load(HTML.positiveClass); + assert.equal(getWeight($('div')), 25); + }); + + it('returns a score of -25 if node has negative class', () => { + const $ = cheerio.load(HTML.negativeClass); + assert.equal(getWeight($('div')), -25); + }); + + it('returns a score of 25 if node has both positive id and class', () => { + const $ = cheerio.load(HTML.positiveIdAndClass); + assert.equal(getWeight($('div')), 25); + }); + + it('returns a score of 25 if node has pos id and neg class', () => { // is this really wanted? id="entry" class="adbox" // should get positive score? - const $ = cheerio.load(HTML.positiveIdNegClass) - assert.equal(getWeight($('div')), 25) - }) + const $ = cheerio.load(HTML.positiveIdNegClass); + assert.equal(getWeight($('div')), 25); + }); - it("returns a score of 10 if node has pos img class", () => { - const $ = cheerio.load(HTML.positivePhotoClass) - assert.equal(getWeight($('div')), 10) - }) + it('returns a score of 10 if node has pos img class', () => { + const $ = cheerio.load(HTML.positivePhotoClass); + assert.equal(getWeight($('div')), 10); + }); - it("returns a score of 35 if node has pos id pos img class", () => { - const $ = cheerio.load(HTML.positiveIdAndPhoto) - assert.equal(getWeight($('div')), 35) - }) + it('returns a score of 35 if node has pos id pos img class', () => { + const $ = cheerio.load(HTML.positiveIdAndPhoto); + assert.equal(getWeight($('div')), 35); + }); it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => { - const $ = cheerio.load(HTML.entryContentAsset) - assert.equal(getWeight($('div')), 50) - }) - - }) -}) + const $ = cheerio.load(HTML.entryContentAsset); + assert.equal(getWeight($('div')), 50); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/index.js b/src/extractors/generic/content/scoring/index.js index 12a8a1f1..209cdc66 100644 --- a/src/extractors/generic/content/scoring/index.js +++ b/src/extractors/generic/content/scoring/index.js @@ -1,13 +1,13 @@ // Scoring -export { default as getWeight } from './get-weight' -export { default as getScore } from './get-score' -export { default as scoreCommas } from './score-commas' -export { default as scoreLength } from './score-length' -export { default as scoreParagraph } from './score-paragraph' -export { default as setScore } from './set-score' -export { default as addScore } from './add-score' -export { default as addToParent } from './add-to-parent' -export { default as getOrInitScore } from './get-or-init-score' -export { default as scoreNode } from './score-node' -export { default as scoreContent } from './score-content' -export { default as findTopCandidate } from './find-top-candidate' +export { default as getWeight } from './get-weight'; +export { default as getScore } from './get-score'; +export { default as scoreCommas } from './score-commas'; +export { default as scoreLength } from './score-length'; +export { default as scoreParagraph } from './score-paragraph'; +export { default as setScore } from './set-score'; +export { default as addScore } from './add-score'; +export { default as addToParent } from './add-to-parent'; +export { default as getOrInitScore } from './get-or-init-score'; +export { default as scoreNode } from './score-node'; +export { default as scoreContent } from './score-content'; +export { default as findTopCandidate } from './find-top-candidate'; diff --git a/src/extractors/generic/content/scoring/merge-siblings.js b/src/extractors/generic/content/scoring/merge-siblings.js new file mode 100644 index 00000000..a2c61560 --- /dev/null +++ b/src/extractors/generic/content/scoring/merge-siblings.js @@ -0,0 +1,79 @@ +import { + textLength, + linkDensity, +} from 'utils/dom'; +import { hasSentenceEnd } from 'utils/text'; + +import { NON_TOP_CANDIDATE_TAGS_RE } from './constants'; +import { getScore } from './index'; + +// Now that we have a top_candidate, look through the siblings of +// it to see if any of them are decently scored. If they are, they +// may be split parts of the content (Like two divs, a preamble and +// a body.) Example: +// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14 +export default function mergeSiblings($candidate, topScore, $) { + if (!$candidate.parent().length) { + return $candidate; + } + + const siblingScoreThreshold = Math.max(10, topScore * 0.2); + const wrappingDiv = $('<div></div>'); + + $candidate.parent().children().each((index, child) => { + const $child = $(child); + // Ignore tags like BR, HR, etc + if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) { + return null; + } + + const childScore = getScore($child); + if (childScore) { + if ($child === $candidate) { + wrappingDiv.append($child); + } else { + let contentBonus = 0; + // extract to scoreLinkDensity() TODO + const density = linkDensity($child); + + // If sibling has a very low link density, + // give it a small bonus + if (density < 0.05) { + contentBonus += 20; + } + + // If sibling has a high link density, + // give it a penalty + if (density >= 0.5) { + contentBonus -= 20; + } + + // If sibling node has the same class as + // candidate, give it a bonus + if ($child.attr('class') === $candidate.attr('class')) { + contentBonus += topScore * 0.2; + } + + const newScore = getScore($child) + contentBonus; + + if (newScore >= siblingScoreThreshold) { + return wrappingDiv.append($child); + } else if (child.tagName === 'p') { + const childContent = $child.text(); + const childContentLength = textLength(childContent); + + if (childContentLength > 80 && density < 0.25) { + return wrappingDiv.append($child); + } else if (childContentLength <= 80 && density === 0 && + hasSentenceEnd(childContent)) { + return wrappingDiv.append($child); + } + } + } + } + + return null; + }); + + return wrappingDiv; +} diff --git a/src/extractors/generic/content/scoring/score-commas.js b/src/extractors/generic/content/scoring/score-commas.js index e9e98de7..69dc7236 100644 --- a/src/extractors/generic/content/scoring/score-commas.js +++ b/src/extractors/generic/content/scoring/score-commas.js @@ -1,5 +1,5 @@ // return 1 for every comma in text export default function scoreCommas(text) { - return (text.match(/,/g) || []).length + return (text.match(/,/g) || []).length; } diff --git a/src/extractors/generic/content/scoring/score-commas.test.js b/src/extractors/generic/content/scoring/score-commas.test.js index 8aaf6af6..d360342a 100644 --- a/src/extractors/generic/content/scoring/score-commas.test.js +++ b/src/extractors/generic/content/scoring/score-commas.test.js @@ -1,20 +1,18 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; -import { scoreCommas } from './index' +import { scoreCommas } from './index'; describe('Scoring utils', () => { describe('scoreCommas(text)', () => { - it(`returns 0 if text has no commas`, () => { - assert.equal(scoreCommas("Foo bar"), 0) - }) - - it(`returns a point for every comma in the text`, () => { - assert.equal(scoreCommas('Foo, bar'), 1) - assert.equal(scoreCommas('Foo, bar, baz'), 2) - assert.equal(scoreCommas('Foo, bar, baz, bat'), 3) - }) - }) -}) + it('returns 0 if text has no commas', () => { + assert.equal(scoreCommas('Foo bar'), 0); + }); + it('returns a point for every comma in the text', () => { + assert.equal(scoreCommas('Foo, bar'), 1); + assert.equal(scoreCommas('Foo, bar, baz'), 2); + assert.equal(scoreCommas('Foo, bar, baz, bat'), 3); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/score-content.js b/src/extractors/generic/content/scoring/score-content.js index 250512f1..4b1731ec 100644 --- a/src/extractors/generic/content/scoring/score-content.js +++ b/src/extractors/generic/content/scoring/score-content.js @@ -1,119 +1,69 @@ -import { HNEWS_CONTENT_SELECTORS } from './constants' +import { convertNodeTo } from 'utils/dom'; +import { HNEWS_CONTENT_SELECTORS } from './constants'; import { scoreNode, setScore, getOrInitScore, addScore, -} from './index' +} from './index'; -import { convertNodeTo } from 'utils/dom' - -// score content. Parents get the full value of their children's -// content score, grandparents half -export default function scoreContent($, weightNodes=true) { - - // First, look for special hNews based selectors and give them a big - // boost, if they exist - HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => { - $(`${parentSelector} ${childSelector}`).each((index, node) => { - addScore($(node).parent(parentSelector), $, 80) - }) - }) +function convertSpans($node, $) { + if ($node.get(0)) { + const { tagName } = $node.get(0); - scorePs($, weightNodes) + if (tagName === 'span') { + // convert spans to divs + convertNodeTo($node, $, 'div'); + } + } +} - return $ +function addScoreTo($node, $, score) { + if ($node) { + convertSpans($node, $); + addScore($node, $, score); + } } function scorePs($, weightNodes) { $('p, pre').toArray().map((node) => { // The raw score for this paragraph, before we add any parent/child // scores. - let $node = $(node) - $node = setScore($node, $, getOrInitScore($node, $, weightNodes)) + let $node = $(node); + $node = setScore($node, $, getOrInitScore($node, $, weightNodes)); - return $node + return $node; }).forEach(($node) => { // The parent scoring has to be done in a separate loop // because otherwise scoring the parent overwrites // the score added to the child // Add the individual content score to the parent node - const rawScore = scoreNode($node) + const rawScore = scoreNode($node); - const $parent = $node.parent() - addScoreTo($parent, $, rawScore, weightNodes) + const $parent = $node.parent(); + addScoreTo($parent, $, rawScore, weightNodes); if ($parent) { // Add half of the individual content score to the // grandparent - addScoreTo($parent.parent(), $, rawScore/2, weightNodes) + addScoreTo($parent.parent(), $, rawScore / 2, weightNodes); } - - }) + }); } -function convertSpans($node, $) { - if ($node.get(0)) { - const { tagName } = $node.get(0) +// score content. Parents get the full value of their children's +// content score, grandparents half +export default function scoreContent($, weightNodes = true) { + // First, look for special hNews based selectors and give them a big + // boost, if they exist + HNEWS_CONTENT_SELECTORS.forEach(([parentSelector, childSelector]) => { + $(`${parentSelector} ${childSelector}`).each((index, node) => { + addScore($(node).parent(parentSelector), $, 80); + }); + }); - if (tagName === 'span') { - // convert spans to divs - convertNodeTo($node, $, 'div') - } - } -} + scorePs($, weightNodes); -function addScoreTo($node, $, score, weightNodes) { - if ($node) { - convertSpans($node, $) - addScore($node, $, score) - } + return $; } - - - // def _score_content(self, doc, weight_nodes=True): - // for selector in constants.HNEWS_CONTENT_SELECTORS: - // # Not self.resource.extract_by_selector because our doc is a copy - // # of the resource doc. - // nodes = extract_by_selector(doc, selector, - // AttribMap(doc)) - // for node in nodes: - // self._add_score(node, 80) - // - // paras = doc.xpath('.//p | .//pre') - // - // # If we don't have any paragraphs at all, we can't score based on - // # paragraphs, so return without modifying anything else. - // if len(paras) == 0: - // return doc - // - // for para in paras: - // # Don't score invalid tags - // if not isinstance(para.tag, basestring): - // continue - // - // # The raw score for this paragraph, before we add any parent/child - // # scores. - // raw_score = self._score_node(para) - // self._set_score(para, self._get_score(para, weight_nodes)) - // - // parent = para.getparent() - // if parent is not None: - // if parent.tag == 'span': - // parent.tag = 'div' - // - // # Add the individual content score to the parent node - // self._add_score(parent, raw_score, weight_nodes=weight_nodes) - // - // grandparent = parent.getparent() - // if grandparent is not None: - // if grandparent.tag == 'span': - // grandparent.tag = 'div' - // - // # Add half of the individual content score to the - // # grandparent - // gp_score = raw_score / 2.0 - // self._add_score(grandparent, gp_score, weight_nodes=weight_nodes) - // - // return doc diff --git a/src/extractors/generic/content/scoring/score-content.test.js b/src/extractors/generic/content/scoring/score-content.test.js index 05b547ec..df75285c 100644 --- a/src/extractors/generic/content/scoring/score-content.test.js +++ b/src/extractors/generic/content/scoring/score-content.test.js @@ -1,47 +1,45 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import fs from 'fs' +import assert from 'assert'; +import cheerio from 'cheerio'; +import fs from 'fs'; -import { clean } from 'test-helpers' -import HTML from './fixtures/html' +import HTML from './fixtures/html'; import { scoreContent, getScore, -} from './index' +} from './index'; // TODO: Walk through these and sanity check my scores // Commented out scores were what I expected, but I was also // probably missing something when calculating describe('scoreContent($, weightNodes)', () => { - it("loves hNews content", () => { - const $ = cheerio.load(HTML.hNews.before) - const result = scoreContent($).html() + it('loves hNews content', () => { + const $ = cheerio.load(HTML.hNews.before); + scoreContent($).html(); - assert.equal(getScore($('div').first()), 140) - }) + assert.equal(getScore($('div').first()), 140); + }); - it("is so-so about non-hNews content", () => { - const $ = cheerio.load(HTML.nonHNews.before) - const result = scoreContent($).html() + it('is so-so about non-hNews content', () => { + const $ = cheerio.load(HTML.nonHNews.before); + scoreContent($).html(); - assert.equal(getScore($('div').first()), 65) - }) + assert.equal(getScore($('div').first()), 65); + }); - it("scores this Wired article the same", () => { - const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') - const $ = cheerio.load(html) - const result = scoreContent($).html() + it('scores this Wired article the same', () => { + const html = fs.readFileSync('./fixtures/wired.html', 'utf-8'); + const $ = cheerio.load(html); + scoreContent($).html(); - assert.equal(getScore($('article').first()), 65.5) - }) + assert.equal(getScore($('article').first()), 65.5); + }); - it("scores this Vulture article", () => { - const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') - let $ = cheerio.load(html) - $ = scoreContent($) + it('scores this Vulture article', () => { + const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8'); + let $ = cheerio.load(html); + $ = scoreContent($); - assert.equal($('p[score]').length, 62) - }) - -}) + assert.equal($('p[score]').length, 62); + }); +}); diff --git a/src/extractors/generic/content/scoring/score-length.js b/src/extractors/generic/content/scoring/score-length.js index 4c8e7467..a00632d0 100644 --- a/src/extractors/generic/content/scoring/score-length.js +++ b/src/extractors/generic/content/scoring/score-length.js @@ -1,11 +1,10 @@ -const idkRe = new RegExp('^(p|pre)$', 'i') +const idkRe = new RegExp('^(p|pre)$', 'i'); -export default function scoreLength(textLength, tagName='p') { - let score - const chunks = textLength / 50 +export default function scoreLength(textLength, tagName = 'p') { + const chunks = textLength / 50; if (chunks > 0) { - let lengthBonus + let lengthBonus; // No idea why p or pre are being tamped down here // but just following the source for now @@ -13,14 +12,14 @@ export default function scoreLength(textLength, tagName='p') { // since this is only being called from the context // of scoreParagraph if (idkRe.test(tagName)) { - lengthBonus = chunks - 2 + lengthBonus = chunks - 2; } else { - lengthBonus = chunks - 1.25 + lengthBonus = chunks - 1.25; } - return Math.min(Math.max(lengthBonus, 0), 3) - } else { - return 0 + return Math.min(Math.max(lengthBonus, 0), 3); } + + return 0; } diff --git a/src/extractors/generic/content/scoring/score-length.test.js b/src/extractors/generic/content/scoring/score-length.test.js index 9a008b28..13384f5d 100644 --- a/src/extractors/generic/content/scoring/score-length.test.js +++ b/src/extractors/generic/content/scoring/score-length.test.js @@ -1,22 +1,21 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; -import { scoreLength } from './index' +import { scoreLength } from './index'; describe('Scoring utils', () => { describe('scoreLength(textLength, tagName)', () => { - it(`returns 0 if length < 50 chars`, () => { - assert.equal(scoreLength(30), 0) - }) + it('returns 0 if length < 50 chars', () => { + assert.equal(scoreLength(30), 0); + }); - it(`returns varying scores but maxes out at 3`, () => { - assert.equal(scoreLength(150), 1) - assert.equal(scoreLength(199), 1.98) - assert.equal(scoreLength(200), 2) - assert.equal(scoreLength(250), 3) - assert.equal(scoreLength(500), 3) - assert.equal(scoreLength(1500), 3) - }) - }) -}) + it('returns varying scores but maxes out at 3', () => { + assert.equal(scoreLength(150), 1); + assert.equal(scoreLength(199), 1.98); + assert.equal(scoreLength(200), 2); + assert.equal(scoreLength(250), 3); + assert.equal(scoreLength(500), 3); + assert.equal(scoreLength(1500), 3); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/score-node.js b/src/extractors/generic/content/scoring/score-node.js index f3ec9a24..1e34aa58 100644 --- a/src/extractors/generic/content/scoring/score-node.js +++ b/src/extractors/generic/content/scoring/score-node.js @@ -1,29 +1,29 @@ -import { scoreParagraph } from './index' +import { scoreParagraph } from './index'; import { PARAGRAPH_SCORE_TAGS, CHILD_CONTENT_TAGS, BAD_TAGS, -} from './constants' +} from './constants'; // Score an individual node. Has some smarts for paragraphs, otherwise // just scores based on tag. export default function scoreNode($node) { - const { tagName } = $node.get(0) + const { tagName } = $node.get(0); // TODO: Consider ordering by most likely. // E.g., if divs are a more common tag on a page, // Could save doing that regex test on every node – AP if (PARAGRAPH_SCORE_TAGS.test(tagName)) { - return scoreParagraph($node) + return scoreParagraph($node); } else if (tagName === 'div') { - return 5 + return 5; } else if (CHILD_CONTENT_TAGS.test(tagName)) { - return 3 + return 3; } else if (BAD_TAGS.test(tagName)) { - return -3 + return -3; } else if (tagName === 'th') { - return -5 + return -5; } - return 0 + return 0; } diff --git a/src/extractors/generic/content/scoring/score-node.test.js b/src/extractors/generic/content/scoring/score-node.test.js index 9a6fe540..363639df 100644 --- a/src/extractors/generic/content/scoring/score-node.test.js +++ b/src/extractors/generic/content/scoring/score-node.test.js @@ -1,95 +1,94 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; import { scoreNode, scoreParagraph, -} from './index' +} from './index'; describe('scoreNode(node)', () => { - it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { - const html = '<p><em>Foo</em> bar</p>' - const $ = cheerio.load(html) - let node = $('p').first() + it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { + const html = '<p><em>Foo</em> bar</p>'; + const $ = cheerio.load(html); + const node = $('p').first(); - const score = scoreNode(node) - const pScore = scoreParagraph(node) + const score = scoreNode(node); + const pScore = scoreParagraph(node); - assert.equal(score, pScore) - assert.equal(score, 0) - }) + assert.equal(score, pScore); + assert.equal(score, 0); + }); - it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { - const $ = cheerio.load(HTML.score1) - let node = $('p').first() + it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { + const $ = cheerio.load(HTML.score1); + const node = $('p').first(); - const score = scoreNode(node) - const pScore = scoreParagraph(node) + const score = scoreNode(node); + const pScore = scoreParagraph(node); - assert.equal(score, pScore) - assert.equal(score, 1) + assert.equal(score, pScore); + assert.equal(score, 1); + }); - }) + it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { + const $ = cheerio.load(HTML.score3); + const node = $('p').first(); - it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { - const $ = cheerio.load(HTML.score3) - let node = $('p').first() + const score = scoreNode(node); + const pScore = scoreParagraph(node); - const score = scoreNode(node) - const pScore = scoreParagraph(node) + assert.equal(score, pScore); + assert.equal(score, 3); + }); - assert.equal(score, pScore) - assert.equal(score, 3) - }) + it('scores P, LI, SPAN, and PRE using scoreParagraph', () => { + const $ = cheerio.load(HTML.score19); + const node = $('p').first(); - it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { - const $ = cheerio.load(HTML.score19) - let node = $('p').first() + const score = scoreNode(node); + const pScore = scoreParagraph(node); - const score = scoreNode(node) - const pScore = scoreParagraph(node) + assert.equal(score, pScore); + assert.equal(score, 19); + }); - assert.equal(score, pScore) - assert.equal(score, 19) - }) + it('scores divs with 5', () => { + const $ = cheerio.load(HTML.divScore5); + const node = $('div').first(); - it(`scores divs with 5`, () => { - const $ = cheerio.load(HTML.divScore5) - let node = $('div').first() + const score = scoreNode(node); - const score = scoreNode(node) + assert.equal(score, 5); + }); - assert.equal(score, 5) - }) + it('scores the blockquote family with 3', () => { + const $ = cheerio.load(HTML.blockquoteScore3); + const node = $('blockquote').first(); - it(`scores the blockquote family with 3`, () => { - const $ = cheerio.load(HTML.blockquoteScore3) - let node = $('blockquote').first() + const score = scoreNode(node); - const score = scoreNode(node) + assert.equal(score, 3); + }); - assert.equal(score, 3) - }) + it('scores a form with negative 3', () => { + const $ = cheerio.load(HTML.formScoreNeg3); + const node = $('form').first(); - it(`scores a form with negative 3`, () => { - const $ = cheerio.load(HTML.formScoreNeg3) - let node = $('form').first() + const score = scoreNode(node); - const score = scoreNode(node) + assert.equal(score, -3); + }); - assert.equal(score, -3) - }) + it('scores a TH element with negative 5', () => { + const $ = cheerio.load(HTML.thScoreNeg5); + const node = $('th').first(); - it(`scores a TH element with negative 5`, () => { - const $ = cheerio.load(HTML.thScoreNeg5) - let node = $('th').first() + const score = scoreNode(node); - const score = scoreNode(node) - - assert.equal(score, -5) - }) -}) + assert.equal(score, -5); + }); +}); diff --git a/src/extractors/generic/content/scoring/score-paragraph.js b/src/extractors/generic/content/scoring/score-paragraph.js index c1447634..65d6ff1f 100644 --- a/src/extractors/generic/content/scoring/score-paragraph.js +++ b/src/extractors/generic/content/scoring/score-paragraph.js @@ -1,35 +1,35 @@ import { scoreCommas, scoreLength, -} from './index' +} from './index'; // Score a paragraph using various methods. Things like number of // commas, etc. Higher is better. export default function scoreParagraph(node) { - let score = 1 - const text = node.text().trim() - const textLength = text.length + let score = 1; + const text = node.text().trim(); + const textLength = text.length; // If this paragraph is less than 25 characters, don't count it. if (textLength < 25) { - return 0 + return 0; } // Add points for any commas within this paragraph - score = score + scoreCommas(text) + score += scoreCommas(text); // For every 50 characters in this paragraph, add another point. Up // to 3 points. - score = score + scoreLength(textLength) + score += scoreLength(textLength); // Articles can end with short paragraphs when people are being clever // but they can also end with short paragraphs setting up lists of junk // that we strip. This negative tweaks junk setup paragraphs just below // the cutoff threshold. if (text.slice(-1) === ':') { - score = score - 1 + score -= 1; } - return score + return score; } diff --git a/src/extractors/generic/content/scoring/score-paragraph.test.js b/src/extractors/generic/content/scoring/score-paragraph.test.js index 7631e4b2..06aa9c0a 100644 --- a/src/extractors/generic/content/scoring/score-paragraph.test.js +++ b/src/extractors/generic/content/scoring/score-paragraph.test.js @@ -1,48 +1,48 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; import { scoreParagraph, -} from './index' +} from './index'; describe('Scoring utils', () => { describe('scoreParagraph(node)', () => { - it(`returns 0 if text is less than 25 chars`, () => { - const html = '<p><em>Foo</em> bar</p>' - const $ = cheerio.load(html) - let node = $('p').first() + it('returns 0 if text is less than 25 chars', () => { + const html = '<p><em>Foo</em> bar</p>'; + const $ = cheerio.load(html); + const node = $('p').first(); - const score = scoreParagraph(node) + const score = scoreParagraph(node); - assert.equal(score, 0) - }) + assert.equal(score, 0); + }); - it(`returns 1 if text is > 25 chars and has 0 commas`, () => { - const $ = cheerio.load(HTML.score1) - let node = $('p').first() + it('returns 1 if text is > 25 chars and has 0 commas', () => { + const $ = cheerio.load(HTML.score1); + const node = $('p').first(); - const score = scoreParagraph(node) + const score = scoreParagraph(node); - assert.equal(score, 1) - }) + assert.equal(score, 1); + }); - it(`returns 3 if text is > 25 chars and has 2 commas`, () => { - const $ = cheerio.load(HTML.score3) - let node = $('p').first() + it('returns 3 if text is > 25 chars and has 2 commas', () => { + const $ = cheerio.load(HTML.score3); + const node = $('p').first(); - const score = scoreParagraph(node) + const score = scoreParagraph(node); - assert.equal(score, 3) - }) + assert.equal(score, 3); + }); - it(`returns 19 if text has 15 commas, ~600 chars`, () => { - const $ = cheerio.load(HTML.score19) - let node = $('p').first() + it('returns 19 if text has 15 commas, ~600 chars', () => { + const $ = cheerio.load(HTML.score19); + const node = $('p').first(); - const score = scoreParagraph(node) + const score = scoreParagraph(node); - assert.equal(score, 19) - }) - }) -}) + assert.equal(score, 19); + }); + }); +}); diff --git a/src/extractors/generic/content/scoring/set-score.js b/src/extractors/generic/content/scoring/set-score.js index 1b0f74ea..ece2051c 100644 --- a/src/extractors/generic/content/scoring/set-score.js +++ b/src/extractors/generic/content/scoring/set-score.js @@ -1,7 +1,6 @@ export default function setScore($node, $, score) { - $node.attr('score', score) - return $node + $node.attr('score', score); + return $node; } - diff --git a/src/extractors/generic/content/scoring/set-score.test.js b/src/extractors/generic/content/scoring/set-score.test.js index f4701820..405b2e59 100644 --- a/src/extractors/generic/content/scoring/set-score.test.js +++ b/src/extractors/generic/content/scoring/set-score.test.js @@ -1,23 +1,22 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; import { setScore, - getScore -} from './index' + getScore, +} from './index'; describe('Scoring utils', () => { - describe('setScore(node, $, amount)', () => { it("sets the specified amount as the node's score", () => { - const $ = cheerio.load('<p>Foo</p>') - let $node = $('p').first() + const $ = cheerio.load('<p>Foo</p>'); + let $node = $('p').first(); - const newScore = 25 - $node = setScore($node, $, newScore) + const newScore = 25; + $node = setScore($node, $, newScore); - const score = getScore($node) - assert(score, newScore) - }) - }) -}) + const score = getScore($node); + assert(score, newScore); + }); + }); +}); diff --git a/src/extractors/generic/date-published/constants.js b/src/extractors/generic/date-published/constants.js index 6fb61412..25bfe0da 100644 --- a/src/extractors/generic/date-published/constants.js +++ b/src/extractors/generic/date-published/constants.js @@ -3,23 +3,23 @@ // should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. export const DATE_PUBLISHED_META_TAGS = [ - 'article:published_time', - 'displaydate', - 'dc.date', - 'dc.date.issued', - 'rbpubdate', - 'publish_date', - 'pub_date', - 'pagedate', - 'pubdate', - 'revision_date', - 'doc_date', - 'date_created', - 'content_create_date', - 'lastmodified', - 'created', - 'date' -] + 'article:published_time', + 'displaydate', + 'dc.date', + 'dc.date.issued', + 'rbpubdate', + 'publish_date', + 'pub_date', + 'pagedate', + 'pubdate', + 'revision_date', + 'doc_date', + 'date_created', + 'content_create_date', + 'lastmodified', + 'created', + 'date', +]; // An ordered list of XPath Selectors to find // likely date published dates. From most explicit @@ -42,20 +42,20 @@ export const DATE_PUBLISHED_SELECTORS = [ '#story .datetime', '.dateline', '.pubdate', -] +]; // An ordered list of compiled regular expressions to find likely date // published dates from the URL. These should always have the first // reference be a date string that is parseable by dateutil.parser.parse -const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)' +const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'; export const DATE_PUBLISHED_URL_RES = [ // /2012/01/27/ but not /2012/01/293 - new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), + new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733 // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i, // 2012-01-27 - new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), + new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/ - new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i') -] + new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'), +]; diff --git a/src/extractors/generic/date-published/extractor.js b/src/extractors/generic/date-published/extractor.js index b20771f3..d3240c08 100644 --- a/src/extractors/generic/date-published/extractor.js +++ b/src/extractors/generic/date-published/extractor.js @@ -1,37 +1,36 @@ +import { cleanDatePublished } from 'cleaners'; +import { + extractFromMeta, + extractFromSelectors, +} from 'utils/dom'; +import { extractFromUrl } from 'utils/text'; + import { DATE_PUBLISHED_META_TAGS, DATE_PUBLISHED_SELECTORS, DATE_PUBLISHED_URL_RES, -} from './constants' - -import { cleanDatePublished } from 'cleaners' - -import { - extractFromMeta, - extractFromSelectors, -} from 'utils/dom' -import { extractFromUrl } from 'utils/text' +} from './constants'; const GenericDatePublishedExtractor = { extract({ $, url, metaCache }) { - let datePublished + let datePublished; // First, check to see if we have a matching meta tag // that we can make use of. // Don't try cleaning tags from this string - datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false) - if(datePublished) return cleanDatePublished(datePublished) + datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false); + if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential // date_published's. - datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS) - if(datePublished) return cleanDatePublished(datePublished) + datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS); + if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL - datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES) - if(datePublished) return cleanDatePublished(datePublished) + datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES); + if (datePublished) return cleanDatePublished(datePublished); - return null - } -} + return null; + }, +}; -export default GenericDatePublishedExtractor +export default GenericDatePublishedExtractor; diff --git a/src/extractors/generic/date-published/extractor.test.js b/src/extractors/generic/date-published/extractor.test.js index 7ba1ca85..a62aad38 100644 --- a/src/extractors/generic/date-published/extractor.test.js +++ b/src/extractors/generic/date-published/extractor.test.js @@ -1,97 +1,95 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import moment from 'moment' +import assert from 'assert'; +import cheerio from 'cheerio'; +import moment from 'moment'; -import HTML from './fixtures/html' -import GenericDatePublishedExtractor from './extractor' +import HTML from './fixtures/html'; +import GenericDatePublishedExtractor from './extractor'; describe('GenericDatePublishedExtractor', () => { describe('extract($, metaCache)', () => { it('extracts datePublished from meta tags', () => { - const $ = cheerio.load(HTML.datePublishedMeta.test) - const metaCache = ["displaydate", "something-else"] + const $ = cheerio.load(HTML.datePublishedMeta.test); + const metaCache = ['displaydate', 'something-else']; const result = GenericDatePublishedExtractor.extract( { $, url: '', metaCache } - ) + ); - assert.equal( + assert.equal( result, HTML.datePublishedMeta.result.toISOString() - ) - }) + ); + }); it('extracts datePublished from selectors', () => { - const $ = cheerio.load(HTML.datePublishedSelectors.test) - const metaCache = [] + const $ = cheerio.load(HTML.datePublishedSelectors.test); + const metaCache = []; const result = GenericDatePublishedExtractor.extract( { $, url: '', metaCache } - ) + ); - assert.equal( + assert.equal( result, HTML.datePublishedMeta.result.toISOString() - ) - }) + ); + }); it('extracts from url formatted /2012/08/01/etc', () => { - const $ = cheerio.load('<div></div>') - const metaCache = [] - const url = 'https://example.com/2012/08/01/this-is-good' + const $ = cheerio.load('<div></div>'); + const metaCache = []; + const url = 'https://example.com/2012/08/01/this-is-good'; const result = GenericDatePublishedExtractor.extract( { $, url, metaCache } - ) + ); - assert.equal( + assert.equal( result, new Date('2012/08/01').toISOString() - ) - }) + ); + }); it('extracts from url formatted /2020-01-01', () => { - const $ = cheerio.load('<div></div>') - const metaCache = [] - const url = 'https://example.com/2020-01-01/this-is-good' + const $ = cheerio.load('<div></div>'); + const metaCache = []; + const url = 'https://example.com/2020-01-01/this-is-good'; const result = GenericDatePublishedExtractor.extract( { $, url, metaCache } - ) + ); - assert.equal( + assert.equal( result, moment(new Date('2020-01-01')).toISOString() - ) - }) + ); + }); it('extracts from url formatted /2020/jan/01', () => { - const $ = cheerio.load('<div></div>') - const metaCache = [] - const url = 'https://example.com/2020/jan/01/this-is-good' + const $ = cheerio.load('<div></div>'); + const metaCache = []; + const url = 'https://example.com/2020/jan/01/this-is-good'; const result = GenericDatePublishedExtractor.extract( { $, url, metaCache } - ) + ); - assert.equal( + assert.equal( result, new Date('2020/jan/01').toISOString() - ) - }) + ); + }); it('returns null if no date can be found', () => { - const $ = cheerio.load('<div></div>') - const metaCache = [] + const $ = cheerio.load('<div></div>'); + const metaCache = []; const result = GenericDatePublishedExtractor.extract( { $, url: '', metaCache } - ) - - assert.equal(result, null) - }) - - }) -}) + ); + assert.equal(result, null); + }); + }); +}); diff --git a/src/extractors/generic/date-published/fixtures/html.js b/src/extractors/generic/date-published/fixtures/html.js index 01b721d3..573fba1f 100644 --- a/src/extractors/generic/date-published/fixtures/html.js +++ b/src/extractors/generic/date-published/fixtures/html.js @@ -7,7 +7,7 @@ const HTML = { </head> </html> `, - result: new Date('1/1/2020 8:30 (EST)') + result: new Date('1/1/2020 8:30 (EST)'), }, datePublishedSelectors: { test: ` @@ -19,8 +19,8 @@ const HTML = { </head> </div> `, - result: new Date('1/1/2020 8:30 am (EST)') + result: new Date('1/1/2020 8:30 am (EST)'), }, -} +}; -export default HTML +export default HTML; diff --git a/src/extractors/generic/dek/extractor.js b/src/extractors/generic/dek/extractor.js index aee71b40..5cf9d1f3 100644 --- a/src/extractors/generic/dek/extractor.js +++ b/src/extractors/generic/dek/extractor.js @@ -1,27 +1,28 @@ -import { - DEK_META_TAGS, - DEK_SELECTORS, - DEK_URL_RES, -} from './constants' +// import { +// DEK_META_TAGS, +// DEK_SELECTORS, +// DEK_URL_RES, +// } from './constants'; -import { cleanDek } from 'cleaners' +// import { cleanDek } from 'cleaners'; -import { - extractFromMeta, - extractFromSelectors, -} from 'utils/dom' +// import { +// extractFromMeta, +// extractFromSelectors, +// } from 'utils/dom'; // Currently there is only one selector for // deks. We should simply return null here // until we have a more robust generic option. // Below is the original source for this, for reference. const GenericDekExtractor = { - extract({ $, content, metaCache }) { - return null - } -} + // extract({ $, content, metaCache }) { + extract() { + return null; + }, +}; -export default GenericDekExtractor +export default GenericDekExtractor; // def extract_dek(self): // # First, check to see if we have a matching meta tag that we can make diff --git a/src/extractors/generic/dek/extractor.test.js b/src/extractors/generic/dek/extractor.test.js index 221b29ea..b1a1709e 100644 --- a/src/extractors/generic/dek/extractor.test.js +++ b/src/extractors/generic/dek/extractor.test.js @@ -1,20 +1,18 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; // import HTML from './fixtures/html' -import GenericDekExtractor from './extractor' +import GenericDekExtractor from './extractor'; describe('GenericDekExtractor', () => { describe('extract({ $, metaCache })', () => { - it('returns null if no dek can be found', () => { - const $ = cheerio.load('<div></div>') - const metaCache = [] + const $ = cheerio.load('<div></div>'); + const metaCache = []; const result = - GenericDekExtractor.extract({ $, metaCache }) - - assert.equal(result, null) - }) + GenericDekExtractor.extract({ $, metaCache }); - }) -}) + assert.equal(result, null); + }); + }); +}); diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js index 0a3ed9c5..90c39bf3 100644 --- a/src/extractors/generic/index.js +++ b/src/extractors/generic/index.js @@ -1,50 +1,50 @@ -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import GenericContentExtractor from './content/extractor' -import GenericTitleExtractor from './title/extractor' -import GenericAuthorExtractor from './author/extractor' -import GenericDatePublishedExtractor from './date-published/extractor' -import GenericDekExtractor from './dek/extractor' -import GenericLeadImageUrlExtractor from './lead-image-url/extractor' -import GenericNextPageUrlExtractor from './next-page-url/extractor' +import GenericContentExtractor from './content/extractor'; +import GenericTitleExtractor from './title/extractor'; +import GenericAuthorExtractor from './author/extractor'; +import GenericDatePublishedExtractor from './date-published/extractor'; +import GenericDekExtractor from './dek/extractor'; +import GenericLeadImageUrlExtractor from './lead-image-url/extractor'; +import GenericNextPageUrlExtractor from './next-page-url/extractor'; const GenericExtractor = { // This extractor is the default for all domains domain: '*', title: GenericTitleExtractor.extract, - datePublished : GenericDatePublishedExtractor.extract, + datePublished: GenericDatePublishedExtractor.extract, author: GenericAuthorExtractor.extract, content: GenericContentExtractor.extract.bind(GenericContentExtractor), leadImageUrl: GenericLeadImageUrlExtractor.extract, dek: GenericDekExtractor.extract, nextPageUrl: GenericNextPageUrlExtractor.extract, - extract: function(options) { - let { html } = options + extract(options) { + const { html } = options; if (html) { - const $ = cheerio.load(html) - options.$ = $ + const $ = cheerio.load(html); + options.$ = $; } - const title = this.title(options) - const datePublished = this.datePublished(options) - const author = this.author(options) - const content = this.content({ ...options, title }) - const leadImageUrl = this.leadImageUrl(options) - const dek = this.dek(options) - const nextPageUrl = this.nextPageUrl(options) + const title = this.title(options); + const datePublished = this.datePublished(options); + const author = this.author(options); + const content = this.content({ ...options, title }); + const leadImageUrl = this.leadImageUrl(options); + const dek = this.dek(options); + const nextPageUrl = this.nextPageUrl(options); return { title, author, - datePublished: datePublished ? datePublished : null, + datePublished: datePublished || null, dek, leadImageUrl, content, nextPageUrl, - } - } -} + }; + }, +}; -export default GenericExtractor +export default GenericExtractor; diff --git a/src/extractors/generic/index.test.js b/src/extractors/generic/index.test.js index 4d463593..7893b2c4 100644 --- a/src/extractors/generic/index.test.js +++ b/src/extractors/generic/index.test.js @@ -1,14 +1,12 @@ -import assert from 'assert' -import fs from 'fs' +import assert from 'assert'; +import fs from 'fs'; -import { clean } from 'test-helpers' - -import GenericExtractor from './index' +import GenericExtractor from './index'; describe('GenericExtractor', () => { describe('extract(opts)', () => { - it("extracts this old LA Times article", () => { - const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') + it('extracts this old LA Times article', () => { + const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8'); const { title, @@ -16,23 +14,23 @@ describe('GenericExtractor', () => { datePublished, dek, } = GenericExtractor.extract( - { url: "http://latimes.com", html, metaCache: [] } - ) + { url: 'http://latimes.com', html, metaCache: [] } + ); - assert.equal(author, null) + assert.equal(author, null); assert.equal( title, 'California appears poised to be first to ban power-guzzling big-screen TVs' - ) + ); assert.equal( datePublished, '2009-10-14T04:00:00.000Z' - ) - assert.equal(dek, null) - }) + ); + assert.equal(dek, null); + }); - it("extracts html and returns the article title", () => { - const html = fs.readFileSync('../fixtures/wired.html', 'utf-8') + it('extracts html and returns the article title', () => { + const html = fs.readFileSync('../fixtures/wired.html', 'utf-8'); const { author, @@ -40,18 +38,17 @@ describe('GenericExtractor', () => { datePublished, dek, } = GenericExtractor.extract( - { url: "http://wired.com", html, metaCache: [] } - ) + { url: 'http://wired.com', html, metaCache: [] } + ); - assert.equal(author, 'Eric Adams') + assert.equal(author, 'Eric Adams'); assert.equal( title, 'Airplane Tires Don’t Explode on Landing Because They Are Pumped!' - ) - assert.equal(datePublished, null) - assert.equal(dek, null) - }) - - }) -}) + ); + assert.equal(datePublished, null); + assert.equal(dek, null); + }); + }); +}); diff --git a/src/extractors/generic/lead-image-url/constants.js b/src/extractors/generic/lead-image-url/constants.js index 07556786..8f70ae81 100644 --- a/src/extractors/generic/lead-image-url/constants.js +++ b/src/extractors/generic/lead-image-url/constants.js @@ -2,52 +2,52 @@ // All attributes should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. export const LEAD_IMAGE_URL_META_TAGS = [ - 'og:image', - 'twitter:image', - 'image_src', -] + 'og:image', + 'twitter:image', + 'image_src', +]; export const LEAD_IMAGE_URL_SELECTORS = [ 'link[rel=image_src]', -] +]; export const POSITIVE_LEAD_IMAGE_URL_HINTS = [ - 'upload', - 'wp-content', - 'large', - 'photo', - 'wp-image', -] -export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') + 'upload', + 'wp-content', + 'large', + 'photo', + 'wp-image', +]; +export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i'); export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [ - 'spacer', - 'sprite', - 'blank', - 'throbber', - 'gradient', - 'tile', - 'bg', - 'background', - 'icon', - 'social', - 'header', - 'hdr', - 'advert', - 'spinner', - 'loader', - 'loading', - 'default', - 'rating', - 'share', - 'facebook', - 'twitter', - 'theme', - 'promo', - 'ads', - 'wp-includes', -] -export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') + 'spacer', + 'sprite', + 'blank', + 'throbber', + 'gradient', + 'tile', + 'bg', + 'background', + 'icon', + 'social', + 'header', + 'hdr', + 'advert', + 'spinner', + 'loader', + 'loading', + 'default', + 'rating', + 'share', + 'facebook', + 'twitter', + 'theme', + 'promo', + 'ads', + 'wp-includes', +]; +export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i'); -export const GIF_RE = /\.gif(\?.*)?$/i -export const JPG_RE = /\.jpe?g(\?.*)?$/i +export const GIF_RE = /\.gif(\?.*)?$/i; +export const JPG_RE = /\.jpe?g(\?.*)?$/i; diff --git a/src/extractors/generic/lead-image-url/extractor.js b/src/extractors/generic/lead-image-url/extractor.js index 73074b77..13200df2 100644 --- a/src/extractors/generic/lead-image-url/extractor.js +++ b/src/extractors/generic/lead-image-url/extractor.js @@ -1,14 +1,12 @@ -import 'babel-polyfill' +import 'babel-polyfill'; + +import { extractFromMeta } from 'utils/dom'; +import { cleanImage } from 'cleaners'; import { LEAD_IMAGE_URL_META_TAGS, LEAD_IMAGE_URL_SELECTORS, -} from './constants' - -import { - extractFromMeta, - extractFromSelectors -} from 'utils/dom' +} from './constants'; import { scoreImageUrl, @@ -17,9 +15,7 @@ import { scoreBySibling, scoreByDimensions, scoreByPosition, -} from './score-image' - -import { cleanImage } from 'cleaners' +} from './score-image'; // Given a resource, try to find the lead image URL from within // it. Like content and next page extraction, uses a scoring system @@ -31,86 +27,87 @@ import { cleanImage } from 'cleaners' // * weird aspect ratio const GenericLeadImageUrlExtractor = { extract({ $, content, metaCache }) { - let imageUrl, cleanUrl + let cleanUrl; // Check to see if we have a matching meta tag that we can make use of. // Moving this higher because common practice is now to use large // images on things like Open Graph or Twitter cards. // images usually have for things like Open Graph. - imageUrl = + const imageUrl = extractFromMeta( $, LEAD_IMAGE_URL_META_TAGS, metaCache, false - ) + ); if (imageUrl) { - cleanUrl = cleanImage(imageUrl) + cleanUrl = cleanImage(imageUrl); - if (cleanUrl) return cleanUrl + if (cleanUrl) return cleanUrl; } // Next, try to find the "best" image via the content. // We'd rather not have to fetch each image and check dimensions, // so try to do some analysis and determine them instead. - const imgs = $('img', content).toArray() - let imgScores = {} + const imgs = $('img', content).toArray(); + const imgScores = {}; imgs.forEach((img, index) => { - const $img = $(img) - const src = $img.attr('src') + const $img = $(img); + const src = $img.attr('src'); - if (!src) return + if (!src) return; - let score = scoreImageUrl(src) - score = score + scoreAttr($img) - score = score + scoreByParents($img) - score = score + scoreBySibling($img) - score = score + scoreByDimensions($img) - score = score + scoreByPosition(imgs, index) + let score = scoreImageUrl(src); + score += scoreAttr($img); + score += scoreByParents($img); + score += scoreBySibling($img); + score += scoreByDimensions($img); + score += scoreByPosition(imgs, index); - imgScores[src] = score - }) + imgScores[src] = score; + }); const [topUrl, topScore] = Reflect.ownKeys(imgScores).reduce((acc, key) => imgScores[key] > acc[1] ? [key, imgScores[key]] : acc - , [null, 0]) + , [null, 0]); if (topScore > 0) { - cleanUrl = cleanImage(topUrl) + cleanUrl = cleanImage(topUrl); - if (cleanUrl) return cleanUrl + if (cleanUrl) return cleanUrl; } // If nothing else worked, check to see if there are any really // probable nodes in the doc, like <link rel="image_src" />. for (const selector of LEAD_IMAGE_URL_SELECTORS) { - const $node = $(selector).first() - const src = $node.attr('src') + const $node = $(selector).first(); + const src = $node.attr('src'); if (src) { - cleanUrl = cleanImage(src) - if (cleanUrl) return cleanUrl + cleanUrl = cleanImage(src); + if (cleanUrl) return cleanUrl; } - const href = $node.attr('href') + const href = $node.attr('href'); if (href) { - cleanUrl = cleanImage(href) - if (cleanUrl) return cleanUrl + cleanUrl = cleanImage(href); + if (cleanUrl) return cleanUrl; } - const value = $node.attr('value') + const value = $node.attr('value'); if (value) { - cleanUrl = cleanImage(value) - if (cleanUrl) return cleanUrl + cleanUrl = cleanImage(value); + if (cleanUrl) return cleanUrl; } } + return null; }, -} +}; -export default GenericLeadImageUrlExtractor +export default GenericLeadImageUrlExtractor; // def extract(self): // """ @@ -182,7 +179,7 @@ export default GenericLeadImageUrlExtractor // if sibling is not None: // if sibling.tag == 'figcaption': // img_score += 25 -// +// // sib_sig = ' '.join([sibling.get('id', ''), // sibling.get('class', '')]).lower() // if 'caption' in sib_sig: @@ -215,7 +212,7 @@ export default GenericLeadImageUrlExtractor // // if img_width and img_height and not 'sprite' in img_path: // area = img_width * img_height -// +// // if area < 5000: # Smaller than 50x100 // logger.debug('Image with small area found. Subtracting 100.') // img_score -= 100 diff --git a/src/extractors/generic/lead-image-url/extractor.test.js b/src/extractors/generic/lead-image-url/extractor.test.js index 459a4c66..1acd314e 100644 --- a/src/extractors/generic/lead-image-url/extractor.test.js +++ b/src/extractors/generic/lead-image-url/extractor.test.js @@ -1,62 +1,62 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; -import GenericLeadImageUrlExtractor from './extractor' +import GenericLeadImageUrlExtractor from './extractor'; describe('GenericLeadImageUrlExtractor', () => { describe('extract({ $, content, metaCache })', () => { it('returns og:image first', () => { - const $ = cheerio.load(HTML.og.test) - const content = $('*').first() - const metaCache = ['og:image'] + const $ = cheerio.load(HTML.og.test); + const content = $('*').first(); + const metaCache = ['og:image']; const result = GenericLeadImageUrlExtractor.extract( { $, content, metaCache } - ) + ); - assert.equal(result, HTML.og.result) - }) + assert.equal(result, HTML.og.result); + }); it('returns twitter:image', () => { - const $ = cheerio.load(HTML.twitter.test) - const content = $('*').first() - const metaCache = ['twitter:image'] + const $ = cheerio.load(HTML.twitter.test); + const content = $('*').first(); + const metaCache = ['twitter:image']; const result = GenericLeadImageUrlExtractor.extract( { $, content, metaCache } - ) + ); - assert.equal(result, HTML.twitter.result) - }) + assert.equal(result, HTML.twitter.result); + }); it('finds images based on scoring', () => { - const $ = cheerio.load(HTML.scoring.test) - const content = $('*').first() - const metaCache = [] + const $ = cheerio.load(HTML.scoring.test); + const content = $('*').first(); + const metaCache = []; const result = GenericLeadImageUrlExtractor.extract( { $, content, metaCache } - ) + ); - assert.equal(result, HTML.scoring.result) - }) + assert.equal(result, HTML.scoring.result); + }); it('returns image based on selectors', () => { - const $ = cheerio.load(HTML.selectors.test) - const content = $('*').first() - const metaCache = [] + const $ = cheerio.load(HTML.selectors.test); + const content = $('*').first(); + const metaCache = []; const result = GenericLeadImageUrlExtractor.extract( { $, content, metaCache } - ) + ); - assert.equal(result, HTML.selectors.result) - }) - }) -}) + assert.equal(result, HTML.selectors.result); + }); + }); +}); diff --git a/src/extractors/generic/lead-image-url/fixtures/html.js b/src/extractors/generic/lead-image-url/fixtures/html.js index 2af7b519..917c5fb8 100644 --- a/src/extractors/generic/lead-image-url/fixtures/html.js +++ b/src/extractors/generic/lead-image-url/fixtures/html.js @@ -7,7 +7,7 @@ const HTML = { </head> </html> `, - result: `http://example.com/lead.jpg` + result: 'http://example.com/lead.jpg', }, twitter: { test: ` @@ -17,7 +17,7 @@ const HTML = { </head> </html> `, - result: `http://example.com/lead.jpg` + result: 'http://example.com/lead.jpg', }, scoring: { test: ` @@ -27,7 +27,7 @@ const HTML = { <img src="http://example.com/upload/whateverpic.png" /> </div> `, - result: `http://example.com/upload/goodpic.jpg` + result: 'http://example.com/upload/goodpic.jpg', }, selectors: { test: ` @@ -35,8 +35,8 @@ const HTML = { <link rel="image_src" href="http://example.com/upload/goodpic.jpg"> </div> `, - result: `http://example.com/upload/goodpic.jpg` + result: 'http://example.com/upload/goodpic.jpg', }, -} +}; -export default HTML +export default HTML; diff --git a/src/extractors/generic/lead-image-url/score-image.js b/src/extractors/generic/lead-image-url/score-image.js index c55ae028..d22b983d 100644 --- a/src/extractors/generic/lead-image-url/score-image.js +++ b/src/extractors/generic/lead-image-url/score-image.js @@ -3,123 +3,123 @@ import { NEGATIVE_LEAD_IMAGE_URL_HINTS_RE, GIF_RE, JPG_RE, -} from './constants' +} from './constants'; -import { PHOTO_HINTS_RE } from '../content/scoring/constants' +import { PHOTO_HINTS_RE } from '../content/scoring/constants'; + +function getSig($node) { + return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`; +} // Scores image urls based on a variety of heuristics. export function scoreImageUrl(url) { - url = url.trim() - let score = 0 + url = url.trim(); + let score = 0; if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { - score = score + 20 + score += 20; } if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { - score = score - 20 + score -= 20; } // TODO: We might want to consider removing this as // gifs are much more common/popular than they once were if (GIF_RE.test(url)) { - score = score - 10 + score -= 10; } if (JPG_RE.test(url)) { - score = score + 10 + score += 10; } // PNGs are neutral. - return score + return score; } // Alt attribute usually means non-presentational image. export function scoreAttr($img) { if ($img.attr('alt')) { - return 5 - } else { - return 0 + return 5; } + + return 0; } // Look through our parent and grandparent for figure-like // container elements, give a bonus if we find them export function scoreByParents($img) { - let score = 0 - const $figParent = $img.parents('figure').first() + let score = 0; + const $figParent = $img.parents('figure').first(); if ($figParent.length === 1) { - score = score + 25 + score += 25; } - const $parent = $img.parent() - let $gParent + const $parent = $img.parent(); + let $gParent; if ($parent.length === 1) { - $gParent = $parent.parent() + $gParent = $parent.parent(); } - [$parent, $gParent].forEach($node => { + [$parent, $gParent].forEach(($node) => { if (PHOTO_HINTS_RE.test(getSig($node))) { - score = score + 15 + score += 15; } - }) + }); - return score + return score; } // Look at our immediate sibling and see if it looks like it's a // caption. Bonus if so. export function scoreBySibling($img) { - let score = 0 - const $sibling = $img.next() - const sibling = $sibling.get(0) + let score = 0; + const $sibling = $img.next(); + const sibling = $sibling.get(0); if (sibling && sibling.tagName === 'figcaption') { - score = score + 25 + score += 25; } if (PHOTO_HINTS_RE.test(getSig($sibling))) { - score = score + 15 + score += 15; } - return score + return score; } export function scoreByDimensions($img) { - let score = 0 + let score = 0; - const width = parseFloat($img.attr('width')) - const height = parseFloat($img.attr('height')) - const src = $img.attr('src') + const width = parseFloat($img.attr('width')); + const height = parseFloat($img.attr('height')); + const src = $img.attr('src'); // Penalty for skinny images if (width && width <= 50) { - score = score - 50 + score -= 50; } // Penalty for short images if (height && height <= 50) { - score = score - 50 + score -= 50; } if (width && height && !src.includes('sprite')) { - const area = width * height + const area = width * height; if (area < 5000) { // Smaller than 50 x 100 - score = score - 100 + score -= 100; } else { - score = score + Math.round(area/1000) + score += Math.round(area / 1000); } } - return score + return score; } export function scoreByPosition($imgs, index) { - return $imgs.length/2 - index -} - -function getSig($node) { - return `${$node.attr('class') || ''} ${$node.attr('id') || ''}` + return ($imgs.length / 2) - index; } diff --git a/src/extractors/generic/lead-image-url/score-image.test.js b/src/extractors/generic/lead-image-url/score-image.test.js index c17d71a6..58ffa59e 100644 --- a/src/extractors/generic/lead-image-url/score-image.test.js +++ b/src/extractors/generic/lead-image-url/score-image.test.js @@ -1,5 +1,5 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; import { scoreImageUrl, @@ -8,61 +8,61 @@ import { scoreBySibling, scoreByDimensions, scoreByPosition, -} from './score-image' +} from './score-image'; describe('scoreImageUrlUrl(url)', () => { it('gets 20 points for a positive lead img hint', () => { - const url = 'http://example.com/upload/img.png' + const url = 'http://example.com/upload/img.png'; - assert.equal(scoreImageUrl(url), 20) - }) + assert.equal(scoreImageUrl(url), 20); + }); it('loses 20 points for a negative lead img hint', () => { - const url = 'http://example.com/sprite/foo/bar.png' + const url = 'http://example.com/sprite/foo/bar.png'; - assert.equal(scoreImageUrl(url), -20) - }) + assert.equal(scoreImageUrl(url), -20); + }); it('loses 10 points for a gif', () => { - const url = 'http://example.com/foo/bar.gif' + const url = 'http://example.com/foo/bar.gif'; - assert.equal(scoreImageUrl(url), -10) + assert.equal(scoreImageUrl(url), -10); - const url2 = 'http://example.com/foogif/bar' + const url2 = 'http://example.com/foogif/bar'; - assert.equal(scoreImageUrl(url2), 0) - }) + assert.equal(scoreImageUrl(url2), 0); + }); it('gains 10 points for a jpg', () => { - const url = 'http://example.com/foo/bar.jpg' - assert.equal(scoreImageUrl(url), 10) + const url = 'http://example.com/foo/bar.jpg'; + assert.equal(scoreImageUrl(url), 10); - const url2 = 'http://example.com/foo/bar.jpeg' - assert.equal(scoreImageUrl(url2), 10) + const url2 = 'http://example.com/foo/bar.jpeg'; + assert.equal(scoreImageUrl(url2), 10); - const url3 = 'http://example.com/foojpg/bar' - assert.equal(scoreImageUrl(url3), 0) + const url3 = 'http://example.com/foojpg/bar'; + assert.equal(scoreImageUrl(url3), 0); - const url4 = 'http://example.com/foo.jpg?bar=baz' - assert.equal(scoreImageUrl(url4), 10) - }) -}) + const url4 = 'http://example.com/foo.jpg?bar=baz'; + assert.equal(scoreImageUrl(url4), 10); + }); +}); describe('scoreAttr($img)', () => { it('gets 5 points if the img node has an alt attribute', () => { - const $ = cheerio.load('<div><img alt="Wow" /></div>') - const $img = $('img').first() + const $ = cheerio.load('<div><img alt="Wow" /></div>'); + const $img = $('img').first(); - assert.equal(scoreAttr($img), 5) - }) + assert.equal(scoreAttr($img), 5); + }); it('gets 0 points if the img node has an alt attribute', () => { - const $ = cheerio.load('<div><img /></div>') - const $img = $('img').first() + const $ = cheerio.load('<div><img /></div>'); + const $img = $('img').first(); - assert.equal(scoreAttr($img), 0) - }) -}) + assert.equal(scoreAttr($img), 0); + }); +}); describe('scoreByParents($img)', () => { it('gets 25 points if it has a figure parent', () => { @@ -74,18 +74,18 @@ describe('scoreByParents($img)', () => { </div> </figure> </div>` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByParents($img), 25) - }) + assert.equal(scoreByParents($img), 25); + }); it('gets 0 points if the img has no figure parent', () => { - const $ = cheerio.load('<div><img /></div>') - const $img = $('img').first() + const $ = cheerio.load('<div><img /></div>'); + const $img = $('img').first(); - assert.equal(scoreByParents($img), 0) - }) + assert.equal(scoreByParents($img), 0); + }); it('gets 15 points if parent or gparent has photo hints', () => { const $ = cheerio.load( @@ -96,12 +96,12 @@ describe('scoreByParents($img)', () => { </div> </div> </div>` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByParents($img), 15) - }) -}) + assert.equal(scoreByParents($img), 15); + }); +}); describe('scoreBySibling($img)', () => { it('gets 25 points if its sibling is figcaption', () => { @@ -112,11 +112,11 @@ describe('scoreBySibling($img)', () => { <figcaption>Wow</figcaption> </div> ` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreBySibling($img), 25) - }) + assert.equal(scoreBySibling($img), 25); + }); it('gets 15 points if its sibling has photo hints', () => { const $ = cheerio.load( @@ -128,12 +128,12 @@ describe('scoreBySibling($img)', () => { </div> </div> </div>` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreBySibling($img), 15) - }) -}) + assert.equal(scoreBySibling($img), 15); + }); +}); describe('scoreByDimensions($img)', () => { it('penalizes skinny images', () => { @@ -143,11 +143,11 @@ describe('scoreByDimensions($img)', () => { <img width="10" /> </div> ` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByDimensions($img), -50) - }) + assert.equal(scoreByDimensions($img), -50); + }); it('penalizes short images', () => { const $ = cheerio.load( @@ -156,11 +156,11 @@ describe('scoreByDimensions($img)', () => { <img height="10" /> </div> ` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByDimensions($img), -50) - }) + assert.equal(scoreByDimensions($img), -50); + }); it('ignores sprites', () => { const $ = cheerio.load( @@ -169,11 +169,11 @@ describe('scoreByDimensions($img)', () => { <img src="/sprite/etc/foo.png" width="1000" height="1000" /> </div> ` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByDimensions($img), 0) - }) + assert.equal(scoreByDimensions($img), 0); + }); it('penalizes images with small areas', () => { const $ = cheerio.load( @@ -182,11 +182,11 @@ describe('scoreByDimensions($img)', () => { <img src="/etc/foo.png" width="60" height="60" /> </div> ` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByDimensions($img), -100) - }) + assert.equal(scoreByDimensions($img), -100); + }); it('prefers the largest images', () => { const $ = cheerio.load( @@ -195,13 +195,12 @@ describe('scoreByDimensions($img)', () => { <img src="/etc/foo.png" width="1000" height="1000" /> </div> ` - ) - const $img = $('img').first() + ); + const $img = $('img').first(); - assert.equal(scoreByDimensions($img), 1000) - }) - -}) + assert.equal(scoreByDimensions($img), 1000); + }); +}); describe('scoreByPosition($imgs, index)', () => { it('gives higher scores to images that come first', () => { @@ -216,10 +215,10 @@ describe('scoreByPosition($imgs, index)', () => { <img width="10" /> </div> ` - ) - const $imgs = $('img') + ); + const $imgs = $('img'); - assert.equal(scoreByPosition($imgs, 0), 3) - }) -}) + assert.equal(scoreByPosition($imgs, 0), 3); + }); +}); diff --git a/src/extractors/generic/next-page-url/extractor.js b/src/extractors/generic/next-page-url/extractor.js index 7b927245..dee033b5 100644 --- a/src/extractors/generic/next-page-url/extractor.js +++ b/src/extractors/generic/next-page-url/extractor.js @@ -1,25 +1,22 @@ -import 'babel-polyfill' -import URL from 'url' +import 'babel-polyfill'; +import URL from 'url'; import { - pageNumFromUrl, articleBaseUrl, removeAnchor, -} from 'utils/text' -import scoreLinks from './scoring/score-links' +} from 'utils/text'; +import scoreLinks from './scoring/score-links'; // Looks for and returns next page url // for multi-page articles const GenericNextPageUrlExtractor = { - extract({ $, url, parsedUrl, previousUrls=[] }) { - parsedUrl = parsedUrl || URL.parse(url) + extract({ $, url, parsedUrl, previousUrls = [] }) { + parsedUrl = parsedUrl || URL.parse(url); - const currentPageNum = pageNumFromUrl(url) - const articleUrl = removeAnchor(url) - const baseUrl = articleBaseUrl(url, parsedUrl) - const { host } = parsedUrl + const articleUrl = removeAnchor(url); + const baseUrl = articleBaseUrl(url, parsedUrl); - const links = $('a[href]').toArray() + const links = $('a[href]').toArray(); const scoredLinks = scoreLinks({ links, @@ -27,28 +24,28 @@ const GenericNextPageUrlExtractor = { baseUrl, parsedUrl, $, - previousUrls - }) + previousUrls, + }); // If no links were scored, return null - if (!scoredLinks) return null + if (!scoredLinks) return null; // now that we've scored all possible pages, // find the biggest one. const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => { - const scoredLink = scoredLinks[link] - return scoredLink.score > acc.score ? scoredLink : acc - }, { score: -100 }) + const scoredLink = scoredLinks[link]; + return scoredLink.score > acc.score ? scoredLink : acc; + }, { score: -100 }); // If the score is less than 50, we're not confident enough to use it, // so we fail. if (topPage.score >= 50) { - return topPage.href - } else { - return null + return topPage.href; } - } -} + return null; + }, +}; -export default GenericNextPageUrlExtractor + +export default GenericNextPageUrlExtractor; diff --git a/src/extractors/generic/next-page-url/extractor.test.js b/src/extractors/generic/next-page-url/extractor.test.js index 68767c64..55ddebb8 100644 --- a/src/extractors/generic/next-page-url/extractor.test.js +++ b/src/extractors/generic/next-page-url/extractor.test.js @@ -1,34 +1,34 @@ -import assert from 'assert' -import fs from 'fs' -import cheerio from 'cheerio' +import assert from 'assert'; +import fs from 'fs'; +import cheerio from 'cheerio'; -import GenericNextPageUrlExtractor from './extractor' +import GenericNextPageUrlExtractor from './extractor'; describe('GenericNextPageUrlExtractor', () => { it('returns most likely next page url', () => { - const html = fs.readFileSync('./fixtures/ars.html', 'utf8') - const $ = cheerio.load(html) - const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' - const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2' + const html = fs.readFileSync('./fixtures/ars.html', 'utf8'); + const $ = cheerio.load(html); + const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; + const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'; const nextPage = GenericNextPageUrlExtractor.extract({ $, - url - }) + url, + }); - assert.equal(nextPage, next) - }) + assert.equal(nextPage, next); + }); it('returns null if there is no likely next page', () => { - const html = `<div><p>HI</p></div>` - const $ = cheerio.load(html) - const url = 'http://example.com/foo/bar' + const html = '<div><p>HI</p></div>'; + const $ = cheerio.load(html); + const url = 'http://example.com/foo/bar'; const nextPage = GenericNextPageUrlExtractor.extract({ $, - url - }) + url, + }); - assert.equal(nextPage, null) - }) -}) + assert.equal(nextPage, null); + }); +}); diff --git a/src/extractors/generic/next-page-url/scoring/constants.js b/src/extractors/generic/next-page-url/scoring/constants.js index 35c97666..a2f8cb9b 100644 --- a/src/extractors/generic/next-page-url/scoring/constants.js +++ b/src/extractors/generic/next-page-url/scoring/constants.js @@ -1,38 +1,38 @@ -export const DIGIT_RE = /\d/ +export const DIGIT_RE = /\d/; // A list of words that, if found in link text or URLs, likely mean that // this link is not a next page link. export const EXTRANEOUS_LINK_HINTS = [ - 'print', - 'archive', - 'comment', - 'discuss', - 'e-mail', - 'email', - 'share', - 'reply', - 'all', - 'login', - 'sign', - 'single', - 'adx', - 'entry-unrelated' -] -export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i') + 'print', + 'archive', + 'comment', + 'discuss', + 'e-mail', + 'email', + 'share', + 'reply', + 'all', + 'login', + 'sign', + 'single', + 'adx', + 'entry-unrelated', +]; +export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i'); // Match any link text/classname/id that looks like it could mean the next // page. Things like: next, continue, >, >>, » but not >|, »| as those can // mean last page. -export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i') +export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i'); // Match any link text/classname/id that looks like it is an end link: things // like "first", "last", "end", etc. -export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i') +export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous // page. -export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i') +export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination -export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i') +export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); diff --git a/src/extractors/generic/next-page-url/scoring/score-links.js b/src/extractors/generic/next-page-url/scoring/score-links.js index 87333bb1..0fb2e753 100644 --- a/src/extractors/generic/next-page-url/scoring/score-links.js +++ b/src/extractors/generic/next-page-url/scoring/score-links.js @@ -1,27 +1,32 @@ -import 'babel-polyfill' -import URL from 'url' -import difflib from 'difflib' +import 'babel-polyfill'; +import URL from 'url'; -import { range } from 'utils' -import { isWordpress } from 'utils/dom' +import { isWordpress } from 'utils/dom'; import { removeAnchor, pageNumFromUrl, -} from 'utils/text' -import { - DIGIT_RE, - NEXT_LINK_TEXT_RE, - PREV_LINK_TEXT_RE, - EXTRANEOUS_LINK_HINTS_RE, - CAP_LINK_TEXT_RE, - PAGE_RE, -} from './constants' +} from 'utils/text'; import { - NEGATIVE_SCORE_RE, - POSITIVE_SCORE_RE, -} from 'utils/dom/constants' -import { IS_DIGIT_RE } from 'utils/text/constants' + scoreSimilarity, + scoreLinkText, + scorePageInLink, + scoreExtraneousLinks, + scoreByParents, + scorePrevLink, + shouldScore, + scoreBaseUrl, + scoreCapLinks, + scoreNextLinkText, +} from './utils'; + +export function makeBaseRegex(baseUrl) { + return new RegExp(`^${baseUrl}`, 'i'); +} + +function makeSig($link, linkText) { + return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`; +} export default function scoreLinks({ links, @@ -29,11 +34,11 @@ export default function scoreLinks({ baseUrl, parsedUrl, $, - previousUrls=[] + previousUrls = [], }) { - parsedUrl = parsedUrl || URL.parse(articleUrl) - const baseRegex = makeBaseRegex(baseUrl) - const isWp = isWordpress($) + parsedUrl = parsedUrl || URL.parse(articleUrl); + const baseRegex = makeBaseRegex(baseUrl); + const isWp = isWordpress($); // Loop through all links, looking for hints that they may be next-page // links. Things like having "page" in their textContent, className or @@ -46,12 +51,12 @@ export default function scoreLinks({ // Remove any anchor data since we don't do a good job // standardizing URLs (it's hard), we're going to do // some checking with and without a trailing slash - let href = removeAnchor(link.attribs.href) - const $link = $(link) - const linkText = $link.text() + const href = removeAnchor(link.attribs.href); + const $link = $(link); + const linkText = $link.text(); if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) { - return possiblePages + return possiblePages; } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ## @@ -60,242 +65,29 @@ export default function scoreLinks({ score: 0, linkText, href, - } + }; } else { - possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}` - } - - const possiblePage = possiblePages[href] - const linkData = makeSig($link, linkText) - const pageNum = pageNumFromUrl(href) - - let score = scoreBaseUrl(href, baseRegex) - score = score + scoreNextLinkText(linkData) - score = score + scoreCapLinks(linkData) - score = score + scorePrevLink(linkData) - score = score + scoreByParents($link) - score = score + scoreExtraneousLinks(href) - score = score + scorePageInLink(pageNum, isWp) - score = score + scoreLinkText(linkText, pageNum) - score = score + scoreSimilarity(score, articleUrl, href) - - possiblePage.score = score - - return possiblePages - }, {}) - - return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages -} - -export function makeBaseRegex(baseUrl) { - return new RegExp(`^${baseUrl}`, 'i') -} - -export function scoreSimilarity(score, articleUrl, href) { - // Do this last and only if we have a real candidate, because it's - // potentially expensive computationally. Compare the link to this - // URL using difflib to get the % similarity of these URLs. On a - // sliding scale, subtract points from this link based on - // similarity. - if (score > 0) { - const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio() - // Subtract .1 from diff_percent when calculating modifier, - // which means that if it's less than 10% different, we give a - // bonus instead. Ex: - // 3% different = +17.5 points - // 10% different = 0 points - // 20% different = -25 points - const diffPercent = 1.0 - similarity - const diffModifier = -(250 * (diffPercent - 0.2)) - return score + diffModifier - } - - return 0 -} - -export function scoreLinkText(linkText, pageNum) { - // If the link text can be parsed as a number, give it a minor - // bonus, with a slight bias towards lower numbered pages. This is - // so that pages that might not have 'next' in their text can still - // get scored, and sorted properly by score. - let score = 0 - - if (IS_DIGIT_RE.test(linkText.trim())) { - const linkTextAsNum = parseInt(linkText) - // If it's the first page, we already got it on the first call. - // Give it a negative score. Otherwise, up to page 10, give a - // small bonus. - if (linkTextAsNum < 2) { - score = -30 - } else { - score = Math.max(0, 10 - linkTextAsNum) - } - - // If it appears that the current page number is greater than - // this links page number, it's a very bad sign. Give it a big - // penalty. - if (pageNum && pageNum >= linkTextAsNum) { - score = score - 50 - } - } - - return score -} - -export function scorePageInLink(pageNum, isWp) { - // page in the link = bonus. Intentionally ignore wordpress because - // their ?p=123 link style gets caught by this even though it means - // separate documents entirely. - if (pageNum && !isWp) { - return 50 - } - - return 0 -} - -export function scoreExtraneousLinks(href) { - // If the URL itself contains extraneous values, give a penalty. - if (EXTRANEOUS_LINK_HINTS_RE.test(href)) { - return -25 - } - - return 0 -} - -export function scoreByParents($link) { - // If a parent node contains paging-like classname or id, give a - // bonus. Additionally, if a parent_node contains bad content - // (like 'sponsor'), give a penalty. - let $parent = $link.parent() - let positiveMatch = false - let negativeMatch = false - let score = 0 - - Array.from(range(0, 4)).forEach((_) => { - if ($parent.length === 0) { - return + possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`; } - const parentData = makeSig($parent, ' ') - - // If we have 'page' or 'paging' in our data, that's a good - // sign. Add a bonus. - if (!positiveMatch && PAGE_RE.test(parentData)) { - positiveMatch = true - score = score + 25 - } + const possiblePage = possiblePages[href]; + const linkData = makeSig($link, linkText); + const pageNum = pageNumFromUrl(href); - // If we have 'comment' or something in our data, and - // we don't have something like 'content' as well, that's - // a bad sign. Give a penalty. - if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData) - && EXTRANEOUS_LINK_HINTS_RE.test(parentData)) { - if (!POSITIVE_SCORE_RE.test(parentData)) { - negativeMatch = true - score = score - 25 - } - } - - $parent = $parent.parent() - }) - - return score -} + let score = scoreBaseUrl(href, baseRegex); + score += scoreNextLinkText(linkData); + score += scoreCapLinks(linkData); + score += scorePrevLink(linkData); + score += scoreByParents($link); + score += scoreExtraneousLinks(href); + score += scorePageInLink(pageNum, isWp); + score += scoreLinkText(linkText, pageNum); + score += scoreSimilarity(score, articleUrl, href); -export function scorePrevLink(linkData) { - // If the link has something like "previous", its definitely - // an old link, skip it. - if (PREV_LINK_TEXT_RE.test(linkData)) { - return -200 - } + possiblePage.score = score; - return 0 -} - -export function scoreCapLinks(linkData) { - // Cap links are links like "last", etc. - if (CAP_LINK_TEXT_RE.test(linkData)) { - // If we found a link like "last", but we've already seen that - // this link is also "next", it's fine. If it's not been - // previously marked as "next", then it's probably bad. - // Penalize. - if (NEXT_LINK_TEXT_RE.test(linkData)) { - return -65 - } - } - - return 0 -} - -export function scoreNextLinkText(linkData) { - // Things like "next", ">>", etc. - if (NEXT_LINK_TEXT_RE.test(linkData)) { - return 50 - } - - return 0 -} + return possiblePages; + }, {}); -export function scoreBaseUrl(href, baseRegex) { - // If the baseUrl isn't part of this URL, penalize this - // link. It could still be the link, but the odds are lower. - // Example: - // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html - if (!baseRegex.test(href)) { - return -25 - } - - return 0 -} - -export function shouldScore( - href, - articleUrl, - baseUrl, - parsedUrl, - linkText, - previousUrls -) { - // skip if we've already fetched this url - if(previousUrls.find((url) => href === url) !== undefined) { - return false - } - - // If we've already parsed this URL, or the URL matches the base - // URL, or is empty, skip it. - if (!href || href === articleUrl || href === baseUrl) { - return false - } - - const { hostname } = parsedUrl - const { hostname: linkHost } = URL.parse(href) - - // Domain mismatch. - if (linkHost !== hostname) { - return false - } - - // If href doesn't contain a digit after removing the base URL, - // it's certainly not the next page. - const fragment = href.replace(baseUrl, '') - if (!DIGIT_RE.test(fragment)) { - return false - } - - // This link has extraneous content (like "comment") in its link - // text, so we skip it. - if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) { - return false - } - - // Next page link text is never long, skip if it is too long. - if (linkText.length > 25) { - return false - } - - return true -} - -function makeSig($link, linkText) { - return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}` + return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages; } diff --git a/src/extractors/generic/next-page-url/scoring/score-links.test.js b/src/extractors/generic/next-page-url/scoring/score-links.test.js index 909cd614..52f939bb 100644 --- a/src/extractors/generic/next-page-url/scoring/score-links.test.js +++ b/src/extractors/generic/next-page-url/scoring/score-links.test.js @@ -1,239 +1,42 @@ -import assert from 'assert' -import cheerio from 'cheerio' -import fs from 'fs' -import URL from 'url' +import assert from 'assert'; +import cheerio from 'cheerio'; +import fs from 'fs'; -import scoreLinks from './score-links' -import { - makeBaseRegex, - scoreBaseUrl, - scoreNextLinkText, - scoreCapLinks, - scorePrevLink, - scoreByParents, - scoreExtraneousLinks, - scorePageInLink, - scoreLinkText, - scoreSimilarity, - shouldScore, -} from './score-links' +import scoreLinks from './score-links'; describe('scoreLinks(links)', () => { it('returns an object of scored links', () => { - const html = fs.readFileSync('./fixtures/ars.html', 'utf8') + const html = fs.readFileSync('./fixtures/ars.html', 'utf8'); - const $ = cheerio.load(html) - const links = $('a[href]').toArray() - const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' + const $ = cheerio.load(html); + const links = $('a[href]').toArray(); + const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; const scoredPages = scoreLinks({ links, articleUrl: url, baseUrl: 'http://arstechnica.com', $, - }) + }); - assert.equal(typeof scoredPages, 'object') - }) + assert.equal(typeof scoredPages, 'object'); + }); it('returns null if no possible pages', () => { - const html = `<div><p>Hello wow</p></div>` + const html = '<div><p>Hello wow</p></div>'; - const $ = cheerio.load(html) - const links = $('a[href]').toArray() - const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' + const $ = cheerio.load(html); + const links = $('a[href]').toArray(); + const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; const scoredPages = scoreLinks({ links, articleUrl: url, baseUrl: 'http://arstechnica.com', $, - }) + }); - assert.equal(scoredPages, null) - }) -}) + assert.equal(scoredPages, null); + }); +}); -describe('scoreBaseUrl(href, baseRegex)', () => { - it('returns -25 if url does not contain the base url', () => { - const baseUrl = 'http://example.com/foo/bar' - const badUrl = 'http://foo.com/foo/bar' - const baseRegex = makeBaseRegex(baseUrl) - - assert.equal(scoreBaseUrl(badUrl, baseRegex), -25) - }) - - it('returns 0 if url contains the base url', () => { - const baseUrl = 'http://example.com/foo/bar' - const badUrl = 'http://example.com/foo/bar/bat' - const baseRegex = makeBaseRegex(baseUrl) - - assert.equal(scoreBaseUrl(badUrl, baseRegex), 0) - }) -}) - -describe('scoreNextLinkText(linkData)', () => { - it('returns 50 if contains common next link text', () => { - const linkData = "foo bar Next page" - - assert.equal(scoreNextLinkText(linkData), 50) - }) - - it('returns 0 if does not contain common next link text', () => { - const linkData = "foo bar WOW GREAT" - - assert.equal(scoreNextLinkText(linkData), 0) - }) -}) - -describe('scoreCapLinks(linkData)', () => { - it('returns -65 if cap link with next link text', () => { - const linkData = "foo next Last page" - - assert.equal(scoreCapLinks(linkData), -65) - }) - - it('returns 0 if does not match a cap link', () => { - const linkData = "foo bar WOW GREAT" - - assert.equal(scoreCapLinks(linkData), 0) - }) -}) - -describe('scorePrevLink(linkData)', () => { - it('returns -200 if link matches previous text', () => { - const linkData = "foo next previous page" - - assert.equal(scorePrevLink(linkData), -200) - }) - - it('returns 0 if does not match a prev link', () => { - const linkData = "foo bar WOW GREAT" - - assert.equal(scoreCapLinks(linkData), 0) - }) -}) - -describe('scoreByParents($link)', () => { - it('returns 25 if parent sig looks like a page', () => { - const html = ` - <div> - <div class="next-page"> - <a href="blah">Next page</a> - </div> - </div> - ` - const $ = cheerio.load(html) - const $link = $('a').first() - - assert.equal(scoreByParents($link), 25) - }) - - it('returns -25 if parent sig looks like a comment', () => { - const html = ` - <div> - <div class="comment"> - <a href="blah">Next page</a> - </div> - </div> - ` - const $ = cheerio.load(html) - const $link = $('a').first() - - assert.equal(scoreByParents($link), -25) - }) - -}) - -describe('scoreExtraneousLinks(href)', () => { - it('returns -25 if link matches extraneous text', () => { - const url = "http://example.com/email-link" - - assert.equal(scoreExtraneousLinks(url), -25) - }) - - it('returns 0 if does not match extraneous text', () => { - const url = "http://example.com/asdf" - - assert.equal(scoreExtraneousLinks(url), 0) - }) -}) - -describe('scorePageInLink(pageNum, isWp)', () => { - it('returns 50 if link contains a page num', () => { - assert.equal(scorePageInLink(1, false), 50) - }) - - it('returns 0 if link contains no page num', () => { - assert.equal(scorePageInLink(null, false), 0) - }) - - it('returns 0 if page is wordpress', () => { - assert.equal(scorePageInLink(10, true), 0) - }) - -}) - -describe('scoreLinkText(linkText)', () => { - it('returns 8 if link contains the num 2', () => { - assert.equal(scoreLinkText('2', 0), 8) - }) - - it('returns 5 if link contains the num 5', () => { - assert.equal(scoreLinkText('5', 0), 5) - }) - - it('returns -30 if link contains the number 1', () => { - assert.equal(scoreLinkText('1', 0), -30) - }) - - it('penalizes -50 if pageNum is >= link text as num', () => { - assert.equal(scoreLinkText('4', 5), -44) - }) - -}) - -describe('scoreSimilarity(score, articleUrl, href)', () => { - it('returns a similarity bonus based on current score', () => { - const articleUrl = 'http://example.com/foo/bar' - const href = 'http://example.com/foo/bar/2' - const score = 25 - assert.equal( - Math.round(scoreSimilarity(score, articleUrl, href)), - 66 - ) - }) - - it('returns 0 is current score <= 0', () => { - const articleUrl = 'http://example.com/foo/bar' - const href = 'http://example.com/foo/bar/2' - const score = 0 - assert.equal(scoreSimilarity(score, articleUrl, href), 0) - }) - -}) - -describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => { - it('returns false if href has already been fetched', () => { - const previousUrls = [ 'http://example.com/foo/bar/2' ] - const href = 'http://example.com/foo/bar/2' - const parsedUrl = URL.parse(href) - - assert.equal( - shouldScore(href, '', '', parsedUrl, '', previousUrls), - false - ) - }) - - it('returns true if href has not been fetched', () => { - const previousUrls = [ 'http://example.com/foo/bar' ] - const href = 'http://example.com/foo/bar/2' - const parsedUrl = URL.parse(href) - - assert.equal( - shouldScore(href, '', '', parsedUrl, '', previousUrls), - true - ) - }) - -}) diff --git a/src/extractors/generic/next-page-url/scoring/utils/index.js b/src/extractors/generic/next-page-url/scoring/utils/index.js new file mode 100644 index 00000000..f697f5cb --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/index.js @@ -0,0 +1,10 @@ +export { default as scoreSimilarity } from './score-similarity'; +export { default as scoreLinkText } from './score-link-text'; +export { default as scorePageInLink } from './score-page-in-link'; +export { default as scoreExtraneousLinks } from './score-extraneous-links'; +export { default as scoreByParents } from './score-by-parents'; +export { default as scorePrevLink } from './score-prev-link'; +export { default as shouldScore } from './should-score'; +export { default as scoreBaseUrl } from './score-base-url'; +export { default as scoreNextLinkText } from './score-next-link-text'; +export { default as scoreCapLinks } from './score-cap-links'; diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-base-url.js b/src/extractors/generic/next-page-url/scoring/utils/score-base-url.js new file mode 100644 index 00000000..f78446ab --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-base-url.js @@ -0,0 +1,11 @@ +export default function scoreBaseUrl(href, baseRegex) { + // If the baseUrl isn't part of this URL, penalize this + // link. It could still be the link, but the odds are lower. + // Example: + // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html + if (!baseRegex.test(href)) { + return -25; + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-base-url.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-base-url.test.js new file mode 100644 index 00000000..3b3eb6be --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-base-url.test.js @@ -0,0 +1,23 @@ +import assert from 'assert'; + +import scoreBaseUrl from './score-base-url'; +import { makeBaseRegex } from '../score-links'; + +describe('scoreBaseUrl(href, baseRegex)', () => { + it('returns -25 if url does not contain the base url', () => { + const baseUrl = 'http://example.com/foo/bar'; + const badUrl = 'http://foo.com/foo/bar'; + const baseRegex = makeBaseRegex(baseUrl); + + assert.equal(scoreBaseUrl(badUrl, baseRegex), -25); + }); + + it('returns 0 if url contains the base url', () => { + const baseUrl = 'http://example.com/foo/bar'; + const badUrl = 'http://example.com/foo/bar/bat'; + const baseRegex = makeBaseRegex(baseUrl); + + assert.equal(scoreBaseUrl(badUrl, baseRegex), 0); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js b/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js new file mode 100644 index 00000000..faa81cc6 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js @@ -0,0 +1,52 @@ +import { range } from 'utils'; +import { + NEGATIVE_SCORE_RE, + POSITIVE_SCORE_RE, + PAGE_RE, +} from 'utils/dom/constants'; +import { EXTRANEOUS_LINK_HINTS_RE } from '../constants'; + +function makeSig($link) { + return `${$link.attr('class') || ''} ${$link.attr('id') || ''}`; +} + +export default function scoreByParents($link) { + // If a parent node contains paging-like classname or id, give a + // bonus. Additionally, if a parent_node contains bad content + // (like 'sponsor'), give a penalty. + let $parent = $link.parent(); + let positiveMatch = false; + let negativeMatch = false; + let score = 0; + + Array.from(range(0, 4)).forEach(() => { + if ($parent.length === 0) { + return; + } + + const parentData = makeSig($parent, ' '); + + // If we have 'page' or 'paging' in our data, that's a good + // sign. Add a bonus. + if (!positiveMatch && PAGE_RE.test(parentData)) { + positiveMatch = true; + score += 25; + } + + // If we have 'comment' or something in our data, and + // we don't have something like 'content' as well, that's + // a bad sign. Give a penalty. + if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData) + && EXTRANEOUS_LINK_HINTS_RE.test(parentData)) { + if (!POSITIVE_SCORE_RE.test(parentData)) { + negativeMatch = true; + score -= 25; + } + } + + $parent = $parent.parent(); + }); + + return score; +} + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.test.js new file mode 100644 index 00000000..63c0921f --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.test.js @@ -0,0 +1,35 @@ +import assert from 'assert'; +import cheerio from 'cheerio'; + +import scoreByParents from './score-by-parents'; + +describe('scoreByParents($link)', () => { + it('returns 25 if parent sig looks like a page', () => { + const html = ` + <div> + <div class="next-page"> + <a href="blah">Next page</a> + </div> + </div> + `; + const $ = cheerio.load(html); + const $link = $('a').first(); + + assert.equal(scoreByParents($link), 25); + }); + + it('returns -25 if parent sig looks like a comment', () => { + const html = ` + <div> + <div class="comment"> + <a href="blah">Next page</a> + </div> + </div> + `; + const $ = cheerio.load(html); + const $link = $('a').first(); + + assert.equal(scoreByParents($link), -25); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js b/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js new file mode 100644 index 00000000..37e81af3 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js @@ -0,0 +1,19 @@ +import { + NEXT_LINK_TEXT_RE, + CAP_LINK_TEXT_RE, +} from '../constants'; + +export default function scoreCapLinks(linkData) { + // Cap links are links like "last", etc. + if (CAP_LINK_TEXT_RE.test(linkData)) { + // If we found a link like "last", but we've already seen that + // this link is also "next", it's fine. If it's not been + // previously marked as "next", then it's probably bad. + // Penalize. + if (NEXT_LINK_TEXT_RE.test(linkData)) { + return -65; + } + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.test.js new file mode 100644 index 00000000..98f62d44 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.test.js @@ -0,0 +1,18 @@ +import assert from 'assert'; + +import scoreCapLinks from './score-cap-links'; + +describe('scoreCapLinks(linkData)', () => { + it('returns -65 if cap link with next link text', () => { + const linkData = 'foo next Last page'; + + assert.equal(scoreCapLinks(linkData), -65); + }); + + it('returns 0 if does not match a cap link', () => { + const linkData = 'foo bar WOW GREAT'; + + assert.equal(scoreCapLinks(linkData), 0); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js b/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js new file mode 100644 index 00000000..042b3fb3 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js @@ -0,0 +1,10 @@ +import { EXTRANEOUS_LINK_HINTS_RE } from '../constants'; + +export default function scoreExtraneousLinks(href) { + // If the URL itself contains extraneous values, give a penalty. + if (EXTRANEOUS_LINK_HINTS_RE.test(href)) { + return -25; + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.test.js new file mode 100644 index 00000000..f85d3f17 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.test.js @@ -0,0 +1,18 @@ +import assert from 'assert'; + +import scoreExtraneousLinks from './score-extraneous-links'; + +describe('scoreExtraneousLinks(href)', () => { + it('returns -25 if link matches extraneous text', () => { + const url = 'http://example.com/email-link'; + + assert.equal(scoreExtraneousLinks(url), -25); + }); + + it('returns 0 if does not match extraneous text', () => { + const url = 'http://example.com/asdf'; + + assert.equal(scoreExtraneousLinks(url), 0); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-link-text.js b/src/extractors/generic/next-page-url/scoring/utils/score-link-text.js new file mode 100644 index 00000000..4280fb76 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-link-text.js @@ -0,0 +1,30 @@ +import { IS_DIGIT_RE } from 'utils/text/constants'; + +export default function scoreLinkText(linkText, pageNum) { + // If the link text can be parsed as a number, give it a minor + // bonus, with a slight bias towards lower numbered pages. This is + // so that pages that might not have 'next' in their text can still + // get scored, and sorted properly by score. + let score = 0; + + if (IS_DIGIT_RE.test(linkText.trim())) { + const linkTextAsNum = parseInt(linkText, 10); + // If it's the first page, we already got it on the first call. + // Give it a negative score. Otherwise, up to page 10, give a + // small bonus. + if (linkTextAsNum < 2) { + score = -30; + } else { + score = Math.max(0, 10 - linkTextAsNum); + } + + // If it appears that the current page number is greater than + // this links page number, it's a very bad sign. Give it a big + // penalty. + if (pageNum && pageNum >= linkTextAsNum) { + score -= 50; + } + } + + return score; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-link-text.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-link-text.test.js new file mode 100644 index 00000000..d8cbcd12 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-link-text.test.js @@ -0,0 +1,22 @@ +import assert from 'assert'; + +import scoreLinkText from './score-link-text'; + +describe('scoreLinkText(linkText)', () => { + it('returns 8 if link contains the num 2', () => { + assert.equal(scoreLinkText('2', 0), 8); + }); + + it('returns 5 if link contains the num 5', () => { + assert.equal(scoreLinkText('5', 0), 5); + }); + + it('returns -30 if link contains the number 1', () => { + assert.equal(scoreLinkText('1', 0), -30); + }); + + it('penalizes -50 if pageNum is >= link text as num', () => { + assert.equal(scoreLinkText('4', 5), -44); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js b/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js new file mode 100644 index 00000000..c92a7ff0 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js @@ -0,0 +1,10 @@ +import { NEXT_LINK_TEXT_RE } from '../constants'; + +export default function scoreNextLinkText(linkData) { + // Things like "next", ">>", etc. + if (NEXT_LINK_TEXT_RE.test(linkData)) { + return 50; + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.test.js new file mode 100644 index 00000000..dd38dd96 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.test.js @@ -0,0 +1,18 @@ +import assert from 'assert'; + +import scoreNextLinkText from './score-next-link-text'; + +describe('scoreNextLinkText(linkData)', () => { + it('returns 50 if contains common next link text', () => { + const linkData = 'foo bar Next page'; + + assert.equal(scoreNextLinkText(linkData), 50); + }); + + it('returns 0 if does not contain common next link text', () => { + const linkData = 'foo bar WOW GREAT'; + + assert.equal(scoreNextLinkText(linkData), 0); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js b/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js new file mode 100644 index 00000000..cb334ad2 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js @@ -0,0 +1,10 @@ +export default function scorePageInLink(pageNum, isWp) { + // page in the link = bonus. Intentionally ignore wordpress because + // their ?p=123 link style gets caught by this even though it means + // separate documents entirely. + if (pageNum && !isWp) { + return 50; + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.test.js new file mode 100644 index 00000000..18f0a047 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.test.js @@ -0,0 +1,18 @@ +import assert from 'assert'; + +import scorePageInLink from './score-page-in-link'; + +describe('scorePageInLink(pageNum, isWp)', () => { + it('returns 50 if link contains a page num', () => { + assert.equal(scorePageInLink(1, false), 50); + }); + + it('returns 0 if link contains no page num', () => { + assert.equal(scorePageInLink(null, false), 0); + }); + + it('returns 0 if page is wordpress', () => { + assert.equal(scorePageInLink(10, true), 0); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js b/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js new file mode 100644 index 00000000..f98368b3 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js @@ -0,0 +1,11 @@ +import { PREV_LINK_TEXT_RE } from '../constants'; + +export default function scorePrevLink(linkData) { + // If the link has something like "previous", its definitely + // an old link, skip it. + if (PREV_LINK_TEXT_RE.test(linkData)) { + return -200; + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.test.js new file mode 100644 index 00000000..f13843b7 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.test.js @@ -0,0 +1,18 @@ +import assert from 'assert'; + +import scorePrevLink from './score-prev-link'; + +describe('scorePrevLink(linkData)', () => { + it('returns -200 if link matches previous text', () => { + const linkData = 'foo next previous page'; + + assert.equal(scorePrevLink(linkData), -200); + }); + + it('returns 0 if does not match a prev link', () => { + const linkData = 'foo bar WOW GREAT'; + + assert.equal(scorePrevLink(linkData), 0); + }); +}); + diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-similarity.js b/src/extractors/generic/next-page-url/scoring/utils/score-similarity.js new file mode 100644 index 00000000..a42d1ac6 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-similarity.js @@ -0,0 +1,23 @@ +import difflib from 'difflib'; + +export default function scoreSimilarity(score, articleUrl, href) { + // Do this last and only if we have a real candidate, because it's + // potentially expensive computationally. Compare the link to this + // URL using difflib to get the % similarity of these URLs. On a + // sliding scale, subtract points from this link based on + // similarity. + if (score > 0) { + const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio(); + // Subtract .1 from diff_percent when calculating modifier, + // which means that if it's less than 10% different, we give a + // bonus instead. Ex: + // 3% different = +17.5 points + // 10% different = 0 points + // 20% different = -25 points + const diffPercent = 1.0 - similarity; + const diffModifier = -(250 * (diffPercent - 0.2)); + return score + diffModifier; + } + + return 0; +} diff --git a/src/extractors/generic/next-page-url/scoring/utils/score-similarity.test.js b/src/extractors/generic/next-page-url/scoring/utils/score-similarity.test.js new file mode 100644 index 00000000..402fc7d5 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/score-similarity.test.js @@ -0,0 +1,22 @@ +import assert from 'assert'; + +import scoreSimilarity from './score-similarity'; + +describe('scoreSimilarity(score, articleUrl, href)', () => { + it('returns a similarity bonus based on current score', () => { + const articleUrl = 'http://example.com/foo/bar'; + const href = 'http://example.com/foo/bar/2'; + const score = 25; + assert.equal( + Math.round(scoreSimilarity(score, articleUrl, href)), + 66 + ); + }); + + it('returns 0 is current score <= 0', () => { + const articleUrl = 'http://example.com/foo/bar'; + const href = 'http://example.com/foo/bar/2'; + const score = 0; + assert.equal(scoreSimilarity(score, articleUrl, href), 0); + }); +}); diff --git a/src/extractors/generic/next-page-url/scoring/utils/should-score.js b/src/extractors/generic/next-page-url/scoring/utils/should-score.js new file mode 100644 index 00000000..a162d0e4 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/should-score.js @@ -0,0 +1,55 @@ +import URL from 'url'; + +import { + DIGIT_RE, + EXTRANEOUS_LINK_HINTS_RE, +} from '../constants'; + +export default function shouldScore( + href, + articleUrl, + baseUrl, + parsedUrl, + linkText, + previousUrls +) { + // skip if we've already fetched this url + if (previousUrls.find(url => href === url) !== undefined) { + return false; + } + + // If we've already parsed this URL, or the URL matches the base + // URL, or is empty, skip it. + if (!href || href === articleUrl || href === baseUrl) { + return false; + } + + const { hostname } = parsedUrl; + const { hostname: linkHost } = URL.parse(href); + + // Domain mismatch. + if (linkHost !== hostname) { + return false; + } + + // If href doesn't contain a digit after removing the base URL, + // it's certainly not the next page. + const fragment = href.replace(baseUrl, ''); + if (!DIGIT_RE.test(fragment)) { + return false; + } + + // This link has extraneous content (like "comment") in its link + // text, so we skip it. + if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) { + return false; + } + + // Next page link text is never long, skip if it is too long. + if (linkText.length > 25) { + return false; + } + + return true; +} + diff --git a/src/extractors/generic/next-page-url/scoring/utils/should-score.test.js b/src/extractors/generic/next-page-url/scoring/utils/should-score.test.js new file mode 100644 index 00000000..17e81f32 --- /dev/null +++ b/src/extractors/generic/next-page-url/scoring/utils/should-score.test.js @@ -0,0 +1,28 @@ +import assert from 'assert'; +import URL from 'url'; + +import shouldScore from './should-score'; + +describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => { + it('returns false if href has already been fetched', () => { + const previousUrls = ['http://example.com/foo/bar/2']; + const href = 'http://example.com/foo/bar/2'; + const parsedUrl = URL.parse(href); + + assert.equal( + shouldScore(href, '', '', parsedUrl, '', previousUrls), + false + ); + }); + + it('returns true if href has not been fetched', () => { + const previousUrls = ['http://example.com/foo/bar']; + const href = 'http://example.com/foo/bar/2'; + const parsedUrl = URL.parse(href); + + assert.equal( + shouldScore(href, '', '', parsedUrl, '', previousUrls), + true + ); + }); +}); diff --git a/src/extractors/generic/title/constants.js b/src/extractors/generic/title/constants.js index e990d3c9..ebfa949a 100644 --- a/src/extractors/generic/title/constants.js +++ b/src/extractors/generic/title/constants.js @@ -6,24 +6,24 @@ // attributes should be lowercase for faster case-insensitive matching. From // most distinct to least distinct. export const STRONG_TITLE_META_TAGS = [ - 'tweetmeme-title', - 'dc.title', - 'rbtitle', - 'headline', - 'title', -] + 'tweetmeme-title', + 'dc.title', + 'rbtitle', + 'headline', + 'title', +]; // og:title is weak because it typically contains context that we don't like, // for example the source site's name. Gotta get that brand into facebook! export const WEAK_TITLE_META_TAGS = [ - 'og:title', -] + 'og:title', +]; // An ordered list of XPath Selectors to find likely article titles. From // most explicit to least explicit. // // Note - this does not use classes like CSS. This checks to see if the string -// exists in the className, which is not as accurate as .className (which +// exists in the className, which is not as accurate as .className (which // splits on spaces/endlines), but for our purposes it's close enough. The // speed tradeoff is worth the accuracy hit. export const STRONG_TITLE_SELECTORS = [ @@ -33,7 +33,7 @@ export const STRONG_TITLE_SELECTORS = [ 'h1.article', '.instapaper_title', '#meebo-title', -] +]; export const WEAK_TITLE_SELECTORS = [ 'article h1', @@ -51,4 +51,4 @@ export const WEAK_TITLE_SELECTORS = [ 'h1', 'html head title', 'title', -] +]; diff --git a/src/extractors/generic/title/extractor.js b/src/extractors/generic/title/extractor.js index 2d64a291..c783e165 100644 --- a/src/extractors/generic/title/extractor.js +++ b/src/extractors/generic/title/extractor.js @@ -1,40 +1,41 @@ +import { cleanTitle } from 'cleaners'; +import { + extractFromMeta, + extractFromSelectors, +} from 'utils/dom'; + import { STRONG_TITLE_META_TAGS, WEAK_TITLE_META_TAGS, STRONG_TITLE_SELECTORS, - WEAK_TITLE_SELECTORS -} from './constants' -import { cleanTitle } from 'cleaners' -import { - extractFromMeta, - extractFromSelectors -} from 'utils/dom' + WEAK_TITLE_SELECTORS, +} from './constants'; const GenericTitleExtractor = { extract({ $, url, metaCache }) { // First, check to see if we have a matching meta tag that we can make // use of that is strongly associated with the headline. - let title + let title; - title = extractFromMeta($, STRONG_TITLE_META_TAGS, metaCache) - if (title) return cleanTitle(title, { url, $ }) + title = extractFromMeta($, STRONG_TITLE_META_TAGS, metaCache); + if (title) return cleanTitle(title, { url, $ }); // Second, look through our content selectors for the most likely // article title that is strongly associated with the headline. - title = extractFromSelectors($, STRONG_TITLE_SELECTORS) - if (title) return cleanTitle(title, { url, $ }) + title = extractFromSelectors($, STRONG_TITLE_SELECTORS); + if (title) return cleanTitle(title, { url, $ }); // Third, check for weaker meta tags that may match. - title = extractFromMeta($, WEAK_TITLE_META_TAGS, metaCache) - if (title) return cleanTitle(title, { url, $ }) + title = extractFromMeta($, WEAK_TITLE_META_TAGS, metaCache); + if (title) return cleanTitle(title, { url, $ }); // Last, look for weaker selector tags that may match. - title = extractFromSelectors($, WEAK_TITLE_SELECTORS) - if (title) return cleanTitle(title, { url, $ }) + title = extractFromSelectors($, WEAK_TITLE_SELECTORS); + if (title) return cleanTitle(title, { url, $ }); // If no matches, return an empty string - return "" - } -} + return ''; + }, +}; -export default GenericTitleExtractor +export default GenericTitleExtractor; diff --git a/src/extractors/generic/title/extractor.test.js b/src/extractors/generic/title/extractor.test.js index 14180de2..641b9602 100644 --- a/src/extractors/generic/title/extractor.test.js +++ b/src/extractors/generic/title/extractor.test.js @@ -1,45 +1,45 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import GenericTitleExtractor from './extractor' +import HTML from './fixtures/html'; +import GenericTitleExtractor from './extractor'; describe('GenericTitleExtractor', () => { describe('extract({ $, url, cachedMeta })', () => { it('extracts strong meta title tags', () => { - const $ = cheerio.load(HTML.dcTitle.test) + const $ = cheerio.load(HTML.dcTitle.test); const result = GenericTitleExtractor.extract( - { $, url: '', metaCache: ["dc.title", "something-else"] } - ) + { $, url: '', metaCache: ['dc.title', 'something-else'] } + ); - assert.equal(result, HTML.dcTitle.result) - }) + assert.equal(result, HTML.dcTitle.result); + }); it('pulls title from selectors lacking string meta', () => { - const $ = cheerio.load(HTML.strongTitleSelector.test) + const $ = cheerio.load(HTML.strongTitleSelector.test); const result = GenericTitleExtractor.extract( - { $, url: '', metaCache: ["og:title", "something-else"] } - ) + { $, url: '', metaCache: ['og:title', 'something-else'] } + ); - assert.equal(result, HTML.ogTitle.result) - }) + assert.equal(result, HTML.ogTitle.result); + }); it('then falls back to weak meta title tags', () => { - const $ = cheerio.load(HTML.ogTitle.test) + const $ = cheerio.load(HTML.ogTitle.test); const result = GenericTitleExtractor.extract( - { $, url: '', metaCache: ["og:title", "something-else"] } - ) + { $, url: '', metaCache: ['og:title', 'something-else'] } + ); - assert.equal(result, HTML.ogTitle.result) - }) - }) + assert.equal(result, HTML.ogTitle.result); + }); + }); it('then falls back to weak selectors', () => { - const $ = cheerio.load(HTML.weakTitleSelector.test) + const $ = cheerio.load(HTML.weakTitleSelector.test); const result = GenericTitleExtractor.extract( { $, url: '', metaCache: [] } - ) + ); - assert.equal(result, HTML.weakTitleSelector.result) - }) -}) + assert.equal(result, HTML.weakTitleSelector.result); + }); +}); diff --git a/src/extractors/generic/title/fixtures/html.js b/src/extractors/generic/title/fixtures/html.js index 2fe64329..87db7387 100644 --- a/src/extractors/generic/title/fixtures/html.js +++ b/src/extractors/generic/title/fixtures/html.js @@ -5,7 +5,7 @@ const HTML = { <meta name="dc.title" value="This Is the Title Okay" /> <html> `, - result: `This Is the Title Okay` + result: 'This Is the Title Okay', }, ogTitle: { test: ` @@ -13,7 +13,7 @@ const HTML = { <meta name="og:title" value="This Is the Title Okay" /> <html> `, - result: `This Is the Title Okay` + result: 'This Is the Title Okay', }, strongTitleSelector: { test: ` @@ -23,7 +23,7 @@ const HTML = { </article> <html> `, - result: `This Is the Title Okay` + result: 'This Is the Title Okay', }, weakTitleSelector: { test: ` @@ -33,8 +33,8 @@ const HTML = { </head> <html> `, - result: `This Is the Weak Title Okay` + result: 'This Is the Weak Title Okay', }, -} +}; -export default HTML +export default HTML; diff --git a/src/extractors/generic/title/utils/index.js b/src/extractors/generic/title/utils/index.js index 0abe45b2..6de53910 100644 --- a/src/extractors/generic/title/utils/index.js +++ b/src/extractors/generic/title/utils/index.js @@ -1,2 +1,2 @@ -export { default as cleanTitle } from './clean-title' -export { default as resolveSplitTitle } from './resolve-split-title' +export { default as cleanTitle } from './clean-title'; +export { default as resolveSplitTitle } from './resolve-split-title'; diff --git a/src/extractors/get-extractor.js b/src/extractors/get-extractor.js index e69d9e7a..4df1de96 100644 --- a/src/extractors/get-extractor.js +++ b/src/extractors/get-extractor.js @@ -1,12 +1,12 @@ -import URL from 'url' +import URL from 'url'; -import Extractors from './all' -import GenericExtractor from './generic' +import Extractors from './all'; +import GenericExtractor from './generic'; export default function getExtractor(url) { - const parsedUrl = URL.parse(url) - const { hostname } = parsedUrl - const baseDomain = hostname.split('.').slice(-2).join('.') + const parsedUrl = URL.parse(url); + const { hostname } = parsedUrl; + const baseDomain = hostname.split('.').slice(-2).join('.'); - return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor + return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor; } diff --git a/src/extractors/get-extractor.test.js b/src/extractors/get-extractor.test.js index b278775c..0586a9bd 100644 --- a/src/extractors/get-extractor.test.js +++ b/src/extractors/get-extractor.test.js @@ -1,23 +1,23 @@ -import assert from 'assert' +import assert from 'assert'; -import getExtractor from './get-extractor' +import getExtractor from './get-extractor'; describe('getExtractor(url)', () => { it('returns GenericExtractor if no custom extractor is found', () => { - const extractor = getExtractor('http://example.com') + const extractor = getExtractor('http://example.com'); - assert.equal(extractor.domain, '*') - }) + assert.equal(extractor.domain, '*'); + }); it('returns a custom extractor if found', () => { - const extractor = getExtractor('https://nymag.com') + const extractor = getExtractor('https://nymag.com'); - assert.equal(extractor.domain, 'nymag.com') - }) + assert.equal(extractor.domain, 'nymag.com'); + }); it('falls back to base domain if subdomain not found', () => { - const extractor = getExtractor('https://googleblog.blogspot.com') + const extractor = getExtractor('https://googleblog.blogspot.com'); - assert.equal(extractor.domain, 'blogspot.com') - }) -}) + assert.equal(extractor.domain, 'blogspot.com'); + }); +}); diff --git a/src/extractors/index.js b/src/extractors/index.js index fc6b1891..49b128a6 100644 --- a/src/extractors/index.js +++ b/src/extractors/index.js @@ -1,5 +1,5 @@ const Extractor = { -} +}; -export default Extractor +export default Extractor; diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index 85e22b61..51c32f04 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -1,76 +1,61 @@ -import 'babel-polyfill' +import 'babel-polyfill'; -import GenericExtractor from './generic' -import Cleaners from 'cleaners' -import { convertNodeTo, stripTags } from 'utils/dom' -import { ATTR_RE } from './constants' +import Cleaners from 'cleaners'; +import { convertNodeTo } from 'utils/dom'; +import GenericExtractor from './generic'; +import { ATTR_RE } from './constants'; -const RootExtractor = { - extract(extractor=GenericExtractor, opts) { - const { $, contentOnly, extractedTitle } = opts - // This is the generic extractor. Run its extract method - if (extractor.domain === '*') return extractor.extract(opts) - - opts = { - ...opts, - extractor - } +// Remove elements by an array of selectors +export function cleanBySelectors($content, $, { clean }) { + if (!clean) return null; - if (contentOnly) { - const content = extract({ - ...opts, type: 'content', extractHtml: true, title: extractedTitle - }) - return { - content - } - } else { - const title = extract({ ...opts, type: 'title' }) - const datePublished = extract({ ...opts, type: 'datePublished' }) - const author = extract({ ...opts, type: 'author' }) - const nextPageUrl = extract({ ...opts, type: 'nextPageUrl' }) - const content = extract({ - ...opts, type: 'content', extractHtml: true, title - }) - const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', content }) - const dek = extract({ ...opts, type: 'dek', content }) - return { - title, - content, - author, - datePublished, - leadImageUrl, - dek, - } - } + $(clean.join(','), $content).remove(); - } + return $content; } -function extract(opts) { - const { type, extractor } = opts +// Transform matching elements +export function transformElements($content, $, { transforms }) { + if (!transforms) return null; + + Reflect.ownKeys(transforms).forEach((key) => { + const $matches = $(key, $content); + const value = transforms[key]; - // If nothing matches the selector, - // run the Generic extraction - return select({ ...opts, extractionOpts: extractor[type] }) || - GenericExtractor[type](opts) + // If value is a string, convert directly + if (typeof value === 'string') { + $matches.each((index, node) => { + convertNodeTo($(node), $, transforms[key]); + }); + } else if (typeof value === 'function') { + // If value is function, apply function to node + $matches.each((index, node) => { + const result = value($(node), $); + // If function returns a string, convert node to that value + if (typeof result === 'string') { + convertNodeTo($(node), $, result); + } + }); + } + }); + + return $content; } export function select(opts) { - const { $, type, extractionOpts, extractHtml=false } = opts + const { $, type, extractionOpts, extractHtml = false } = opts; // Skip if there's not extraction for this type - if (!extractionOpts) return + if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia // contributors), return the string - if (typeof extractionOpts === 'string') return extractionOpts + if (typeof extractionOpts === 'string') return extractionOpts; - const { selectors } = extractionOpts + const { selectors } = extractionOpts; - const matchingSelector = selectors.find((selector) => { - return $(selector).length === 1 - }) + const matchingSelector = selectors.find(selector => $(selector).length === 1); - if (!matchingSelector) return + if (!matchingSelector) return null; // Declaring result; will contain either // text or html, which will be cleaned @@ -79,69 +64,80 @@ export function select(opts) { // If the selector type requests html as its return type // transform and clean the element with provided selectors if (extractHtml) { - let $content = $(matchingSelector) + let $content = $(matchingSelector); // Wrap in div so transformation can take place on root element - $content.wrap($('<div></div>')) - $content = $content.parent() + $content.wrap($('<div></div>')); + $content = $content.parent(); - $content = transformElements($content, $, extractionOpts) - $content = cleanBySelectors($content, $, extractionOpts) + $content = transformElements($content, $, extractionOpts); + $content = cleanBySelectors($content, $, extractionOpts); - $content = Cleaners[type]($content, opts) + $content = Cleaners[type]($content, opts); - return $.html($content) + return $.html($content); + } + // if selector includes an attr (e.g., img[src]), + // extract the attr + const attr = matchingSelector.match(ATTR_RE); + let result; + + if (attr) { + result = $(matchingSelector).attr(attr[1]); } else { - // if selector includes an attr (e.g., img[src]), - // extract the attr - const attr = matchingSelector.match(ATTR_RE) - let result - - if (attr) { - result = $(matchingSelector).attr(attr[1]) - } else { - // otherwise use the text of the node - result = $(matchingSelector).text() - } - return Cleaners[type](result, opts) + // otherwise use the text of the node + result = $(matchingSelector).text(); } + return Cleaners[type](result, opts); } -// Remove elements by an array of selectors -export function cleanBySelectors($content, $, { clean }) { - if (!clean) return - - $(clean.join(','), $content).remove() +function extractResult(opts) { + const { type, extractor } = opts; - return $content + // If nothing matches the selector, + // run the Generic extraction + return select({ ...opts, extractionOpts: extractor[type] }) || + GenericExtractor[type](opts); } -// Transform matching elements -export function transformElements($content, $, { transforms }) { - if (!transforms) return +const RootExtractor = { + extract(extractor = GenericExtractor, opts) { + const { contentOnly, extractedTitle } = opts; + // This is the generic extractor. Run its extract method + if (extractor.domain === '*') return extractor.extract(opts); - Reflect.ownKeys(transforms).forEach((key) => { - const $matches = $(key, $content) - const value = transforms[key] + opts = { + ...opts, + extractor, + }; - // If value is a string, convert directly - if (typeof value === 'string') { - $matches.each((index, node) => { - convertNodeTo($(node), $, transforms[key]) - }) - } else if (typeof value === 'function') { - // If value is function, apply function to node - $matches.each((index, node) => { - const result = value($(node), $) - // If function returns a string, convert node to that value - if (typeof result === 'string') { - convertNodeTo($(node), $, result) - } - }) + if (contentOnly) { + const content = extractResult({ + ...opts, type: 'content', extractHtml: true, title: extractedTitle, + }); + return { + content, + }; } - }) - - return $content -} - -export default RootExtractor + const title = extractResult({ ...opts, type: 'title' }); + const datePublished = extractResult({ ...opts, type: 'datePublished' }); + const author = extractResult({ ...opts, type: 'author' }); + const nextPageUrl = extractResult({ ...opts, type: 'nextPageUrl' }); + const content = extractResult({ + ...opts, type: 'content', extractHtml: true, title, + }); + const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content }); + const dek = extractResult({ ...opts, type: 'dek', content }); + return { + title, + content, + author, + datePublished, + leadImageUrl, + dek, + nextPageUrl, + }; + }, +}; + +export default RootExtractor; diff --git a/src/extractors/root-extractor.test.js b/src/extractors/root-extractor.test.js index 61185902..347b1637 100644 --- a/src/extractors/root-extractor.test.js +++ b/src/extractors/root-extractor.test.js @@ -1,40 +1,36 @@ -import assert from 'assert' -import fs from 'fs' -import cheerio from 'cheerio' +import assert from 'assert'; +import fs from 'fs'; +import cheerio from 'cheerio'; -import RootExtractor from './root-extractor' -import { select } from './root-extractor' +import { assertClean } from 'test-helpers'; import { + default as RootExtractor, + select, cleanBySelectors, - transformElements -} from './root-extractor' + transformElements, +} from './root-extractor'; -import GenericExtractor from './generic' -import NYMagExtractor from './custom/nymag.com' +import NYMagExtractor from './custom/nymag.com'; describe('RootExtractor', () => { it('extracts based on custom selectors', () => { - const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html' - const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8') - const $ = cheerio.load(html) + const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'; + const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8'); + const $ = cheerio.load(html); const { title, - content, - author, - datePublished, - leadImageUrl, } = RootExtractor.extract( NYMagExtractor, { url, html, $, metaCache: [] } - ) + ); - assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation') - }) -}) + assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation'); + }); +}); describe('cleanBySelectors($content, $, { clean })', () => { it('removes provided selectors from the content', () => { - const opts = { clean: ['.ad', '.share'] } + const opts = { clean: ['.ad', '.share'] }; const html = ` <div> <div class="body"> @@ -42,16 +38,16 @@ describe('cleanBySelectors($content, $, { clean })', () => { <p>This is some good content</p> <div class="ad">Advertisement!</div> </div> - </div>` - const $ = cheerio.load(html) + </div>`; + const $ = cheerio.load(html); - let $content = $('.body') - $content = cleanBySelectors($content, $, opts) + let $content = $('.body'); + $content = cleanBySelectors($content, $, opts); - assert.equal($content.find('.ad').length, 0) - assert.equal($content.find('.share').length, 0) - }) -}) + assert.equal($content.find('.ad').length, 0); + assert.equal($content.find('.share').length, 0); + }); +}); describe('transformElements($content, $, { transforms })', () => { it('performs a simple transformation on matched elements', () => { @@ -63,12 +59,12 @@ describe('transformElements($content, $, { transforms })', () => { <h1>WOW BIG TITLE</h1> </div> </div> - ` + `; const opts = { - transforms: { 'h1': 'h2' } - } - const $ = cheerio.load(html) - let $content = $('.body') + transforms: { h1: 'h2' }, + }; + const $ = cheerio.load(html); + let $content = $('.body'); const after = ` <div class="body"> @@ -76,11 +72,11 @@ describe('transformElements($content, $, { transforms })', () => { <p>Here are some words</p> <h2>WOW BIG TITLE</h2> </div> - ` + `; - $content = transformElements($content, $, opts) - assertClean($.html($content), after) - }) + $content = transformElements($content, $, opts); + assertClean($.html($content), after); + }); it('performs a complex transformation on matched elements', () => { const html = ` @@ -95,19 +91,21 @@ describe('transformElements($content, $, { transforms })', () => { <p>Here are some words</p> </div> </div> - ` + `; const opts = { transforms: { - 'noscript': ($node) => { - const $children = $node.children() + noscript: ($node) => { + const $children = $node.children(); if ($children.length === 1 && $children.get(0).tagName === 'img') { - return 'figure' + return 'figure'; } - } - } - } - const $ = cheerio.load(html) - let $content = $('.body') + + return null; + }, + }, + }; + const $ = cheerio.load(html); + let $content = $('.body'); const after = ` <div class="body"> @@ -119,58 +117,49 @@ describe('transformElements($content, $, { transforms })', () => { </noscript> <p>Here are some words</p> </div> - ` + `; - $content = transformElements($content, $, opts) - assertClean($.html($content), after) - }) -}) + $content = transformElements($content, $, opts); + assertClean($.html($content), after); + }); +}); describe('select(opts)', () => { - it(`returns a node's text with a simple selector`, () => { + it('returns a node\'s text with a simple selector', () => { const html = ` <div><div class="author">Bob</div></div> - ` - const $ = cheerio.load(html) + `; + const $ = cheerio.load(html); const opts = { type: 'author', $, extractionOpts: { - selectors: ['.author'] - } - } + selectors: ['.author'], + }, + }; - const result = select(opts) - assert.equal(result, 'Bob') - }) + const result = select(opts); + assert.equal(result, 'Bob'); + }); - it(`returns a node's attr with a attr selector`, () => { + it('returns a node\'s attr with a attr selector', () => { const html = ` <div> <time datetime="2016-09-07T05:07:59-04:00"> September 7, 2016 </time> </div> - ` - const $ = cheerio.load(html) + `; + const $ = cheerio.load(html); const opts = { type: 'datePublished', $, extractionOpts: { - selectors: ['time[datetime]'] - } - } - - const result = select(opts) - assert.equal(result, '2016-09-07T09:07:59.000Z') - }) -}) - -function clean(string) { - return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ') -} - -function assertClean(a, b) { - assert.equal(clean(a), clean(b)) -} - + selectors: ['time[datetime]'], + }, + }; + + const result = select(opts); + assert.equal(result, '2016-09-07T09:07:59.000Z'); + }); +}); diff --git a/src/iris.js b/src/iris.js index e0e9b23a..e9594ef4 100644 --- a/src/iris.js +++ b/src/iris.js @@ -1,80 +1,87 @@ -import fs from 'fs' - -import Resource from 'resource' -import getExtractor from 'extractors/get-extractor' -import RootExtractor from 'extractors/root-extractor' -import { removeAnchor } from 'utils/text' +import Resource from 'resource'; +import getExtractor from 'extractors/get-extractor'; +import RootExtractor from 'extractors/root-extractor'; +import { removeAnchor } from 'utils/text'; const Iris = { - parse: async function(url, html, opts={}) { - const { fetchAllPages=true } = opts || true - let $ = await Resource.create(url, html) - html = $.html() + async parse(url, html, opts = {}) { + const { fetchAllPages = true } = opts || true; + const $ = await Resource.create(url, html); + html = $.html(); - const Extractor = getExtractor(url) - console.log(`Using extractor for ${Extractor.domain}`) + const Extractor = getExtractor(url); + console.log(`Using extractor for ${Extractor.domain}`); // Cached value of every meta name in our document. // Used when extracting title/author/date_published/dek - const metaCache = $('meta').map((_, node) => { - return $(node).attr('name') - }).toArray() + const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray(); - let extractorOpts = { url, html, $, metaCache } - let result = RootExtractor.extract(Extractor, extractorOpts) - let { nextPageUrl, title } = result + const extractorOpts = { url, html, $, metaCache }; + let result = RootExtractor.extract(Extractor, extractorOpts); + const { title, nextPageUrl } = result; if (fetchAllPages && nextPageUrl) { - result = await collectAllPages({ nextPageUrl, html, $, metaCache, result, Extractor, title, url }) + result = await this.collectAllPages( + { + nextPageUrl, + html, + $, + metaCache, + result, + Extractor, + title, + url, + } + ); } - return result - } -} + return result; + }, -async function collectAllPages({ - nextPageUrl, - html, - $, - metaCache, - result, - Extractor, - title, - url -}) { - let pages = 2 - let previousUrls = [removeAnchor(url)] - while (nextPageUrl && pages < 26) { - $ = await Resource.create(nextPageUrl) - html = $.html() - let extractorOpts = { url: nextPageUrl, html, $, metaCache } - let nextPageResult = RootExtractor.extract( - Extractor, - { - ...extractorOpts, - url: nextPageUrl, - contentOnly: true, - extractedTitle: title, - previousUrls - } - ) + async collectAllPages({ + nextPageUrl, + html, + $, + metaCache, + result, + Extractor, + title, + url, + }) { + let pages = 2; + const previousUrls = [removeAnchor(url)]; + while (nextPageUrl && pages < 26) { + $ = await Resource.create(nextPageUrl); + html = $.html(); + const extractorOpts = { url: nextPageUrl, html, $, metaCache }; + const nextPageResult = RootExtractor.extract( + Extractor, + { + ...extractorOpts, + url: nextPageUrl, + contentOnly: true, + extractedTitle: title, + previousUrls, + } + ); - previousUrls.push(nextPageUrl) - result = { - ...result, - content: ` - ${result.content} - <hr> - <h4>Page ${pages}</h4> - ${nextPageResult.content} - ` - } + previousUrls.push(nextPageUrl); + result = { + ...result, + content: ` + ${result.content} + <hr> + <h4>Page ${pages}</h4> + ${nextPageResult.content} + `, + }; - nextPageUrl = nextPageResult.nextPageUrl + nextPageUrl = nextPageResult.nextPageUrl; - pages = pages + 1 - } - return result -} + pages += 1; + } + return result; + }, +}; -export default Iris +export default Iris; diff --git a/src/iris.test.js b/src/iris.test.js index 1dc0a986..39da5c35 100644 --- a/src/iris.test.js +++ b/src/iris.test.js @@ -1,46 +1,49 @@ -import assert from 'assert' +import assert from 'assert'; -import Iris from './iris' +import Iris from './iris'; -describe('Iris', function() { - describe('parse(url)', function() { - this.timeout(1000000) +describe('Iris', () => { + describe('parse(url)', function test() { + this.timeout(1000000); it('does the whole thing', async function() { - const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220') + const result = await Iris.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220'); + assert.equal(typeof result, 'object'); // console.log(result) - }) + }); it('does blogger', async function() { - const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html') + const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html'); + assert.equal(typeof result, 'object'); // console.log(result) - }) + }); it('does wikipedia', async function() { - const result = await Iris.parse('https://en.wikipedia.org/wiki/Brihadeeswarar_Temple_fire') + const result = await Iris.parse('https://en.wikipedia.org/wiki/Brihadeeswarar_Temple_fire'); + assert.equal(typeof result, 'object'); // console.log(result) - }) + }); it('does the nyt', async function() { - const result = await Iris.parse('http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0') + const result = await Iris.parse('http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0'); + assert.equal(typeof result, 'object'); // console.log(result) - }) + }); it('does ars pagination', async function() { - const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' + const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; const result = await Iris.parse( url, null, { fetchAllPages: true } - ) + ); // console.log(result) - assert.equal(result.nextPageUrl, `${url}2`) + assert.equal(result.nextPageUrl, `${url}2`); // console.log(result.content) - }) - - }) -}) + }); + }); +}); diff --git a/src/resource/index.js b/src/resource/index.js index 172e2e38..1a3d6a86 100644 --- a/src/resource/index.js +++ b/src/resource/index.js @@ -1,72 +1,70 @@ -import 'babel-polyfill' +import 'babel-polyfill'; -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import { fetchResource } from './utils' +import { fetchResource } from './utils'; import { normalizeMetaTags, convertLazyLoadedImages, clean, -} from './utils/dom' +} from './utils/dom'; const Resource = { // Create a Resource. // // :param url: The URL for the document we should retrieve. - // :param parseNon2xx: If true, attempt to parse non-200 level - // resources. Default is false. // :param response: If set, use as the response rather than // attempting to fetch it ourselves. Expects a // string. - create: async function(url, preparedResponse, parseNon2xx=false) { - let result + async create(url, preparedResponse) { + let result; if (preparedResponse) { const validResponse = { - statusMessage: "OK", + statusMessage: 'OK', statusCode: 200, headers: { - "content-type": 'text/html', - "content-length": 500, - } - } + 'content-type': 'text/html', + 'content-length': 500, + }, + }; - result = { body: preparedResponse, response: validResponse } + result = { body: preparedResponse, response: validResponse }; } else { - result = await fetchResource(url) + result = await fetchResource(url); } - return this.generateDoc(result) + return this.generateDoc(result); }, generateDoc({ body: content, response }) { - const { "content-type": contentType } = response.headers + const { 'content-type': contentType } = response.headers; // TODO: Implement is_text function from // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57 if (!contentType.includes('html') && !contentType.includes('text')) { - throw new Error(`Content does not appear to be text.`) + throw new Error('Content does not appear to be text.'); } - let $ = cheerio.load(content, { normalizeWhitespace: true }) + let $ = cheerio.load(content, { normalizeWhitespace: true }); if ($.root().children().length === 0) { - throw new Error(`No children, likely a bad parse.`) + throw new Error('No children, likely a bad parse.'); } - $ = normalizeMetaTags($) - $ = convertLazyLoadedImages($) - $ = clean($) + $ = normalizeMetaTags($); + $ = convertLazyLoadedImages($); + $ = clean($); - return $ - } -} + return $; + }, +}; -export default Resource +export default Resource; // def __init__(self, url, parse_non_2xx=False, response=None): // """ Create a Resource. -// +// // :param url: The URL for the document we should retrieve. // :param parse_non_2xx: If True, attempt to parse non-200 level // resources. If False, raise a RetrievalFailed @@ -128,14 +126,14 @@ export default Resource // """ A Resource is a wrapper class for an HTTP resource. Provides // functionality to fetch a resource as well as a handful of shortcut // methods to run xpath efficiently on HTML, etc. -// +// // Uses requests and lxml internally for fetching and querying. // """ // // // def __init__(self, url, parse_non_2xx=False, response=None): // """ Create a Resource. -// +// // :param url: The URL for the document we should retrieve. // :param parse_non_2xx: If True, attempt to parse non-200 level // resources. If False, raise a RetrievalFailed @@ -164,20 +162,20 @@ export default Resource // as though it has already fetched the content. Useful for using // Resource objects without having to do a GET. // """ -// +// // if type(content) != unicode: // raise TypeError("Provided content must be unicode.") // // if headers is None: // headers = {} -// +// // try: // utf8_content = content.encode('utf-8', 'strict') // except UnicodeDecodeError: // logger.warning("Unable to encode content for url %s. Content " // "should be unicode and encodeable at this point.") // utf8_content = content.encode('utf-8', 'replace') -// +// // mocked_response_dict = { // "cookies": {}, // "_content": utf8_content, @@ -214,7 +212,7 @@ export default Resource // mocked_response = requests.Response() // for k, v in mocked_response_dict.items(): // setattr(mocked_response, k, v) -// +// // return Resource( // url = url, // response = mocked_response @@ -225,13 +223,13 @@ export default Resource // def url(self): // return self._url // -// +// // @url.setter // def url(self, value): // parsed_url = urlparse(value) // if parsed_url.scheme not in ('http', 'https'): // raise ValueError("Resource only allows HTTP and HTTPS urls.") -// +// // if not parsed_url.netloc: // raise ValueError("Relative URLs are not allowed.") // @@ -311,21 +309,21 @@ export default Resource // def is_plaintext(self): // if 'text/plain' in self.content_type: // return True -// +// // return False -// +// // @property // def is_image(self): // if 'image' in self.content_type: // return True -// +// // return False -// +// // @property // def is_pdf(self): // if 'pdf' in self.content_type: // return True -// +// // return False // // _lxml_doc = None @@ -342,7 +340,7 @@ export default Resource // """ Generate an XPath Evaluator for this doc. """ // if self._docxp is None: // self._docxp = XPathEvaluator(self.doc) -// +// // return self._docxp // // _redocxp = None @@ -350,7 +348,7 @@ export default Resource // def redocxp(self): // """ Generate an XPath Evaluator for this doc, that includes the RE // namespace for regular expression matching. -// +// // """ // if self._redocxp is None: // _rens = {'re':'http://exslt.org/regular-expressions'} @@ -365,7 +363,7 @@ export default Resource // not is_text(self.content[:512])): // raise ValueError("Content does not appear to be text.") // -// +// // # Remove useless carriage returns which get parsed as otherwise // content = re.sub(r'(\n\r|\r\n)', '\n', self.content) // @@ -376,16 +374,16 @@ export default Resource // // // -// +// // if len(self._lxml_doc.getchildren()) == 0: // stats.increment('iris.resource.encoding.no_children') // raise ValueError("No children, likely a bad parse.") // // // # Sometimes, lxml (or BeautifulSoup) will wrap the whole document -// # in an extra html tag. This screws up a whole bunch of things in +// # in an extra html tag. This screws up a whole bunch of things in // # the parsing process. If this is the case, reset the doc to the -// # ACTUAL root of the doc. +// # ACTUAL root of the doc. // # Sample cases: // # * Strange Doctype causing issues: http://bit.ly/IATz0B // # * Messy markup causing double HTML tags: http://bit.ly/IGOq4o @@ -417,7 +415,7 @@ export default Resource // a.attrib['rel'] = ' '.join(rel_attribs) // else: // a.attrib['rel'] = 'nofollow' -// +// // # Re-relativize anchor links // anchor_link_xpath = ("//a[starts-with(@href, '%s#')]" % // self.url.replace("'", "%27")) @@ -430,16 +428,16 @@ export default Resource // def attrib_map(self): // """ Create an AttribMap object for fast checking of class/id existence // in the document. Used in association with extract_by_selector. -// +// // """ // if self._attrib_map is None: // self._attrib_map = AttribMap(self.doc) -// +// // return self._attrib_map // // // def extract_by_selector(self, selector): // " Shortcut to run extract_by_selector on our doc with our AttribMap. " // return ebs(self.doc, selector, self.attrib_map, self.docxp) -// +// // diff --git a/src/resource/index.test.js b/src/resource/index.test.js index ff7d09e6..5b4baf3b 100644 --- a/src/resource/index.test.js +++ b/src/resource/index.test.js @@ -1,58 +1,58 @@ -import assert from 'assert' +import assert from 'assert'; -import Resource from './index' +import Resource from './index'; describe('Resource', () => { - describe('create(url)', function() { - this.timeout(10000) - it('fetches the page and returns a cheerio object', async () => { - const url = 'http://theconcourse.deadspin.com/1786177057' - const $ = await Resource.create(url) + describe('create(url)', function test() { + this.timeout(10000); + it('fetches the page and returns a cheerio object', (async) () => { + const url = 'http://theconcourse.deadspin.com/1786177057'; + const $ = await Resource.create(url); - // console.log($.html()) - }) - }) + assert.equal(typeof $, 'function'); + }); + }); describe('generateDoc({ body, response })', () => { it('returns a cheerio object if valid', () => { - const response = { headers: { "content-type": "text/html" } } + const response = { headers: { 'content-type': 'text/html' } }; - const body = `<div><p>Hi</p></div>` - const $ = Resource.generateDoc({ body, response }) + const body = '<div><p>Hi</p></div>'; + const $ = Resource.generateDoc({ body, response }); - assert.equal($.html(), body) - }) + assert.equal($.html(), body); + }); it('throws an error if the content is not text', () => { const response = { headers: { - "content-type": "foo" - } - } - const body = '' + 'content-type': 'foo', + }, + }; + const body = ''; assert.throws( () => { - Resource.generateDoc({ body, response }) + Resource.generateDoc({ body, response }); }, /content does not appear to be text/i - ) - }) + ); + }); it('throws an error if the content has no children', () => { const response = { headers: { - "content-type": "html" - } - } - const body = `` + 'content-type': 'html', + }, + }; + const body = ''; assert.throws( () => { - Resource.generateDoc({ body, response }) + Resource.generateDoc({ body, response }); }, /no children/i - ) - }) - }) -}) + ); + }); + }); +}); diff --git a/src/resource/utils/constants.js b/src/resource/utils/constants.js index 39c379da..4b0daf13 100644 --- a/src/resource/utils/constants.js +++ b/src/resource/utils/constants.js @@ -1,36 +1,35 @@ export const REQUEST_HEADERS = { - "User-Agent": "Readability - http://readability.com/about/", -} + 'User-Agent': 'Readability - http://readability.com/about/', +}; // The number of milliseconds to attempt to fetch a resource before timing out. -export const FETCH_TIMEOUT = 10000 +export const FETCH_TIMEOUT = 10000; // Content types that we do not extract content from const BAD_CONTENT_TYPES = [ - 'audio/mpeg', - 'image/gif', - 'image/jpeg', - 'image/jpg', -] - -export const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i') + 'audio/mpeg', + 'image/gif', + 'image/jpeg', + 'image/jpg', +]; +export const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i'); // Use this setting as the maximum size an article can be // for us to attempt parsing. Defaults to 5 MB. -export const MAX_CONTENT_LENGTH = 5242880 +export const MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off // Proxying is not currently enabled in Python source // so not implementing logic in port. -export const PROXY_DOMAINS = false +export const PROXY_DOMAINS = false; export const REQUESTS_PROXIES = { - 'http': 'http://38.98.105.139:33333', - 'https': 'http://38.98.105.139:33333', -} + http: 'http://38.98.105.139:33333', + https: 'http://38.98.105.139:33333', +}; export const DOMAINS_TO_PROXY = [ 'nih.gov', 'gutenberg.org', -] +]; diff --git a/src/resource/utils/dom/clean.js b/src/resource/utils/dom/clean.js index 5ab2d80e..13f3b5b7 100644 --- a/src/resource/utils/dom/clean.js +++ b/src/resource/utils/dom/clean.js @@ -1,17 +1,21 @@ -import { TAGS_TO_REMOVE } from './constants' -export default function clean($) { - $(TAGS_TO_REMOVE).remove() - - $ = cleanComments($) - return $ -} +import { TAGS_TO_REMOVE } from './constants'; function isComment(index, node) { - return node.type === 'comment' + return node.type === 'comment'; } function cleanComments($) { - $.root().find('*').contents().filter(isComment).remove() + $.root().find('*') + .contents() + .filter(isComment) + .remove(); + + return $; +} + +export default function clean($) { + $(TAGS_TO_REMOVE).remove(); - return $ + $ = cleanComments($); + return $; } diff --git a/src/resource/utils/dom/clean.test.js b/src/resource/utils/dom/clean.test.js index 947b2067..86b99b5d 100644 --- a/src/resource/utils/dom/clean.test.js +++ b/src/resource/utils/dom/clean.test.js @@ -1,27 +1,27 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import clean from './clean' +import clean from './clean'; describe('clean($)', () => { it('removes script elements', () => { - const html = `<div><script>alert('hi')</script></div>` - const $ = cheerio.load(html) + const html = '<div><script>alert(\'hi\')</script></div>'; + const $ = cheerio.load(html); - assert.equal(clean($).html(), '<div></div>') - }) + assert.equal(clean($).html(), '<div></div>'); + }); it('removes style elements', () => { - const html = `<div><style>foo: {color: red;}</style></div>` - const $ = cheerio.load(html) + const html = '<div><style>foo: {color: red;}</style></div>'; + const $ = cheerio.load(html); - assert.equal(clean($).html(), '<div></div>') - }) + assert.equal(clean($).html(), '<div></div>'); + }); it('removes comments', () => { - const html = `<div>HI <!-- This is a comment --></div>` - const $ = cheerio.load(html) + const html = '<div>HI <!-- This is a comment --></div>'; + const $ = cheerio.load(html); - assert.equal(clean($).html(), '<div>HI </div>') - }) -}) + assert.equal(clean($).html(), '<div>HI </div>'); + }); +}); diff --git a/src/resource/utils/dom/constants.js b/src/resource/utils/dom/constants.js index 3332172d..129efaf0 100644 --- a/src/resource/utils/dom/constants.js +++ b/src/resource/utils/dom/constants.js @@ -1,8 +1,8 @@ -export const IS_LINK = new RegExp('https?://', 'i') -export const IS_IMAGE = new RegExp('\.(png|gif|jpe?g)', 'i') +export const IS_LINK = new RegExp('https?://', 'i'); +export const IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i'); export const TAGS_TO_REMOVE = [ 'script', 'style', 'form', -].join(',') +].join(','); diff --git a/src/resource/utils/dom/convert-lazy-loaded-images.js b/src/resource/utils/dom/convert-lazy-loaded-images.js index 8b757a9a..11c67fbc 100644 --- a/src/resource/utils/dom/convert-lazy-loaded-images.js +++ b/src/resource/utils/dom/convert-lazy-loaded-images.js @@ -1,9 +1,9 @@ -import 'babel-polyfill' +import 'babel-polyfill'; import { IS_LINK, IS_IMAGE, -} from './constants' +} from './constants'; // Convert all instances of images with potentially // lazy loaded images into normal images. @@ -13,14 +13,14 @@ import { export default function convertLazyLoadedImages($) { $('img').each((_, img) => { Reflect.ownKeys(img.attribs).forEach((attr) => { - const value = img.attribs[attr] + const value = img.attribs[attr]; if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) { - $(img).attr('src', value) + $(img).attr('src', value); } - }) - }) + }); + }); - return $ + return $; } diff --git a/src/resource/utils/dom/convert-lazy-loaded-images.test.js b/src/resource/utils/dom/convert-lazy-loaded-images.test.js index 386777fc..615888f6 100644 --- a/src/resource/utils/dom/convert-lazy-loaded-images.test.js +++ b/src/resource/utils/dom/convert-lazy-loaded-images.test.js @@ -1,44 +1,44 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import convertLazyLoadedImages from './convert-lazy-loaded-images' +import convertLazyLoadedImages from './convert-lazy-loaded-images'; describe('convertLazyLoadedImages($)', () => { it('moves image links to src if placed in another attribute', () => { - const html = `<img data-src="http://example.com/foo.jpg">` - const $ = cheerio.load(html) + const html = '<img data-src="http://example.com/foo.jpg">'; + const $ = cheerio.load(html); - const result = convertLazyLoadedImages($).html() + const result = convertLazyLoadedImages($).html(); - assert.equal(result, `<img data-src="http://example.com/foo.jpg" src="http://example.com/foo.jpg">`) - }) + assert.equal(result, '<img data-src="http://example.com/foo.jpg" src="http://example.com/foo.jpg">'); + }); it('does nothing when value is not a link', () => { // This is far from perfect, since a relative url could // be perfectly correct. - const html = `<img data-src="foo.jpg">` - const $ = cheerio.load(html) + const html = '<img data-src="foo.jpg">'; + const $ = cheerio.load(html); - const result = convertLazyLoadedImages($).html() + const result = convertLazyLoadedImages($).html(); - assert.equal(result, `<img data-src="foo.jpg">`) - }) + assert.equal(result, '<img data-src="foo.jpg">'); + }); it('does nothing when value is not an image', () => { - const html = `<img data-src="http://example.com">` - const $ = cheerio.load(html) + const html = '<img data-src="http://example.com">'; + const $ = cheerio.load(html); - const result = convertLazyLoadedImages($).html() + const result = convertLazyLoadedImages($).html(); - assert.equal(result, `<img data-src="http://example.com">`) - }) + assert.equal(result, '<img data-src="http://example.com">'); + }); it('does not change a correct img with src', () => { - const html = `<img src="http://example.com/foo.jpg">` - const $ = cheerio.load(html) + const html = '<img src="http://example.com/foo.jpg">'; + const $ = cheerio.load(html); - const result = convertLazyLoadedImages($).html() + const result = convertLazyLoadedImages($).html(); - assert.equal(result, `<img src="http://example.com/foo.jpg">`) - }) -}) + assert.equal(result, '<img src="http://example.com/foo.jpg">'); + }); +}); diff --git a/src/resource/utils/dom/index.js b/src/resource/utils/dom/index.js index 42def32a..1a74439d 100644 --- a/src/resource/utils/dom/index.js +++ b/src/resource/utils/dom/index.js @@ -1,3 +1,3 @@ -export { default as normalizeMetaTags } from './normalize-meta-tags' -export { default as convertLazyLoadedImages } from './convert-lazy-loaded-images' -export { default as clean } from './clean' +export { default as normalizeMetaTags } from './normalize-meta-tags'; +export { default as convertLazyLoadedImages } from './convert-lazy-loaded-images'; +export { default as clean } from './clean'; diff --git a/src/resource/utils/dom/normalize-meta-tags.js b/src/resource/utils/dom/normalize-meta-tags.js index c937b871..994fa0b7 100644 --- a/src/resource/utils/dom/normalize-meta-tags.js +++ b/src/resource/utils/dom/normalize-meta-tags.js @@ -1,3 +1,15 @@ +function convertMetaProp($, from, to) { + $(`meta[${from}]`).each((_, node) => { + const $node = $(node); + + const value = $node.attr(from); + $node.attr(to, value); + $node.removeAttr(from); + }); + + return $; +} + // For ease of use in extracting from meta tags, // replace the "content" attribute on meta tags with the // "value" attribute. @@ -6,19 +18,7 @@ // querying later. See, e.g., og or twitter meta tags. export default function normalizeMetaTags($) { - $ = convertMetaProp($, 'content', 'value') - $ = convertMetaProp($, 'property', 'name') - return $ -} - -function convertMetaProp($, from, to) { - $(`meta[${from}]`).each((_, node) => { - const $node = $(node) - - const value = $node.attr(from) - $node.attr(to, value) - $node.removeAttr(from) - }) - - return $ + $ = convertMetaProp($, 'content', 'value'); + $ = convertMetaProp($, 'property', 'name'); + return $; } diff --git a/src/resource/utils/dom/normalize-meta-tags.test.js b/src/resource/utils/dom/normalize-meta-tags.test.js index 8a8bceb6..3b4959a9 100644 --- a/src/resource/utils/dom/normalize-meta-tags.test.js +++ b/src/resource/utils/dom/normalize-meta-tags.test.js @@ -1,28 +1,28 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import normalizeMetaTags from './normalize-meta-tags' +import normalizeMetaTags from './normalize-meta-tags'; describe('normalizeMetaTags($)', () => { it('replaces "content" attributes with "value"', () => { - const html = `<html><meta name="foo" content="bar"></html>` - const test = `<html><meta name="foo" value="bar"></html>` + const html = '<html><meta name="foo" content="bar"></html>'; + const test = '<html><meta name="foo" value="bar"></html>'; - const $ = cheerio.load(html) + const $ = cheerio.load(html); - const result = normalizeMetaTags($).html() + const result = normalizeMetaTags($).html(); - assert.equal(result, test) - }) + assert.equal(result, test); + }); it('replaces "property" attributes with "name"', () => { - const html = `<html><meta property="foo" value="bar"></html>` - const test = `<html><meta value="bar" name="foo"></html>` + const html = '<html><meta property="foo" value="bar"></html>'; + const test = '<html><meta value="bar" name="foo"></html>'; - const $ = cheerio.load(html) + const $ = cheerio.load(html); - const result = normalizeMetaTags($).html() + const result = normalizeMetaTags($).html(); - assert.equal(result, test) - }) -}) + assert.equal(result, test); + }); +}); diff --git a/src/resource/utils/fetch-resource.js b/src/resource/utils/fetch-resource.js index fefc283e..4a3d691c 100644 --- a/src/resource/utils/fetch-resource.js +++ b/src/resource/utils/fetch-resource.js @@ -1,55 +1,25 @@ -import 'babel-polyfill' +import 'babel-polyfill'; -import URL from 'url' -import request from 'request' +import URL from 'url'; +import request from 'request'; import { REQUEST_HEADERS, FETCH_TIMEOUT, BAD_CONTENT_TYPES_RE, MAX_CONTENT_LENGTH, -} from './constants' +} from './constants'; -// Set our response attribute to the result of fetching our URL. -// TODO: This should gracefully handle timeouts and raise the -// proper exceptions on the many failure cases of HTTP. -// TODO: Ensure we are not fetching something enormous. Always return -// unicode content for HTML, with charset conversion. - -export default async function fetchResource(url) { - const parsedUrl = URL.parse(url) - - const options = { - url: parsedUrl, - headers: { ...REQUEST_HEADERS }, - timeout: FETCH_TIMEOUT, - // Don't set encoding; this fixes issues - // w/gzipped responses - encoding: null, - // Accept cookies - jar: true, - } - - const { response, body } = await get(options) - - try { - validateResponse(response) - return { body, response } - } catch(e) { - return e - } -} - -function get(options){ - return new Promise(function(resolve, reject){ - request(options, function(err, response, body){ - if(err){ - reject(err) +function get(options) { + return new Promise((resolve, reject) => { + request(options, (err, response, body) => { + if (err) { + reject(err); } else { - resolve({ body, response }) + resolve({ body, response }); } - }) - }) + }); + }); } // Evaluate a response to ensure it's something we should be keeping. @@ -57,45 +27,74 @@ function get(options){ // not. Validation here means that we haven't found reason to bail from // further processing of this url. -export function validateResponse(response, parseNon2xx=false) { +export function validateResponse(response, parseNon2xx = false) { // Check if we got a valid status code - if (response.statusMessage !== "OK") { + if (response.statusMessage !== 'OK') { if (!response.statusCode) { throw new Error( `Unable to fetch content. Original exception was ${response.error}` - ) + ); } else if (!parseNon2xx) { throw new Error( `Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.` - ) + ); } - } const { - "content-type": contentType, - "content-length": contentLength - } = response.headers + 'content-type': contentType, + 'content-length': contentLength, + } = response.headers; // Check that the content is not in BAD_CONTENT_TYPES if (BAD_CONTENT_TYPES_RE.test(contentType)) { throw new Error( `Content-type for this resource was ${contentType} and is not allowed.` - ) + ); } // Check that the content length is below maximum if (contentLength > MAX_CONTENT_LENGTH) { throw new Error( `Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.` - ) + ); } - return true + return true; } // Grabs the last two pieces of the URL and joins them back together // This is to get the 'livejournal.com' from 'erotictrains.livejournal.com' export function baseDomain({ host }) { - return host.split('.').slice(-2).join('.') + return host.split('.').slice(-2).join('.'); +} + +// Set our response attribute to the result of fetching our URL. +// TODO: This should gracefully handle timeouts and raise the +// proper exceptions on the many failure cases of HTTP. +// TODO: Ensure we are not fetching something enormous. Always return +// unicode content for HTML, with charset conversion. + +export default async function fetchResource(url) { + const parsedUrl = URL.parse(url); + + const options = { + url: parsedUrl, + headers: { ...REQUEST_HEADERS }, + timeout: FETCH_TIMEOUT, + // Don't set encoding; this fixes issues + // w/gzipped responses + encoding: null, + // Accept cookies + jar: true, + }; + + const { response, body } = await get(options); + + try { + validateResponse(response); + return { body, response }; + } catch (e) { + return e; + } } diff --git a/src/resource/utils/fetch-resource.test.js b/src/resource/utils/fetch-resource.test.js index 94442d5d..f3b3c398 100644 --- a/src/resource/utils/fetch-resource.test.js +++ b/src/resource/utils/fetch-resource.test.js @@ -1,118 +1,118 @@ -import assert from 'assert' -import URL from 'url' +import assert from 'assert'; +import URL from 'url'; import { default as fetchResource, baseDomain, validateResponse, -} from './fetch-resource' -import { MAX_CONTENT_LENGTH } from './constants' +} from './fetch-resource'; +import { MAX_CONTENT_LENGTH } from './constants'; -describe('fetchResource(url)', function() { - this.timeout(1000000) - it('fetches domains', async () => { - const url = 'http://theconcourse.deadspin.com/1786177057' - const { body, response } = await fetchResource(url) +describe('fetchResource(url)', function test() { + this.timeout(1000000); + it('fetches domains', (async) () => { + const url = 'http://theconcourse.deadspin.com/1786177057'; + const { body } = await fetchResource(url); - assert.equal(typeof body, 'object') - }) + assert.equal(typeof body, 'object'); + }); - it('fetches nyt', async () => { - const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0' - const { body, response } = await fetchResource(url) + it('fetches nyt', (async) () => { + const url = 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0'; + const { body } = await fetchResource(url); - assert.equal(typeof body, 'object') - }) -}) + assert.equal(typeof body, 'object'); + }); +}); describe('validateResponse(response)', () => { it('validates a response object', () => { const validResponse = { - statusMessage: "OK", + statusMessage: 'OK', statusCode: 200, headers: { - "content-type": 'text/html', - "content-length": 500, - } - } + 'content-type': 'text/html', + 'content-length': 500, + }, + }; - assert.equal(validateResponse(validResponse), true) - }) + assert.equal(validateResponse(validResponse), true); + }); it('throws an error if there is no status code', () => { const invalidResponse = { - } + }; assert.throws( () => { - validateResponse(invalidResponse) + validateResponse(invalidResponse); }, /unable to fetch content/i - ) - }) + ); + }); it('throws an error if response code is not 2xx', () => { const invalidResponse = { statusCode: 500, - } + }; assert.throws( () => { - validateResponse(invalidResponse) + validateResponse(invalidResponse); }, /instructed to reject non-2xx/i - ) - }) + ); + }); it('throws an error if response has bad content-type', () => { const invalidResponse = { - statusMessage: "OK", + statusMessage: 'OK', statusCode: 200, headers: { - "content-type": 'image/gif', - "content-length": 500, - } - } + 'content-type': 'image/gif', + 'content-length': 500, + }, + }; assert.throws( () => { - validateResponse(invalidResponse) + validateResponse(invalidResponse); }, /content-type for this resource/i - ) - }) + ); + }); it('throws an error if response length is > max', () => { const invalidResponse = { - statusMessage: "OK", + statusMessage: 'OK', statusCode: 200, headers: { - "content-type": 'text/html', - "content-length": MAX_CONTENT_LENGTH + 1, - } - } + 'content-type': 'text/html', + 'content-length': MAX_CONTENT_LENGTH + 1, + }, + }; assert.throws( () => { - validateResponse(invalidResponse) + validateResponse(invalidResponse); }, /Content for this resource was too large/i - ) - }) -}) + ); + }); +}); describe('baseDomain(parsedUrl)', () => { it('returns the base domain, excluding subdomain', () => { - const url = 'https://www.npmjs.com/package/request#streaming' - const parsedUrl = URL.parse(url) + const url = 'https://www.npmjs.com/package/request#streaming'; + const parsedUrl = URL.parse(url); - assert.equal(baseDomain(parsedUrl), 'npmjs.com') - }) + assert.equal(baseDomain(parsedUrl), 'npmjs.com'); + }); it('returns the base domain as is if no subdomain', () => { - const url = 'https://npmjs.com/package/request#streaming' - const parsedUrl = URL.parse(url) + const url = 'https://npmjs.com/package/request#streaming'; + const parsedUrl = URL.parse(url); - assert.equal(baseDomain(parsedUrl), 'npmjs.com') - }) -}) + assert.equal(baseDomain(parsedUrl), 'npmjs.com'); + }); +}); diff --git a/src/resource/utils/index.js b/src/resource/utils/index.js index c41fd72f..b98d4710 100644 --- a/src/resource/utils/index.js +++ b/src/resource/utils/index.js @@ -1 +1 @@ -export { default as fetchResource } from './fetch-resource' +export { default as fetchResource } from './fetch-resource'; diff --git a/src/test-helpers.js b/src/test-helpers.js index 9598b25e..1a12cec2 100644 --- a/src/test-helpers.js +++ b/src/test-helpers.js @@ -1,11 +1,10 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; export function clean(string) { - return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ') + return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' '); } export function assertClean(a, b) { - assert.equal(clean(a), clean(b)) + assert.equal(clean(a), clean(b)); } diff --git a/src/utils/dom/brs-to-ps.js b/src/utils/dom/brs-to-ps.js index 1bcd6901..7ad53513 100644 --- a/src/utils/dom/brs-to-ps.js +++ b/src/utils/dom/brs-to-ps.js @@ -1,4 +1,4 @@ -import { paragraphize } from './index' +import { paragraphize } from './index'; // ## NOTES: // Another good candidate for refactoring/optimizing. @@ -11,19 +11,19 @@ import { paragraphize } from './index' // :param $: A cheerio object export default function brsToPs($) { - let collapsing = false + let collapsing = false; $('br').each((index, element) => { - let nextElement = $(element).next().get(0) + const nextElement = $(element).next().get(0); if (nextElement && nextElement.tagName === 'br') { - collapsing = true - $(element).remove() + collapsing = true; + $(element).remove(); } else if (collapsing) { - collapsing = false + collapsing = false; // $(element).replaceWith('<p />') - paragraphize(element, $, true) + paragraphize(element, $, true); } - }) + }); - return $ + return $; } diff --git a/src/utils/dom/brs-to-ps.test.js b/src/utils/dom/brs-to-ps.test.js index 6e537567..5c3eaadc 100644 --- a/src/utils/dom/brs-to-ps.test.js +++ b/src/utils/dom/brs-to-ps.test.js @@ -1,39 +1,37 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import { assertClean } from 'test-helpers' -import HTML from './fixtures/html' -import brsToPs from './brs-to-ps' +import { assertClean } from 'test-helpers'; +import HTML from './fixtures/html'; +import brsToPs from './brs-to-ps'; function assertBeforeAndAfter(key, fn) { - const $ = cheerio.load(HTML[key].before) - assertClean(fn($).html(), HTML[key].after) + const $ = cheerio.load(HTML[key].before); + assertClean(fn($).html(), HTML[key].after); } describe('Generic Extractor Utils', () => { describe('brsToPs(node)', () => { - - it("does nothing when no BRs present", () => { - const $ = cheerio.load(HTML.positiveId) - assert.equal(brsToPs($).html(), HTML.positiveId) - }) - - it("does nothing when a single BR is present", () => { - assertBeforeAndAfter('singleBr', brsToPs) - }) - - it("converts double BR tags to an empty P tag", () => { - assertBeforeAndAfter('doubleBrs', brsToPs) - }) - - it("converts several BR tags to an empty P tag", () => { - assertBeforeAndAfter('severalBrs', brsToPs) - }) - - it("converts BR tags in a P tag into a P containing inline children", () => { - assertBeforeAndAfter('brsInP', brsToPs) - }) - - }) -}) + it('does nothing when no BRs present', () => { + const $ = cheerio.load(HTML.positiveId); + assert.equal(brsToPs($).html(), HTML.positiveId); + }); + + it('does nothing when a single BR is present', () => { + assertBeforeAndAfter('singleBr', brsToPs); + }); + + it('converts double BR tags to an empty P tag', () => { + assertBeforeAndAfter('doubleBrs', brsToPs); + }); + + it('converts several BR tags to an empty P tag', () => { + assertBeforeAndAfter('severalBrs', brsToPs); + }); + + it('converts BR tags in a P tag into a P containing inline children', () => { + assertBeforeAndAfter('brsInP', brsToPs); + }); + }); +}); diff --git a/src/utils/dom/clean-attributes.js b/src/utils/dom/clean-attributes.js index bc6913c7..5440654b 100644 --- a/src/utils/dom/clean-attributes.js +++ b/src/utils/dom/clean-attributes.js @@ -1,34 +1,29 @@ -import 'babel-polyfill' +import 'babel-polyfill'; -import { - REMOVE_ATTR_SELECTORS, - REMOVE_ATTR_LIST, - REMOVE_ATTRS, - WHITELIST_ATTRS_RE, -} from './constants' +import { WHITELIST_ATTRS_RE } from './constants'; -// Remove attributes like style or align -export default function cleanAttributes($article, $) { - removeAllButWhitelist($article, $) - - return $ -} - -function removeAllButWhitelist($article, $) { +function removeAllButWhitelist($article) { // $('*', article).each((index, node) => { $article.find('*').each((index, node) => { node.attribs = Reflect.ownKeys(node.attribs).reduce((acc, attr) => { if (WHITELIST_ATTRS_RE.test(attr)) { - return { ...acc, [attr]: node.attribs[attr] } - } else { - return acc + return { ...acc, [attr]: node.attribs[attr] }; } - }, {}) - }) + + return acc; + }, {}); + }); } -function removeAttrs(article, $) { - REMOVE_ATTRS.forEach((attr) => { - $(`[${attr}]`, article).removeAttr(attr) - }) +// function removeAttrs(article, $) { +// REMOVE_ATTRS.forEach((attr) => { +// $(`[${attr}]`, article).removeAttr(attr); +// }); +// } + +// Remove attributes like style or align +export default function cleanAttributes($article) { + removeAllButWhitelist($article); + + return $article; } diff --git a/src/utils/dom/clean-attributes.test.js b/src/utils/dom/clean-attributes.test.js index 1021cf7a..2bceb198 100644 --- a/src/utils/dom/clean-attributes.test.js +++ b/src/utils/dom/clean-attributes.test.js @@ -1,24 +1,22 @@ -import cheerio from 'cheerio' -import assert from 'assert' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { cleanAttributes } from './index' +import HTML from './fixtures/html'; +import { cleanAttributes } from './index'; describe('cleanAttributes($)', () => { - it("removes style attributes from nodes", () => { - let $ = cheerio.load(HTML.removeStyle.before) + it('removes style attributes from nodes', () => { + const $ = cheerio.load(HTML.removeStyle.before); - let result = cleanAttributes($('*').first(), $) - assertClean(result.html(), HTML.removeStyle.after) - }) + const result = cleanAttributes($('*').first()); + assertClean($.html(result), HTML.removeStyle.after); + }); - it("removes align attributes from nodes", () => { - let $ = cheerio.load(HTML.removeAlign.before) + it('removes align attributes from nodes', () => { + const $ = cheerio.load(HTML.removeAlign.before); - let result = cleanAttributes($('*').first(), $) - assertClean(result.html(), HTML.removeAlign.after) - }) - -}) + const result = cleanAttributes($('*').first()); + assertClean($.html(result), HTML.removeAlign.after); + }); +}); diff --git a/src/utils/dom/clean-h-ones.js b/src/utils/dom/clean-h-ones.js index b6702832..e8710295 100644 --- a/src/utils/dom/clean-h-ones.js +++ b/src/utils/dom/clean-h-ones.js @@ -1,18 +1,18 @@ -import { convertNodeTo } from 'utils/dom' +import { convertNodeTo } from 'utils/dom'; // H1 tags are typically the article title, which should be extracted // by the title extractor instead. If there's less than 3 of them (<3), // strip them. Otherwise, turn 'em into H2s. export default function cleanHOnes(article, $) { // const hOnes = $.find('h1') - const $hOnes = $('h1', article) + const $hOnes = $('h1', article); if ($hOnes.length < 3) { - $hOnes.each((index, node) => $(node).remove()) + $hOnes.each((index, node) => $(node).remove()); } else { $hOnes.each((index, node) => { - convertNodeTo($(node), $, 'h2') - }) + convertNodeTo($(node), $, 'h2'); + }); } - return $ + return $; } diff --git a/src/utils/dom/clean-h-ones.test.js b/src/utils/dom/clean-h-ones.test.js index 30effef0..86393df0 100644 --- a/src/utils/dom/clean-h-ones.test.js +++ b/src/utils/dom/clean-h-ones.test.js @@ -1,28 +1,23 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { cleanHOnes } from './index' +import HTML from './fixtures/html'; +import { cleanHOnes } from './index'; describe('cleanHOnes($)', () => { - it("removes H1s if there are less than 3 of them", () => { - let $ = cheerio.load(HTML.removeTwoHOnes.before) - - let result = cleanHOnes($('*').first(), $) - assertClean(result.html(), HTML.removeTwoHOnes.after) - }) - - it("converts H1s to H2s if there are 3 or more of them", () => { - let $ = cheerio.load(HTML.convertThreeHOnes.before) - - let result = cleanHOnes($('*').first(), $) - assertClean(result.html(), HTML.convertThreeHOnes.after) - }) - -}) + it('removes H1s if there are less than 3 of them', () => { + const $ = cheerio.load(HTML.removeTwoHOnes.before); + const result = cleanHOnes($('*').first(), $); + assertClean(result.html(), HTML.removeTwoHOnes.after); + }); + it('converts H1s to H2s if there are 3 or more of them', () => { + const $ = cheerio.load(HTML.convertThreeHOnes.before); + const result = cleanHOnes($('*').first(), $); + assertClean(result.html(), HTML.convertThreeHOnes.after); + }); +}); diff --git a/src/utils/dom/clean-headers.js b/src/utils/dom/clean-headers.js index bc7d5f58..2db0ddd2 100644 --- a/src/utils/dom/clean-headers.js +++ b/src/utils/dom/clean-headers.js @@ -1,39 +1,32 @@ -import { HEADER_TAG_LIST } from './constants' -import { normalizeSpaces } from '../text' -import { getWeight } from 'extractors/generic/content/scoring' +import { getWeight } from 'extractors/generic/content/scoring'; -export default function cleanHeaders($article, $, title='') { +import { HEADER_TAG_LIST } from './constants'; +import { normalizeSpaces } from '../text'; + +export default function cleanHeaders($article, $, title = '') { $(HEADER_TAG_LIST, $article).each((index, header) => { - const $header = $(header) + const $header = $(header); // Remove any headers that appear before all other p tags in the // document. This probably means that it was part of the title, a // subtitle or something else extraneous like a datestamp or byline, // all of which should be handled by other metadata handling. if ($($header, $article).prevAll('p').length === 0) { - return $header.remove() + return $header.remove(); } // Remove any headers that match the title exactly. if (normalizeSpaces($(header).text()) === title) { - return $header.remove() + return $header.remove(); } // If this header has a negative weight, it's probably junk. // Get rid of it. if (getWeight($(header)) < 0) { - return $header.remove() + return $header.remove(); } - }) - return $ + + return $header; + }); + + return $; } - // # If this header has a negative weight, it's probably junk. - // # Get rid of it. - // if self.get_weight(header) < 0: - // drop_header = True - // - // if drop_header: - // try: - // header.drop_tree() - // except AssertionError: - // # No parent exists for this node, so just blank it out. - // header.text = '' diff --git a/src/utils/dom/clean-headers.test.js b/src/utils/dom/clean-headers.test.js index 5fff337d..7c9867f3 100644 --- a/src/utils/dom/clean-headers.test.js +++ b/src/utils/dom/clean-headers.test.js @@ -1,31 +1,30 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { cleanHeaders } from './index' +import HTML from './fixtures/html'; +import { cleanHeaders } from './index'; describe('cleanHeaders(article, $)', () => { - it("parses html and returns the article", () => { - let $ = cheerio.load(HTML.cleanFirstHeds.before) + it('parses html and returns the article', () => { + const $ = cheerio.load(HTML.cleanFirstHeds.before); - let result = cleanHeaders($('*').first(), $) - assertClean(result.html(), HTML.cleanFirstHeds.after) - }) + const result = cleanHeaders($('*').first(), $); + assertClean(result.html(), HTML.cleanFirstHeds.after); + }); - it("removes headers when the header text matches the title", () => { - let $ = cheerio.load(HTML.cleanTitleMatch.before) + it('removes headers when the header text matches the title', () => { + const $ = cheerio.load(HTML.cleanTitleMatch.before); - let result = cleanHeaders($('*').first(), $, 'Title Match') - assertClean(result.html(), HTML.cleanTitleMatch.after) - }) + const result = cleanHeaders($('*').first(), $, 'Title Match'); + assertClean(result.html(), HTML.cleanTitleMatch.after); + }); - it("removes headers with a negative weight", () => { - let $ = cheerio.load(HTML.dropWithNegativeWeight.before) + it('removes headers with a negative weight', () => { + const $ = cheerio.load(HTML.dropWithNegativeWeight.before); - let result = cleanHeaders($('*').first(), $) - assertClean(result.html(), HTML.dropWithNegativeWeight.after) - }) -}) + const result = cleanHeaders($('*').first(), $); + assertClean(result.html(), HTML.dropWithNegativeWeight.after); + }); +}); diff --git a/src/utils/dom/clean-images.js b/src/utils/dom/clean-images.js index 2a089f32..42f2a0ff 100644 --- a/src/utils/dom/clean-images.js +++ b/src/utils/dom/clean-images.js @@ -1,41 +1,41 @@ -import { SPACER_RE } from './constants' - -export default function cleanImages($article, $) { - $article.find('img').each((index, img) => { - const $img = $(img) - - cleanForHeight($img, $) - removeSpacers($img, $) - }) - - return $ -} +import { SPACER_RE } from './constants'; function cleanForHeight($img, $) { - const height = parseInt($img.attr('height')) - const width = parseInt($img.attr('width')) || 20 + const height = parseInt($img.attr('height'), 10); + const width = parseInt($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or // widths, because they are most likely shims or icons, // which aren't very useful for reading. if ((height || 20) < 10 || width < 10) { - $img.remove() + $img.remove(); } else if (height) { // Don't ever specify a height on images, so that we can // scale with respect to width without screwing up the // aspect ratio. - $img.removeAttr('height') + $img.removeAttr('height'); } - return $ + return $; } // Cleans out images where the source string matches transparent/spacer/etc // TODO This seems very aggressive - AP function removeSpacers($img, $) { if (SPACER_RE.test($img.attr('src'))) { - $img.remove() + $img.remove(); } - return $ + return $; +} + +export default function cleanImages($article, $) { + $article.find('img').each((index, img) => { + const $img = $(img); + + cleanForHeight($img, $); + removeSpacers($img, $); + }); + + return $; } diff --git a/src/utils/dom/clean-images.test.js b/src/utils/dom/clean-images.test.js index f71f8df4..c616c59c 100644 --- a/src/utils/dom/clean-images.test.js +++ b/src/utils/dom/clean-images.test.js @@ -1,33 +1,30 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { cleanImages } from './index' +import HTML from './fixtures/html'; +import { cleanImages } from './index'; describe('cleanImages($)', () => { - it("removes images with small heights/widths", () => { - let $ = cheerio.load(HTML.cleanSmallImages.before) + it('removes images with small heights/widths', () => { + const $ = cheerio.load(HTML.cleanSmallImages.before); - let result = cleanImages($('*').first(), $) - assertClean(result.html(), HTML.cleanSmallImages.after) - }) + const result = cleanImages($('*').first(), $); + assertClean(result.html(), HTML.cleanSmallImages.after); + }); - it("removes height attribute from images that remain", () => { - let $ = cheerio.load(HTML.cleanHeight.before) + it('removes height attribute from images that remain', () => { + const $ = cheerio.load(HTML.cleanHeight.before); - let result = cleanImages($('*').first(), $) - assertClean(result.html(), HTML.cleanHeight.after) - }) - - it("removes spacer/transparent images", () => { - let $ = cheerio.load(HTML.cleanSpacer.before) - - let result = cleanImages($('*').first(), $) - assertClean(result.html(), HTML.cleanSpacer.after) - }) -}) + const result = cleanImages($('*').first(), $); + assertClean(result.html(), HTML.cleanHeight.after); + }); + it('removes spacer/transparent images', () => { + const $ = cheerio.load(HTML.cleanSpacer.before); + const result = cleanImages($('*').first(), $); + assertClean(result.html(), HTML.cleanSpacer.after); + }); +}); diff --git a/src/utils/dom/clean-tags.js b/src/utils/dom/clean-tags.js index 3c461457..71828fca 100644 --- a/src/utils/dom/clean-tags.js +++ b/src/utils/dom/clean-tags.js @@ -1,104 +1,109 @@ -import { CLEAN_CONDITIONALLY_TAGS } from './constants' import { getScore, setScore, getOrInitScore, scoreCommas, -} from 'extractors/generic/content/scoring' +} from 'extractors/generic/content/scoring'; -import { normalizeSpaces } from '../text' - -import { linkDensity } from './index' - -// Given an article, clean it of some superfluous content specified by -// tags. Things like forms, ads, etc. -// -// Tags is an array of tag name's to search through. (like div, form, -// etc) -// -// Return this same doc. -export default function cleanTags($article, $) { - $(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => { - const $node = $(node) - let weight = getScore($node) - if (!weight) { - weight = getOrInitScore($node, $) - setScore($node, $, weight) - } - - // drop node if its weight is < 0 - if (weight < 0) { - $node.remove() - } else { - // deteremine if node seems like content - removeUnlessContent($node, $, weight) - } - }) - - return $ -} +import { CLEAN_CONDITIONALLY_TAGS } from './constants'; +import { normalizeSpaces } from '../text'; +import { linkDensity } from './index'; function removeUnlessContent($node, $, weight) { // Explicitly save entry-content-asset tags, which are // noted as valuable in the Publisher guidelines. For now // this works everywhere. We may want to consider making // this less of a sure-thing later. - if ($node.hasClass('entry-content-asset')) { - return - } + if ($node.hasClass('entry-content-asset')) { + return; + } - const content = normalizeSpaces($node.text()) + const content = normalizeSpaces($node.text()); - if (scoreCommas(content) < 10) { - const pCount = $('p', $node).length - const inputCount = $('input', $node).length + if (scoreCommas(content) < 10) { + const pCount = $('p', $node).length; + const inputCount = $('input', $node).length; // Looks like a form, too many inputs. - if (inputCount > (pCount / 3)) { - return $node.remove() - } + if (inputCount > (pCount / 3)) { + $node.remove(); + return; + } - const contentLength = content.length - const imgCount = $('img', $node).length + const contentLength = content.length; + const imgCount = $('img', $node).length; // Content is too short, and there are no images, so // this is probably junk content. - if (contentLength < 25 && imgCount === 0) { - return $node.remove() - } + if (contentLength < 25 && imgCount === 0) { + $node.remove(); + return; + } - const density = linkDensity($node) + const density = linkDensity($node); // Too high of link density, is probably a menu or // something similar. // console.log(weight, density, contentLength) - if (weight < 25 && density > 0.2 && contentLength > 75) { - return $node.remove() - } + if (weight < 25 && density > 0.2 && contentLength > 75) { + $node.remove(); + return; + } // Too high of a link density, despite the score being // high. - if (weight >= 25 && density > 0.5) { + if (weight >= 25 && density > 0.5) { // Don't remove the node if it's a list and the // previous sibling starts with a colon though. That // means it's probably content. - const tagName = $node.get(0).tagName - const nodeIsList = tagName === 'ol' || tagName === 'ul' - if (nodeIsList) { - const previousNode = $node.prev() - if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') { - return - } + const tagName = $node.get(0).tagName; + const nodeIsList = tagName === 'ol' || tagName === 'ul'; + if (nodeIsList) { + const previousNode = $node.prev(); + if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') { + return; } - - return $node.remove() } - const scriptCount = $('script', $node).length + $node.remove(); + return; + } + + const scriptCount = $('script', $node).length; // Too many script tags, not enough content. - if (scriptCount > 0 && contentLength < 150) { - return $node.remove() - } + if (scriptCount > 0 && contentLength < 150) { + $node.remove(); + return; + } + } +} + +// Given an article, clean it of some superfluous content specified by +// tags. Things like forms, ads, etc. +// +// Tags is an array of tag name's to search through. (like div, form, +// etc) +// +// Return this same doc. +export default function cleanTags($article, $) { + $(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => { + const $node = $(node); + let weight = getScore($node); + if (!weight) { + weight = getOrInitScore($node, $); + setScore($node, $, weight); } + + // drop node if its weight is < 0 + if (weight < 0) { + $node.remove(); + } else { + // deteremine if node seems like content + removeUnlessContent($node, $, weight); + } + }); + + return $; } + diff --git a/src/utils/dom/clean-tags.test.js b/src/utils/dom/clean-tags.test.js index ac89d0fe..2bf22981 100644 --- a/src/utils/dom/clean-tags.test.js +++ b/src/utils/dom/clean-tags.test.js @@ -1,70 +1,66 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { cleanTags } from './index' +import HTML from './fixtures/html'; +import { cleanTags } from './index'; describe('cleanTags($)', () => { - it("drops a matching node with a negative score", () => { - let $ = cheerio.load(HTML.dropNegativeScore.before) + it('drops a matching node with a negative score', () => { + const $ = cheerio.load(HTML.dropNegativeScore.before); - let result = cleanTags($('*').first(), $) - assertClean(result.html(), HTML.dropNegativeScore.after) - }) + const result = cleanTags($('*').first(), $); + assertClean(result.html(), HTML.dropNegativeScore.after); + }); - it("removes a node with too many inputs", () => { - let $ = cheerio.load(HTML.removeTooManyInputs.before) + it('removes a node with too many inputs', () => { + const $ = cheerio.load(HTML.removeTooManyInputs.before); - let result = cleanTags($('*').first(), $) - $('[score]').each((i, e) => $(e).removeAttr('score')) + const result = cleanTags($('*').first(), $); + $('[score]').each((i, e) => $(e).removeAttr('score')); - assertClean(result.html(), HTML.removeTooManyInputs.after) - }) + assertClean(result.html(), HTML.removeTooManyInputs.after); + }); - it("removes a div with no images and very little text", () => { - let $ = cheerio.load(HTML.removeShortNoImg.before) + it('removes a div with no images and very little text', () => { + const $ = cheerio.load(HTML.removeShortNoImg.before); - let result = cleanTags($('*').first(), $) - $('[score]').each((i, e) => $(e).removeAttr('score')) + const result = cleanTags($('*').first(), $); + $('[score]').each((i, e) => $(e).removeAttr('score')); - assertClean(result.html(), HTML.removeShortNoImg.after) - }) + assertClean(result.html(), HTML.removeShortNoImg.after); + }); - it("removes a node with a link density that is too high", () => { - let $ = cheerio.load(HTML.linkDensityHigh.before) + it('removes a node with a link density that is too high', () => { + const $ = cheerio.load(HTML.linkDensityHigh.before); - let result = cleanTags($('*').first(), $) - $('[score]').each((i, e) => $(e).removeAttr('score')) + const result = cleanTags($('*').first(), $); + $('[score]').each((i, e) => $(e).removeAttr('score')); - assertClean(result.html(), HTML.linkDensityHigh.after) - }) + assertClean(result.html(), HTML.linkDensityHigh.after); + }); - it("removes a node with a good score but link density > 0.5", () => { - let $ = cheerio.load(HTML.linkDensityHigh.before) + it('removes a node with a good score but link density > 0.5', () => { + const $ = cheerio.load(HTML.linkDensityHigh.before); - let result = cleanTags($('*').first(), $) - $('[score]').each((i, e) => $(e).removeAttr('score')) + const result = cleanTags($('*').first(), $); + $('[score]').each((i, e) => $(e).removeAttr('score')); - assertClean(result.html(), HTML.linkDensityHigh.after) - }) + assertClean(result.html(), HTML.linkDensityHigh.after); + }); - it("keeps node with a good score but link density > 0.5 if preceding text ends in colon", () => { - let $ = cheerio.load(HTML.previousEndsInColon.before) + it('keeps node with a good score but link density > 0.5 if preceding text ends in colon', () => { + const $ = cheerio.load(HTML.previousEndsInColon.before); - let result = cleanTags($('*').first(), $) - assertClean(result.html(), HTML.previousEndsInColon.before) - }) + const result = cleanTags($('*').first(), $); + assertClean(result.html(), HTML.previousEndsInColon.before); + }); - it("keeps anything with a class of entry-content-asset", () => { - let $ = cheerio.load(HTML.cleanEntryContentAsset.before) - - let result = cleanTags($('*').first(), $) - assertClean(result.html(), HTML.cleanEntryContentAsset.before) - }) - - -}) + it('keeps anything with a class of entry-content-asset', () => { + const $ = cheerio.load(HTML.cleanEntryContentAsset.before); + const result = cleanTags($('*').first(), $); + assertClean(result.html(), HTML.cleanEntryContentAsset.before); + }); +}); diff --git a/src/utils/dom/constants.js b/src/utils/dom/constants.js index 8a88522d..3ef18c2d 100644 --- a/src/utils/dom/constants.js +++ b/src/utils/dom/constants.js @@ -1,86 +1,84 @@ // Spacer images to be removed -export const SPACER_RE = new RegExp("trans|transparent|spacer|blank", "i") +export const SPACER_RE = new RegExp('trans|transparent|spacer|blank', 'i'); // A list of tags to strip from the output if we encounter them. export const STRIP_OUTPUT_TAGS = [ - 'title', - 'script', - 'noscript', - 'link', - 'style', - 'hr', - 'embed', - 'iframe', - 'object', -] + 'title', + 'script', + 'noscript', + 'link', + 'style', + 'hr', + 'embed', + 'iframe', + 'object', +]; // cleanAttributes -export const REMOVE_ATTRS = ['style', 'align'] -export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`) -export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',') -export const WHITELIST_ATTRS = ['src', 'href', 'class', 'id', 'score'] -export const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i') +export const REMOVE_ATTRS = ['style', 'align']; +export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`); +export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(','); +export const WHITELIST_ATTRS = ['src', 'href', 'class', 'id', 'score']; +export const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i'); // removeEmpty -export const REMOVE_EMPTY_TAGS = ['p'] -export const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',') +export const REMOVE_EMPTY_TAGS = ['p']; +export const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(','); // cleanTags -export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(',') +export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(','); // cleanHeaders -const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'] -export const HEADER_TAG_LIST = HEADER_TAGS.join(',') +const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']; +export const HEADER_TAG_LIST = HEADER_TAGS.join(','); - - -//// CONTENT FETCHING CONSTANTS //// +// // CONTENT FETCHING CONSTANTS //// // A list of strings that can be considered unlikely candidates when // extracting content from a resource. These strings are joined together // and then tested for existence using re:test, so may contain simple, // non-pipe style regular expression queries if necessary. export const UNLIKELY_CANDIDATES_BLACKLIST = [ - 'ad-break', - 'adbox', - 'advert', - 'addthis', - 'agegate', - 'aux', - 'blogger-labels', - 'combx', - 'comment', - 'conversation', - 'disqus', - 'entry-unrelated', - 'extra', - 'foot', - 'form', - 'header', - 'hidden', - 'loader', - 'login', // Note: This can hit 'blogindex'. - 'menu', - 'meta', - 'nav', - 'pager', - 'pagination', - 'predicta', // readwriteweb inline ad box - 'presence_control_external', // lifehacker.com container full of false positives - 'popup', - 'printfriendly', - 'related', - 'remove', - 'remark', - 'rss', - 'share', - 'shoutbox', - 'sidebar', - 'sociable', - 'sponsor', - 'tools' -] + 'ad-break', + 'adbox', + 'advert', + 'addthis', + 'agegate', + 'aux', + 'blogger-labels', + 'combx', + 'comment', + 'conversation', + 'disqus', + 'entry-unrelated', + 'extra', + 'foot', + 'form', + 'header', + 'hidden', + 'loader', + 'login', // Note: This can hit 'blogindex'. + 'menu', + 'meta', + 'nav', + 'pager', + 'pagination', + 'predicta', // readwriteweb inline ad box + 'presence_control_external', // lifehacker.com container full of false positives + 'popup', + 'printfriendly', + 'related', + 'remove', + 'remark', + 'rss', + 'share', + 'shoutbox', + 'sidebar', + 'sociable', + 'sponsor', + 'tools', +]; // A list of strings that can be considered LIKELY candidates when // extracting content from a resource. Essentially, the inverse of the @@ -94,56 +92,56 @@ export const UNLIKELY_CANDIDATES_BLACKLIST = [ // re:test, so may contain simple, non-pipe style regular expression queries // if necessary. export const UNLIKELY_CANDIDATES_WHITELIST = [ - 'and', - 'article', - 'body', - 'blogindex', - 'column', - 'content', - 'entry-content-asset', - 'format', // misuse of form - 'hfeed', - 'hentry', - 'hatom', - 'main', - 'page', - 'posts', - 'shadow' -] + 'and', + 'article', + 'body', + 'blogindex', + 'column', + 'content', + 'entry-content-asset', + 'format', // misuse of form + 'hfeed', + 'hentry', + 'hatom', + 'main', + 'page', + 'posts', + 'shadow', +]; // A list of tags which, if found inside, should cause a <div /> to NOT // be turned into a paragraph tag. Shallow div tags without these elements // should be turned into <p /> tags. export const DIV_TO_P_BLOCK_TAGS = [ - 'a', - 'blockquote', - 'dl', - 'div', - 'img', - 'p', - 'pre', - 'table', -].join(',') + 'a', + 'blockquote', + 'dl', + 'div', + 'img', + 'p', + 'pre', + 'table', +].join(','); // A list of tags that should be ignored when trying to find the top candidate // for a document. export const NON_TOP_CANDIDATE_TAGS = [ - 'br', - 'b', - 'i', - 'label', - 'hr', - 'area', - 'base', - 'basefont', - 'input', - 'img', - 'link', - 'meta', -] + 'br', + 'b', + 'i', + 'label', + 'hr', + 'area', + 'base', + 'basefont', + 'input', + 'img', + 'link', + 'meta', +]; export const NON_TOP_CANDIDATE_TAGS_RE = - new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i') + new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i'); // A list of selectors that specify, very clearly, either hNews or other // very content-specific style content, like Blogger templates. @@ -155,53 +153,15 @@ export const HNEWS_CONTENT_SELECTORS = [ ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body'], -] -// export const HNEWS_CONTENT_SELECTORS = [ -// { -// //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'), -// must_exist: { -// classes: ['hentry', 'entry-content'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'), -// must_exist: { -// classes: ['entry', 'entry-content'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'), -// must_exist: { -// classes: ['entry', 'entry_content'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'), -// must_exist: { -// classes: ['post', 'post-body'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'), -// must_exist: { -// classes: ['post', 'post_body'], -// } -// }, -// { -// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'), -// must_exist: { -// classes: ['post', 'postbody'], -// } -// }, -// ] +]; export const PHOTO_HINTS = [ - 'figure', - 'photo', - 'image', - 'caption' -] -export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i') + 'figure', + 'photo', + 'image', + 'caption', +]; +export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being @@ -209,233 +169,215 @@ export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i') // // TODO: Perhaps have these scale based on their odds of being quality? export const POSITIVE_SCORE_HINTS = [ - 'article', - 'articlecontent', - 'instapaper_body', - 'blog', - 'body', - 'content', - 'entry-content-asset', - 'entry', - 'hentry', - 'main', - 'Normal', - 'page', - 'pagination', - 'permalink', - 'post', - 'story', - 'text', - '[-_]copy', //usatoday - '\Bcopy' -] + 'article', + 'articlecontent', + 'instapaper_body', + 'blog', + 'body', + 'content', + 'entry-content-asset', + 'entry', + 'hentry', + 'main', + 'Normal', + 'page', + 'pagination', + 'permalink', + 'post', + 'story', + 'text', + '[-_]copy', // usatoday + '\Bcopy', +]; // The above list, joined into a matching regular expression -export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i') +export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines -export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i') +export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? export const NEGATIVE_SCORE_HINTS = [ - 'adbox', - 'advert', - 'author', - 'bio', - 'bookmark', - 'bottom', - 'byline', - 'clear', - 'com-', - 'combx', - 'comment', - 'comment\B', - 'contact', - 'copy', - 'credit', - 'crumb', - 'date', - 'deck', - 'excerpt', - 'featured', //tnr.com has a featured_content which throws us off - 'foot', - 'footer', - 'footnote', - 'graf', - 'head', - 'info', - 'infotext', //newscientist.com copyright - 'instapaper_ignore', - 'jump', - 'linebreak', - 'link', - 'masthead', - 'media', - 'meta', - 'modal', - 'outbrain', //slate.com junk - 'promo', - 'pr_', // autoblog - press release - 'related', - 'respond', - 'roundcontent', //lifehacker restricted content warning - 'scroll', - 'secondary', - 'share', - 'shopping', - 'shoutbox', - 'side', - 'sidebar', - 'sponsor', - 'stamp', - 'sub', - 'summary', - 'tags', - 'tools', - 'widget' -] + 'adbox', + 'advert', + 'author', + 'bio', + 'bookmark', + 'bottom', + 'byline', + 'clear', + 'com-', + 'combx', + 'comment', + 'comment\B', + 'contact', + 'copy', + 'credit', + 'crumb', + 'date', + 'deck', + 'excerpt', + 'featured', // tnr.com has a featured_content which throws us off + 'foot', + 'footer', + 'footnote', + 'graf', + 'head', + 'info', + 'infotext', // newscientist.com copyright + 'instapaper_ignore', + 'jump', + 'linebreak', + 'link', + 'masthead', + 'media', + 'meta', + 'modal', + 'outbrain', // slate.com junk + 'promo', + 'pr_', // autoblog - press release + 'related', + 'respond', + 'roundcontent', // lifehacker restricted content warning + 'scroll', + 'secondary', + 'share', + 'shopping', + 'shoutbox', + 'side', + 'sidebar', + 'sponsor', + 'stamp', + 'sub', + 'summary', + 'tags', + 'tools', + 'widget', +]; // The above list, joined into a matching regular expression -export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i') +export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful. -export const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]' +export const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear. -export const DIGIT_RE = new RegExp('[0-9]') +export const DIGIT_RE = new RegExp('[0-9]'); // A list of words that, if found in link text or URLs, likely mean that // this link is not a next page link. export const EXTRANEOUS_LINK_HINTS = [ - 'print', - 'archive', - 'comment', - 'discuss', - 'e-mail', - 'email', - 'share', - 'reply', - 'all', - 'login', - 'sign', - 'single', - 'adx', - 'entry-unrelated' -] -export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i') - -// An expression that looks to try to find the page digit within a URL, if -// it exists. -// Matches: -// page=1 -// pg=1 -// p=1 -// paging=12 -// pag=7 -// pagination/1 -// paging/88 -// pa/83 -// p/11 -// -// Does not match: -// pg=102 -// page:2 -// DISABLING FOR NOW TODO AP -// export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i') + 'print', + 'archive', + 'comment', + 'discuss', + 'e-mail', + 'email', + 'share', + 'reply', + 'all', + 'login', + 'sign', + 'single', + 'adx', + 'entry-unrelated', +]; +export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i'); // Match any phrase that looks like it could be page, or paging, or pagination -export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i') +export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next // page. Things like: next, continue, >, >>, » but not >|, »| as those can // mean last page. -export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i') +// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i'); +export const NEXT_LINK_TEXT_RE = /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i; // Match any link text/classname/id that looks like it is an end link: things // like "first", "last", "end", etc. -export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i') +export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous // page. -export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i') +export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match 2 or more consecutive <br> tags -export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i') +export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i'); // Match 1 BR tag. -export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i') +export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i'); // A list of all of the block level tags known in HTML5 and below. Taken from // http://bit.ly/qneNIT export const BLOCK_LEVEL_TAGS = [ - 'article', - 'aside', - 'blockquote', - 'body', - 'br', - 'button', - 'canvas', - 'caption', - 'col', - 'colgroup', - 'dd', - 'div', - 'dl', - 'dt', - 'embed', - 'fieldset', - 'figcaption', - 'figure', - 'footer', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'header', - 'hgroup', - 'hr', - 'li', - 'map', - 'object', - 'ol', - 'output', - 'p', - 'pre', - 'progress', - 'section', - 'table', - 'tbody', - 'textarea', - 'tfoot', - 'th', - 'thead', - 'tr', - 'ul', - 'video', -] -export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i') + 'article', + 'aside', + 'blockquote', + 'body', + 'br', + 'button', + 'canvas', + 'caption', + 'col', + 'colgroup', + 'dd', + 'div', + 'dl', + 'dt', + 'embed', + 'fieldset', + 'figcaption', + 'figure', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'header', + 'hgroup', + 'hr', + 'li', + 'map', + 'object', + 'ol', + 'output', + 'p', + 'pre', + 'progress', + 'section', + 'table', + 'tbody', + 'textarea', + 'tfoot', + 'th', + 'thead', + 'tr', + 'ul', + 'video', +]; +export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i'); // The removal is implemented as a blacklist and whitelist, this test finds // blacklisted elements that aren't whitelisted. We do this all in one // expression-both because it's only one pass, and because this skips the // serialization for whitelisted nodes. -const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|') -export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i') +const candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|'); +export const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i'); -const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|') -export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i') +const candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|'); +export const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i'); -export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i') +export const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i'); -export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i') -export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i') -export const BAD_TAGS = new RegExp('^(address|form)$', 'i') +export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i'); +export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i'); +export const BAD_TAGS = new RegExp('^(address|form)$', 'i'); -export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i') +export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i'); diff --git a/src/utils/dom/convert-node-to.js b/src/utils/dom/convert-node-to.js index a654e604..25f7d853 100644 --- a/src/utils/dom/convert-node-to.js +++ b/src/utils/dom/convert-node-to.js @@ -1,4 +1,4 @@ -export default function convertNodeTo($node, $, tag='p') { - $node.replaceWith(`<${tag}>${$node.contents()}</${tag}>`) - return $ +export default function convertNodeTo($node, $, tag = 'p') { + $node.replaceWith(`<${tag}>${$node.contents()}</${tag}>`); + return $; } diff --git a/src/utils/dom/convert-node-to.test.js b/src/utils/dom/convert-node-to.test.js index 3ec97828..1a22b8e1 100644 --- a/src/utils/dom/convert-node-to.test.js +++ b/src/utils/dom/convert-node-to.test.js @@ -1,20 +1,18 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import convertNodeTo from './convert-node-to' +import convertNodeTo from './convert-node-to'; describe('convertNodeTo(node, $)', () => { it('takes a node and converts it to a diff tag', () => { - const html = '<div>Should become a p</div>' - const $ = cheerio.load(html) - const node = $('div').first() + const html = '<div>Should become a p</div>'; + const $ = cheerio.load(html); + const node = $('div').first(); - const result = convertNodeTo(node, $).html() - const after = '<p>Should become a p</p>' - - assert.equal(result, after) - }) - -}) + const result = convertNodeTo(node, $).html(); + const after = '<p>Should become a p</p>'; + assert.equal(result, after); + }); +}); diff --git a/src/utils/dom/convert-to-paragraphs.js b/src/utils/dom/convert-to-paragraphs.js index b7302755..30bcd0a8 100644 --- a/src/utils/dom/convert-to-paragraphs.js +++ b/src/utils/dom/convert-to-paragraphs.js @@ -1,48 +1,49 @@ -import { convertNodeTo } from 'utils/dom' +import { convertNodeTo } from 'utils/dom'; -import { brsToPs } from './index' -import { DIV_TO_P_BLOCK_TAGS } from './constants' -// Loop through the provided doc, and convert any p-like elements to -// actual paragraph tags. -// -// Things fitting this criteria: -// * Multiple consecutive <br /> tags. -// * <div /> tags without block level elements inside of them -// * <span /> tags who are not children of <p /> or <div /> tags. -// -// :param $: A cheerio object to search -// :return cheerio object with new p elements -// (By-reference mutation, though. Returned just for convenience.) - -export default function convertToParagraphs($) { - $ = brsToPs($) - $ = convertDivs($) - $ = convertSpans($) - - return $ -} +import { brsToPs } from './index'; +import { DIV_TO_P_BLOCK_TAGS } from './constants'; function convertDivs($) { $('div').each((index, div) => { - const $div = $(div) + const $div = $(div); const convertable = $div.children() - .not(DIV_TO_P_BLOCK_TAGS).length == 0 + .not(DIV_TO_P_BLOCK_TAGS).length === 0; if (convertable) { - convertNodeTo($div, $, 'p') + convertNodeTo($div, $, 'p'); } - }) + }); - return $ + return $; } function convertSpans($) { $('span').each((index, span) => { - const $span = $(span) - const convertable = $span.parents('p, div').length == 0 + const $span = $(span); + const convertable = $span.parents('p, div').length === 0; if (convertable) { - convertNodeTo($span, $, 'p') + convertNodeTo($span, $, 'p'); } - }) + }); + + return $; +} + +// Loop through the provided doc, and convert any p-like elements to +// actual paragraph tags. +// +// Things fitting this criteria: +// * Multiple consecutive <br /> tags. +// * <div /> tags without block level elements inside of them +// * <span /> tags who are not children of <p /> or <div /> tags. +// +// :param $: A cheerio object to search +// :return cheerio object with new p elements +// (By-reference mutation, though. Returned just for convenience.) + +export default function convertToParagraphs($) { + $ = brsToPs($); + $ = convertDivs($); + $ = convertSpans($); - return $ + return $; } diff --git a/src/utils/dom/convert-to-paragraphs.test.js b/src/utils/dom/convert-to-paragraphs.test.js index c0ab25e9..45b7870f 100644 --- a/src/utils/dom/convert-to-paragraphs.test.js +++ b/src/utils/dom/convert-to-paragraphs.test.js @@ -1,25 +1,20 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import { assertClean } from 'test-helpers' -import HTML from './fixtures/html' +import { assertClean } from 'test-helpers'; +import HTML from './fixtures/html'; -import convertToParagraphs from './convert-to-paragraphs' +import convertToParagraphs from './convert-to-paragraphs'; function assertBeforeAndAfter(key, fn) { - const $ = cheerio.load(HTML[key].before) - assertClean(fn($).html(), HTML[key].after) + const $ = cheerio.load(HTML[key].before); + assertClean(fn($).html(), HTML[key].after); } describe('Generic Extractor Utils', () => { describe('convertToParagraphs($)', () => { - - it("performs all conversions", () => { - assertBeforeAndAfter('convertToParagraphs', convertToParagraphs) - }) - - }) - -}) - + it('performs all conversions', () => { + assertBeforeAndAfter('convertToParagraphs', convertToParagraphs); + }); + }); +}); diff --git a/src/utils/dom/extract-from-meta.js b/src/utils/dom/extract-from-meta.js index 41953e21..6962e7a0 100644 --- a/src/utils/dom/extract-from-meta.js +++ b/src/utils/dom/extract-from-meta.js @@ -1,59 +1,47 @@ -import { stripTags } from 'utils/dom' +import { stripTags } from 'utils/dom'; // Given a node type to search for, and a list of meta tag names to // search for, find a meta tag associated. -// metaNames can be an array of strings of an array of three-element -// arrays that will define the attributes to select from the meta -// elements. E.g., ['og:image', 'property', 'content'] will search -// $('meta[property=og:image]').attr('content'). -// -// Default is $('meta[name=og:image]').attr(value) export default function extractFromMeta( $, metaNames, cachedNames, - cleanTags=true, + cleanTags = true ) { - const foundNames = metaNames.filter(name => { - return cachedNames.indexOf(name) !== -1 - }) + const foundNames = metaNames.filter(name => cachedNames.indexOf(name) !== -1); - for (let name of foundNames) { - let type, value + for (const name of foundNames) { + const type = 'name'; + const value = 'value'; - type = 'name' - value = 'value' - - const nodes = $(`meta[${type}="${name}"]`) + const nodes = $(`meta[${type}="${name}"]`); // Get the unique value of every matching node, in case there // are two meta tags with the same name and value. // Remove empty values. const values = nodes.map((index, node) => $(node).attr(value)) - .toArray() - .filter(text => text !== '') - - // If we have more than one value for the same name, we have a - // conflict and can't trust any of them. Skip this name. If we have - // zero, that means our meta tags had no values. Skip this name - // also. - if (values.length !== 1) { - continue + .toArray() + .filter(text => text !== ''); + + // If we have more than one value for the same name, we have a + // conflict and can't trust any of them. Skip this name. If we have + // zero, that means our meta tags had no values. Skip this name + // also. + if (values.length === 1) { + let metaValue; + // Meta values that contain HTML should be stripped, as they + // weren't subject to cleaning previously. + if (cleanTags) { + metaValue = stripTags(values[0], $); + } else { + metaValue = values[0]; + } + + return metaValue; } - - let metaValue - // Meta values that contain HTML should be stripped, as they - // weren't subject to cleaning previously. - if (cleanTags) { - metaValue = stripTags(values[0], $) - } else { - metaValue = values[0] - } - - return metaValue } // If nothing is found, return null - return null + return null; } diff --git a/src/utils/dom/extract-from-meta.test.js b/src/utils/dom/extract-from-meta.test.js index 5c631a97..700e9880 100644 --- a/src/utils/dom/extract-from-meta.test.js +++ b/src/utils/dom/extract-from-meta.test.js @@ -1,37 +1,35 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/extract-from-selectors' -import { extractFromMeta } from './index' +import HTML from './fixtures/extract-from-selectors'; +import { extractFromMeta } from './index'; describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => { it('extracts an arbitrary meta tag by name', () => { - const $ = cheerio.load(HTML.metaFoo.test) + const $ = cheerio.load(HTML.metaFoo.test); const result = extractFromMeta( $, ['foo', 'baz'], ['foo', 'bat'] - ) + ); - assert.equal(result, HTML.metaFoo.result) - }) + assert.equal(result, HTML.metaFoo.result); + }); it('returns nothing if a meta name is duplicated', () => { - const $ = cheerio.load(HTML.metaDupes.test) + const $ = cheerio.load(HTML.metaDupes.test); const result = extractFromMeta( $, ['foo', 'baz'], ['foo', 'bat'] - ) + ); - assert.equal(result, HTML.metaDupes.result) - }) + assert.equal(result, HTML.metaDupes.result); + }); it('ignores duplicate meta names with empty values', () => { - const $ = cheerio.load(HTML.metaEmptyDupes.test) + const $ = cheerio.load(HTML.metaEmptyDupes.test); const result = extractFromMeta( $, ['foo', 'baz'], ['foo', 'bat'] - ) - - assert.equal(result, HTML.metaEmptyDupes.result) - }) - -}) + ); + assert.equal(result, HTML.metaEmptyDupes.result); + }); +}); diff --git a/src/utils/dom/extract-from-selectors.js b/src/utils/dom/extract-from-selectors.js index ebe22ee6..c88123b5 100644 --- a/src/utils/dom/extract-from-selectors.js +++ b/src/utils/dom/extract-from-selectors.js @@ -1,69 +1,50 @@ -import { withinComment } from 'utils/dom' +import { withinComment } from 'utils/dom'; + +function isGoodNode($node, maxChildren) { + // If it has a number of children, it's more likely a container + // element. Skip it. + if ($node.children().length > maxChildren) { + return false; + } + // If it looks to be within a comment, skip it. + if (withinComment($node)) { + return false; + } + + return true; +} + // Given a a list of selectors find content that may // be extractable from the document. This is for flat // meta-information, like author, title, date published, etc. export default function extractFromSelectors( $, selectors, - maxChildren=1, - textOnly=true + maxChildren = 1, + textOnly = true ) { for (const selector of selectors) { - const nodes = $(selector) + const nodes = $(selector); // If we didn't get exactly one of this selector, this may be // a list of articles or comments. Skip it. if (nodes.length === 1) { - const $node = $(nodes[0]) + const $node = $(nodes[0]); - // If it has a number of children, it's more likely a container - // element. Skip it. - if ($node.children().length > maxChildren) { - continue - } - // If it looks to be within a comment, skip it. - if (withinComment($node, $)) { - continue - } - - let content - if (textOnly) { - content = $node.text() - } else { - content = $node.html() - } + if (isGoodNode($node, maxChildren)) { + let content; + if (textOnly) { + content = $node.text(); + } else { + content = $node.html(); + } - if (content) { - return content + if (content) { + return content; + } } } } - return null + return null; } -// def extract_from_selectors(self, node_type, selectors, use_re=False, -// max_children=1, text_only=True): -// for selector in selectors: -// if type(selector) == str: -// if use_re: -// nodes = self.resource.redocxp(selector) -// else: -// nodes = self.resource.docxp(selector) -// else: -// nodes = self.resource.extract_by_selector(selector) -// -// if len(nodes) == 1: -// # If it looks to be within a comment, skip it. -// if dom.within_comment(node): -// continue -// -// if text_only: -// inner_content = dom.inner_text(node) -// else: -// inner_content = dom.inner_html(node) -// -// clean_value = self._clean(node_type, inner_content) -// if clean_value: -// return clean_value -// -// return None diff --git a/src/utils/dom/extract-from-selectors.test.js b/src/utils/dom/extract-from-selectors.test.js index cd375bff..a43871be 100644 --- a/src/utils/dom/extract-from-selectors.test.js +++ b/src/utils/dom/extract-from-selectors.test.js @@ -1,39 +1,36 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/extract-from-selectors' -import extractFromSelectors from './extract-from-selectors' +import HTML from './fixtures/extract-from-selectors'; +import extractFromSelectors from './extract-from-selectors'; describe('extractFromSelectors($, selectors, maxChildren, textOnly)', () => { it('extracts an arbitrary node by selector', () => { - const $ = cheerio.load(HTML.simpleSelector.test) - const result = extractFromSelectors($, ['.author']) + const $ = cheerio.load(HTML.simpleSelector.test); + const result = extractFromSelectors($, ['.author']); - assert.equal(result, HTML.simpleSelector.result) - }) + assert.equal(result, HTML.simpleSelector.result); + }); it('ignores comments', () => { - const $ = cheerio.load(HTML.insideComment.test) - const result = extractFromSelectors($, ['.author']) + const $ = cheerio.load(HTML.insideComment.test); + const result = extractFromSelectors($, ['.author']); - assert.equal(result, HTML.insideComment.result) - }) + assert.equal(result, HTML.insideComment.result); + }); it('skips a selector if it matches multiple nodes', () => { - const $ = cheerio.load(HTML.multiMatch.test) - const result = extractFromSelectors($, ['.author']) + const $ = cheerio.load(HTML.multiMatch.test); + const result = extractFromSelectors($, ['.author']); - assert.equal(result, HTML.multiMatch.result) - }) + assert.equal(result, HTML.multiMatch.result); + }); it('skips a node with too many children', () => { - const $ = cheerio.load(HTML.manyChildren.test) - const result = extractFromSelectors($, ['.author']) - - assert.equal(result, HTML.manyChildren.result) - }) - -}) - + const $ = cheerio.load(HTML.manyChildren.test); + const result = extractFromSelectors($, ['.author']); + assert.equal(result, HTML.manyChildren.result); + }); +}); diff --git a/src/utils/dom/fixtures/extract-from-selectors.js b/src/utils/dom/fixtures/extract-from-selectors.js index dfeb7c40..f95c2a8d 100644 --- a/src/utils/dom/fixtures/extract-from-selectors.js +++ b/src/utils/dom/fixtures/extract-from-selectors.js @@ -5,7 +5,7 @@ const HTML = { <html> <meta name="foo" value="bar" /> </html>`, - result: `bar`, + result: 'bar', }, metaDupes: { test: ` @@ -21,14 +21,14 @@ const HTML = { <meta name="foo" value="bar" /> <meta name="foo" value="" /> </html>`, - result: `bar`, + result: 'bar', }, custom: { test: ` <html> <meta property="foo" content="bar" /> </html>`, - result: `bar`, + result: 'bar', }, // extractFromSelectors @@ -37,7 +37,7 @@ const HTML = { <html> <div class="author">Adam</div> </html>`, - result: `Adam`, + result: 'Adam', }, insideComment: { test: ` @@ -70,6 +70,6 @@ const HTML = { </html>`, result: null, }, -} +}; -export default HTML +export default HTML; diff --git a/src/utils/dom/fixtures/html.js b/src/utils/dom/fixtures/html.js index 88025271..97dc6148 100644 --- a/src/utils/dom/fixtures/html.js +++ b/src/utils/dom/fixtures/html.js @@ -237,7 +237,7 @@ const HTML = { `, after: ` <div><div><div><p><a href="">Wow how about that</a></p></div></div></div> - ` + `, }, // cleanImages @@ -252,7 +252,7 @@ const HTML = { <div> <img width="50"> </div> - ` + `, }, cleanHeight: { before: ` @@ -264,7 +264,7 @@ const HTML = { <div> <img width="50"> </div> - ` + `, }, cleanSpacer: { before: ` @@ -279,7 +279,7 @@ const HTML = { <img src="/foo/bar/baz/normal.png"> <p>Some text</p> </div> - ` + `, }, // stripJunkTags stripsJunk: { @@ -298,7 +298,7 @@ const HTML = { <div> <p>What an article</p> </div> - ` + `, }, // stripHOnes @@ -314,7 +314,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, convertThreeHOnes: { before: ` @@ -334,7 +334,7 @@ const HTML = { <p>What do you think?</p> <h2>Can you believe it?!</h2> </div> - ` + `, }, // cleanAttributes @@ -348,7 +348,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, removeAlign: { before: ` @@ -360,7 +360,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, // removeEmpty @@ -375,7 +375,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, doNotRemoveBr: { before: ` @@ -392,7 +392,7 @@ const HTML = { <div></div> <p>What do you think?</p> </div> - ` + `, }, doNotNested: { before: ` @@ -409,7 +409,7 @@ const HTML = { <p><img src="foo/bar.jpg" /></p> <p>What do you think?</p> </div> - ` + `, }, // cleanConditionally @@ -433,7 +433,7 @@ const HTML = { </p> <p>What do you think?</p> </div> - ` + `, }, removeTooManyInputs: { before: ` @@ -467,7 +467,7 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, removeShortNoImg: { before: ` @@ -490,7 +490,7 @@ const HTML = { <img src="asdf"> </div> </div> - ` + `, }, linkDensityHigh: { @@ -527,7 +527,7 @@ const HTML = { <li>Keep this one</li> </ul> </div> - ` + `, }, goodScoreTooDense: { before: ` @@ -567,7 +567,7 @@ const HTML = { <li>Keep this one</li> </ul> </div> - ` + `, }, previousEndsInColon: { before: ` @@ -608,7 +608,7 @@ const HTML = { <p>What do you think?</p> </div> `, - after: `What do you think?` + after: 'What do you think?', }, // cleanHeaders @@ -627,7 +627,7 @@ const HTML = { <h2>Keep me</h2> <p>What do you think?</p> </div> - ` + `, }, cleanTitleMatch: { before: ` @@ -642,7 +642,7 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, dropWithNegativeWeight: { before: ` @@ -657,8 +657,8 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, -} +}; -export default HTML +export default HTML; diff --git a/src/utils/dom/fixtures/node-is-sufficient.js b/src/utils/dom/fixtures/node-is-sufficient.js index 9e8a6594..522336b8 100644 --- a/src/utils/dom/fixtures/node-is-sufficient.js +++ b/src/utils/dom/fixtures/node-is-sufficient.js @@ -12,7 +12,7 @@ const HTML = { Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m </p> </div> - ` -} + `, +}; -export default HTML +export default HTML; diff --git a/src/utils/dom/index.js b/src/utils/dom/index.js index 4208e0c5..2ddde9f9 100644 --- a/src/utils/dom/index.js +++ b/src/utils/dom/index.js @@ -1,22 +1,22 @@ // DOM manipulation -export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates' -export { default as brsToPs } from './brs-to-ps' -export { default as paragraphize } from './paragraphize' -export { default as convertToParagraphs } from './convert-to-paragraphs' -export { default as convertNodeTo } from './convert-node-to' -export { default as cleanImages } from './clean-images' -export { default as stripJunkTags } from './strip-junk-tags' -export { default as cleanHOnes } from './clean-h-ones' -export { default as cleanAttributes } from './clean-attributes' -export { default as removeEmpty } from './remove-empty' -export { default as cleanTags } from './clean-tags' -export { default as cleanHeaders } from './clean-headers' -export { default as rewriteTopLevel } from './rewrite-top-level' -export { default as makeLinksAbsolute } from './make-links-absolute' -export { textLength, linkDensity } from './link-density' -export { default as extractFromMeta } from './extract-from-meta' -export { default as extractFromSelectors } from './extract-from-selectors' -export { default as stripTags } from './strip-tags' -export { default as withinComment } from './within-comment' -export { default as nodeIsSufficient } from './node-is-sufficient' -export { default as isWordpress } from './is-wordpress' +export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'; +export { default as brsToPs } from './brs-to-ps'; +export { default as paragraphize } from './paragraphize'; +export { default as convertToParagraphs } from './convert-to-paragraphs'; +export { default as convertNodeTo } from './convert-node-to'; +export { default as cleanImages } from './clean-images'; +export { default as stripJunkTags } from './strip-junk-tags'; +export { default as cleanHOnes } from './clean-h-ones'; +export { default as cleanAttributes } from './clean-attributes'; +export { default as removeEmpty } from './remove-empty'; +export { default as cleanTags } from './clean-tags'; +export { default as cleanHeaders } from './clean-headers'; +export { default as rewriteTopLevel } from './rewrite-top-level'; +export { default as makeLinksAbsolute } from './make-links-absolute'; +export { textLength, linkDensity } from './link-density'; +export { default as extractFromMeta } from './extract-from-meta'; +export { default as extractFromSelectors } from './extract-from-selectors'; +export { default as stripTags } from './strip-tags'; +export { default as withinComment } from './within-comment'; +export { default as nodeIsSufficient } from './node-is-sufficient'; +export { default as isWordpress } from './is-wordpress'; diff --git a/src/utils/dom/is-wordpress.js b/src/utils/dom/is-wordpress.js index a5ad38fb..38a07c15 100644 --- a/src/utils/dom/is-wordpress.js +++ b/src/utils/dom/is-wordpress.js @@ -1,5 +1,5 @@ -import { IS_WP_SELECTOR } from './constants' +import { IS_WP_SELECTOR } from './constants'; export default function isWordpress($) { - return $(IS_WP_SELECTOR).length > 0 + return $(IS_WP_SELECTOR).length > 0; } diff --git a/src/utils/dom/is-wordpress.test.js b/src/utils/dom/is-wordpress.test.js index 1f6bf1f8..7d52ea74 100644 --- a/src/utils/dom/is-wordpress.test.js +++ b/src/utils/dom/is-wordpress.test.js @@ -1,7 +1,7 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import isWordpress from './is-wordpress' +import isWordpress from './is-wordpress'; describe('isWordpress($)', () => { it('returns false if a site is not generated by wordpress', () => { @@ -11,10 +11,10 @@ describe('isWordpress($)', () => { <meta name="generator" value="whatever"> <head> </html> - ` - let $ = cheerio.load(html) + `; + let $ = cheerio.load(html); - assert.equal(isWordpress($), false) + assert.equal(isWordpress($), false); const html2 = ` <html> @@ -22,11 +22,11 @@ describe('isWordpress($)', () => { <meta name="foo" value="bar"> <head> </html> - ` - $ = cheerio.load(html) + `; + $ = cheerio.load(html2); - assert.equal(isWordpress($), false) - }) + assert.equal(isWordpress($), false); + }); it('returns true if a site is generated by wordpress', () => { const html = ` @@ -35,9 +35,9 @@ describe('isWordpress($)', () => { <meta name="generator" value="WordPress 4.7-alpha-38592"> <head> </html> - ` - const $ = cheerio.load(html) + `; + const $ = cheerio.load(html); - assert.equal(isWordpress($), true) - }) -}) + assert.equal(isWordpress($), true); + }); +}); diff --git a/src/utils/dom/link-density.js b/src/utils/dom/link-density.js index 2fd87ecc..a3e44da1 100644 --- a/src/utils/dom/link-density.js +++ b/src/utils/dom/link-density.js @@ -1,23 +1,24 @@ + +export function textLength(text) { + return text.trim() + .replace(/\s+/g, ' ') + .length; +} + // Determines what percentage of the text // in a node is link text // Takes a node, returns a float export function linkDensity($node) { - const totalTextLength = textLength($node.text()) + const totalTextLength = textLength($node.text()); - const linkText = $node.find('a').text() - const linkLength = textLength(linkText) + const linkText = $node.find('a').text(); + const linkLength = textLength(linkText); if (totalTextLength > 0) { - return linkLength / totalTextLength + return linkLength / totalTextLength; } else if (totalTextLength === 0 && linkLength > 0) { - return 1 - } else { - return 0 + return 1; } -} -export function textLength(text) { - return text.trim() - .replace(/\s+/g, ' ') - .length + return 0; } diff --git a/src/utils/dom/link-density.test.js b/src/utils/dom/link-density.test.js index 0e95efd3..48e25689 100644 --- a/src/utils/dom/link-density.test.js +++ b/src/utils/dom/link-density.test.js @@ -1,34 +1,33 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; -import { linkDensity } from './index' +import { linkDensity } from './index'; describe('linkDensity($)', () => { - it("returns 0.5 if half of the text is a link", () => { - const $ = cheerio.load(HTML.linkDensity5) + it('returns 0.5 if half of the text is a link', () => { + const $ = cheerio.load(HTML.linkDensity5); - const density = linkDensity($('div').first(), $) + const density = linkDensity($('div').first(), $); - assert.equal(density, 0.5) - }) + assert.equal(density, 0.5); + }); - it("returns 1 if all of the text is a link", () => { - const $ = cheerio.load(HTML.linkDensity1) + it('returns 1 if all of the text is a link', () => { + const $ = cheerio.load(HTML.linkDensity1); - const density = linkDensity($('div').first(), $) + const density = linkDensity($('div').first(), $); - assert.equal(density, 1) - }) + assert.equal(density, 1); + }); it("returns 0 if there's no text", () => { - const $ = cheerio.load(HTML.linkDensity0) + const $ = cheerio.load(HTML.linkDensity0); - const density = linkDensity($('div').first()) + const density = linkDensity($('div').first()); - assert.equal(density, 0) - }) - -}) + assert.equal(density, 0); + }); +}); diff --git a/src/utils/dom/make-links-absolute.js b/src/utils/dom/make-links-absolute.js index 6221d44e..48cbb7a4 100644 --- a/src/utils/dom/make-links-absolute.js +++ b/src/utils/dom/make-links-absolute.js @@ -1,17 +1,17 @@ -import URL from 'url' +import URL from 'url'; -export default function makeLinksAbsolute($content, $, url) { - ['href', 'src'].forEach(attr => absolutize($, url, attr, $content)) +function absolutize($, rootUrl, attr, $content) { + $(`[${attr}]`, $content).each((_, node) => { + const url = node.attribs[attr]; + const absoluteUrl = URL.resolve(rootUrl, url); - // console.log($content.html()) - return $content + node.attribs[attr] = absoluteUrl; + }); } -function absolutize($, rootUrl, attr, $content) { - $(`[${attr}]`, $content).each((_, node) => { - const url = node.attribs[attr] - const absoluteUrl = URL.resolve(rootUrl, url) +export default function makeLinksAbsolute($content, $, url) { + ['href', 'src'].forEach(attr => absolutize($, url, attr, $content)); - node.attribs[attr] = absoluteUrl - }) + // console.log($content.html()) + return $content; } diff --git a/src/utils/dom/make-links-absolute.test.js b/src/utils/dom/make-links-absolute.test.js index 61849615..277b829c 100644 --- a/src/utils/dom/make-links-absolute.test.js +++ b/src/utils/dom/make-links-absolute.test.js @@ -1,46 +1,46 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import makeLinksAbsolute from './make-links-absolute' +import makeLinksAbsolute from './make-links-absolute'; describe('makeLinksAbsolute($)', () => { it('makes relative #hrefs absolute', () => { - const html = `<div><a href="#foo">bar</a></div>` - const $ = cheerio.load(html) - const $content = $('*').first() + const html = '<div><a href="#foo">bar</a></div>'; + const $ = cheerio.load(html); + const $content = $('*').first(); - const result = $.html(makeLinksAbsolute($content, $, 'http://example.com')) + const result = $.html(makeLinksAbsolute($content, $, 'http://example.com')); - assert.equal(result, `<div><a href="http://example.com/#foo">bar</a></div>`) - }) + assert.equal(result, '<div><a href="http://example.com/#foo">bar</a></div>'); + }); it('makes relative ./relative paths absolute', () => { - const html = `<div><a href="foo/bar">bar</a></div>` - const $ = cheerio.load(html) - const $content = $('*').first() + const html = '<div><a href="foo/bar">bar</a></div>'; + const $ = cheerio.load(html); + const $content = $('*').first(); - const result = $.html(makeLinksAbsolute($content, $, 'http://example.com/baz/bat')) + const result = $.html(makeLinksAbsolute($content, $, 'http://example.com/baz/bat')); - assert.equal(result, `<div><a href="http://example.com/baz/foo/bar">bar</a></div>`) - }) + assert.equal(result, '<div><a href="http://example.com/baz/foo/bar">bar</a></div>'); + }); it('makes relative /root/paths absolute', () => { - const html = `<div><a href="/foo/bar">bar</a></div>` - const $ = cheerio.load(html) - const $content = $('*').first() + const html = '<div><a href="/foo/bar">bar</a></div>'; + const $ = cheerio.load(html); + const $content = $('*').first(); - const result = $.html(makeLinksAbsolute($content, $, 'http://example.com/baz/bat')) + const result = $.html(makeLinksAbsolute($content, $, 'http://example.com/baz/bat')); - assert.equal(result, `<div><a href="http://example.com/foo/bar">bar</a></div>`) - }) + assert.equal(result, '<div><a href="http://example.com/foo/bar">bar</a></div>'); + }); it('makes relative srcs absolute', () => { - const html = `<div><img src="#foo"></div>` - const $ = cheerio.load(html) - const $content = $('*').first() + const html = '<div><img src="#foo"></div>'; + const $ = cheerio.load(html); + const $content = $('*').first(); - const result = $.html(makeLinksAbsolute($content, $, 'http://example.com')) + const result = $.html(makeLinksAbsolute($content, $, 'http://example.com')); - assert.equal(result, `<div><img src="http://example.com/#foo"></div>`) - }) -}) + assert.equal(result, '<div><img src="http://example.com/#foo"></div>'); + }); +}); diff --git a/src/utils/dom/node-is-sufficient.js b/src/utils/dom/node-is-sufficient.js index 517918f1..5f1e62d3 100644 --- a/src/utils/dom/node-is-sufficient.js +++ b/src/utils/dom/node-is-sufficient.js @@ -3,5 +3,5 @@ // return: boolean export default function nodeIsSufficient($node) { - return $node.text().trim().length >= 100 + return $node.text().trim().length >= 100; } diff --git a/src/utils/dom/node-is-sufficient.test.js b/src/utils/dom/node-is-sufficient.test.js index 4d6943f4..e666f1f6 100644 --- a/src/utils/dom/node-is-sufficient.test.js +++ b/src/utils/dom/node-is-sufficient.test.js @@ -1,21 +1,21 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/node-is-sufficient' -import nodeIsSufficient from './node-is-sufficient' +import HTML from './fixtures/node-is-sufficient'; +import nodeIsSufficient from './node-is-sufficient'; describe('Utils', () => { describe('nodeIsSufficient(node)', () => { - it("returns false if node text length < 100 chars", () => { - const $ = cheerio.load(HTML.tooShort) - const sufficient = nodeIsSufficient($.root()) - assert.equal(sufficient, false) - }) + it('returns false if node text length < 100 chars', () => { + const $ = cheerio.load(HTML.tooShort); + const sufficient = nodeIsSufficient($.root()); + assert.equal(sufficient, false); + }); - it("returns true if node text length > 100 chars", () => { - const $ = cheerio.load(HTML.longEnough) - const sufficient = nodeIsSufficient($.root()) - assert.equal(sufficient, true) - }) - }) -}) + it('returns true if node text length > 100 chars', () => { + const $ = cheerio.load(HTML.longEnough); + const sufficient = nodeIsSufficient($.root()); + assert.equal(sufficient, true); + }); + }); +}); diff --git a/src/utils/dom/paragraphize.js b/src/utils/dom/paragraphize.js index 10306005..eb6ff1fb 100644 --- a/src/utils/dom/paragraphize.js +++ b/src/utils/dom/paragraphize.js @@ -1,4 +1,4 @@ -import { BLOCK_LEVEL_TAGS_RE } from './constants' +import { BLOCK_LEVEL_TAGS_RE } from './constants'; // Given a node, turn it into a P if it is not already a P, and // make sure it conforms to the constraints of a P tag (I.E. does @@ -11,32 +11,25 @@ import { BLOCK_LEVEL_TAGS_RE } from './constants' // :param $: The cheerio object to handle dom manipulation // :param br: Whether or not the passed node is a br -export default function paragraphize(node, $, br=false) { - const $node = $(node) +export default function paragraphize(node, $, br = false) { + const $node = $(node); if (br) { - let sibling = node.nextSibling - let p = $('<p></p>') + let sibling = node.nextSibling; + const p = $('<p></p>'); // while the next node is text or not a block level element // append it to a new p node - while (true) { - if (!sibling || (sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) { - break - } - - let nextSibling = sibling.nextSibling - $(sibling).appendTo(p) - sibling = nextSibling + while (sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) { + const nextSibling = sibling.nextSibling; + $(sibling).appendTo(p); + sibling = nextSibling; } - $node.replaceWith(p) - $node.remove() - return $ - } else { - // Not currently implemented. May not need to; can leverage - // cheerio's loader/htmlparser2 to format invalid html - // (e.g., nested p tags) - return $ + $node.replaceWith(p); + $node.remove(); + return $; } + + return $; } diff --git a/src/utils/dom/paragraphize.test.js b/src/utils/dom/paragraphize.test.js index f549ab04..a40656f2 100644 --- a/src/utils/dom/paragraphize.test.js +++ b/src/utils/dom/paragraphize.test.js @@ -1,36 +1,33 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import { clean } from 'test-helpers' -import HTML from './fixtures/html' +import { clean } from 'test-helpers'; +import HTML from './fixtures/html'; import { - paragraphize -} from './index' + paragraphize, +} from './index'; describe('Generic Extractor Utils', () => { describe('paragraphize(node)', () => { - - it("conversts a BR into P and moves inline contents to P tag after current parent", () => { - const $ = cheerio.load(HTML.paragraphize.before) - let node = $('br').get(0) + it('conversts a BR into P and moves inline contents to P tag after current parent', () => { + const $ = cheerio.load(HTML.paragraphize.before); + const node = $('br').get(0); // note: result here is not valid html; will handle elsewhere - let result = paragraphize(node, $, true).html() + const result = paragraphize(node, $, true).html(); - assert.equal(clean(result), clean(HTML.paragraphize.after)) - }) + assert.equal(clean(result), clean(HTML.paragraphize.after)); + }); - it("conversts a BR into P and stops when block element hit", () => { - const $ = cheerio.load(HTML.paragraphizeBlock.before) - let node = $('br').get(0) + it('conversts a BR into P and stops when block element hit', () => { + const $ = cheerio.load(HTML.paragraphizeBlock.before); + const node = $('br').get(0); // note: result here is not valid html; will handle elsewhere - let result = paragraphize(node, $, true).html() - - assert.equal(clean(result), clean(HTML.paragraphizeBlock.after)) - }) - - }) -}) + const result = paragraphize(node, $, true).html(); + assert.equal(clean(result), clean(HTML.paragraphizeBlock.after)); + }); + }); +}); diff --git a/src/utils/dom/remove-empty.js b/src/utils/dom/remove-empty.js index d527f5d3..b4d24729 100644 --- a/src/utils/dom/remove-empty.js +++ b/src/utils/dom/remove-empty.js @@ -1,12 +1,10 @@ -import { REMOVE_EMPTY_SELECTORS } from './constants' +import { REMOVE_EMPTY_TAGS } from './constants'; export default function removeEmpty($article, $) { - // $(REMOVE_EMPTY_SELECTORS, $article).remove() + $article.find(REMOVE_EMPTY_TAGS.join(',')).each((index, p) => { + const $p = $(p); + if ($p.text().trim() === '') $p.remove(); + }); - $article.find('p').each((index, p) => { - const $p = $(p) - if ($p.text().trim() === '') $p.remove() - }) - - return $ + return $; } diff --git a/src/utils/dom/remove-empty.test.js b/src/utils/dom/remove-empty.test.js index d99fb150..7c7ab911 100644 --- a/src/utils/dom/remove-empty.test.js +++ b/src/utils/dom/remove-empty.test.js @@ -1,33 +1,31 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { removeEmpty } from './index' +import HTML from './fixtures/html'; +import { removeEmpty } from './index'; describe('removeEmpty($)', () => { - it("removes empty P tags", () => { - let $ = cheerio.load(HTML.removeEmptyP.before) + it('removes empty P tags', () => { + const $ = cheerio.load(HTML.removeEmptyP.before); - let result = removeEmpty($('*').first(), $) - assertClean(result.html(), HTML.removeEmptyP.after) - }) + const result = removeEmpty($('*').first(), $); + assertClean(result.html(), HTML.removeEmptyP.after); + }); - it("removes P tags with only space", () => { - const html = `<div><p> </p></div>` - let $ = cheerio.load(html) + it('removes P tags with only space', () => { + const html = '<div><p> </p></div>'; + const $ = cheerio.load(html); - let result = removeEmpty($('*').first(), $) - assertClean(result.html(), `<div></div>`) - }) + const result = removeEmpty($('*').first(), $); + assertClean(result.html(), '<div></div>'); + }); - it("does not remove empty DIV tags", () => { - let $ = cheerio.load(HTML.removeEmptyP.before) + it('does not remove empty DIV tags', () => { + const $ = cheerio.load(HTML.removeEmptyP.before); - let result = removeEmpty($('*').first(), $) - assertClean(result.html(), HTML.removeEmptyP.after) - }) - -}) + const result = removeEmpty($('*').first(), $); + assertClean(result.html(), HTML.removeEmptyP.after); + }); +}); diff --git a/src/utils/dom/rewrite-top-level.js b/src/utils/dom/rewrite-top-level.js index aad28c28..c1cd641d 100644 --- a/src/utils/dom/rewrite-top-level.js +++ b/src/utils/dom/rewrite-top-level.js @@ -1,4 +1,4 @@ -import { convertNodeTo } from 'utils/dom' +import { convertNodeTo } from 'utils/dom'; // Rewrite the tag name to div if it's a top level node like body or // html to avoid later complications with multiple body tags. @@ -6,8 +6,8 @@ export default function rewriteTopLevel(article, $) { // I'm not using context here because // it's problematic when converting the // top-level/root node - AP - $ = convertNodeTo($('html'), $, 'div') - $ = convertNodeTo($('body'), $, 'div') + $ = convertNodeTo($('html'), $, 'div'); + $ = convertNodeTo($('body'), $, 'div'); - return $ + return $; } diff --git a/src/utils/dom/rewrite-top-level.test.js b/src/utils/dom/rewrite-top-level.test.js index 0813d42f..60ce8c5e 100644 --- a/src/utils/dom/rewrite-top-level.test.js +++ b/src/utils/dom/rewrite-top-level.test.js @@ -1,18 +1,16 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import rewriteTopLevel from './rewrite-top-level' +import HTML from './fixtures/html'; +import rewriteTopLevel from './rewrite-top-level'; describe('rewriteTopLevel(node, $)', () => { - it("turns html and body tags into divs", () => { - let $ = cheerio.load(HTML.rewriteHTMLBody.before) - - let result = rewriteTopLevel($('html').first(), $) - assertClean(result.html(), HTML.rewriteHTMLBody.after) - }) -}) + it('turns html and body tags into divs', () => { + const $ = cheerio.load(HTML.rewriteHTMLBody.before); + const result = rewriteTopLevel($('html').first(), $); + assertClean(result.html(), HTML.rewriteHTMLBody.after); + }); +}); diff --git a/src/utils/dom/strip-junk-tags.js b/src/utils/dom/strip-junk-tags.js index cad729a3..8d5423c5 100644 --- a/src/utils/dom/strip-junk-tags.js +++ b/src/utils/dom/strip-junk-tags.js @@ -1,9 +1,9 @@ import { - STRIP_OUTPUT_TAGS -} from './constants' + STRIP_OUTPUT_TAGS, +} from './constants'; export default function stripJunkTags(article, $) { - $(STRIP_OUTPUT_TAGS.join(','), article).remove() + $(STRIP_OUTPUT_TAGS.join(','), article).remove(); - return $ + return $; } diff --git a/src/utils/dom/strip-junk-tags.test.js b/src/utils/dom/strip-junk-tags.test.js index c75509fe..0925cf44 100644 --- a/src/utils/dom/strip-junk-tags.test.js +++ b/src/utils/dom/strip-junk-tags.test.js @@ -1,20 +1,16 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import cheerio from 'cheerio'; -import HTML from './fixtures/html' -import { assertClean } from 'test-helpers' +import { assertClean } from 'test-helpers'; -import { stripJunkTags } from './index' +import HTML from './fixtures/html'; +import { stripJunkTags } from './index'; describe('stripJunkTags($)', () => { - it("strips script and other junk tags", () => { - let $ = cheerio.load(HTML.stripsJunk.before) - - let result = stripJunkTags($('*').first(), $) - assertClean(result.html(), HTML.stripsJunk.after) - }) - -}) - + it('strips script and other junk tags', () => { + const $ = cheerio.load(HTML.stripsJunk.before); + const result = stripJunkTags($('*').first(), $); + assertClean(result.html(), HTML.stripsJunk.after); + }); +}); diff --git a/src/utils/dom/strip-tags.js b/src/utils/dom/strip-tags.js index 8602b245..9a77495d 100644 --- a/src/utils/dom/strip-tags.js +++ b/src/utils/dom/strip-tags.js @@ -2,6 +2,6 @@ export default function stripTags(text, $) { // Wrapping text in html element prevents errors when text // has no html - const cleanText = $(`<span>${text}</span>`).text() - return cleanText === '' ? text : cleanText + const cleanText = $(`<span>${text}</span>`).text(); + return cleanText === '' ? text : cleanText; } diff --git a/src/utils/dom/strip-tags.test.js b/src/utils/dom/strip-tags.test.js index 7de57090..92a0b042 100644 --- a/src/utils/dom/strip-tags.test.js +++ b/src/utils/dom/strip-tags.test.js @@ -1,22 +1,22 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import stripTags from './strip-tags' +import stripTags from './strip-tags'; describe('stripTags(title, $)', () => { it('strips tags from a string of text', () => { - const $ = cheerio.load('<div></div>') + const $ = cheerio.load('<div></div>'); - const result = stripTags('What a <em>Wonderful</em> Day', $) + const result = stripTags('What a <em>Wonderful</em> Day', $); - assert.equal(result, 'What a Wonderful Day') - }) + assert.equal(result, 'What a Wonderful Day'); + }); it('returns the original text if no tags found', () => { - const $ = cheerio.load('<div></div>') + const $ = cheerio.load('<div></div>'); - const result = stripTags('What a Wonderful Day', $) + const result = stripTags('What a Wonderful Day', $); - assert.equal(result, 'What a Wonderful Day') - }) -}) + assert.equal(result, 'What a Wonderful Day'); + }); +}); diff --git a/src/utils/dom/strip-unlikely-candidates.js b/src/utils/dom/strip-unlikely-candidates.js index 1c47eae7..4b94a467 100644 --- a/src/utils/dom/strip-unlikely-candidates.js +++ b/src/utils/dom/strip-unlikely-candidates.js @@ -1,8 +1,7 @@ import { CANDIDATES_WHITELIST, CANDIDATES_BLACKLIST, - UNLIKELY_RE, -} from './constants' +} from './constants'; // ## NOTES: // This is a working first pass, but if/when we start optimizing @@ -18,21 +17,19 @@ export default function stripUnlikelyCandidates($) { // // :param $: a cheerio object to strip nodes from // :return $: the cleaned cheerio object - $('*').not('a').each(function(index, node) { - const $node = $(node) - const classes = $node.attr('class') - const id = $node.attr('id') - if (!id && !classes) { - return - } else { - const classAndId = `${classes || ''} ${id || ''}` - if (CANDIDATES_WHITELIST.test(classAndId)) { - return - } else if (CANDIDATES_BLACKLIST.test(classAndId)) { - return $node.remove() - } + $('*').not('a').each((index, node) => { + const $node = $(node); + const classes = $node.attr('class'); + const id = $node.attr('id'); + if (!id && !classes) return; + + const classAndId = `${classes || ''} ${id || ''}`; + if (CANDIDATES_WHITELIST.test(classAndId)) { + return; + } else if (CANDIDATES_BLACKLIST.test(classAndId)) { + $node.remove(); } - }) + }); - return $ + return $; } diff --git a/src/utils/dom/strip-unlikely-candidates.test.js b/src/utils/dom/strip-unlikely-candidates.test.js index cc07149a..58cbb55d 100644 --- a/src/utils/dom/strip-unlikely-candidates.test.js +++ b/src/utils/dom/strip-unlikely-candidates.test.js @@ -1,36 +1,34 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import { assertClean } from 'test-helpers' -import HTML from './fixtures/html' -import stripUnlikelyCandidates from './strip-unlikely-candidates' +import { assertClean } from 'test-helpers'; +import HTML from './fixtures/html'; +import stripUnlikelyCandidates from './strip-unlikely-candidates'; function assertBeforeAndAfter(key, fn) { - const $ = cheerio.load(HTML[key].before) - assertClean(fn($).html(), HTML[key].after) + const $ = cheerio.load(HTML[key].before); + assertClean(fn($).html(), HTML[key].after); } describe('Generic Extractor Utils', () => { describe('stripUnlikelyCandidates(node)', () => { - it("returns original doc if no matches found", () => { - const $ = cheerio.load(HTML.noMatches) - const stripped = stripUnlikelyCandidates($) - assert.equal(stripped.html(), HTML.noMatches) - }) - - it("strips unlikely matches from the doc", () => { - assertBeforeAndAfter('whitelistMatch', stripUnlikelyCandidates) - }) - - it("keeps likely matches even when they also match the blacklist", () => { - assertBeforeAndAfter('whiteAndBlack', stripUnlikelyCandidates) - }) - - it("removed likely matches when inside blacklist node", () => { - assertBeforeAndAfter('whiteInsideBlack', stripUnlikelyCandidates) - }) - - - }) -}) + it('returns original doc if no matches found', () => { + const $ = cheerio.load(HTML.noMatches); + const stripped = stripUnlikelyCandidates($); + assert.equal(stripped.html(), HTML.noMatches); + }); + + it('strips unlikely matches from the doc', () => { + assertBeforeAndAfter('whitelistMatch', stripUnlikelyCandidates); + }); + + it('keeps likely matches even when they also match the blacklist', () => { + assertBeforeAndAfter('whiteAndBlack', stripUnlikelyCandidates); + }); + + it('removed likely matches when inside blacklist node', () => { + assertBeforeAndAfter('whiteInsideBlack', stripUnlikelyCandidates); + }); + }); +}); diff --git a/src/utils/dom/within-comment.js b/src/utils/dom/within-comment.js index 496f9e73..e65423e8 100644 --- a/src/utils/dom/within-comment.js +++ b/src/utils/dom/within-comment.js @@ -1,9 +1,9 @@ -export default function withinComment($node, $) { - const parents = $node.parents().toArray() +export default function withinComment($node) { + const parents = $node.parents().toArray(); const commentParent = parents.find((parent) => { - const classAndId = `${parent.attribs['class']} ${parent.attribs['id']}` - return classAndId.includes('comment') - }) + const classAndId = `${parent.attribs.class} ${parent.attribs.id}`; + return classAndId.includes('comment'); + }); - return commentParent !== undefined + return commentParent !== undefined; } diff --git a/src/utils/dom/within-comment.test.js b/src/utils/dom/within-comment.test.js index 70a2e0fc..cc2511ae 100644 --- a/src/utils/dom/within-comment.test.js +++ b/src/utils/dom/within-comment.test.js @@ -1,33 +1,33 @@ -import cheerio from 'cheerio' -import assert from 'assert' +import cheerio from 'cheerio'; +import assert from 'assert'; -import withinComment from './within-comment' +import withinComment from './within-comment'; -describe('withinComment(node, $)', () => { +describe('withinComment(node)', () => { it('returns false if its parent is not a comment', () => { const $ = cheerio.load(`<div> <div> <div class="author">Adam</div> </div> - </div>`) - assert.equal(withinComment($('.author').first(), $), false) - }) + </div>`); + assert.equal(withinComment($('.author').first()), false); + }); it('returns true if its parent has a class of comment', () => { const $ = cheerio.load(`<div class="comments"> <div> <div class="author">Adam</div> </div> - </div>`) - assert.equal(withinComment($('.author').first(), $), true) - }) + </div>`); + assert.equal(withinComment($('.author').first()), true); + }); it('returns true if its parent has an id of comment', () => { const $ = cheerio.load(`<div id="comment"> <div> <div class="author">Adam</div> </div> - </div>`) - assert.equal(withinComment($('.author').first(), $), true) - }) -}) + </div>`); + assert.equal(withinComment($('.author').first()), true); + }); +}); diff --git a/src/utils/index.js b/src/utils/index.js index 33322bd8..53eee9fb 100644 --- a/src/utils/index.js +++ b/src/utils/index.js @@ -1 +1 @@ -export { default as range } from './range' +export { default as range } from './range'; diff --git a/src/utils/range.js b/src/utils/range.js index 53aef275..c835c7b1 100644 --- a/src/utils/range.js +++ b/src/utils/range.js @@ -1,5 +1,5 @@ export default function* range(start = 1, end = 1) { while (start <= end) { - yield start++ + yield start += 1; } } diff --git a/src/utils/text/article-base-url.js b/src/utils/text/article-base-url.js index 97197506..214f91d6 100644 --- a/src/utils/text/article-base-url.js +++ b/src/utils/text/article-base-url.js @@ -1,34 +1,60 @@ -import URL from 'url' +import URL from 'url'; import { HAS_ALPHA_RE, IS_ALPHA_RE, IS_DIGIT_RE, PAGE_IN_HREF_RE, -} from './constants' +} from './constants'; + +function isGoodSegment(segment, index, firstSegmentHasLetters) { + let goodSegment = true; + + // If this is purely a number, and it's the first or second + // url_segment, it's probably a page number. Remove it. + if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) { + goodSegment = true; + } + + // If this is the first url_segment and it's just "index", + // remove it + if (index === 0 && segment.toLowerCase() === 'index') { + goodSegment = false; + } + + // If our first or second url_segment is smaller than 3 characters, + // and the first url_segment had no alphas, remove it. + if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) { + goodSegment = false; + } + + return goodSegment; +} // Take a URL, and return the article base of said URL. That is, no // pagination data exists in it. Useful for comparing to other links // that might have pagination data within them. -export default function articleBaseUrl(url, parsedUrl) { - parsedUrl = parsedUrl || URL.parse(url) - const { protocol, host, path } = parsedUrl +export default function articleBaseUrl(url, parsed) { + const parsedUrl = parsed || URL.parse(url); + const { protocol, host, path } = parsedUrl; - let firstSegmentHasLetters = false + let firstSegmentHasLetters = false; const cleanedSegments = path.split('/') .reverse() - .reduce((acc, segment, index) => { + .reduce((acc, rawSegment, index) => { + let segment = rawSegment; + // Split off and save anything that looks like a file type. if (segment.includes('.')) { - const [ possibleSegment, fileExt ] = segment.split('.') + const [possibleSegment, fileExt] = segment.split('.'); if (IS_ALPHA_RE.test(fileExt)) { - segment = possibleSegment + segment = possibleSegment; } } // If our first or second segment has anything looking like a page // number, remove it. if (PAGE_IN_HREF_RE.test(segment) && index < 2) { - segment = segment.replace(PAGE_IN_HREF_RE, '') + segment = segment.replace(PAGE_IN_HREF_RE, ''); } // If we're on the first segment, check to see if we have any @@ -36,40 +62,16 @@ export default function articleBaseUrl(url, parsedUrl) { // the URL, and this will be helpful to determine if we're on a URL // segment that looks like "/2/" for example. if (index === 0) { - firstSegmentHasLetters = HAS_ALPHA_RE.test(segment) + firstSegmentHasLetters = HAS_ALPHA_RE.test(segment); } // If it's not marked for deletion, push it to cleaned_segments. if (isGoodSegment(segment, index, firstSegmentHasLetters)) { - acc.push(segment) + acc.push(segment); } - return acc - }, []) - - return `${protocol}//${host}${cleanedSegments.reverse().join('/')}` -} - -function isGoodSegment(segment, index, firstSegmentHasLetters) { - let goodSegment = true - - // If this is purely a number, and it's the first or second - // url_segment, it's probably a page number. Remove it. - if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) { - goodSegment = true - } - - // If this is the first url_segment and it's just "index", - // remove it - if (index === 0 && segment.toLowerCase() === 'index') { - goodSegment = false - } - - // If our first or second url_segment is smaller than 3 characters, - // and the first url_segment had no alphas, remove it. - if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) { - goodSegment = false - } + return acc; + }, []); - return goodSegment + return `${protocol}//${host}${cleanedSegments.reverse().join('/')}`; } diff --git a/src/utils/text/article-base-url.test.js b/src/utils/text/article-base-url.test.js index 5b31c736..88fbaceb 100644 --- a/src/utils/text/article-base-url.test.js +++ b/src/utils/text/article-base-url.test.js @@ -1,21 +1,20 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; -import articleBaseUrl from './article-base-url' +import articleBaseUrl from './article-base-url'; describe('articleBaseUrl(url, parsedUrl)', () => { it('returns the base url of a paginated url', () => { - const url = "http://example.com/foo/bar/wow-cool/page=10" - const cleaned = "http://example.com/foo/bar/wow-cool" + const url = 'http://example.com/foo/bar/wow-cool/page=10'; + const cleaned = 'http://example.com/foo/bar/wow-cool'; - assert.equal(articleBaseUrl(url), cleaned) - }) + assert.equal(articleBaseUrl(url), cleaned); + }); it('returns same url if url has no pagination info', () => { - const url = "http://example.com/foo/bar/wow-cool/" - const cleaned = "http://example.com/foo/bar/wow-cool" + const url = 'http://example.com/foo/bar/wow-cool/'; + const cleaned = 'http://example.com/foo/bar/wow-cool'; - assert.equal(articleBaseUrl(url), cleaned) - }) -}) + assert.equal(articleBaseUrl(url), cleaned); + }); +}); diff --git a/src/utils/text/constants.js b/src/utils/text/constants.js index 3e53fde8..905f87d1 100644 --- a/src/utils/text/constants.js +++ b/src/utils/text/constants.js @@ -14,9 +14,9 @@ // Does not match: // pg=102 // page:2 -export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|\/)([0-9]{1,3})', 'i') +export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i'); -export const HAS_ALPHA_RE = /[a-z]/i +export const HAS_ALPHA_RE = /[a-z]/i; -export const IS_ALPHA_RE = /^[a-z]+$/i -export const IS_DIGIT_RE = /^[0-9]+$/i +export const IS_ALPHA_RE = /^[a-z]+$/i; +export const IS_DIGIT_RE = /^[0-9]+$/i; diff --git a/src/utils/text/extract-from-url.js b/src/utils/text/extract-from-url.js index 8422a7ed..35fb6df5 100644 --- a/src/utils/text/extract-from-url.js +++ b/src/utils/text/extract-from-url.js @@ -4,12 +4,10 @@ // string to be cleaned. // Only used for date_published currently. export default function extractFromUrl(url, regexList) { - const matchRe = regexList.find((re) => { - return re.test(url) - }) + const matchRe = regexList.find(re => re.test(url)); if (matchRe) { - return matchRe.exec(url)[1] - } else { - return null + return matchRe.exec(url)[1]; } + + return null; } diff --git a/src/utils/text/extract-from-url.test.js b/src/utils/text/extract-from-url.test.js index f43c2152..ad6c14da 100644 --- a/src/utils/text/extract-from-url.test.js +++ b/src/utils/text/extract-from-url.test.js @@ -1,26 +1,25 @@ -import cheerio from 'cheerio' -import assert from 'assert' +import assert from 'assert'; -import extractFromUrl from './extract-from-url' +import extractFromUrl from './extract-from-url'; describe('extractFromUrl(url)', () => { it('extracts datePublished from url', () => { - const url = 'https://example.com/2012/08/01/this-is-good' + const url = 'https://example.com/2012/08/01/this-is-good'; const regexList = [ - new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/') - ] - const result = extractFromUrl(url, regexList) + new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/'), + ]; + const result = extractFromUrl(url, regexList); - assert.equal(result, '2012/08/01') - }) + assert.equal(result, '2012/08/01'); + }); it('returns null if nothing found', () => { - const url = 'https://example.com/this-is-good' + const url = 'https://example.com/this-is-good'; const regexList = [ - new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/') - ] - const result = extractFromUrl(url, regexList) + new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/'), + ]; + const result = extractFromUrl(url, regexList); - assert.equal(result, null) - }) -}) + assert.equal(result, null); + }); +}); diff --git a/src/utils/text/fixtures/html.js b/src/utils/text/fixtures/html.js index 88025271..97dc6148 100644 --- a/src/utils/text/fixtures/html.js +++ b/src/utils/text/fixtures/html.js @@ -237,7 +237,7 @@ const HTML = { `, after: ` <div><div><div><p><a href="">Wow how about that</a></p></div></div></div> - ` + `, }, // cleanImages @@ -252,7 +252,7 @@ const HTML = { <div> <img width="50"> </div> - ` + `, }, cleanHeight: { before: ` @@ -264,7 +264,7 @@ const HTML = { <div> <img width="50"> </div> - ` + `, }, cleanSpacer: { before: ` @@ -279,7 +279,7 @@ const HTML = { <img src="/foo/bar/baz/normal.png"> <p>Some text</p> </div> - ` + `, }, // stripJunkTags stripsJunk: { @@ -298,7 +298,7 @@ const HTML = { <div> <p>What an article</p> </div> - ` + `, }, // stripHOnes @@ -314,7 +314,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, convertThreeHOnes: { before: ` @@ -334,7 +334,7 @@ const HTML = { <p>What do you think?</p> <h2>Can you believe it?!</h2> </div> - ` + `, }, // cleanAttributes @@ -348,7 +348,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, removeAlign: { before: ` @@ -360,7 +360,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, // removeEmpty @@ -375,7 +375,7 @@ const HTML = { <div> <p>What do you think?</p> </div> - ` + `, }, doNotRemoveBr: { before: ` @@ -392,7 +392,7 @@ const HTML = { <div></div> <p>What do you think?</p> </div> - ` + `, }, doNotNested: { before: ` @@ -409,7 +409,7 @@ const HTML = { <p><img src="foo/bar.jpg" /></p> <p>What do you think?</p> </div> - ` + `, }, // cleanConditionally @@ -433,7 +433,7 @@ const HTML = { </p> <p>What do you think?</p> </div> - ` + `, }, removeTooManyInputs: { before: ` @@ -467,7 +467,7 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, removeShortNoImg: { before: ` @@ -490,7 +490,7 @@ const HTML = { <img src="asdf"> </div> </div> - ` + `, }, linkDensityHigh: { @@ -527,7 +527,7 @@ const HTML = { <li>Keep this one</li> </ul> </div> - ` + `, }, goodScoreTooDense: { before: ` @@ -567,7 +567,7 @@ const HTML = { <li>Keep this one</li> </ul> </div> - ` + `, }, previousEndsInColon: { before: ` @@ -608,7 +608,7 @@ const HTML = { <p>What do you think?</p> </div> `, - after: `What do you think?` + after: 'What do you think?', }, // cleanHeaders @@ -627,7 +627,7 @@ const HTML = { <h2>Keep me</h2> <p>What do you think?</p> </div> - ` + `, }, cleanTitleMatch: { before: ` @@ -642,7 +642,7 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, dropWithNegativeWeight: { before: ` @@ -657,8 +657,8 @@ const HTML = { <p>What do you think?</p> <p>What do you think?</p> </div> - ` + `, }, -} +}; -export default HTML +export default HTML; diff --git a/src/utils/text/has-sentence-end.js b/src/utils/text/has-sentence-end.js new file mode 100644 index 00000000..73a947c8 --- /dev/null +++ b/src/utils/text/has-sentence-end.js @@ -0,0 +1,7 @@ +// Given a string, return True if it appears to have an ending sentence +// within it, false otherwise. +const SENTENCE_END_RE = new RegExp('\.( |$)'); +export default function hasSentenceEnd(text) { + return SENTENCE_END_RE.test(text); +} + diff --git a/src/utils/text/index.js b/src/utils/text/index.js index 5f0d759a..8b94553b 100644 --- a/src/utils/text/index.js +++ b/src/utils/text/index.js @@ -1,6 +1,6 @@ -export { default as normalizeSpaces } from './normalize-spaces' -export { default as extractFromUrl } from './extract-from-url' -export { default as pageNumFromUrl } from './page-num-from-url' -export { default as removeAnchor } from './remove-anchor' -export { default as articleBaseUrl } from './article-base-url' - +export { default as normalizeSpaces } from './normalize-spaces'; +export { default as extractFromUrl } from './extract-from-url'; +export { default as pageNumFromUrl } from './page-num-from-url'; +export { default as removeAnchor } from './remove-anchor'; +export { default as articleBaseUrl } from './article-base-url'; +export { default as hasSentenceEnd } from './has-sentence-end'; diff --git a/src/utils/text/normalize-spaces.js b/src/utils/text/normalize-spaces.js index 4bf5f862..130d9bd5 100644 --- a/src/utils/text/normalize-spaces.js +++ b/src/utils/text/normalize-spaces.js @@ -1,5 +1,5 @@ -const NORMALIZE_RE = /\s{2,}/ +const NORMALIZE_RE = /\s{2,}/; export default function normalizeSpaces(text) { - return text.replace(NORMALIZE_RE, ' ').trim() + return text.replace(NORMALIZE_RE, ' ').trim(); } diff --git a/src/utils/text/normalize-spaces.test.js b/src/utils/text/normalize-spaces.test.js index 886938a3..184f37ab 100644 --- a/src/utils/text/normalize-spaces.test.js +++ b/src/utils/text/normalize-spaces.test.js @@ -1,16 +1,15 @@ -import assert from 'assert' -import cheerio from 'cheerio' +import assert from 'assert'; +import cheerio from 'cheerio'; -import HTML from './fixtures/html' +import HTML from './fixtures/html'; -import { normalizeSpaces } from './index' +import { normalizeSpaces } from './index'; describe('normalizeSpaces(text)', () => { - it("normalizes spaces from text", () => { - let $ = cheerio.load(HTML.normalizeSpaces.before) + it('normalizes spaces from text', () => { + const $ = cheerio.load(HTML.normalizeSpaces.before); - let result = normalizeSpaces($('*').first().text()) - assert.equal(result, HTML.normalizeSpaces.after) - }) - -}) + const result = normalizeSpaces($('*').first().text()); + assert.equal(result, HTML.normalizeSpaces.after); + }); +}); diff --git a/src/utils/text/page-num-from-url.js b/src/utils/text/page-num-from-url.js index e2a8f9b1..ba62713d 100644 --- a/src/utils/text/page-num-from-url.js +++ b/src/utils/text/page-num-from-url.js @@ -1,12 +1,12 @@ -import { PAGE_IN_HREF_RE } from './constants' +import { PAGE_IN_HREF_RE } from './constants'; export default function pageNumFromUrl(url) { - const matches = url.match(PAGE_IN_HREF_RE) - if (!matches) return null + const matches = url.match(PAGE_IN_HREF_RE); + if (!matches) return null; - const pageNum = parseInt(matches[6]) + const pageNum = parseInt(matches[6], 10); // Return pageNum < 100, otherwise // return null - return pageNum < 100 ? pageNum : null + return pageNum < 100 ? pageNum : null; } diff --git a/src/utils/text/page-num-from-url.test.js b/src/utils/text/page-num-from-url.test.js index e7b29869..d516475f 100644 --- a/src/utils/text/page-num-from-url.test.js +++ b/src/utils/text/page-num-from-url.test.js @@ -1,45 +1,45 @@ -import assert from 'assert' +import assert from 'assert'; -import pageNumFromUrl from './page-num-from-url' +import pageNumFromUrl from './page-num-from-url'; describe('pageNumFromUrl(url)', () => { it('returns null if there is no page num in the url', () => { - const url1 = "http://example.com" - assert.equal(pageNumFromUrl(url1), null) + const url1 = 'http://example.com'; + assert.equal(pageNumFromUrl(url1), null); - const url2 = "http://example.com/?pg=102" - assert.equal(pageNumFromUrl(url2), null) + const url2 = 'http://example.com/?pg=102'; + assert.equal(pageNumFromUrl(url2), null); - const url3 = "http://example.com/?page:102" - assert.equal(pageNumFromUrl(url3), null) - }) + const url3 = 'http://example.com/?page:102'; + assert.equal(pageNumFromUrl(url3), null); + }); it('returns a page num if one matches the url', () => { - const url1 = "http://example.com/foo?page=1" - assert.equal(pageNumFromUrl(url1), 1) + const url1 = 'http://example.com/foo?page=1'; + assert.equal(pageNumFromUrl(url1), 1); - const url2 = "http://example.com/foo?pg=1" - assert.equal(pageNumFromUrl(url2), 1) + const url2 = 'http://example.com/foo?pg=1'; + assert.equal(pageNumFromUrl(url2), 1); - const url3 = "http://example.com/foo?p=1" - assert.equal(pageNumFromUrl(url3), 1) + const url3 = 'http://example.com/foo?p=1'; + assert.equal(pageNumFromUrl(url3), 1); - const url4 = "http://example.com/foo?paging=1" - assert.equal(pageNumFromUrl(url4), 1) + const url4 = 'http://example.com/foo?paging=1'; + assert.equal(pageNumFromUrl(url4), 1); - const url5 = "http://example.com/foo?pag=1" - assert.equal(pageNumFromUrl(url5), 1) + const url5 = 'http://example.com/foo?pag=1'; + assert.equal(pageNumFromUrl(url5), 1); - const url6 = "http://example.com/foo?pagination/1" - assert.equal(pageNumFromUrl(url6), 1) + const url6 = 'http://example.com/foo?pagination/1'; + assert.equal(pageNumFromUrl(url6), 1); - const url7 = "http://example.com/foo?paging/88" - assert.equal(pageNumFromUrl(url7), 88) + const url7 = 'http://example.com/foo?paging/88'; + assert.equal(pageNumFromUrl(url7), 88); - const url8 = "http://example.com/foo?pa/88" - assert.equal(pageNumFromUrl(url8), 88) + const url8 = 'http://example.com/foo?pa/88'; + assert.equal(pageNumFromUrl(url8), 88); - const url9 = "http://example.com/foo?p/88" - assert.equal(pageNumFromUrl(url9), 88) - }) -}) + const url9 = 'http://example.com/foo?p/88'; + assert.equal(pageNumFromUrl(url9), 88); + }); +}); diff --git a/src/utils/text/remove-anchor.js b/src/utils/text/remove-anchor.js index 9c4ea793..d7569ac7 100644 --- a/src/utils/text/remove-anchor.js +++ b/src/utils/text/remove-anchor.js @@ -1,3 +1,3 @@ export default function removeAnchor(url) { - return url.split('#')[0].replace(/\/$/, '') + return url.split('#')[0].replace(/\/$/, ''); } diff --git a/src/utils/text/remove-anchor.test.js b/src/utils/text/remove-anchor.test.js index ab27eff4..53932e02 100644 --- a/src/utils/text/remove-anchor.test.js +++ b/src/utils/text/remove-anchor.test.js @@ -1,21 +1,20 @@ -import assert from 'assert' +import assert from 'assert'; -import removeAnchor from './remove-anchor' +import removeAnchor from './remove-anchor'; describe('removeAnchor(url)', () => { it('returns a url w/out #anchor', () => { - const url = "http://example.com/foo/bar/wow-cool/page=10/#wow" - const cleaned = "http://example.com/foo/bar/wow-cool/page=10" + const url = 'http://example.com/foo/bar/wow-cool/page=10/#wow'; + const cleaned = 'http://example.com/foo/bar/wow-cool/page=10'; - assert.equal(removeAnchor(url), cleaned) - }) + assert.equal(removeAnchor(url), cleaned); + }); it('returns same url if url has no anchor found', () => { - const url = "http://example.com/foo/bar/wow-cool" - const cleaned = "http://example.com/foo/bar/wow-cool" - - assert.equal(removeAnchor(url), cleaned) - }) -}) + const url = 'http://example.com/foo/bar/wow-cool'; + const cleaned = 'http://example.com/foo/bar/wow-cool'; + assert.equal(removeAnchor(url), cleaned); + }); +});