diff --git a/.eslintignore b/.eslintignore
new file mode 100644
index 00000000..b0c38463
--- /dev/null
+++ b/.eslintignore
@@ -0,0 +1 @@
+**/fixtures/*
diff --git a/.eslintrc b/.eslintrc
new file mode 100644
index 00000000..6011b843
--- /dev/null
+++ b/.eslintrc
@@ -0,0 +1,39 @@
+// Use this file as a starting point for your project's .eslintrc.
+// Copy this file, and add rule overrides as needed.
+{
+ "parser": "babel-eslint",
+ "extends": "airbnb",
+ "plugins": [
+ "babel"
+ ],
+ "globals": {
+ /* mocha */
+ "describe",
+ "it"
+ },
+ "rules": {
+ "no-param-reassign": 0,
+ /* TODO fix this; this should work w/import/resolver below, but doesn't */
+ "import/no-extraneous-dependencies": 0,
+ "import/no-unresolved": 0,
+ "no-control-regex": 0,
+ "import/prefer-default-export": 0,
+ "generator-star-spacing": 0,
+ "babel/generator-star-spacing": 0,
+ "func-names": 0,
+ "no-useless-escape": 0,
+ "no-confusing-arrow": 0,
+ },
+ "settings": {
+ "import/resolver": {
+ "babel-module": {
+ "extensions": [".js"]
+ }
+ }
+ },
+ "parserOptions":{
+ "ecmaFeatures": {
+ "experimentalObjectRestSpread": true
+ }
+ }
+}
diff --git a/package.json b/package.json
index aa8aaac1..d0103ef1 100644
--- a/package.json
+++ b/package.json
@@ -5,14 +5,17 @@
"main": "index.js",
"scripts": {
"start": "node ./build",
- "build": "rollup -c",
+ "lint": "eslint src/**",
+ "build": "eslint src/** && rollup -c",
"test": "./test-runner"
},
"author": "",
"license": "ISC",
"devDependencies": {
+ "babel-eslint": "^6.1.2",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-module-alias": "^1.6.0",
+ "babel-plugin-module-resolver": "^2.2.0",
"babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0",
@@ -21,6 +24,14 @@
"babel-preset-es2015-rollup": "^1.2.0",
"babel-register": "^6.11.6",
"babelrc-rollup": "^3.0.0",
+ "eslint": "^3.5.0",
+ "eslint-config-airbnb": "^11.1.0",
+ "eslint-import-resolver-babel-module": "^2.0.1",
+ "eslint-plugin-async": "^0.1.1",
+ "eslint-plugin-babel": "^3.3.0",
+ "eslint-plugin-import": "^1.15.0",
+ "eslint-plugin-jsx-a11y": "^2.2.2",
+ "eslint-plugin-react": "^6.2.1",
"mocha": "^3.0.2",
"rollup": "^0.34.13",
"rollup-plugin-babel": "^2.6.1",
diff --git a/score-move b/score-move
new file mode 100755
index 00000000..50182a1a
--- /dev/null
+++ b/score-move
@@ -0,0 +1,21 @@
+#!/usr/local/bin/fish
+
+set file $argv[1]
+set function $argv[2]
+
+touch src/extractors/generic/next-page-url/scoring/utils/index.js
+touch src/extractors/generic/next-page-url/scoring/utils/$file.js
+touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js
+
+echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
+echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
+echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
+echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
+echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js
+
+echo "Now make it a default export"
+echo "Move it to its file"
+echo "Move its tests to its test file"
+echo "import in score-links"
+echo "Test it."
+
diff --git a/src/cleaners/author.js b/src/cleaners/author.js
index 58cff5b0..f0a67096 100644
--- a/src/cleaners/author.js
+++ b/src/cleaners/author.js
@@ -1,7 +1,7 @@
-import { CLEAN_AUTHOR_RE } from './constants'
+import { CLEAN_AUTHOR_RE } from './constants';
// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
export default function cleanAuthor(author) {
- return author.replace(CLEAN_AUTHOR_RE, '$2').trim()
+ return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
}
diff --git a/src/cleaners/author.test.js b/src/cleaners/author.test.js
index 4407effb..e7e881e3 100644
--- a/src/cleaners/author.test.js
+++ b/src/cleaners/author.test.js
@@ -1,21 +1,21 @@
-import assert from 'assert'
+import assert from 'assert';
-import cleanAuthor from './author'
+import cleanAuthor from './author';
describe('cleanAuthor(author)', () => {
it('removes the By from an author string', () => {
- const author = cleanAuthor('By Bob Dylan')
+ const author = cleanAuthor('By Bob Dylan');
- assert.equal(author, 'Bob Dylan')
- })
+ assert.equal(author, 'Bob Dylan');
+ });
it('trims trailing whitespace and line breaks', () => {
const text = `
written by
Bob Dylan
- `
- const author = cleanAuthor(text)
+ `;
+ const author = cleanAuthor(text);
- assert.equal(author, 'Bob Dylan')
- })
-})
+ assert.equal(author, 'Bob Dylan');
+ });
+});
diff --git a/src/cleaners/constants.js b/src/cleaners/constants.js
index 54557dc7..c2b4dd50 100644
--- a/src/cleaners/constants.js
+++ b/src/cleaners/constants.js
@@ -1,9 +1,9 @@
// CLEAN AUTHOR CONSTANTS
-export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i
+export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS
-export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
+export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct.
//
@@ -14,7 +14,7 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [
-]
+];
// An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit.
@@ -23,18 +23,36 @@ export const DEK_META_TAGS = [
// detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [
'.entry-summary',
-]
+];
// CLEAN DATE PUBLISHED CONSTANTS
-export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i
-export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i
-export const TIME_MERIDIAN_DOTS_RE = /\.m\./i
-export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}(\s?[ap]\.?m\.?)?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig
+export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
+export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
+export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
+const months = [
+ 'jan',
+ 'feb',
+ 'mar',
+ 'apr',
+ 'may',
+ 'jun',
+ 'jul',
+ 'aug',
+ 'sep',
+ 'oct',
+ 'nov',
+ 'dec',
+];
+const allMonths = months.join('|');
+const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
+const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
+export const SPLIT_DATE_STRING =
+ new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig');
// CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
-export const TITLE_SPLITTERS_RE = /(: | - | \| )/g
+export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;
export const DOMAIN_ENDINGS_RE =
- new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g')
+ new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g');
diff --git a/src/cleaners/content.js b/src/cleaners/content.js
index 76d7310a..00cd939f 100644
--- a/src/cleaners/content.js
+++ b/src/cleaners/content.js
@@ -8,54 +8,52 @@ import {
rewriteTopLevel,
stripJunkTags,
makeLinksAbsolute,
-} from 'utils/dom'
-
-import { convertNodeTo } from 'utils/dom'
+} from 'utils/dom';
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(
article,
{
$,
- cleanConditionally=true,
- title='',
- url='',
+ cleanConditionally = true,
+ title = '',
+ url = '',
}
) {
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
- rewriteTopLevel(article, $)
+ rewriteTopLevel(article, $);
// Drop small images and spacer images
- cleanImages(article, $)
+ cleanImages(article, $);
// Drop certain tags like
, etc
// This is -mostly- for cleanliness, not security.
- stripJunkTags(article, $)
+ stripJunkTags(article, $);
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
- cleanHOnes(article, $)
+ cleanHOnes(article, $);
// Clean headers
- cleanHeaders(article, $, title)
+ cleanHeaders(article, $, title);
// Make links absolute
- makeLinksAbsolute(article, $, url)
+ makeLinksAbsolute(article, $, url);
// Remove style or align attributes
- cleanAttributes(article, $)
+ cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
- cleanTags(article, $, cleanConditionally)
+ cleanTags(article, $, cleanConditionally);
// Remove empty paragraph nodes
- removeEmpty(article, $)
+ removeEmpty(article, $);
- return article
+ return article;
}
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers:
diff --git a/src/cleaners/content.test.js b/src/cleaners/content.test.js
index 68f6f346..c12b7bbf 100644
--- a/src/cleaners/content.test.js
+++ b/src/cleaners/content.test.js
@@ -1,32 +1,32 @@
-import assert from 'assert'
-import cheerio from 'cheerio'
-import fs from 'fs'
+import assert from 'assert';
+import cheerio from 'cheerio';
+import fs from 'fs';
-import extractCleanNode from './content'
-import extractBestNode from 'extractors/generic/content/extract-best-node'
+import extractBestNode from 'extractors/generic/content/extract-best-node';
+import extractCleanNode from './content';
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
- it("cleans cruft out of a DOM node", () => {
- const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
- let $ = cheerio.load(html)
+ it('cleans cruft out of a DOM node', () => {
+ const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
+ const $ = cheerio.load(html);
const opts = {
- stripUnlikelyCandidates: true,
- weightNodes: true,
- cleanConditionally: true,
- }
+ stripUnlikelyCandidates: true,
+ weightNodes: true,
+ cleanConditionally: true,
+ };
- const bestNode = extractBestNode($, opts)
- let result = $.html(bestNode)
- // console.log(result)
- // console.log(result.length)
- const cleanNode = extractCleanNode(bestNode, { $, opts })
- result = $.html(cleanNode)
- // console.log(result.length)
- // console.log(result)
- // console.log(bestNode.html())
+ const bestNode = extractBestNode($, opts);
+ // let result = $.html(bestNode);
+ // // console.log(result)
+ // // console.log(result.length)
+ const cleanNode = extractCleanNode(bestNode, { $, opts });
+ // result = $.html(cleanNode);
+ // // console.log(result.length)
+ // // console.log(result)
+ // // console.log(bestNode.html())
- assert.equal($(bestNode).text().length, 2687)
- })
-})
+ assert.equal($(cleanNode).text().length, 2687);
+ });
+});
diff --git a/src/cleaners/date-published.js b/src/cleaners/date-published.js
index 7347222d..c8e196e2 100644
--- a/src/cleaners/date-published.js
+++ b/src/cleaners/date-published.js
@@ -1,4 +1,4 @@
-import moment from 'moment'
+import moment from 'moment';
// Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string.
@@ -7,27 +7,27 @@ import {
CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING,
TIME_MERIDIAN_SPACE_RE,
- TIME_MERIDIAN_DOTS_RE
-} from './constants'
+ TIME_MERIDIAN_DOTS_RE,
+} from './constants';
+
+export function cleanDateString(dateString) {
+ return (dateString.match(SPLIT_DATE_STRING) || [])
+ .join(' ')
+ .replace(TIME_MERIDIAN_DOTS_RE, 'm')
+ .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
+ .replace(CLEAN_DATE_STRING_RE, '$1')
+ .trim();
+}
// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
export default function cleanDatePublished(dateString) {
- let date = moment(new Date(dateString))
+ let date = moment(new Date(dateString));
if (!date.isValid()) {
- dateString = cleanDateString(dateString)
- date = moment(new Date(dateString))
+ dateString = cleanDateString(dateString);
+ date = moment(new Date(dateString));
}
- return date.isValid() ? date.toISOString() : null
-}
-
-export function cleanDateString(dateString) {
- return (dateString.match(SPLIT_DATE_STRING) || [])
- .join(' ')
- .replace(TIME_MERIDIAN_DOTS_RE, 'm')
- .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
- .replace(CLEAN_DATE_STRING_RE, '$1')
- .trim()
+ return date.isValid() ? date.toISOString() : null;
}
diff --git a/src/cleaners/date-published.test.js b/src/cleaners/date-published.test.js
index 4c254a28..fe955d8a 100644
--- a/src/cleaners/date-published.test.js
+++ b/src/cleaners/date-published.test.js
@@ -1,67 +1,62 @@
-import assert from 'assert'
+import assert from 'assert';
import {
default as cleanDatePublished,
cleanDateString,
-} from './date-published'
+} from './date-published';
describe('cleanDatePublished(dateString)', () => {
it('returns a date object', () => {
- const datePublished = cleanDatePublished('published: 1/1/2020')
+ const datePublished = cleanDatePublished('published: 1/1/2020');
assert.equal(
datePublished,
new Date('1/1/2020').toISOString()
- )
- })
+ );
+ });
it('returns null if date is invalid', () => {
- const datePublished = cleanDatePublished('blargh')
+ const datePublished = cleanDatePublished('blargh');
- assert.equal(datePublished, null)
- })
-
-})
+ assert.equal(datePublished, null);
+ });
+});
describe('cleanDateString(dateString)', () => {
it('removes "published" text from an datePublished string', () => {
- const datePublished = cleanDateString('published: 1/1/2020')
+ const datePublished = cleanDateString('published: 1/1/2020');
- assert.equal(datePublished, '1/1/2020')
- })
+ assert.equal(datePublished, '1/1/2020');
+ });
it('trims whitespace', () => {
- const datePublished = cleanDateString(' 1/1/2020 ')
+ const datePublished = cleanDateString(' 1/1/2020 ');
- assert.equal(datePublished, '1/1/2020')
- })
+ assert.equal(datePublished, '1/1/2020');
+ });
it('puts a space b/w a time and am/pm', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
- const date1 = cleanDateString('1/1/2020 8:30am')
- assert.equal(date1, '1/1/2020 8:30 am')
+ const date1 = cleanDateString('1/1/2020 8:30am');
+ assert.equal(date1, '1/1/2020 8:30 am');
- const date2 = cleanDateString('8:30PM 1/1/2020')
- assert.equal(date2, '8:30 PM 1/1/2020')
- })
+ const date2 = cleanDateString('8:30PM 1/1/2020');
+ assert.equal(date2, '8:30 PM 1/1/2020');
+ });
it('cleans the dots from a.m. or p.m.', () => {
// The JS date parser is forgiving, but
// it needs a.m./p.m. without dots
- const date1 = cleanDateString('1/1/2020 8:30 a.m.')
- assert.equal(date1, '1/1/2020 8:30 am')
- })
+ const date1 = cleanDateString('1/1/2020 8:30 a.m.');
+ assert.equal(date1, '1/1/2020 8:30 am');
+ });
it('can handle some tough timestamps', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
- const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.')
- assert.equal(date1, '15 Apr 2016 10:59')
-
- const date2 = cleanDateString('8:30PM 1/1/2020')
- assert.equal(date2, '8:30 PM 1/1/2020')
- })
-
-})
+ const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.');
+ assert.equal(date1, '15 Apr 2016 10:59');
+ });
+});
diff --git a/src/cleaners/dek.js b/src/cleaners/dek.js
index 7ddbdf65..8686120b 100644
--- a/src/cleaners/dek.js
+++ b/src/cleaners/dek.js
@@ -1,17 +1,18 @@
-import { TEXT_LINK_RE } from './constants'
-import { stripTags } from 'utils/dom'
+import { stripTags } from 'utils/dom';
+
+import { TEXT_LINK_RE } from './constants';
// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.
export default function cleanDek(dek, { $ }) {
// Sanity check that we didn't get too short or long of a dek.
- if (dek.length > 1000 || dek.length < 5) return null
+ if (dek.length > 1000 || dek.length < 5) return null;
- const dekText = stripTags(dek, $)
+ const dekText = stripTags(dek, $);
// Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail.
- if (TEXT_LINK_RE.test(dekText)) return null
+ if (TEXT_LINK_RE.test(dekText)) return null;
- return dekText.trim()
+ return dekText.trim();
}
diff --git a/src/cleaners/dek.test.js b/src/cleaners/dek.test.js
index eaa4fc2f..39aef99e 100644
--- a/src/cleaners/dek.test.js
+++ b/src/cleaners/dek.test.js
@@ -1,52 +1,50 @@
-import assert from 'assert'
-import cheerio from 'cheerio'
+import assert from 'assert';
+import cheerio from 'cheerio';
-import {
- default as cleanDek,
- cleanDekString,
-} from './dek'
+import cleanDek from './dek';
describe('cleanDek(dekString, { $ })', () => {
it('returns null if the dek is < 5 chars', () => {
- const $ = cheerio.load('')
- assert.equal(cleanDek('Hi', { $ }), null)
- })
+ const $ = cheerio.load('');
+ assert.equal(cleanDek('Hi', { $ }), null);
+ });
it('returns null if the dek is > 1000 chars', () => {
- const $ = cheerio.load('')
+ const $ = cheerio.load('');
const longDek =
// generate a string that is 1,280 chars
- [0,1,2,3,4,5,6].reduce((acc, i) =>
- acc += acc, '0123456789'
- )
- assert.equal(cleanDek(longDek, { $ }), null)
- })
+ [0, 1, 2, 3, 4, 5, 6].reduce((acc) => {
+ acc += acc;
+ return acc;
+ }, '0123456789');
+ assert.equal(cleanDek(longDek, { $ }), null);
+ });
it('strip html tags from the dek', () => {
- const $ = cheerio.load('')
- const dek = 'This is a very important dek.'
+ const $ = cheerio.load('');
+ const dek = 'This is a very important dek.';
- assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.')
- })
+ assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.');
+ });
it('returns null if dek contains plain text link', () => {
- const $ = cheerio.load('')
- const dek = 'This has this link http://example.com/foo/bar'
+ const $ = cheerio.load('');
+ const dek = 'This has this link http://example.com/foo/bar';
- assert.equal(cleanDek(dek, { $ }), null)
- })
+ assert.equal(cleanDek(dek, { $ }), null);
+ });
it('returns a normal dek as is', () => {
- const $ = cheerio.load('')
- const dek = 'This is the dek'
+ const $ = cheerio.load('');
+ const dek = 'This is the dek';
- assert.equal(cleanDek(dek, { $ }), dek)
- })
+ assert.equal(cleanDek(dek, { $ }), dek);
+ });
it('cleans extra whitespace', () => {
- const $ = cheerio.load('')
- const dek = ' This is the dek '
+ const $ = cheerio.load('');
+ const dek = ' This is the dek ';
- assert.equal(cleanDek(dek, { $ }), 'This is the dek')
- })
-})
+ assert.equal(cleanDek(dek, { $ }), 'This is the dek');
+ });
+});
diff --git a/src/cleaners/fixtures/html.js b/src/cleaners/fixtures/html.js
index f7b776a9..a75cd793 100644
--- a/src/cleaners/fixtures/html.js
+++ b/src/cleaners/fixtures/html.js
@@ -1,5 +1,5 @@
const HTML = {
- docWithH1: `This Is the Real Title
`,
+ docWithH1: 'This Is the Real Title
',
docWith2H1s: `
This Is the Real Title
@@ -7,9 +7,9 @@ const HTML = {
`,
docWithTagsInH1: {
- before: `This Is the Real Title
`,
- after: `This Is the Real Title`
+ before: 'This Is the Real Title
',
+ after: 'This Is the Real Title',
},
-}
+};
-export default HTML
+export default HTML;
diff --git a/src/cleaners/index.js b/src/cleaners/index.js
index 11439970..ce1ab34c 100644
--- a/src/cleaners/index.js
+++ b/src/cleaners/index.js
@@ -1,9 +1,9 @@
-import cleanAuthor from './author'
-import cleanImage from './lead-image-url'
-import cleanDek from './dek'
-import cleanDatePublished from './date-published'
-import cleanContent from './content'
-import cleanTitle from './title'
+import cleanAuthor from './author';
+import cleanImage from './lead-image-url';
+import cleanDek from './dek';
+import cleanDatePublished from './date-published';
+import cleanContent from './content';
+import cleanTitle from './title';
const Cleaners = {
author: cleanAuthor,
@@ -12,15 +12,15 @@ const Cleaners = {
datePublished: cleanDatePublished,
content: cleanContent,
title: cleanTitle,
-}
+};
-export default Cleaners
+export default Cleaners;
-export { cleanAuthor }
-export { cleanImage }
-export { cleanDek }
-export { cleanDatePublished }
-export { cleanContent }
-export { cleanTitle }
-export { default as resolveSplitTitle } from './resolve-split-title'
+export { cleanAuthor };
+export { cleanImage };
+export { cleanDek };
+export { cleanDatePublished };
+export { cleanContent };
+export { cleanTitle };
+export { default as resolveSplitTitle } from './resolve-split-title';
diff --git a/src/cleaners/lead-image-url.js b/src/cleaners/lead-image-url.js
index f33cd914..a61d11ef 100644
--- a/src/cleaners/lead-image-url.js
+++ b/src/cleaners/lead-image-url.js
@@ -1,10 +1,10 @@
-import validUrl from 'valid-url'
+import validUrl from 'valid-url';
export default function clean(leadImageUrl) {
- leadImageUrl = leadImageUrl.trim()
+ leadImageUrl = leadImageUrl.trim();
if (validUrl.isWebUri(leadImageUrl)) {
- return leadImageUrl
- } else {
- return null
+ return leadImageUrl;
}
+
+ return null;
}
diff --git a/src/cleaners/lead-image-url.test.js b/src/cleaners/lead-image-url.test.js
index 0ff85abe..90632c58 100644
--- a/src/cleaners/lead-image-url.test.js
+++ b/src/cleaners/lead-image-url.test.js
@@ -1,20 +1,20 @@
-import assert from 'assert'
+import assert from 'assert';
-import clean from './lead-image-url'
+import clean from './lead-image-url';
describe('clean(leadImageUrl)', () => {
it('returns the url if valid', () => {
- const url = 'https://example.com'
- assert.equal(clean(url), url)
- })
+ const url = 'https://example.com';
+ assert.equal(clean(url), url);
+ });
it('returns null if the url is not valid', () => {
- const url = 'this is not a valid url'
- assert.equal(clean(url), null)
- })
+ const url = 'this is not a valid url';
+ assert.equal(clean(url), null);
+ });
it('trims whitespace', () => {
- const url = ' https://example.com/foo/bar.jpg'
- assert.equal(clean(url), url.trim())
- })
-})
+ const url = ' https://example.com/foo/bar.jpg';
+ assert.equal(clean(url), url.trim());
+ });
+});
diff --git a/src/cleaners/resolve-split-title.js b/src/cleaners/resolve-split-title.js
index 9654140d..7c393bca 100644
--- a/src/cleaners/resolve-split-title.js
+++ b/src/cleaners/resolve-split-title.js
@@ -1,34 +1,11 @@
-import URL from 'url'
-import 'babel-polyfill'
-import wuzzy from 'wuzzy'
+import URL from 'url';
+import 'babel-polyfill';
+import wuzzy from 'wuzzy';
import {
TITLE_SPLITTERS_RE,
DOMAIN_ENDINGS_RE,
-} from './constants'
-
-// Given a title with separators in it (colons, dashes, etc),
-// resolve whether any of the segments should be removed.
-export default function resolveSplitTitle(title, url='') {
- // Splits while preserving splitters, like:
- // ['The New New York', ' - ', 'The Washington Post']
- title = title
-
- let splitTitle = title.split(TITLE_SPLITTERS_RE)
- if (splitTitle.length === 1) {
- return title
- }
-
- let newTitle = extractBreadcrumbTitle(splitTitle, title)
- if (newTitle) return newTitle
-
- newTitle = cleanDomainFromTitle(splitTitle, url)
- if (newTitle) return newTitle
-
- // Fuzzy ratio didn't find anything, so this title is probably legit.
- // Just return it all.
- return title
-}
+} from './constants';
function extractBreadcrumbTitle(splitTitle, text) {
// This must be a very breadcrumbed title, like:
@@ -38,40 +15,40 @@ function extractBreadcrumbTitle(splitTitle, text) {
// Look to see if we can find a breadcrumb splitter that happens
// more than once. If we can, we'll be able to better pull out
// the title.
- const termCounts = splitTitle.reduce((acc, text) => {
- acc[text] = acc[text] ? acc[text] + 1 : 1
- return acc
- }, {})
+ const termCounts = splitTitle.reduce((acc, titleText) => {
+ acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
+ return acc;
+ }, {});
const [maxTerm, termCount] =
Reflect.ownKeys(termCounts)
.reduce((acc, key) => {
if (acc[1] < termCounts[key]) {
- return [key, termCounts[key]]
- } else {
- return acc
+ return [key, termCounts[key]];
}
- }, [0, 0])
+
+ return acc;
+ }, [0, 0]);
// We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead.
// Note: max_term should be <= 4 characters, so that " >> "
// will match, but nothing longer than that.
if (termCount >= 2 && maxTerm.length <= 4) {
- splitTitle = text.split(maxTerm)
+ splitTitle = text.split(maxTerm);
}
- const splitEnds = [splitTitle[0], splitTitle.slice(-1)]
- const longestEnd = splitEnds.reduce((acc, end) => {
- return acc.length > end.length ? acc : end
- }, '')
+ const splitEnds = [splitTitle[0], splitTitle.slice(-1)];
+ const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');
if (longestEnd.length > 10) {
- return longestEnd
- } else {
- return text
+ return longestEnd;
}
+
+ return text;
}
+
+ return null;
}
function cleanDomainFromTitle(splitTitle, url) {
@@ -81,20 +58,43 @@ function cleanDomainFromTitle(splitTitle, url) {
//
// Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right.
- const { host } = URL.parse(url)
- const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '')
+ const { host } = URL.parse(url);
+ const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
- const startSlug = splitTitle[0].toLowerCase().replace(' ', '')
- const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain)
+ const startSlug = splitTitle[0].toLowerCase().replace(' ', '');
+ const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
- if (startSlugRatio > .4 && startSlug.length > 5) {
- return splitTitle.slice(2).join('')
+ if (startSlugRatio > 0.4 && startSlug.length > 5) {
+ return splitTitle.slice(2).join('');
}
- const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '')
- const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain)
+ const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
+ const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
- if (endSlugRatio > .4 && endSlug.length >= 5) {
- return splitTitle.slice(0, -2).join('')
+ if (endSlugRatio > 0.4 && endSlug.length >= 5) {
+ return splitTitle.slice(0, -2).join('');
}
+
+ return null;
+}
+
+// Given a title with separators in it (colons, dashes, etc),
+// resolve whether any of the segments should be removed.
+export default function resolveSplitTitle(title, url = '') {
+ // Splits while preserving splitters, like:
+ // ['The New New York', ' - ', 'The Washington Post']
+ const splitTitle = title.split(TITLE_SPLITTERS_RE);
+ if (splitTitle.length === 1) {
+ return title;
+ }
+
+ let newTitle = extractBreadcrumbTitle(splitTitle, title);
+ if (newTitle) return newTitle;
+
+ newTitle = cleanDomainFromTitle(splitTitle, url);
+ if (newTitle) return newTitle;
+
+ // Fuzzy ratio didn't find anything, so this title is probably legit.
+ // Just return it all.
+ return title;
}
diff --git a/src/cleaners/resolve-split-title.test.js b/src/cleaners/resolve-split-title.test.js
index 871d1191..5fee794c 100644
--- a/src/cleaners/resolve-split-title.test.js
+++ b/src/cleaners/resolve-split-title.test.js
@@ -1,32 +1,31 @@
-import assert from 'assert'
-import cheerio from 'cheerio'
+import assert from 'assert';
-import { resolveSplitTitle } from './index'
+import { resolveSplitTitle } from './index';
describe('resolveSplitTitle(text)', () => {
it('does nothing if title not splittable', () => {
- const title = "This Is a Normal Title"
+ const title = 'This Is a Normal Title';
- assert.equal(resolveSplitTitle(title), title)
- })
+ assert.equal(resolveSplitTitle(title), title);
+ });
it('extracts titles from breadcrumb-like titles', () => {
- const title = "The Best Gadgets on Earth : Bits : Blogs : NYTimes.com"
+ const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com';
- assert.equal(resolveSplitTitle(title), "The Best Gadgets on Earth ")
- })
+ assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth ');
+ });
it('cleans domains from titles at the front', () => {
- const title = "NYTimes - The Best Gadgets on Earth"
- const url = "https://www.nytimes.com/bits/blog/etc/"
+ const title = 'NYTimes - The Best Gadgets on Earth';
+ const url = 'https://www.nytimes.com/bits/blog/etc/';
- assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth")
- })
+ assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
+ });
it('cleans domains from titles at the back', () => {
- const title = "The Best Gadgets on Earth | NYTimes"
- const url = "https://www.nytimes.com/bits/blog/etc/"
+ const title = 'The Best Gadgets on Earth | NYTimes';
+ const url = 'https://www.nytimes.com/bits/blog/etc/';
- assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth")
- })
-})
+ assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
+ });
+});
diff --git a/src/cleaners/title.js b/src/cleaners/title.js
index 9328becd..a1fd2f9a 100644
--- a/src/cleaners/title.js
+++ b/src/cleaners/title.js
@@ -1,25 +1,26 @@
-import { TITLE_SPLITTERS_RE } from './constants'
-import { resolveSplitTitle } from './index'
-import { stripTags } from 'utils/dom'
+import { stripTags } from 'utils/dom';
+
+import { TITLE_SPLITTERS_RE } from './constants';
+import { resolveSplitTitle } from './index';
export default function cleanTitle(title, { url, $ }) {
// If title has |, :, or - in it, see if
// we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) {
- title = resolveSplitTitle(title, url)
+ title = resolveSplitTitle(title, url);
}
// Final sanity check that we didn't get a crazy title.
// if (title.length > 150 || title.length < 15) {
if (title.length > 150) {
// If we did, return h1 from the document if it exists
- const h1 = $('h1')
+ const h1 = $('h1');
if (h1.length === 1) {
- title = h1.text()
+ title = h1.text();
}
}
// strip any html tags in the title text
- return stripTags(title, $).trim()
+ return stripTags(title, $).trim();
}
diff --git a/src/cleaners/title.test.js b/src/cleaners/title.test.js
index c8a0c7a5..c99d3d05 100644
--- a/src/cleaners/title.test.js
+++ b/src/cleaners/title.test.js
@@ -1,8 +1,8 @@
-import assert from 'assert'
-import cheerio from 'cheerio'
+import assert from 'assert';
+import cheerio from 'cheerio';
-import HTML from './fixtures/html'
-import { cleanTitle } from './index'
+import HTML from './fixtures/html';
+import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => {
it('uses a single h1 if the title is too short or too long', () => {
@@ -10,28 +10,27 @@ describe('cleanTitle(title, { url, $ })', () => {
// const $ = cheerio.load(HTML.docWithH1)
//
// assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
- })
+ });
it('only uses h1 if there is only one on the page', () => {
- const title = "Too Short"
- const $ = cheerio.load(HTML.docWith2H1s)
+ const title = 'Too Short';
+ const $ = cheerio.load(HTML.docWith2H1s);
- assert.equal(cleanTitle(title, { url: '', $ }), title)
- })
+ assert.equal(cleanTitle(title, { url: '', $ }), title);
+ });
it('removes HTML tags from titles', () => {
- const $ = cheerio.load(HTML.docWithTagsInH1.before)
- const title = $('h1').html()
+ const $ = cheerio.load(HTML.docWithTagsInH1.before);
+ const title = $('h1').html();
- assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after)
- })
+ assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after);
+ });
it('trims extraneous spaces', () => {
- const title = " This Is a Great Title That You'll Love "
- const $ = cheerio.load(HTML.docWithTagsInH1.before)
+ const title = " This Is a Great Title That You'll Love ";
+ const $ = cheerio.load(HTML.docWithTagsInH1.before);
- assert.equal(cleanTitle(title, { url: '', $ }), title.trim())
- })
-
-})
+ assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
+ });
+});
diff --git a/src/extractors/all.js b/src/extractors/all.js
index 399c2466..6b26f28f 100644
--- a/src/extractors/all.js
+++ b/src/extractors/all.js
@@ -1,12 +1,11 @@
-import GenericExtractor from './generic'
-import NYMagExtractor from './custom/nymag.com'
-import BloggerExtractor from './custom/blogspot.com'
-import WikipediaExtractor from './custom/wikipedia.org'
+import NYMagExtractor from './custom/nymag.com';
+import BloggerExtractor from './custom/blogspot.com';
+import WikipediaExtractor from './custom/wikipedia.org';
const Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor,
-}
+};
-export default Extractors
+export default Extractors;
diff --git a/src/extractors/constants.js b/src/extractors/constants.js
index b6fc067b..f490a68e 100644
--- a/src/extractors/constants.js
+++ b/src/extractors/constants.js
@@ -1 +1 @@
-export const ATTR_RE = /\[([\w-]+)\]/
+export const ATTR_RE = /\[([\w-]+)\]/;
diff --git a/src/extractors/custom/blogspot.com/index.js b/src/extractors/custom/blogspot.com/index.js
index 20a294ae..8fa5a8a8 100644
--- a/src/extractors/custom/blogspot.com/index.js
+++ b/src/extractors/custom/blogspot.com/index.js
@@ -14,27 +14,27 @@ const BloggerExtractor = {
// Convert the noscript tag to a div
transforms: {
- 'noscript': 'div'
+ noscript: 'div',
},
},
author: {
selectors: [
- '.post-author-name'
- ]
+ '.post-author-name',
+ ],
},
title: {
selectors: [
'h2.title',
- ]
+ ],
},
datePublished: {
selectors: [
'span.publishdate',
- ]
- }
-}
+ ],
+ },
+};
-export default BloggerExtractor
+export default BloggerExtractor;
diff --git a/src/extractors/custom/nymag.com/index.js b/src/extractors/custom/nymag.com/index.js
index d96a4bc6..c7622191 100644
--- a/src/extractors/custom/nymag.com/index.js
+++ b/src/extractors/custom/nymag.com/index.js
@@ -22,37 +22,39 @@ const NYMagExtractor = {
// the transformation.
transforms: {
// Convert h1s to h2s
- 'h1': 'h2',
+ h1: 'h2',
// Convert lazy-loaded noscript images to figures
- 'noscript': ($node) => {
- const $children = $node.children()
+ noscript: ($node) => {
+ const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') {
- return 'figure'
+ return 'figure';
}
- }
- }
+
+ return null;
+ },
+ },
},
title: {
selectors: [
'h1.headline-primary',
'h1',
- ]
+ ],
},
author: {
selectors: [
'.by-authors',
- ]
+ ],
},
datePublished: {
selectors: [
'time.article-timestamp[datetime]',
'time.article-timestamp',
- ]
- }
-}
+ ],
+ },
+};
-export default NYMagExtractor
+export default NYMagExtractor;
diff --git a/src/extractors/custom/wikipedia.org/index.js b/src/extractors/custom/wikipedia.org/index.js
index 73c07aca..a30ce35b 100644
--- a/src/extractors/custom/wikipedia.org/index.js
+++ b/src/extractors/custom/wikipedia.org/index.js
@@ -8,7 +8,7 @@ const WikipediaExtractor = {
// transform top infobox to an image with caption
transforms: {
'.infobox img': ($node) => {
- $node.parents('.infobox').prepend($node)
+ $node.parents('.infobox').prepend($node);
},
'.infobox caption': 'figcaption',
'.infobox': 'figure',
@@ -28,15 +28,15 @@ const WikipediaExtractor = {
title: {
selectors: [
'h2.title',
- ]
+ ],
},
datePublished: {
selectors: [
'#footer-info-lastmod',
- ]
+ ],
},
-}
+};
-export default WikipediaExtractor
+export default WikipediaExtractor;
diff --git a/src/extractors/generic/author/constants.js b/src/extractors/generic/author/constants.js
index 942b101c..3b0b8d94 100644
--- a/src/extractors/generic/author/constants.js
+++ b/src/extractors/generic/author/constants.js
@@ -5,22 +5,22 @@
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
export const AUTHOR_META_TAGS = [
- 'byl',
- 'clmst',
- 'dc.author',
- 'dcsext.author',
- 'dc.creator',
- 'rbauthors',
- 'authors',
-]
+ 'byl',
+ 'clmst',
+ 'dc.author',
+ 'dcsext.author',
+ 'dc.creator',
+ 'rbauthors',
+ 'authors',
+];
-export const AUTHOR_MAX_LENGTH = 300
+export const AUTHOR_MAX_LENGTH = 300;
// An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
-// exists in the className, which is not as accurate as .className (which
+// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
export const AUTHOR_SELECTORS = [
@@ -47,12 +47,12 @@ export const AUTHOR_SELECTORS = [
'.articleauthor',
'.ArticleAuthor',
'.byline',
-]
+];
// An ordered list of Selectors to find likely article authors, with
// regular expression for content.
-const byline_re = /^[\n\s]*By/i
+const bylineRe = /^[\n\s]*By/i;
export const BYLINE_SELECTORS_RE = [
- ['#byline', byline_re],
- ['.byline', byline_re],
-]
+ ['#byline', bylineRe],
+ ['.byline', bylineRe],
+];
diff --git a/src/extractors/generic/author/extractor.js b/src/extractors/generic/author/extractor.js
index 240d6e75..5a7c6cf3 100644
--- a/src/extractors/generic/author/extractor.js
+++ b/src/extractors/generic/author/extractor.js
@@ -1,49 +1,48 @@
+import { cleanAuthor } from 'cleaners';
+import {
+ extractFromMeta,
+ extractFromSelectors,
+} from 'utils/dom';
+
import {
AUTHOR_META_TAGS,
AUTHOR_MAX_LENGTH,
AUTHOR_SELECTORS,
BYLINE_SELECTORS_RE,
-} from './constants'
-
-import { cleanAuthor } from 'cleaners'
-
-import {
- extractFromMeta,
- extractFromSelectors
-} from 'utils/dom'
+} from './constants';
const GenericAuthorExtractor = {
extract({ $, metaCache }) {
- let author
+ let author;
// First, check to see if we have a matching
// meta tag that we can make use of.
- author = extractFromMeta($, AUTHOR_META_TAGS, metaCache)
+ author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
if (author && author.length < AUTHOR_MAX_LENGTH) {
- return cleanAuthor(author)
+ return cleanAuthor(author);
}
// Second, look through our selectors looking for potential authors.
- author = extractFromSelectors($, AUTHOR_SELECTORS, 2)
+ author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
if (author && author.length < AUTHOR_MAX_LENGTH) {
- return cleanAuthor(author)
+ return cleanAuthor(author);
}
// Last, use our looser regular-expression based selectors for
// potential authors.
for (const [selector, regex] of BYLINE_SELECTORS_RE) {
- const node = $(selector)
+ const node = $(selector);
if (node.length === 1) {
- const text = node.text()
+ const text = node.text();
if (regex.test(text)) {
- return cleanAuthor(text)
+ return cleanAuthor(text);
}
}
}
- return null
- }
-}
+ return null;
+ },
+};
-export default GenericAuthorExtractor
+export default GenericAuthorExtractor;
diff --git a/src/extractors/generic/author/extractor.test.js b/src/extractors/generic/author/extractor.test.js
index f1df9107..fa522cf9 100644
--- a/src/extractors/generic/author/extractor.test.js
+++ b/src/extractors/generic/author/extractor.test.js
@@ -1,46 +1,46 @@
-import assert from 'assert'
-import cheerio from 'cheerio'
+import assert from 'assert';
+import cheerio from 'cheerio';
-import HTML from './fixtures/html'
-import GenericAuthorExtractor from './extractor'
+import HTML from './fixtures/html';
+import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => {
- const $ = cheerio.load(HTML.authorMeta.test)
+ const $ = cheerio.load(HTML.authorMeta.test);
const result = GenericAuthorExtractor.extract(
- { $, metaCache: ["dc.author", "something-else"] }
- )
+ { $, metaCache: ['dc.author', 'something-else'] }
+ );
- assert.equal(result, HTML.authorMeta.result)
- })
+ assert.equal(result, HTML.authorMeta.result);
+ });
it('extracts author from author selectors', () => {
- const $ = cheerio.load(HTML.authorSelectors.test)
+ const $ = cheerio.load(HTML.authorSelectors.test);
const result = GenericAuthorExtractor.extract(
- { $, metaCache: ["dc.author", "something-else"] }
- )
+ { $, metaCache: ['dc.author', 'something-else'] }
+ );
- assert.equal(result, HTML.authorSelectors.result)
- })
+ assert.equal(result, HTML.authorSelectors.result);
+ });
it('extracts author with regex selectors', () => {
- const $ = cheerio.load(HTML.authorRegSelectors.test)
+ const $ = cheerio.load(HTML.authorRegSelectors.test);
const result = GenericAuthorExtractor.extract(
- { $, metaCache: ["dc.author", "something-else"] }
- )
+ { $, metaCache: ['dc.author', 'something-else'] }
+ );
- assert.equal(result, HTML.authorRegSelectors.result)
- })
+ assert.equal(result, HTML.authorRegSelectors.result);
+ });
it('returns null if no author found', () => {
- const $ = cheerio.load('')
+ const $ = cheerio.load('');
const result = GenericAuthorExtractor.extract(
- { $, metaCache: ["dc.author", "something-else"] }
- )
+ { $, metaCache: ['dc.author', 'something-else'] }
+ );
- assert.equal(result, null)
- })
- })
-})
+ assert.equal(result, null);
+ });
+ });
+});
diff --git a/src/extractors/generic/author/fixtures/html.js b/src/extractors/generic/author/fixtures/html.js
index 499a0588..84ed985d 100644
--- a/src/extractors/generic/author/fixtures/html.js
+++ b/src/extractors/generic/author/fixtures/html.js
@@ -5,7 +5,7 @@ const HTML = {