chore: refactored and linted

pull/1/head
Adam Pash 8 years ago
parent 9906bd36a4
commit 7e2a34945f

@ -0,0 +1 @@
**/fixtures/*

@ -0,0 +1,39 @@
// Use this file as a starting point for your project's .eslintrc.
// Copy this file, and add rule overrides as needed.
{
"parser": "babel-eslint",
"extends": "airbnb",
"plugins": [
"babel"
],
"globals": {
/* mocha */
"describe",
"it"
},
"rules": {
"no-param-reassign": 0,
/* TODO fix this; this should work w/import/resolver below, but doesn't */
"import/no-extraneous-dependencies": 0,
"import/no-unresolved": 0,
"no-control-regex": 0,
"import/prefer-default-export": 0,
"generator-star-spacing": 0,
"babel/generator-star-spacing": 0,
"func-names": 0,
"no-useless-escape": 0,
"no-confusing-arrow": 0,
},
"settings": {
"import/resolver": {
"babel-module": {
"extensions": [".js"]
}
}
},
"parserOptions":{
"ecmaFeatures": {
"experimentalObjectRestSpread": true
}
}
}

@ -5,14 +5,17 @@
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"start": "node ./build", "start": "node ./build",
"build": "rollup -c", "lint": "eslint src/**",
"build": "eslint src/** && rollup -c",
"test": "./test-runner" "test": "./test-runner"
}, },
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"devDependencies": { "devDependencies": {
"babel-eslint": "^6.1.2",
"babel-plugin-external-helpers": "^6.8.0", "babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-module-alias": "^1.6.0", "babel-plugin-module-alias": "^1.6.0",
"babel-plugin-module-resolver": "^2.2.0",
"babel-plugin-transform-async-to-generator": "^6.8.0", "babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0", "babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0", "babel-plugin-transform-object-rest-spread": "^6.8.0",
@ -21,6 +24,14 @@
"babel-preset-es2015-rollup": "^1.2.0", "babel-preset-es2015-rollup": "^1.2.0",
"babel-register": "^6.11.6", "babel-register": "^6.11.6",
"babelrc-rollup": "^3.0.0", "babelrc-rollup": "^3.0.0",
"eslint": "^3.5.0",
"eslint-config-airbnb": "^11.1.0",
"eslint-import-resolver-babel-module": "^2.0.1",
"eslint-plugin-async": "^0.1.1",
"eslint-plugin-babel": "^3.3.0",
"eslint-plugin-import": "^1.15.0",
"eslint-plugin-jsx-a11y": "^2.2.2",
"eslint-plugin-react": "^6.2.1",
"mocha": "^3.0.2", "mocha": "^3.0.2",
"rollup": "^0.34.13", "rollup": "^0.34.13",
"rollup-plugin-babel": "^2.6.1", "rollup-plugin-babel": "^2.6.1",

@ -0,0 +1,21 @@
#!/usr/local/bin/fish
set file $argv[1]
set function $argv[2]
touch src/extractors/generic/next-page-url/scoring/utils/index.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js
echo "Now make it a default export"
echo "Move it to its file"
echo "Move its tests to its test file"
echo "import in score-links"
echo "Test it."

@ -1,7 +1,7 @@
import { CLEAN_AUTHOR_RE } from './constants' import { CLEAN_AUTHOR_RE } from './constants';
// Take an author string (like 'By David Smith ') and clean it to // Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'. // just the name(s): 'David Smith'.
export default function cleanAuthor(author) { export default function cleanAuthor(author) {
return author.replace(CLEAN_AUTHOR_RE, '$2').trim() return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
} }

@ -1,21 +1,21 @@
import assert from 'assert' import assert from 'assert';
import cleanAuthor from './author' import cleanAuthor from './author';
describe('cleanAuthor(author)', () => { describe('cleanAuthor(author)', () => {
it('removes the By from an author string', () => { it('removes the By from an author string', () => {
const author = cleanAuthor('By Bob Dylan') const author = cleanAuthor('By Bob Dylan');
assert.equal(author, 'Bob Dylan') assert.equal(author, 'Bob Dylan');
}) });
it('trims trailing whitespace and line breaks', () => { it('trims trailing whitespace and line breaks', () => {
const text = ` const text = `
written by written by
Bob Dylan Bob Dylan
` `;
const author = cleanAuthor(text) const author = cleanAuthor(text);
assert.equal(author, 'Bob Dylan') assert.equal(author, 'Bob Dylan');
}) });
}) });

@ -1,9 +1,9 @@
// CLEAN AUTHOR CONSTANTS // CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)', // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS // CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i') export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// An ordered list of meta tag names that denote likely article deks. // An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct. // From most distinct to least distinct.
// //
@ -14,7 +14,7 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// However, these tags often have SEO-specific junk in them that's not // However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best. // header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [ export const DEK_META_TAGS = [
] ];
// An ordered list of Selectors to find likely article deks. From // An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit. // most explicit to least explicit.
@ -23,18 +23,36 @@ export const DEK_META_TAGS = [
// detrimental to the aesthetics of an article. // detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [ export const DEK_SELECTORS = [
'.entry-summary', '.entry-summary',
] ];
// CLEAN DATE PUBLISHED CONSTANTS // CLEAN DATE PUBLISHED CONSTANTS
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}(\s?[ap]\.?m\.?)?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig const months = [
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
'aug',
'sep',
'oct',
'nov',
'dec',
];
const allMonths = months.join('|');
const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
export const SPLIT_DATE_STRING =
new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig');
// CLEAN TITLE CONSTANTS // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a // A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar. // title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;
export const DOMAIN_ENDINGS_RE = export const DOMAIN_ENDINGS_RE =
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g') new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g');

@ -8,54 +8,52 @@ import {
rewriteTopLevel, rewriteTopLevel,
stripJunkTags, stripJunkTags,
makeLinksAbsolute, makeLinksAbsolute,
} from 'utils/dom' } from 'utils/dom';
import { convertNodeTo } from 'utils/dom'
// Clean our article content, returning a new, cleaned node. // Clean our article content, returning a new, cleaned node.
export default function extractCleanNode( export default function extractCleanNode(
article, article,
{ {
$, $,
cleanConditionally=true, cleanConditionally = true,
title='', title = '',
url='', url = '',
} }
) { ) {
// Rewrite the tag name to div if it's a top level node like body or // Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags. // html to avoid later complications with multiple body tags.
rewriteTopLevel(article, $) rewriteTopLevel(article, $);
// Drop small images and spacer images // Drop small images and spacer images
cleanImages(article, $) cleanImages(article, $);
// Drop certain tags like <title>, etc // Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security. // This is -mostly- for cleanliness, not security.
stripJunkTags(article, $) stripJunkTags(article, $);
// H1 tags are typically the article title, which should be extracted // H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3), // by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s. // strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article, $) cleanHOnes(article, $);
// Clean headers // Clean headers
cleanHeaders(article, $, title) cleanHeaders(article, $, title);
// Make links absolute // Make links absolute
makeLinksAbsolute(article, $, url) makeLinksAbsolute(article, $, url);
// Remove style or align attributes // Remove style or align attributes
cleanAttributes(article, $) cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to // We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better // too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them. // way to detect menus particularly and remove them.
cleanTags(article, $, cleanConditionally) cleanTags(article, $, cleanConditionally);
// Remove empty paragraph nodes // Remove empty paragraph nodes
removeEmpty(article, $) removeEmpty(article, $);
return article return article;
} }
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6') // headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers: // for header in headers:

@ -1,32 +1,32 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import extractCleanNode from './content' import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractBestNode from 'extractors/generic/content/extract-best-node' import extractCleanNode from './content';
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => { describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it("cleans cruft out of a DOM node", () => { it('cleans cruft out of a DOM node', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
let $ = cheerio.load(html) const $ = cheerio.load(html);
const opts = { const opts = {
stripUnlikelyCandidates: true, stripUnlikelyCandidates: true,
weightNodes: true, weightNodes: true,
cleanConditionally: true, cleanConditionally: true,
} };
const bestNode = extractBestNode($, opts) const bestNode = extractBestNode($, opts);
let result = $.html(bestNode) // let result = $.html(bestNode);
// console.log(result) // // console.log(result)
// console.log(result.length) // // console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts }) const cleanNode = extractCleanNode(bestNode, { $, opts });
result = $.html(cleanNode) // result = $.html(cleanNode);
// console.log(result.length) // // console.log(result.length)
// console.log(result) // // console.log(result)
// console.log(bestNode.html()) // // console.log(bestNode.html())
assert.equal($(bestNode).text().length, 2687) assert.equal($(cleanNode).text().length, 2687);
}) });
}) });

@ -1,4 +1,4 @@
import moment from 'moment' import moment from 'moment';
// Is there a compelling reason to use moment here? // Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method, // Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string. // but could just check for 'Invalid Date' string.
@ -7,27 +7,27 @@ import {
CLEAN_DATE_STRING_RE, CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING, SPLIT_DATE_STRING,
TIME_MERIDIAN_SPACE_RE, TIME_MERIDIAN_SPACE_RE,
TIME_MERIDIAN_DOTS_RE TIME_MERIDIAN_DOTS_RE,
} from './constants' } from './constants';
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim();
}
// Take a date published string, and hopefully return a date out of // Take a date published string, and hopefully return a date out of
// it. Return none if we fail. // it. Return none if we fail.
export default function cleanDatePublished(dateString) { export default function cleanDatePublished(dateString) {
let date = moment(new Date(dateString)) let date = moment(new Date(dateString));
if (!date.isValid()) { if (!date.isValid()) {
dateString = cleanDateString(dateString) dateString = cleanDateString(dateString);
date = moment(new Date(dateString)) date = moment(new Date(dateString));
} }
return date.isValid() ? date.toISOString() : null return date.isValid() ? date.toISOString() : null;
}
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim()
} }

@ -1,67 +1,62 @@
import assert from 'assert' import assert from 'assert';
import { import {
default as cleanDatePublished, default as cleanDatePublished,
cleanDateString, cleanDateString,
} from './date-published' } from './date-published';
describe('cleanDatePublished(dateString)', () => { describe('cleanDatePublished(dateString)', () => {
it('returns a date object', () => { it('returns a date object', () => {
const datePublished = cleanDatePublished('published: 1/1/2020') const datePublished = cleanDatePublished('published: 1/1/2020');
assert.equal( assert.equal(
datePublished, datePublished,
new Date('1/1/2020').toISOString() new Date('1/1/2020').toISOString()
) );
}) });
it('returns null if date is invalid', () => { it('returns null if date is invalid', () => {
const datePublished = cleanDatePublished('blargh') const datePublished = cleanDatePublished('blargh');
assert.equal(datePublished, null) assert.equal(datePublished, null);
}) });
});
})
describe('cleanDateString(dateString)', () => { describe('cleanDateString(dateString)', () => {
it('removes "published" text from an datePublished string', () => { it('removes "published" text from an datePublished string', () => {
const datePublished = cleanDateString('published: 1/1/2020') const datePublished = cleanDateString('published: 1/1/2020');
assert.equal(datePublished, '1/1/2020') assert.equal(datePublished, '1/1/2020');
}) });
it('trims whitespace', () => { it('trims whitespace', () => {
const datePublished = cleanDateString(' 1/1/2020 ') const datePublished = cleanDateString(' 1/1/2020 ');
assert.equal(datePublished, '1/1/2020') assert.equal(datePublished, '1/1/2020');
}) });
it('puts a space b/w a time and am/pm', () => { it('puts a space b/w a time and am/pm', () => {
// The JS date parser is forgiving, but // The JS date parser is forgiving, but
// it needs am/pm separated from a time // it needs am/pm separated from a time
const date1 = cleanDateString('1/1/2020 8:30am') const date1 = cleanDateString('1/1/2020 8:30am');
assert.equal(date1, '1/1/2020 8:30 am') assert.equal(date1, '1/1/2020 8:30 am');
const date2 = cleanDateString('8:30PM 1/1/2020') const date2 = cleanDateString('8:30PM 1/1/2020');
assert.equal(date2, '8:30 PM 1/1/2020') assert.equal(date2, '8:30 PM 1/1/2020');
}) });
it('cleans the dots from a.m. or p.m.', () => { it('cleans the dots from a.m. or p.m.', () => {
// The JS date parser is forgiving, but // The JS date parser is forgiving, but
// it needs a.m./p.m. without dots // it needs a.m./p.m. without dots
const date1 = cleanDateString('1/1/2020 8:30 a.m.') const date1 = cleanDateString('1/1/2020 8:30 a.m.');
assert.equal(date1, '1/1/2020 8:30 am') assert.equal(date1, '1/1/2020 8:30 am');
}) });
it('can handle some tough timestamps', () => { it('can handle some tough timestamps', () => {
// The JS date parser is forgiving, but // The JS date parser is forgiving, but
// it needs am/pm separated from a time // it needs am/pm separated from a time
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.') const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.');
assert.equal(date1, '15 Apr 2016 10:59') assert.equal(date1, '15 Apr 2016 10:59');
});
const date2 = cleanDateString('8:30PM 1/1/2020') });
assert.equal(date2, '8:30 PM 1/1/2020')
})
})

@ -1,17 +1,18 @@
import { TEXT_LINK_RE } from './constants' import { stripTags } from 'utils/dom';
import { stripTags } from 'utils/dom'
import { TEXT_LINK_RE } from './constants';
// Take a dek HTML fragment, and return the cleaned version of it. // Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough. // Return None if the dek wasn't good enough.
export default function cleanDek(dek, { $ }) { export default function cleanDek(dek, { $ }) {
// Sanity check that we didn't get too short or long of a dek. // Sanity check that we didn't get too short or long of a dek.
if (dek.length > 1000 || dek.length < 5) return null if (dek.length > 1000 || dek.length < 5) return null;
const dekText = stripTags(dek, $) const dekText = stripTags(dek, $);
// Plain text links shouldn't exist in the dek. If we have some, it's // Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail. // not a good dek - bail.
if (TEXT_LINK_RE.test(dekText)) return null if (TEXT_LINK_RE.test(dekText)) return null;
return dekText.trim() return dekText.trim();
} }

@ -1,52 +1,50 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import cleanDek from './dek';
default as cleanDek,
cleanDekString,
} from './dek'
describe('cleanDek(dekString, { $ })', () => { describe('cleanDek(dekString, { $ })', () => {
it('returns null if the dek is < 5 chars', () => { it('returns null if the dek is < 5 chars', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
assert.equal(cleanDek('Hi', { $ }), null) assert.equal(cleanDek('Hi', { $ }), null);
}) });
it('returns null if the dek is > 1000 chars', () => { it('returns null if the dek is > 1000 chars', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const longDek = const longDek =
// generate a string that is 1,280 chars // generate a string that is 1,280 chars
[0,1,2,3,4,5,6].reduce((acc, i) => [0, 1, 2, 3, 4, 5, 6].reduce((acc) => {
acc += acc, '0123456789' acc += acc;
) return acc;
assert.equal(cleanDek(longDek, { $ }), null) }, '0123456789');
}) assert.equal(cleanDek(longDek, { $ }), null);
});
it('strip html tags from the dek', () => { it('strip html tags from the dek', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = 'This is a <em>very</em> important dek.' const dek = 'This is a <em>very</em> important dek.';
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.') assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.');
}) });
it('returns null if dek contains plain text link', () => { it('returns null if dek contains plain text link', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = 'This has this link http://example.com/foo/bar' const dek = 'This has this link http://example.com/foo/bar';
assert.equal(cleanDek(dek, { $ }), null) assert.equal(cleanDek(dek, { $ }), null);
}) });
it('returns a normal dek as is', () => { it('returns a normal dek as is', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = 'This is the dek' const dek = 'This is the dek';
assert.equal(cleanDek(dek, { $ }), dek) assert.equal(cleanDek(dek, { $ }), dek);
}) });
it('cleans extra whitespace', () => { it('cleans extra whitespace', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = ' This is the dek ' const dek = ' This is the dek ';
assert.equal(cleanDek(dek, { $ }), 'This is the dek') assert.equal(cleanDek(dek, { $ }), 'This is the dek');
}) });
}) });

@ -1,5 +1,5 @@
const HTML = { const HTML = {
docWithH1: `<div><h1>This Is the Real Title</h1></div>`, docWithH1: '<div><h1>This Is the Real Title</h1></div>',
docWith2H1s: ` docWith2H1s: `
<div> <div>
<h1>This Is the Real Title</h1> <h1>This Is the Real Title</h1>
@ -7,9 +7,9 @@ const HTML = {
</div> </div>
`, `,
docWithTagsInH1: { docWithTagsInH1: {
before: `<div><h1>This Is the <em>Real</em> Title</h1></div>`, before: '<div><h1>This Is the <em>Real</em> Title</h1></div>',
after: `This Is the Real Title` after: 'This Is the Real Title',
}, },
} };
export default HTML export default HTML;

@ -1,9 +1,9 @@
import cleanAuthor from './author' import cleanAuthor from './author';
import cleanImage from './lead-image-url' import cleanImage from './lead-image-url';
import cleanDek from './dek' import cleanDek from './dek';
import cleanDatePublished from './date-published' import cleanDatePublished from './date-published';
import cleanContent from './content' import cleanContent from './content';
import cleanTitle from './title' import cleanTitle from './title';
const Cleaners = { const Cleaners = {
author: cleanAuthor, author: cleanAuthor,
@ -12,15 +12,15 @@ const Cleaners = {
datePublished: cleanDatePublished, datePublished: cleanDatePublished,
content: cleanContent, content: cleanContent,
title: cleanTitle, title: cleanTitle,
} };
export default Cleaners export default Cleaners;
export { cleanAuthor } export { cleanAuthor };
export { cleanImage } export { cleanImage };
export { cleanDek } export { cleanDek };
export { cleanDatePublished } export { cleanDatePublished };
export { cleanContent } export { cleanContent };
export { cleanTitle } export { cleanTitle };
export { default as resolveSplitTitle } from './resolve-split-title' export { default as resolveSplitTitle } from './resolve-split-title';

@ -1,10 +1,10 @@
import validUrl from 'valid-url' import validUrl from 'valid-url';
export default function clean(leadImageUrl) { export default function clean(leadImageUrl) {
leadImageUrl = leadImageUrl.trim() leadImageUrl = leadImageUrl.trim();
if (validUrl.isWebUri(leadImageUrl)) { if (validUrl.isWebUri(leadImageUrl)) {
return leadImageUrl return leadImageUrl;
} else {
return null
} }
return null;
} }

@ -1,20 +1,20 @@
import assert from 'assert' import assert from 'assert';
import clean from './lead-image-url' import clean from './lead-image-url';
describe('clean(leadImageUrl)', () => { describe('clean(leadImageUrl)', () => {
it('returns the url if valid', () => { it('returns the url if valid', () => {
const url = 'https://example.com' const url = 'https://example.com';
assert.equal(clean(url), url) assert.equal(clean(url), url);
}) });
it('returns null if the url is not valid', () => { it('returns null if the url is not valid', () => {
const url = 'this is not a valid url' const url = 'this is not a valid url';
assert.equal(clean(url), null) assert.equal(clean(url), null);
}) });
it('trims whitespace', () => { it('trims whitespace', () => {
const url = ' https://example.com/foo/bar.jpg' const url = ' https://example.com/foo/bar.jpg';
assert.equal(clean(url), url.trim()) assert.equal(clean(url), url.trim());
}) });
}) });

@ -1,34 +1,11 @@
import URL from 'url' import URL from 'url';
import 'babel-polyfill' import 'babel-polyfill';
import wuzzy from 'wuzzy' import wuzzy from 'wuzzy';
import { import {
TITLE_SPLITTERS_RE, TITLE_SPLITTERS_RE,
DOMAIN_ENDINGS_RE, DOMAIN_ENDINGS_RE,
} from './constants' } from './constants';
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url='') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
title = title
let splitTitle = title.split(TITLE_SPLITTERS_RE)
if (splitTitle.length === 1) {
return title
}
let newTitle = extractBreadcrumbTitle(splitTitle, title)
if (newTitle) return newTitle
newTitle = cleanDomainFromTitle(splitTitle, url)
if (newTitle) return newTitle
// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title
}
function extractBreadcrumbTitle(splitTitle, text) { function extractBreadcrumbTitle(splitTitle, text) {
// This must be a very breadcrumbed title, like: // This must be a very breadcrumbed title, like:
@ -38,40 +15,40 @@ function extractBreadcrumbTitle(splitTitle, text) {
// Look to see if we can find a breadcrumb splitter that happens // Look to see if we can find a breadcrumb splitter that happens
// more than once. If we can, we'll be able to better pull out // more than once. If we can, we'll be able to better pull out
// the title. // the title.
const termCounts = splitTitle.reduce((acc, text) => { const termCounts = splitTitle.reduce((acc, titleText) => {
acc[text] = acc[text] ? acc[text] + 1 : 1 acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
return acc return acc;
}, {}) }, {});
const [maxTerm, termCount] = const [maxTerm, termCount] =
Reflect.ownKeys(termCounts) Reflect.ownKeys(termCounts)
.reduce((acc, key) => { .reduce((acc, key) => {
if (acc[1] < termCounts[key]) { if (acc[1] < termCounts[key]) {
return [key, termCounts[key]] return [key, termCounts[key]];
} else {
return acc
} }
}, [0, 0])
return acc;
}, [0, 0]);
// We found a splitter that was used more than once, so it // We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead. // is probably the breadcrumber. Split our title on that instead.
// Note: max_term should be <= 4 characters, so that " >> " // Note: max_term should be <= 4 characters, so that " >> "
// will match, but nothing longer than that. // will match, but nothing longer than that.
if (termCount >= 2 && maxTerm.length <= 4) { if (termCount >= 2 && maxTerm.length <= 4) {
splitTitle = text.split(maxTerm) splitTitle = text.split(maxTerm);
} }
const splitEnds = [splitTitle[0], splitTitle.slice(-1)] const splitEnds = [splitTitle[0], splitTitle.slice(-1)];
const longestEnd = splitEnds.reduce((acc, end) => { const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');
return acc.length > end.length ? acc : end
}, '')
if (longestEnd.length > 10) { if (longestEnd.length > 10) {
return longestEnd return longestEnd;
} else {
return text
} }
return text;
} }
return null;
} }
function cleanDomainFromTitle(splitTitle, url) { function cleanDomainFromTitle(splitTitle, url) {
@ -81,20 +58,43 @@ function cleanDomainFromTitle(splitTitle, url) {
// //
// Strip out the big TLDs - it just makes the matching a bit more // Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right. // accurate. Not the end of the world if it doesn't strip right.
const { host } = URL.parse(url) const { host } = URL.parse(url);
const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '') const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
const startSlug = splitTitle[0].toLowerCase().replace(' ', '') const startSlug = splitTitle[0].toLowerCase().replace(' ', '');
const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain) const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
if (startSlugRatio > .4 && startSlug.length > 5) { if (startSlugRatio > 0.4 && startSlug.length > 5) {
return splitTitle.slice(2).join('') return splitTitle.slice(2).join('');
} }
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '') const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain) const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
if (endSlugRatio > .4 && endSlug.length >= 5) { if (endSlugRatio > 0.4 && endSlug.length >= 5) {
return splitTitle.slice(0, -2).join('') return splitTitle.slice(0, -2).join('');
} }
return null;
}
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url = '') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
const splitTitle = title.split(TITLE_SPLITTERS_RE);
if (splitTitle.length === 1) {
return title;
}
let newTitle = extractBreadcrumbTitle(splitTitle, title);
if (newTitle) return newTitle;
newTitle = cleanDomainFromTitle(splitTitle, url);
if (newTitle) return newTitle;
// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title;
} }

@ -1,32 +1,31 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio'
import { resolveSplitTitle } from './index' import { resolveSplitTitle } from './index';
describe('resolveSplitTitle(text)', () => { describe('resolveSplitTitle(text)', () => {
it('does nothing if title not splittable', () => { it('does nothing if title not splittable', () => {
const title = "This Is a Normal Title" const title = 'This Is a Normal Title';
assert.equal(resolveSplitTitle(title), title) assert.equal(resolveSplitTitle(title), title);
}) });
it('extracts titles from breadcrumb-like titles', () => { it('extracts titles from breadcrumb-like titles', () => {
const title = "The Best Gadgets on Earth : Bits : Blogs : NYTimes.com" const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com';
assert.equal(resolveSplitTitle(title), "The Best Gadgets on Earth ") assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth ');
}) });
it('cleans domains from titles at the front', () => { it('cleans domains from titles at the front', () => {
const title = "NYTimes - The Best Gadgets on Earth" const title = 'NYTimes - The Best Gadgets on Earth';
const url = "https://www.nytimes.com/bits/blog/etc/" const url = 'https://www.nytimes.com/bits/blog/etc/';
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth") assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
}) });
it('cleans domains from titles at the back', () => { it('cleans domains from titles at the back', () => {
const title = "The Best Gadgets on Earth | NYTimes" const title = 'The Best Gadgets on Earth | NYTimes';
const url = "https://www.nytimes.com/bits/blog/etc/" const url = 'https://www.nytimes.com/bits/blog/etc/';
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth") assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
}) });
}) });

@ -1,25 +1,26 @@
import { TITLE_SPLITTERS_RE } from './constants' import { stripTags } from 'utils/dom';
import { resolveSplitTitle } from './index'
import { stripTags } from 'utils/dom' import { TITLE_SPLITTERS_RE } from './constants';
import { resolveSplitTitle } from './index';
export default function cleanTitle(title, { url, $ }) { export default function cleanTitle(title, { url, $ }) {
// If title has |, :, or - in it, see if // If title has |, :, or - in it, see if
// we can clean it up. // we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) { if (TITLE_SPLITTERS_RE.test(title)) {
title = resolveSplitTitle(title, url) title = resolveSplitTitle(title, url);
} }
// Final sanity check that we didn't get a crazy title. // Final sanity check that we didn't get a crazy title.
// if (title.length > 150 || title.length < 15) { // if (title.length > 150 || title.length < 15) {
if (title.length > 150) { if (title.length > 150) {
// If we did, return h1 from the document if it exists // If we did, return h1 from the document if it exists
const h1 = $('h1') const h1 = $('h1');
if (h1.length === 1) { if (h1.length === 1) {
title = h1.text() title = h1.text();
} }
} }
// strip any html tags in the title text // strip any html tags in the title text
return stripTags(title, $).trim() return stripTags(title, $).trim();
} }

@ -1,8 +1,8 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { cleanTitle } from './index' import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => { describe('cleanTitle(title, { url, $ })', () => {
it('uses a single h1 if the title is too short or too long', () => { it('uses a single h1 if the title is too short or too long', () => {
@ -10,28 +10,27 @@ describe('cleanTitle(title, { url, $ })', () => {
// const $ = cheerio.load(HTML.docWithH1) // const $ = cheerio.load(HTML.docWithH1)
// //
// assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text()) // assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
}) });
it('only uses h1 if there is only one on the page', () => { it('only uses h1 if there is only one on the page', () => {
const title = "Too Short" const title = 'Too Short';
const $ = cheerio.load(HTML.docWith2H1s) const $ = cheerio.load(HTML.docWith2H1s);
assert.equal(cleanTitle(title, { url: '', $ }), title) assert.equal(cleanTitle(title, { url: '', $ }), title);
}) });
it('removes HTML tags from titles', () => { it('removes HTML tags from titles', () => {
const $ = cheerio.load(HTML.docWithTagsInH1.before) const $ = cheerio.load(HTML.docWithTagsInH1.before);
const title = $('h1').html() const title = $('h1').html();
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after) assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after);
}) });
it('trims extraneous spaces', () => { it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love " const title = " This Is a Great Title That You'll Love ";
const $ = cheerio.load(HTML.docWithTagsInH1.before) const $ = cheerio.load(HTML.docWithTagsInH1.before);
assert.equal(cleanTitle(title, { url: '', $ }), title.trim()) assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
}) });
});
})

@ -1,12 +1,11 @@
import GenericExtractor from './generic' import NYMagExtractor from './custom/nymag.com';
import NYMagExtractor from './custom/nymag.com' import BloggerExtractor from './custom/blogspot.com';
import BloggerExtractor from './custom/blogspot.com' import WikipediaExtractor from './custom/wikipedia.org';
import WikipediaExtractor from './custom/wikipedia.org'
const Extractors = { const Extractors = {
'nymag.com': NYMagExtractor, 'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor, 'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor, 'wikipedia.org': WikipediaExtractor,
} };
export default Extractors export default Extractors;

@ -1 +1 @@
export const ATTR_RE = /\[([\w-]+)\]/ export const ATTR_RE = /\[([\w-]+)\]/;

@ -14,27 +14,27 @@ const BloggerExtractor = {
// Convert the noscript tag to a div // Convert the noscript tag to a div
transforms: { transforms: {
'noscript': 'div' noscript: 'div',
}, },
}, },
author: { author: {
selectors: [ selectors: [
'.post-author-name' '.post-author-name',
] ],
}, },
title: { title: {
selectors: [ selectors: [
'h2.title', 'h2.title',
] ],
}, },
datePublished: { datePublished: {
selectors: [ selectors: [
'span.publishdate', 'span.publishdate',
] ],
} },
} };
export default BloggerExtractor export default BloggerExtractor;

@ -22,37 +22,39 @@ const NYMagExtractor = {
// the transformation. // the transformation.
transforms: { transforms: {
// Convert h1s to h2s // Convert h1s to h2s
'h1': 'h2', h1: 'h2',
// Convert lazy-loaded noscript images to figures // Convert lazy-loaded noscript images to figures
'noscript': ($node) => { noscript: ($node) => {
const $children = $node.children() const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') { if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure' return 'figure';
} }
}
} return null;
},
},
}, },
title: { title: {
selectors: [ selectors: [
'h1.headline-primary', 'h1.headline-primary',
'h1', 'h1',
] ],
}, },
author: { author: {
selectors: [ selectors: [
'.by-authors', '.by-authors',
] ],
}, },
datePublished: { datePublished: {
selectors: [ selectors: [
'time.article-timestamp[datetime]', 'time.article-timestamp[datetime]',
'time.article-timestamp', 'time.article-timestamp',
] ],
} },
} };
export default NYMagExtractor export default NYMagExtractor;

@ -8,7 +8,7 @@ const WikipediaExtractor = {
// transform top infobox to an image with caption // transform top infobox to an image with caption
transforms: { transforms: {
'.infobox img': ($node) => { '.infobox img': ($node) => {
$node.parents('.infobox').prepend($node) $node.parents('.infobox').prepend($node);
}, },
'.infobox caption': 'figcaption', '.infobox caption': 'figcaption',
'.infobox': 'figure', '.infobox': 'figure',
@ -28,15 +28,15 @@ const WikipediaExtractor = {
title: { title: {
selectors: [ selectors: [
'h2.title', 'h2.title',
] ],
}, },
datePublished: { datePublished: {
selectors: [ selectors: [
'#footer-info-lastmod', '#footer-info-lastmod',
] ],
}, },
} };
export default WikipediaExtractor export default WikipediaExtractor;

@ -5,22 +5,22 @@
// Note: "author" is too often the -developer- of the page, so it is not // Note: "author" is too often the -developer- of the page, so it is not
// added here. // added here.
export const AUTHOR_META_TAGS = [ export const AUTHOR_META_TAGS = [
'byl', 'byl',
'clmst', 'clmst',
'dc.author', 'dc.author',
'dcsext.author', 'dcsext.author',
'dc.creator', 'dc.creator',
'rbauthors', 'rbauthors',
'authors', 'authors',
] ];
export const AUTHOR_MAX_LENGTH = 300 export const AUTHOR_MAX_LENGTH = 300;
// An ordered list of XPath Selectors to find likely article authors. From // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit. // most explicit to least explicit.
// //
// Note - this does not use classes like CSS. This checks to see if the string // Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which // exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The // splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit. // speed tradeoff is worth the accuracy hit.
export const AUTHOR_SELECTORS = [ export const AUTHOR_SELECTORS = [
@ -47,12 +47,12 @@ export const AUTHOR_SELECTORS = [
'.articleauthor', '.articleauthor',
'.ArticleAuthor', '.ArticleAuthor',
'.byline', '.byline',
] ];
// An ordered list of Selectors to find likely article authors, with // An ordered list of Selectors to find likely article authors, with
// regular expression for content. // regular expression for content.
const byline_re = /^[\n\s]*By/i const bylineRe = /^[\n\s]*By/i;
export const BYLINE_SELECTORS_RE = [ export const BYLINE_SELECTORS_RE = [
['#byline', byline_re], ['#byline', bylineRe],
['.byline', byline_re], ['.byline', bylineRe],
] ];

@ -1,49 +1,48 @@
import { cleanAuthor } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import { import {
AUTHOR_META_TAGS, AUTHOR_META_TAGS,
AUTHOR_MAX_LENGTH, AUTHOR_MAX_LENGTH,
AUTHOR_SELECTORS, AUTHOR_SELECTORS,
BYLINE_SELECTORS_RE, BYLINE_SELECTORS_RE,
} from './constants' } from './constants';
import { cleanAuthor } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors
} from 'utils/dom'
const GenericAuthorExtractor = { const GenericAuthorExtractor = {
extract({ $, metaCache }) { extract({ $, metaCache }) {
let author let author;
// First, check to see if we have a matching // First, check to see if we have a matching
// meta tag that we can make use of. // meta tag that we can make use of.
author = extractFromMeta($, AUTHOR_META_TAGS, metaCache) author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
if (author && author.length < AUTHOR_MAX_LENGTH) { if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author) return cleanAuthor(author);
} }
// Second, look through our selectors looking for potential authors. // Second, look through our selectors looking for potential authors.
author = extractFromSelectors($, AUTHOR_SELECTORS, 2) author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
if (author && author.length < AUTHOR_MAX_LENGTH) { if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author) return cleanAuthor(author);
} }
// Last, use our looser regular-expression based selectors for // Last, use our looser regular-expression based selectors for
// potential authors. // potential authors.
for (const [selector, regex] of BYLINE_SELECTORS_RE) { for (const [selector, regex] of BYLINE_SELECTORS_RE) {
const node = $(selector) const node = $(selector);
if (node.length === 1) { if (node.length === 1) {
const text = node.text() const text = node.text();
if (regex.test(text)) { if (regex.test(text)) {
return cleanAuthor(text) return cleanAuthor(text);
} }
} }
} }
return null return null;
} },
} };
export default GenericAuthorExtractor export default GenericAuthorExtractor;

@ -1,46 +1,46 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import GenericAuthorExtractor from './extractor' import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => { describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => { describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => { it('extracts author from meta tags', () => {
const $ = cheerio.load(HTML.authorMeta.test) const $ = cheerio.load(HTML.authorMeta.test);
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, HTML.authorMeta.result) assert.equal(result, HTML.authorMeta.result);
}) });
it('extracts author from author selectors', () => { it('extracts author from author selectors', () => {
const $ = cheerio.load(HTML.authorSelectors.test) const $ = cheerio.load(HTML.authorSelectors.test);
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, HTML.authorSelectors.result) assert.equal(result, HTML.authorSelectors.result);
}) });
it('extracts author with regex selectors', () => { it('extracts author with regex selectors', () => {
const $ = cheerio.load(HTML.authorRegSelectors.test) const $ = cheerio.load(HTML.authorRegSelectors.test);
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, HTML.authorRegSelectors.result) assert.equal(result, HTML.authorRegSelectors.result);
}) });
it('returns null if no author found', () => { it('returns null if no author found', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, null) assert.equal(result, null);
}) });
}) });
}) });

@ -5,7 +5,7 @@ const HTML = {
<meta name="dc.author" value="Adam" /> <meta name="dc.author" value="Adam" />
</html> </html>
`, `,
result: `Adam` result: 'Adam',
}, },
authorSelectors: { authorSelectors: {
test: ` test: `
@ -15,7 +15,7 @@ const HTML = {
</div> </div>
</div> </div>
`, `,
result: `Adam` result: 'Adam',
}, },
authorRegSelectors: { authorRegSelectors: {
test: ` test: `
@ -25,8 +25,8 @@ const HTML = {
</div> </div>
</div> </div>
`, `,
result: `Adam` result: 'Adam',
}, },
} };
export default HTML export default HTML;

@ -1,11 +1,12 @@
import {
scoreContent,
findTopCandidate,
} from './scoring'
import { import {
stripUnlikelyCandidates, stripUnlikelyCandidates,
convertToParagraphs, convertToParagraphs,
} from 'utils/dom' } from 'utils/dom';
import {
scoreContent,
findTopCandidate,
} from './scoring';
// Using a variety of scoring techniques, extract the content most // Using a variety of scoring techniques, extract the content most
// likely to be article text. // likely to be article text.
@ -26,12 +27,12 @@ export default function extractBestNode($, opts) {
if (opts.stripUnlikelyCandidates) { if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($) $ = stripUnlikelyCandidates($);
} }
$ = convertToParagraphs($) $ = convertToParagraphs($);
$ = scoreContent($, opts.weightNodes) $ = scoreContent($, opts.weightNodes);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
return $topCandidate return $topCandidate;
} }

@ -1,24 +1,26 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
// import HTML from './fixtures/html' // import HTML from './fixtures/html'
import extractBestNode from './extract-best-node' import extractBestNode from './extract-best-node';
describe('extractBestNode($, flags)', () => { describe('extractBestNode($, flags)', () => {
it("scores the dom nodes and returns the best option", () => { it('scores the dom nodes and returns the best option', () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8');
const opts = { const opts = {
stripUnlikelyCandidates: true, stripUnlikelyCandidates: true,
weightNodes: true, weightNodes: true,
} };
let $ = cheerio.load(html) const $ = cheerio.load(html);
const bestNode = extractBestNode($, opts) const bestNode = extractBestNode($, opts);
assert(typeof bestNode, 'object');
// console.log(bestNode.html()) // console.log(bestNode.html())
// assert.equal($(bestNode).text().length, 3652) // assert.equal($(bestNode).text().length, 3652)
}) });
}) });

@ -1,10 +1,11 @@
import cheerio from 'cheerio' import cheerio from 'cheerio';
import 'babel-polyfill' import 'babel-polyfill';
import extractBestNode from './extract-best-node' import { nodeIsSufficient } from 'utils/dom';
import { nodeIsSufficient } from 'utils/dom' import { cleanContent } from 'cleaners';
import { cleanContent } from 'cleaners' import { normalizeSpaces } from 'utils/text';
import { normalizeSpaces } from 'utils/text'
import extractBestNode from './extract-best-node';
const GenericContentExtractor = { const GenericContentExtractor = {
defaultOpts: { defaultOpts: {
@ -33,46 +34,44 @@ const GenericContentExtractor = {
// cleanConditionally: Clean the node to return of some // cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc. // superfluous content. Things like forms, ads, etc.
extract({ $, html, title, url }, opts) { extract({ $, html, title, url }, opts) {
opts = { ...this.defaultOpts, ...opts } opts = { ...this.defaultOpts, ...opts };
$ = $ || cheerio.load(html) $ = $ || cheerio.load(html);
// Cascade through our extraction-specific opts in an ordered fashion, // Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content. // turning them off as we try to extract content.
let node = this.getContentNode($, title, url, opts) let node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) { if (nodeIsSufficient(node)) {
return this.cleanAndReturnNode(node, $) return this.cleanAndReturnNode(node, $);
} else { }
// We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
for (const key of Reflect.ownKeys(opts).filter(key => opts[key] === true)) {
opts[key] = false
$ = cheerio.load(html)
node = this.getContentNode($, title, url, opts)
if (nodeIsSufficient(node)) {
break
}
}
return this.cleanAndReturnNode(node, $) // We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) {
opts[key] = false;
$ = cheerio.load(html);
node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) {
break;
}
} }
return this.cleanAndReturnNode(node, $) return this.cleanAndReturnNode(node, $);
}, },
// Get node given current options // Get node given current options
getContentNode($, title, url, opts) { getContentNode($, title, url, opts) {
return cleanContent( return cleanContent(
extractBestNode($, opts), extractBestNode($, opts),
{ {
$, $,
cleanConditionally: opts.cleanConditionally, cleanConditionally: opts.cleanConditionally,
title, title,
url, url,
}) });
}, },
// Once we got here, either we're at our last-resort node, or // Once we got here, either we're at our last-resort node, or
@ -80,10 +79,10 @@ const GenericContentExtractor = {
// move forward. // move forward.
cleanAndReturnNode(node, $) { cleanAndReturnNode(node, $) {
if (!node) { if (!node) {
return null return null;
} }
return normalizeSpaces($.html(node)) return normalizeSpaces($.html(node));
// if return_type == "html": // if return_type == "html":
// return normalize_spaces(node_to_html(node)) // return normalize_spaces(node_to_html(node))
@ -91,6 +90,6 @@ const GenericContentExtractor = {
// return node // return node
}, },
} };
export default GenericContentExtractor export default GenericContentExtractor;

@ -1,16 +1,15 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import fs from 'fs';
import fs from 'fs'
import { clean } from 'test-helpers' import { clean } from 'test-helpers';
import GenericContentExtractor from './extractor' import GenericContentExtractor from './extractor';
describe('GenericContentExtractor', function() { describe('GenericContentExtractor', function () {
this.timeout(1000000) this.timeout(1000000);
describe('extract($, html, opts)', () => { describe('extract($, html, opts)', () => {
it("extracts html and returns the article", () => { it('extracts html and returns the article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
// Array.from(range(1, 100)).map((i) => { // Array.from(range(1, 100)).map((i) => {
// console.log(i) // console.log(i)
@ -20,15 +19,10 @@ describe('GenericContentExtractor', function() {
// }) // })
const result = clean(GenericContentExtractor.extract( const result = clean(GenericContentExtractor.extract(
{ $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' } { $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' }
)) ));
// console.log(result)
})
})
})
function* range(start = 1, end = 1) { assert(typeof result, 'string');
while (start <= end) { // console.log(result)
yield start++ });
} });
} });

@ -1,15 +1,15 @@
import { import {
getOrInitScore, getOrInitScore,
setScore, setScore,
} from './index' } from './index';
export default function addScore($node, $, amount) { export default function addScore($node, $, amount) {
try { try {
const score = getOrInitScore($node, $) + amount const score = getOrInitScore($node, $) + amount;
setScore($node, $, score) setScore($node, $, score);
} catch(e) { } catch (e) {
console.debug(e) // Ignoring; error occurs in scoreNode
} finally {
return $node
} }
return $node;
} }

@ -1,28 +1,27 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
addScore, addScore,
getScore, getScore,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => { describe('addScore(node, $, amount)', () => {
it(`adds the specified amount to a node's score`, () => { it('adds the specified amount to a node\'s score', () => {
const $ = cheerio.load('<p score="25">Foo</p>') const $ = cheerio.load('<p score="25">Foo</p>');
let $node = $('p').first() let $node = $('p').first();
$node = addScore($node, $, 25) $node = addScore($node, $, 25);
assert.equal(getScore($node), 50) assert.equal(getScore($node), 50);
}) });
it(`adds score if score not yet set (assumes score is 0)`, () => { it('adds score if score not yet set (assumes score is 0)', () => {
const $ = cheerio.load('<p>Foo</p>') const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first() let $node = $('p').first();
$node = addScore($node, $, 25) $node = addScore($node, $, 25);
assert.equal(getScore($node), 25) assert.equal(getScore($node), 25);
}) });
});
}) });
})

@ -1,11 +1,11 @@
import { addScore } from './index' import { addScore } from './index';
// Adds 1/4 of a child's score to its parent // Adds 1/4 of a child's score to its parent
export default function addToParent(node, $, score) { export default function addToParent(node, $, score) {
const parent = node.parent() const parent = node.parent();
if (parent) { if (parent) {
addScore(parent, $, score * .25) addScore(parent, $, score * 0.25);
} }
return node return node;
} }

@ -1,24 +1,23 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
addToParent, addToParent,
getScore, getScore,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('addToParent(node, $, amount)', () => { describe('addToParent(node, $, amount)', () => {
it(`adds 1/4 of a node's score it its parent`, () => { it('adds 1/4 of a node\'s score it its parent', () => {
const html = '<div score="25"><p score="40">Foo</p></div>' const html = '<div score="25"><p score="40">Foo</p></div>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
let $node = $('p').first() let $node = $('p').first();
$node = addToParent($node, $, 40) $node = addToParent($node, $, 40);
assert.equal(getScore($node.parent()), 35) assert.equal(getScore($node.parent()), 35);
assert.equal(getScore($node), 40) assert.equal(getScore($node), 40);
}) });
}) });
});
})

@ -1,49 +1,49 @@
//// CONTENT FETCHING CONSTANTS //// // // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when // A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together // extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple, // and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary. // non-pipe style regular expression queries if necessary.
export const UNLIKELY_CANDIDATES_BLACKLIST = [ export const UNLIKELY_CANDIDATES_BLACKLIST = [
'ad-break', 'ad-break',
'adbox', 'adbox',
'advert', 'advert',
'addthis', 'addthis',
'agegate', 'agegate',
'aux', 'aux',
'blogger-labels', 'blogger-labels',
'combx', 'combx',
'comment', 'comment',
'conversation', 'conversation',
'disqus', 'disqus',
'entry-unrelated', 'entry-unrelated',
'extra', 'extra',
'foot', 'foot',
'form', 'form',
'header', 'header',
'hidden', 'hidden',
'loader', 'loader',
'login', // Note: This can hit 'blogindex'. 'login', // Note: This can hit 'blogindex'.
'menu', 'menu',
'meta', 'meta',
'nav', 'nav',
'pager', 'pager',
'pagination', 'pagination',
'predicta', // readwriteweb inline ad box 'predicta', // readwriteweb inline ad box
'presence_control_external', // lifehacker.com container full of false positives 'presence_control_external', // lifehacker.com container full of false positives
'popup', 'popup',
'printfriendly', 'printfriendly',
'related', 'related',
'remove', 'remove',
'remark', 'remark',
'rss', 'rss',
'share', 'share',
'shoutbox', 'shoutbox',
'sidebar', 'sidebar',
'sociable', 'sociable',
'sponsor', 'sponsor',
'tools' 'tools',
] ];
// A list of strings that can be considered LIKELY candidates when // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the // extracting content from a resource. Essentially, the inverse of the
@ -57,56 +57,56 @@ export const UNLIKELY_CANDIDATES_BLACKLIST = [
// re:test, so may contain simple, non-pipe style regular expression queries // re:test, so may contain simple, non-pipe style regular expression queries
// if necessary. // if necessary.
export const UNLIKELY_CANDIDATES_WHITELIST = [ export const UNLIKELY_CANDIDATES_WHITELIST = [
'and', 'and',
'article', 'article',
'body', 'body',
'blogindex', 'blogindex',
'column', 'column',
'content', 'content',
'entry-content-asset', 'entry-content-asset',
'format', // misuse of form 'format', // misuse of form
'hfeed', 'hfeed',
'hentry', 'hentry',
'hatom', 'hatom',
'main', 'main',
'page', 'page',
'posts', 'posts',
'shadow' 'shadow',
] ];
// A list of tags which, if found inside, should cause a <div /> to NOT // A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements // be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags. // should be turned into <p /> tags.
export const DIV_TO_P_BLOCK_TAGS = [ export const DIV_TO_P_BLOCK_TAGS = [
'a', 'a',
'blockquote', 'blockquote',
'dl', 'dl',
'div', 'div',
'img', 'img',
'p', 'p',
'pre', 'pre',
'table', 'table',
].join(',') ].join(',');
// A list of tags that should be ignored when trying to find the top candidate // A list of tags that should be ignored when trying to find the top candidate
// for a document. // for a document.
export const NON_TOP_CANDIDATE_TAGS = [ export const NON_TOP_CANDIDATE_TAGS = [
'br', 'br',
'b', 'b',
'i', 'i',
'label', 'label',
'hr', 'hr',
'area', 'area',
'base', 'base',
'basefont', 'basefont',
'input', 'input',
'img', 'img',
'link', 'link',
'meta', 'meta',
] ];
export const NON_TOP_CANDIDATE_TAGS_RE = export const NON_TOP_CANDIDATE_TAGS_RE =
new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i') new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i');
// A list of selectors that specify, very clearly, either hNews or other // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates. // very content-specific style content, like Blogger templates.
@ -118,53 +118,15 @@ export const HNEWS_CONTENT_SELECTORS = [
['.post', '.postbody'], ['.post', '.postbody'],
['.post', '.post_body'], ['.post', '.post_body'],
['.post', '.post-body'], ['.post', '.post-body'],
] ];
// export const HNEWS_CONTENT_SELECTORS = [
// {
// //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['hentry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['entry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'),
// must_exist: {
// classes: ['entry', 'entry_content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'),
// must_exist: {
// classes: ['post', 'post-body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'),
// must_exist: {
// classes: ['post', 'post_body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'),
// must_exist: {
// classes: ['post', 'postbody'],
// }
// },
// ]
export const PHOTO_HINTS = [ export const PHOTO_HINTS = [
'figure', 'figure',
'photo', 'photo',
'image', 'image',
'caption' 'caption',
] ];
export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i') export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');
// A list of strings that denote a positive scoring for this content as being // A list of strings that denote a positive scoring for this content as being
@ -172,175 +134,175 @@ export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
// //
// TODO: Perhaps have these scale based on their odds of being quality? // TODO: Perhaps have these scale based on their odds of being quality?
export const POSITIVE_SCORE_HINTS = [ export const POSITIVE_SCORE_HINTS = [
'article', 'article',
'articlecontent', 'articlecontent',
'instapaper_body', 'instapaper_body',
'blog', 'blog',
'body', 'body',
'content', 'content',
'entry-content-asset', 'entry-content-asset',
'entry', 'entry',
'hentry', 'hentry',
'main', 'main',
'Normal', 'Normal',
'page', 'page',
'pagination', 'pagination',
'permalink', 'permalink',
'post', 'post',
'story', 'story',
'text', 'text',
'[-_]copy', //usatoday '[-_]copy', // usatoday
'\Bcopy' '\Bcopy',
] ];
// The above list, joined into a matching regular expression // The above list, joined into a matching regular expression
export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i') export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');
// Readability publisher-specific guidelines // Readability publisher-specific guidelines
export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i') export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');
// A list of strings that denote a negative scoring for this content as being // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id. // an article container. Checked against className and id.
// //
// TODO: Perhaps have these scale based on their odds of being quality? // TODO: Perhaps have these scale based on their odds of being quality?
export const NEGATIVE_SCORE_HINTS = [ export const NEGATIVE_SCORE_HINTS = [
'adbox', 'adbox',
'advert', 'advert',
'author', 'author',
'bio', 'bio',
'bookmark', 'bookmark',
'bottom', 'bottom',
'byline', 'byline',
'clear', 'clear',
'com-', 'com-',
'combx', 'combx',
'comment', 'comment',
'comment\B', 'comment\B',
'contact', 'contact',
'copy', 'copy',
'credit', 'credit',
'crumb', 'crumb',
'date', 'date',
'deck', 'deck',
'excerpt', 'excerpt',
'featured', //tnr.com has a featured_content which throws us off 'featured', // tnr.com has a featured_content which throws us off
'foot', 'foot',
'footer', 'footer',
'footnote', 'footnote',
'graf', 'graf',
'head', 'head',
'info', 'info',
'infotext', //newscientist.com copyright 'infotext', // newscientist.com copyright
'instapaper_ignore', 'instapaper_ignore',
'jump', 'jump',
'linebreak', 'linebreak',
'link', 'link',
'masthead', 'masthead',
'media', 'media',
'meta', 'meta',
'modal', 'modal',
'outbrain', //slate.com junk 'outbrain', // slate.com junk
'promo', 'promo',
'pr_', // autoblog - press release 'pr_', // autoblog - press release
'related', 'related',
'respond', 'respond',
'roundcontent', //lifehacker restricted content warning 'roundcontent', // lifehacker restricted content warning
'scroll', 'scroll',
'secondary', 'secondary',
'share', 'share',
'shopping', 'shopping',
'shoutbox', 'shoutbox',
'side', 'side',
'sidebar', 'sidebar',
'sponsor', 'sponsor',
'stamp', 'stamp',
'sub', 'sub',
'summary', 'summary',
'tags', 'tags',
'tools', 'tools',
'widget' 'widget',
] ];
// The above list, joined into a matching regular expression // The above list, joined into a matching regular expression
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i') export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');
// Match a digit. Pretty clear. // Match a digit. Pretty clear.
export const DIGIT_RE = new RegExp('[0-9]') export const DIGIT_RE = new RegExp('[0-9]');
// Match 2 or more consecutive <br> tags // Match 2 or more consecutive <br> tags
export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i') export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i');
// Match 1 BR tag. // Match 1 BR tag.
export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i') export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');
// A list of all of the block level tags known in HTML5 and below. Taken from // A list of all of the block level tags known in HTML5 and below. Taken from
// http://bit.ly/qneNIT // http://bit.ly/qneNIT
export const BLOCK_LEVEL_TAGS = [ export const BLOCK_LEVEL_TAGS = [
'article', 'article',
'aside', 'aside',
'blockquote', 'blockquote',
'body', 'body',
'br', 'br',
'button', 'button',
'canvas', 'canvas',
'caption', 'caption',
'col', 'col',
'colgroup', 'colgroup',
'dd', 'dd',
'div', 'div',
'dl', 'dl',
'dt', 'dt',
'embed', 'embed',
'fieldset', 'fieldset',
'figcaption', 'figcaption',
'figure', 'figure',
'footer', 'footer',
'form', 'form',
'h1', 'h1',
'h2', 'h2',
'h3', 'h3',
'h4', 'h4',
'h5', 'h5',
'h6', 'h6',
'header', 'header',
'hgroup', 'hgroup',
'hr', 'hr',
'li', 'li',
'map', 'map',
'object', 'object',
'ol', 'ol',
'output', 'output',
'p', 'p',
'pre', 'pre',
'progress', 'progress',
'section', 'section',
'table', 'table',
'tbody', 'tbody',
'textarea', 'textarea',
'tfoot', 'tfoot',
'th', 'th',
'thead', 'thead',
'tr', 'tr',
'ul', 'ul',
'video', 'video',
] ];
export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i') export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i');
// The removal is implemented as a blacklist and whitelist, this test finds // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one // blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the // expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes. // serialization for whitelisted nodes.
const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|') const candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i') export const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|') const candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i') export const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');
export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i') export const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i');
export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i') export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');
export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i') export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
export const BAD_TAGS = new RegExp('^(address|form)$', 'i') export const BAD_TAGS = new RegExp('^(address|form)$', 'i');
export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i') export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');

@ -1,115 +1,35 @@
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants' import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
import { getScore } from './index' import { getScore } from './index';
import { import mergeSiblings from './merge-siblings';
textLength,
linkDensity
} from 'utils/dom'
// After we've calculated scores, loop through all of the possible // After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score. // candidate nodes we found and find the one with the highest score.
export default function findTopCandidate($) { export default function findTopCandidate($) {
let $candidate, topScore = 0 let $candidate;
let topScore = 0;
$('*[score]').each((index, node) => { $('*[score]').each((index, node) => {
const $node = $(node) const $node = $(node);
// Ignore tags like BR, HR, etc // Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) { if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
return return;
} }
const score = getScore($node) const score = getScore($node);
if (score > topScore) { if (score > topScore) {
topScore = score topScore = score;
$candidate = $node $candidate = $node;
} }
}) });
// If we don't have a candidate, return the body // If we don't have a candidate, return the body
// or whatever the first element is // or whatever the first element is
if (!$candidate) { if (!$candidate) {
return $('body') || $('*').first() return $('body') || $('*').first();
} }
$candidate = mergeSiblings($candidate, topScore, $) $candidate = mergeSiblings($candidate, topScore, $);
return $candidate return $candidate;
}
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
let wrappingDiv = $('<div></div>')
$candidate.parent().children().each((index, child) => {
const $child = $(child)
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return
}
const childScore = getScore($child)
if (childScore) {
if ($child === $candidate) {
wrappingDiv.append($child)
} else {
let contentBonus = 0
// extract to scoreLinkDensity() TODO
const density = linkDensity($child)
// If sibling has a very low link density,
// give it a small bonus
if (density < .05) {
contentBonus = contentBonus + 20
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus = contentBonus - 20
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus = contentBonus + topScore * .2
}
const newScore = getScore($child) + contentBonus
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($child)
} else if (child.tagName === 'p') {
const childContentLength = textLength($child.text())
if (childContentLength > 80 && density < .25) {
return wrappingDiv.append($child)
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append($child)
}
}
}
}
})
return wrappingDiv
}
// TODO Extract into util - AP
// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
const SENTENCE_END_RE = new RegExp('\.( |$)')
function hasSentenceEnd(text) {
return SENTENCE_END_RE.test(text)
} }

@ -1,58 +1,58 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
getScore, getScore,
findTopCandidate, findTopCandidate,
scoreContent scoreContent,
} from './index' } from './index';
describe('findTopCandidate($)', () => { describe('findTopCandidate($)', () => {
it("finds the top candidate from simple case", () => { it('finds the top candidate from simple case', () => {
const $ = cheerio.load(HTML.findDom1) const $ = cheerio.load(HTML.findDom1);
const $$topCandidate = findTopCandidate($) const $$topCandidate = findTopCandidate($);
assert.equal(getScore($$topCandidate), 100) assert.equal(getScore($$topCandidate), 100);
}) });
it("finds the top candidate from a nested case", () => { it('finds the top candidate from a nested case', () => {
const $ = cheerio.load(HTML.findDom2) const $ = cheerio.load(HTML.findDom2);
const $$topCandidate = findTopCandidate($) const $$topCandidate = findTopCandidate($);
// this is wrapped in a div so checking // this is wrapped in a div so checking
// the score of the first child // the score of the first child
assert.equal(getScore($$topCandidate.children().first()), 50) assert.equal(getScore($$topCandidate.children().first()), 50);
}) });
it("ignores tags like BR", () => { it('ignores tags like BR', () => {
const $ = cheerio.load(HTML.findDom3) const $ = cheerio.load(HTML.findDom3);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
assert.equal(getScore($topCandidate), 50) assert.equal(getScore($topCandidate), 50);
}) });
it("returns BODY if no candidates found", () => { it('returns BODY if no candidates found', () => {
const $ = cheerio.load(HTML.topBody) const $ = cheerio.load(HTML.topBody);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
assert.equal($topCandidate.get(0).tagName, 'body') assert.equal($topCandidate.get(0).tagName, 'body');
}) });
it("appends a sibling with a good enough score", () => { it('appends a sibling with a good enough score', () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
let $ = cheerio.load(html) let $ = cheerio.load(html);
$ = scoreContent($) $ = scoreContent($);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
assert.equal($($topCandidate).text().length, 3652) assert.equal($($topCandidate).text().length, 3652);
}) });
}) });

@ -237,7 +237,7 @@ const HTML = {
`, `,
after: ` after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div> <div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
` `,
}, },
// cleanImages // cleanImages
@ -252,7 +252,7 @@ const HTML = {
<div> <div>
<img width="50"> <img width="50">
</div> </div>
` `,
}, },
cleanHeight: { cleanHeight: {
before: ` before: `
@ -264,7 +264,7 @@ const HTML = {
<div> <div>
<img width="50"> <img width="50">
</div> </div>
` `,
}, },
cleanSpacer: { cleanSpacer: {
before: ` before: `
@ -279,7 +279,7 @@ const HTML = {
<img src="/foo/bar/baz/normal.png"> <img src="/foo/bar/baz/normal.png">
<p>Some text</p> <p>Some text</p>
</div> </div>
` `,
}, },
// stripJunkTags // stripJunkTags
stripsJunk: { stripsJunk: {
@ -298,7 +298,7 @@ const HTML = {
<div> <div>
<p>What an article</p> <p>What an article</p>
</div> </div>
` `,
}, },
// stripHOnes // stripHOnes
@ -314,7 +314,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
convertThreeHOnes: { convertThreeHOnes: {
before: ` before: `
@ -334,7 +334,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<h2>Can you believe it?!</h2> <h2>Can you believe it?!</h2>
</div> </div>
` `,
}, },
// cleanAttributes // cleanAttributes
@ -348,7 +348,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
removeAlign: { removeAlign: {
before: ` before: `
@ -360,7 +360,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
// removeEmpty // removeEmpty
@ -375,7 +375,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
doNotRemoveBr: { doNotRemoveBr: {
before: ` before: `
@ -392,7 +392,7 @@ const HTML = {
<div></div> <div></div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
doNotNested: { doNotNested: {
before: ` before: `
@ -409,7 +409,7 @@ const HTML = {
<p><img src="foo/bar.jpg" /></p> <p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
// cleanConditionally // cleanConditionally
@ -433,7 +433,7 @@ const HTML = {
</p> </p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
removeTooManyInputs: { removeTooManyInputs: {
before: ` before: `
@ -467,7 +467,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
removeShortNoImg: { removeShortNoImg: {
before: ` before: `
@ -490,7 +490,7 @@ const HTML = {
<img src="asdf"> <img src="asdf">
</div> </div>
</div> </div>
` `,
}, },
linkDensityHigh: { linkDensityHigh: {
@ -527,7 +527,7 @@ const HTML = {
<li>Keep this one</li> <li>Keep this one</li>
</ul> </ul>
</div> </div>
` `,
}, },
goodScoreTooDense: { goodScoreTooDense: {
before: ` before: `
@ -567,7 +567,7 @@ const HTML = {
<li>Keep this one</li> <li>Keep this one</li>
</ul> </ul>
</div> </div>
` `,
}, },
previousEndsInColon: { previousEndsInColon: {
before: ` before: `
@ -608,7 +608,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
`, `,
after: `What do you think?` after: 'What do you think?',
}, },
// cleanHeaders // cleanHeaders
@ -627,7 +627,7 @@ const HTML = {
<h2>Keep me</h2> <h2>Keep me</h2>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
cleanTitleMatch: { cleanTitleMatch: {
before: ` before: `
@ -642,7 +642,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
dropWithNegativeWeight: { dropWithNegativeWeight: {
before: ` before: `
@ -657,8 +657,8 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
} };
export default HTML export default HTML;

@ -82,6 +82,6 @@ const HTML = {
</article> </article>
<body> <body>
`, `,
} };
export default HTML export default HTML;

@ -3,27 +3,26 @@ import {
scoreNode, scoreNode,
getWeight, getWeight,
addToParent, addToParent,
} from './index' } from './index';
// gets and returns the score if it exists // gets and returns the score if it exists
// if not, initializes a score based on // if not, initializes a score based on
// the node's tag type // the node's tag type
export default function getOrInitScore($node, $, weightNodes=true) { export default function getOrInitScore($node, $, weightNodes = true) {
let score = getScore($node) let score = getScore($node);
if (score) { if (score) {
return score return score;
} else { }
score = scoreNode($node)
if (weightNodes) { score = scoreNode($node);
score = score + getWeight($node)
}
addToParent($node, $, score) if (weightNodes) {
score += getWeight($node);
} }
return score addToParent($node, $, score);
}
return score;
}

@ -1,61 +1,61 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
getOrInitScore, getOrInitScore,
getScore, getScore,
} from './index' } from './index';
describe('getOrInitScore(node, $)', () => { describe('getOrInitScore(node, $)', () => {
describe('when score set', () => { describe('when score set', () => {
it(`returns score if node's score already set`, () => { it('returns score if node\'s score already set', () => {
const html = '<p score="40">Foo</p>' const html = '<p score="40">Foo</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 40) assert.equal(score, 40);
}) });
}) });
describe('when no score set', () => { describe('when no score set', () => {
it(`returns 0 if no class/id and text < 25 chars`, () => { it('returns 0 if no class/id and text < 25 chars', () => {
const html = '<p>Foo</p>' const html = '<p>Foo</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 0) assert.equal(score, 0);
}) });
it(`returns score if no class/id and has commas/length`, () => { it('returns score if no class/id and has commas/length', () => {
const $ = cheerio.load(HTML.score19) const $ = cheerio.load(HTML.score19);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 19) assert.equal(score, 19);
}) });
it(`returns greater score if weighted class/id is set`, () => { it('returns greater score if weighted class/id is set', () => {
const $ = cheerio.load(HTML.score44) const $ = cheerio.load(HTML.score44);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 44) assert.equal(score, 44);
}) });
it(`gives 1/4 of its score to its parent`, () => { it('gives 1/4 of its score to its parent', () => {
const $ = cheerio.load(HTML.score44Parent) const $ = cheerio.load(HTML.score44Parent);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) getOrInitScore(node, $);
assert.equal(getScore(node.parent()), 16) assert.equal(getScore(node.parent()), 16);
}) });
}) });
}) });

@ -2,5 +2,5 @@
// the node's score attribute // the node's score attribute
// returns null if no score set // returns null if no score set
export default function getScore($node) { export default function getScore($node) {
return parseFloat($node.attr('score')) || null return parseFloat($node.attr('score')) || null;
} }

@ -1,25 +1,22 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { getScore } from './index' import { getScore } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('getScore($node)', () => { describe('getScore($node)', () => {
it("returns null if the node has no score set", () => { it('returns null if the node has no score set', () => {
const $ = cheerio.load('<p>Foo</p>') const $ = cheerio.load('<p>Foo</p>');
const $node = $('p').first() const $node = $('p').first();
assert.equal(getScore($node), null) assert.equal(getScore($node), null);
}) });
it("returns 25 if the node has a score attr of 25", () => { it('returns 25 if the node has a score attr of 25', () => {
const $ = cheerio.load('<p score="25">Foo</p>') const $ = cheerio.load('<p score="25">Foo</p>');
const $node = $('p').first() const $node = $('p').first();
assert.equal(typeof getScore($node), 'number') assert.equal(typeof getScore($node), 'number');
assert.equal(getScore($node), 25) assert.equal(getScore($node), 25);
}) });
});
}) });
})

@ -3,42 +3,42 @@ import {
POSITIVE_SCORE_RE, POSITIVE_SCORE_RE,
PHOTO_HINTS_RE, PHOTO_HINTS_RE,
READABILITY_ASSET, READABILITY_ASSET,
} from './constants' } from './constants';
// Get the score of a node based on its className and id. // Get the score of a node based on its className and id.
export default function getWeight(node) { export default function getWeight(node) {
const classes = node.attr('class') const classes = node.attr('class');
const id = node.attr('id') const id = node.attr('id');
let score = 0 let score = 0;
if (id) { if (id) {
// if id exists, try to score on both positive and negative // if id exists, try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(id)) { if (POSITIVE_SCORE_RE.test(id)) {
score = score + 25 score += 25;
} }
if (NEGATIVE_SCORE_RE.test(id)) { if (NEGATIVE_SCORE_RE.test(id)) {
score = score - 25 score -= 25;
} }
} }
if (classes) { if (classes) {
if (score == 0) { if (score === 0) {
// if classes exist and id did not contribute to score // if classes exist and id did not contribute to score
// try to score on both positive and negative // try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(classes)) { if (POSITIVE_SCORE_RE.test(classes)) {
score = score + 25 score += 25;
} }
if (NEGATIVE_SCORE_RE.test(classes)) { if (NEGATIVE_SCORE_RE.test(classes)) {
score = score - 25 score -= 25;
} }
} }
// even if score has been set by id, add score for // even if score has been set by id, add score for
// possible photo matches // possible photo matches
// "try to keep photos if we can" // "try to keep photos if we can"
if (PHOTO_HINTS_RE.test(classes)) { if (PHOTO_HINTS_RE.test(classes)) {
score = score + 10 score += 10;
} }
// add 25 if class matches entry-content-asset, // add 25 if class matches entry-content-asset,
@ -46,11 +46,10 @@ export default function getWeight(node) {
// Readability publisher guidelines // Readability publisher guidelines
// https://www.readability.com/developers/guidelines // https://www.readability.com/developers/guidelines
if (READABILITY_ASSET.test(classes)) { if (READABILITY_ASSET.test(classes)) {
score = score + 25 score += 25;
} }
} }
return score return score;
} }

@ -1,59 +1,58 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/get-weight' import HTML from './fixtures/get-weight';
import { import {
getWeight getWeight,
} from './index' } from './index';
describe('Generic Extractor Utils', () => { describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => { describe('getWeight(node)', () => {
it("returns a score of 25 if node has positive id", () => { it('returns a score of 25 if node has positive id', () => {
const $ = cheerio.load(HTML.positiveId) const $ = cheerio.load(HTML.positiveId);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of -25 if node has negative id", () => { it('returns a score of -25 if node has negative id', () => {
const $ = cheerio.load(HTML.negativeId) const $ = cheerio.load(HTML.negativeId);
assert.equal(getWeight($('div')), -25) assert.equal(getWeight($('div')), -25);
}) });
it("returns a score of 25 if node has positive class", () => { it('returns a score of 25 if node has positive class', () => {
const $ = cheerio.load(HTML.positiveClass) const $ = cheerio.load(HTML.positiveClass);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of -25 if node has negative class", () => { it('returns a score of -25 if node has negative class', () => {
const $ = cheerio.load(HTML.negativeClass) const $ = cheerio.load(HTML.negativeClass);
assert.equal(getWeight($('div')), -25) assert.equal(getWeight($('div')), -25);
}) });
it("returns a score of 25 if node has both positive id and class", () => { it('returns a score of 25 if node has both positive id and class', () => {
const $ = cheerio.load(HTML.positiveIdAndClass) const $ = cheerio.load(HTML.positiveIdAndClass);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of 25 if node has pos id and neg class", () => { it('returns a score of 25 if node has pos id and neg class', () => {
// is this really wanted? id="entry" class="adbox" // is this really wanted? id="entry" class="adbox"
// should get positive score? // should get positive score?
const $ = cheerio.load(HTML.positiveIdNegClass) const $ = cheerio.load(HTML.positiveIdNegClass);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of 10 if node has pos img class", () => { it('returns a score of 10 if node has pos img class', () => {
const $ = cheerio.load(HTML.positivePhotoClass) const $ = cheerio.load(HTML.positivePhotoClass);
assert.equal(getWeight($('div')), 10) assert.equal(getWeight($('div')), 10);
}) });
it("returns a score of 35 if node has pos id pos img class", () => { it('returns a score of 35 if node has pos id pos img class', () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto) const $ = cheerio.load(HTML.positiveIdAndPhoto);
assert.equal(getWeight($('div')), 35) assert.equal(getWeight($('div')), 35);
}) });
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => { it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
const $ = cheerio.load(HTML.entryContentAsset) const $ = cheerio.load(HTML.entryContentAsset);
assert.equal(getWeight($('div')), 50) assert.equal(getWeight($('div')), 50);
}) });
});
}) });
})

@ -1,13 +1,13 @@
// Scoring // Scoring
export { default as getWeight } from './get-weight' export { default as getWeight } from './get-weight';
export { default as getScore } from './get-score' export { default as getScore } from './get-score';
export { default as scoreCommas } from './score-commas' export { default as scoreCommas } from './score-commas';
export { default as scoreLength } from './score-length' export { default as scoreLength } from './score-length';
export { default as scoreParagraph } from './score-paragraph' export { default as scoreParagraph } from './score-paragraph';
export { default as setScore } from './set-score' export { default as setScore } from './set-score';
export { default as addScore } from './add-score' export { default as addScore } from './add-score';
export { default as addToParent } from './add-to-parent' export { default as addToParent } from './add-to-parent';
export { default as getOrInitScore } from './get-or-init-score' export { default as getOrInitScore } from './get-or-init-score';
export { default as scoreNode } from './score-node' export { default as scoreNode } from './score-node';
export { default as scoreContent } from './score-content' export { default as scoreContent } from './score-content';
export { default as findTopCandidate } from './find-top-candidate' export { default as findTopCandidate } from './find-top-candidate';

@ -0,0 +1,79 @@
import {
textLength,
linkDensity,
} from 'utils/dom';
import { hasSentenceEnd } from 'utils/text';
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
import { getScore } from './index';
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export default function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate;
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2);
const wrappingDiv = $('<div></div>');
$candidate.parent().children().each((index, child) => {
const $child = $(child);
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return null;
}
const childScore = getScore($child);
if (childScore) {
if ($child === $candidate) {
wrappingDiv.append($child);
} else {
let contentBonus = 0;
// extract to scoreLinkDensity() TODO
const density = linkDensity($child);
// If sibling has a very low link density,
// give it a small bonus
if (density < 0.05) {
contentBonus += 20;
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus -= 20;
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus += topScore * 0.2;
}
const newScore = getScore($child) + contentBonus;
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($child);
} else if (child.tagName === 'p') {
const childContent = $child.text();
const childContentLength = textLength(childContent);
if (childContentLength > 80 && density < 0.25) {
return wrappingDiv.append($child);
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append($child);
}
}
}
}
return null;
});
return wrappingDiv;
}

@ -1,5 +1,5 @@
// return 1 for every comma in text // return 1 for every comma in text
export default function scoreCommas(text) { export default function scoreCommas(text) {
return (text.match(/,/g) || []).length return (text.match(/,/g) || []).length;
} }

@ -1,20 +1,18 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio'
import { scoreCommas } from './index' import { scoreCommas } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreCommas(text)', () => { describe('scoreCommas(text)', () => {
it(`returns 0 if text has no commas`, () => { it('returns 0 if text has no commas', () => {
assert.equal(scoreCommas("Foo bar"), 0) assert.equal(scoreCommas('Foo bar'), 0);
}) });
it(`returns a point for every comma in the text`, () => {
assert.equal(scoreCommas('Foo, bar'), 1)
assert.equal(scoreCommas('Foo, bar, baz'), 2)
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3)
})
})
})
it('returns a point for every comma in the text', () => {
assert.equal(scoreCommas('Foo, bar'), 1);
assert.equal(scoreCommas('Foo, bar, baz'), 2);
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3);
});
});
});

@ -1,119 +1,69 @@
import { HNEWS_CONTENT_SELECTORS } from './constants' import { convertNodeTo } from 'utils/dom';
import { HNEWS_CONTENT_SELECTORS } from './constants';
import { import {
scoreNode, scoreNode,
setScore, setScore,
getOrInitScore, getOrInitScore,
addScore, addScore,
} from './index' } from './index';
import { convertNodeTo } from 'utils/dom' function convertSpans($node, $) {
if ($node.get(0)) {
// score content. Parents get the full value of their children's const { tagName } = $node.get(0);
// content score, grandparents half
export default function scoreContent($, weightNodes=true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80)
})
})
scorePs($, weightNodes) if (tagName === 'span') {
// convert spans to divs
convertNodeTo($node, $, 'div');
}
}
}
return $ function addScoreTo($node, $, score) {
if ($node) {
convertSpans($node, $);
addScore($node, $, score);
}
} }
function scorePs($, weightNodes) { function scorePs($, weightNodes) {
$('p, pre').toArray().map((node) => { $('p, pre').toArray().map((node) => {
// The raw score for this paragraph, before we add any parent/child // The raw score for this paragraph, before we add any parent/child
// scores. // scores.
let $node = $(node) let $node = $(node);
$node = setScore($node, $, getOrInitScore($node, $, weightNodes)) $node = setScore($node, $, getOrInitScore($node, $, weightNodes));
return $node return $node;
}).forEach(($node) => { }).forEach(($node) => {
// The parent scoring has to be done in a separate loop // The parent scoring has to be done in a separate loop
// because otherwise scoring the parent overwrites // because otherwise scoring the parent overwrites
// the score added to the child // the score added to the child
// Add the individual content score to the parent node // Add the individual content score to the parent node
const rawScore = scoreNode($node) const rawScore = scoreNode($node);
const $parent = $node.parent() const $parent = $node.parent();
addScoreTo($parent, $, rawScore, weightNodes) addScoreTo($parent, $, rawScore, weightNodes);
if ($parent) { if ($parent) {
// Add half of the individual content score to the // Add half of the individual content score to the
// grandparent // grandparent
addScoreTo($parent.parent(), $, rawScore/2, weightNodes) addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
} }
});
})
} }
function convertSpans($node, $) { // score content. Parents get the full value of their children's
if ($node.get(0)) { // content score, grandparents half
const { tagName } = $node.get(0) export default function scoreContent($, weightNodes = true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.forEach(([parentSelector, childSelector]) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80);
});
});
if (tagName === 'span') { scorePs($, weightNodes);
// convert spans to divs
convertNodeTo($node, $, 'div')
}
}
}
function addScoreTo($node, $, score, weightNodes) { return $;
if ($node) {
convertSpans($node, $)
addScore($node, $, score)
}
} }
// def _score_content(self, doc, weight_nodes=True):
// for selector in constants.HNEWS_CONTENT_SELECTORS:
// # Not self.resource.extract_by_selector because our doc is a copy
// # of the resource doc.
// nodes = extract_by_selector(doc, selector,
// AttribMap(doc))
// for node in nodes:
// self._add_score(node, 80)
//
// paras = doc.xpath('.//p | .//pre')
//
// # If we don't have any paragraphs at all, we can't score based on
// # paragraphs, so return without modifying anything else.
// if len(paras) == 0:
// return doc
//
// for para in paras:
// # Don't score invalid tags
// if not isinstance(para.tag, basestring):
// continue
//
// # The raw score for this paragraph, before we add any parent/child
// # scores.
// raw_score = self._score_node(para)
// self._set_score(para, self._get_score(para, weight_nodes))
//
// parent = para.getparent()
// if parent is not None:
// if parent.tag == 'span':
// parent.tag = 'div'
//
// # Add the individual content score to the parent node
// self._add_score(parent, raw_score, weight_nodes=weight_nodes)
//
// grandparent = parent.getparent()
// if grandparent is not None:
// if grandparent.tag == 'span':
// grandparent.tag = 'div'
//
// # Add half of the individual content score to the
// # grandparent
// gp_score = raw_score / 2.0
// self._add_score(grandparent, gp_score, weight_nodes=weight_nodes)
//
// return doc

@ -1,47 +1,45 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import { clean } from 'test-helpers' import HTML from './fixtures/html';
import HTML from './fixtures/html'
import { import {
scoreContent, scoreContent,
getScore, getScore,
} from './index' } from './index';
// TODO: Walk through these and sanity check my scores // TODO: Walk through these and sanity check my scores
// Commented out scores were what I expected, but I was also // Commented out scores were what I expected, but I was also
// probably missing something when calculating // probably missing something when calculating
describe('scoreContent($, weightNodes)', () => { describe('scoreContent($, weightNodes)', () => {
it("loves hNews content", () => { it('loves hNews content', () => {
const $ = cheerio.load(HTML.hNews.before) const $ = cheerio.load(HTML.hNews.before);
const result = scoreContent($).html() scoreContent($).html();
assert.equal(getScore($('div').first()), 140) assert.equal(getScore($('div').first()), 140);
}) });
it("is so-so about non-hNews content", () => { it('is so-so about non-hNews content', () => {
const $ = cheerio.load(HTML.nonHNews.before) const $ = cheerio.load(HTML.nonHNews.before);
const result = scoreContent($).html() scoreContent($).html();
assert.equal(getScore($('div').first()), 65) assert.equal(getScore($('div').first()), 65);
}) });
it("scores this Wired article the same", () => { it('scores this Wired article the same', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html) const $ = cheerio.load(html);
const result = scoreContent($).html() scoreContent($).html();
assert.equal(getScore($('article').first()), 65.5) assert.equal(getScore($('article').first()), 65.5);
}) });
it("scores this Vulture article", () => { it('scores this Vulture article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
let $ = cheerio.load(html) let $ = cheerio.load(html);
$ = scoreContent($) $ = scoreContent($);
assert.equal($('p[score]').length, 62) assert.equal($('p[score]').length, 62);
}) });
});
})

@ -1,11 +1,10 @@
const idkRe = new RegExp('^(p|pre)$', 'i') const idkRe = new RegExp('^(p|pre)$', 'i');
export default function scoreLength(textLength, tagName='p') { export default function scoreLength(textLength, tagName = 'p') {
let score const chunks = textLength / 50;
const chunks = textLength / 50
if (chunks > 0) { if (chunks > 0) {
let lengthBonus let lengthBonus;
// No idea why p or pre are being tamped down here // No idea why p or pre are being tamped down here
// but just following the source for now // but just following the source for now
@ -13,14 +12,14 @@ export default function scoreLength(textLength, tagName='p') {
// since this is only being called from the context // since this is only being called from the context
// of scoreParagraph // of scoreParagraph
if (idkRe.test(tagName)) { if (idkRe.test(tagName)) {
lengthBonus = chunks - 2 lengthBonus = chunks - 2;
} else { } else {
lengthBonus = chunks - 1.25 lengthBonus = chunks - 1.25;
} }
return Math.min(Math.max(lengthBonus, 0), 3) return Math.min(Math.max(lengthBonus, 0), 3);
} else {
return 0
} }
return 0;
} }

@ -1,22 +1,21 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio'
import { scoreLength } from './index' import { scoreLength } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreLength(textLength, tagName)', () => { describe('scoreLength(textLength, tagName)', () => {
it(`returns 0 if length < 50 chars`, () => { it('returns 0 if length < 50 chars', () => {
assert.equal(scoreLength(30), 0) assert.equal(scoreLength(30), 0);
}) });
it(`returns varying scores but maxes out at 3`, () => { it('returns varying scores but maxes out at 3', () => {
assert.equal(scoreLength(150), 1) assert.equal(scoreLength(150), 1);
assert.equal(scoreLength(199), 1.98) assert.equal(scoreLength(199), 1.98);
assert.equal(scoreLength(200), 2) assert.equal(scoreLength(200), 2);
assert.equal(scoreLength(250), 3) assert.equal(scoreLength(250), 3);
assert.equal(scoreLength(500), 3) assert.equal(scoreLength(500), 3);
assert.equal(scoreLength(1500), 3) assert.equal(scoreLength(1500), 3);
}) });
}) });
}) });

@ -1,29 +1,29 @@
import { scoreParagraph } from './index' import { scoreParagraph } from './index';
import { import {
PARAGRAPH_SCORE_TAGS, PARAGRAPH_SCORE_TAGS,
CHILD_CONTENT_TAGS, CHILD_CONTENT_TAGS,
BAD_TAGS, BAD_TAGS,
} from './constants' } from './constants';
// Score an individual node. Has some smarts for paragraphs, otherwise // Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag. // just scores based on tag.
export default function scoreNode($node) { export default function scoreNode($node) {
const { tagName } = $node.get(0) const { tagName } = $node.get(0);
// TODO: Consider ordering by most likely. // TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page, // E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node AP // Could save doing that regex test on every node AP
if (PARAGRAPH_SCORE_TAGS.test(tagName)) { if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
return scoreParagraph($node) return scoreParagraph($node);
} else if (tagName === 'div') { } else if (tagName === 'div') {
return 5 return 5;
} else if (CHILD_CONTENT_TAGS.test(tagName)) { } else if (CHILD_CONTENT_TAGS.test(tagName)) {
return 3 return 3;
} else if (BAD_TAGS.test(tagName)) { } else if (BAD_TAGS.test(tagName)) {
return -3 return -3;
} else if (tagName === 'th') { } else if (tagName === 'th') {
return -5 return -5;
} }
return 0 return 0;
} }

@ -1,95 +1,94 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
scoreNode, scoreNode,
scoreParagraph, scoreParagraph,
} from './index' } from './index';
describe('scoreNode(node)', () => { describe('scoreNode(node)', () => {
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const html = '<p><em>Foo</em> bar</p>' const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
let node = $('p').first() const node = $('p').first();
const score = scoreNode(node) const score = scoreNode(node);
const pScore = scoreParagraph(node) const pScore = scoreParagraph(node);
assert.equal(score, pScore) assert.equal(score, pScore);
assert.equal(score, 0) assert.equal(score, 0);
}) });
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score1) const $ = cheerio.load(HTML.score1);
let node = $('p').first() const node = $('p').first();
const score = scoreNode(node) const score = scoreNode(node);
const pScore = scoreParagraph(node) const pScore = scoreParagraph(node);
assert.equal(score, pScore) assert.equal(score, pScore);
assert.equal(score, 1) assert.equal(score, 1);
});
}) it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score3);
const node = $('p').first();
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.score3) const pScore = scoreParagraph(node);
let node = $('p').first()
const score = scoreNode(node) assert.equal(score, pScore);
const pScore = scoreParagraph(node) assert.equal(score, 3);
});
assert.equal(score, pScore) it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
assert.equal(score, 3) const $ = cheerio.load(HTML.score19);
}) const node = $('p').first();
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.score19) const pScore = scoreParagraph(node);
let node = $('p').first()
const score = scoreNode(node) assert.equal(score, pScore);
const pScore = scoreParagraph(node) assert.equal(score, 19);
});
assert.equal(score, pScore) it('scores divs with 5', () => {
assert.equal(score, 19) const $ = cheerio.load(HTML.divScore5);
}) const node = $('div').first();
it(`scores divs with 5`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.divScore5)
let node = $('div').first()
const score = scoreNode(node) assert.equal(score, 5);
});
assert.equal(score, 5) it('scores the blockquote family with 3', () => {
}) const $ = cheerio.load(HTML.blockquoteScore3);
const node = $('blockquote').first();
it(`scores the blockquote family with 3`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.blockquoteScore3)
let node = $('blockquote').first()
const score = scoreNode(node) assert.equal(score, 3);
});
assert.equal(score, 3) it('scores a form with negative 3', () => {
}) const $ = cheerio.load(HTML.formScoreNeg3);
const node = $('form').first();
it(`scores a form with negative 3`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.formScoreNeg3)
let node = $('form').first()
const score = scoreNode(node) assert.equal(score, -3);
});
assert.equal(score, -3) it('scores a TH element with negative 5', () => {
}) const $ = cheerio.load(HTML.thScoreNeg5);
const node = $('th').first();
it(`scores a TH element with negative 5`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.thScoreNeg5)
let node = $('th').first()
const score = scoreNode(node) assert.equal(score, -5);
});
assert.equal(score, -5) });
})
})

@ -1,35 +1,35 @@
import { import {
scoreCommas, scoreCommas,
scoreLength, scoreLength,
} from './index' } from './index';
// Score a paragraph using various methods. Things like number of // Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better. // commas, etc. Higher is better.
export default function scoreParagraph(node) { export default function scoreParagraph(node) {
let score = 1 let score = 1;
const text = node.text().trim() const text = node.text().trim();
const textLength = text.length const textLength = text.length;
// If this paragraph is less than 25 characters, don't count it. // If this paragraph is less than 25 characters, don't count it.
if (textLength < 25) { if (textLength < 25) {
return 0 return 0;
} }
// Add points for any commas within this paragraph // Add points for any commas within this paragraph
score = score + scoreCommas(text) score += scoreCommas(text);
// For every 50 characters in this paragraph, add another point. Up // For every 50 characters in this paragraph, add another point. Up
// to 3 points. // to 3 points.
score = score + scoreLength(textLength) score += scoreLength(textLength);
// Articles can end with short paragraphs when people are being clever // Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk // but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below // that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold. // the cutoff threshold.
if (text.slice(-1) === ':') { if (text.slice(-1) === ':') {
score = score - 1 score -= 1;
} }
return score return score;
} }

@ -1,48 +1,48 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
scoreParagraph, scoreParagraph,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreParagraph(node)', () => { describe('scoreParagraph(node)', () => {
it(`returns 0 if text is less than 25 chars`, () => { it('returns 0 if text is less than 25 chars', () => {
const html = '<p><em>Foo</em> bar</p>' const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 0) assert.equal(score, 0);
}) });
it(`returns 1 if text is > 25 chars and has 0 commas`, () => { it('returns 1 if text is > 25 chars and has 0 commas', () => {
const $ = cheerio.load(HTML.score1) const $ = cheerio.load(HTML.score1);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 1) assert.equal(score, 1);
}) });
it(`returns 3 if text is > 25 chars and has 2 commas`, () => { it('returns 3 if text is > 25 chars and has 2 commas', () => {
const $ = cheerio.load(HTML.score3) const $ = cheerio.load(HTML.score3);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 3) assert.equal(score, 3);
}) });
it(`returns 19 if text has 15 commas, ~600 chars`, () => { it('returns 19 if text has 15 commas, ~600 chars', () => {
const $ = cheerio.load(HTML.score19) const $ = cheerio.load(HTML.score19);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 19) assert.equal(score, 19);
}) });
}) });
}) });

@ -1,7 +1,6 @@
export default function setScore($node, $, score) { export default function setScore($node, $, score) {
$node.attr('score', score) $node.attr('score', score);
return $node return $node;
} }

@ -1,23 +1,22 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
setScore, setScore,
getScore getScore,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => { describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => { it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>') const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first() let $node = $('p').first();
const newScore = 25 const newScore = 25;
$node = setScore($node, $, newScore) $node = setScore($node, $, newScore);
const score = getScore($node) const score = getScore($node);
assert(score, newScore) assert(score, newScore);
}) });
}) });
}) });

@ -3,23 +3,23 @@
// should be lowercase for faster case-insensitive matching. // should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct. // From most distinct to least distinct.
export const DATE_PUBLISHED_META_TAGS = [ export const DATE_PUBLISHED_META_TAGS = [
'article:published_time', 'article:published_time',
'displaydate', 'displaydate',
'dc.date', 'dc.date',
'dc.date.issued', 'dc.date.issued',
'rbpubdate', 'rbpubdate',
'publish_date', 'publish_date',
'pub_date', 'pub_date',
'pagedate', 'pagedate',
'pubdate', 'pubdate',
'revision_date', 'revision_date',
'doc_date', 'doc_date',
'date_created', 'date_created',
'content_create_date', 'content_create_date',
'lastmodified', 'lastmodified',
'created', 'created',
'date' 'date',
] ];
// An ordered list of XPath Selectors to find // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit // likely date published dates. From most explicit
@ -42,20 +42,20 @@ export const DATE_PUBLISHED_SELECTORS = [
'#story .datetime', '#story .datetime',
'.dateline', '.dateline',
'.pubdate', '.pubdate',
] ];
// An ordered list of compiled regular expressions to find likely date // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first // published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse // reference be a date string that is parseable by dateutil.parser.parse
const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)' const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
export const DATE_PUBLISHED_URL_RES = [ export const DATE_PUBLISHED_URL_RES = [
// /2012/01/27/ but not /2012/01/293 // /2012/01/27/ but not /2012/01/293
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
// 20120127 or 20120127T but not 2012012733 or 8201201733 // 20120127 or 20120127T but not 2012012733 or 8201201733
// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i, // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
// 2012-01-27 // 2012-01-27
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
// /2012/jan/27/ // /2012/jan/27/
new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i') new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'),
] ];

@ -1,37 +1,36 @@
import { cleanDatePublished } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import { extractFromUrl } from 'utils/text';
import { import {
DATE_PUBLISHED_META_TAGS, DATE_PUBLISHED_META_TAGS,
DATE_PUBLISHED_SELECTORS, DATE_PUBLISHED_SELECTORS,
DATE_PUBLISHED_URL_RES, DATE_PUBLISHED_URL_RES,
} from './constants' } from './constants';
import { cleanDatePublished } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom'
import { extractFromUrl } from 'utils/text'
const GenericDatePublishedExtractor = { const GenericDatePublishedExtractor = {
extract({ $, url, metaCache }) { extract({ $, url, metaCache }) {
let datePublished let datePublished;
// First, check to see if we have a matching meta tag // First, check to see if we have a matching meta tag
// that we can make use of. // that we can make use of.
// Don't try cleaning tags from this string // Don't try cleaning tags from this string
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false) datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false);
if(datePublished) return cleanDatePublished(datePublished) if (datePublished) return cleanDatePublished(datePublished);
// Second, look through our selectors looking for potential // Second, look through our selectors looking for potential
// date_published's. // date_published's.
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS) datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
if(datePublished) return cleanDatePublished(datePublished) if (datePublished) return cleanDatePublished(datePublished);
// Lastly, look to see if a dately string exists in the URL // Lastly, look to see if a dately string exists in the URL
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES) datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
if(datePublished) return cleanDatePublished(datePublished) if (datePublished) return cleanDatePublished(datePublished);
return null return null;
} },
} };
export default GenericDatePublishedExtractor export default GenericDatePublishedExtractor;

@ -1,97 +1,95 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import moment from 'moment' import moment from 'moment';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import GenericDatePublishedExtractor from './extractor' import GenericDatePublishedExtractor from './extractor';
describe('GenericDatePublishedExtractor', () => { describe('GenericDatePublishedExtractor', () => {
describe('extract($, metaCache)', () => { describe('extract($, metaCache)', () => {
it('extracts datePublished from meta tags', () => { it('extracts datePublished from meta tags', () => {
const $ = cheerio.load(HTML.datePublishedMeta.test) const $ = cheerio.load(HTML.datePublishedMeta.test);
const metaCache = ["displaydate", "something-else"] const metaCache = ['displaydate', 'something-else'];
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache } { $, url: '', metaCache }
) );
assert.equal( assert.equal(
result, result,
HTML.datePublishedMeta.result.toISOString() HTML.datePublishedMeta.result.toISOString()
) );
}) });
it('extracts datePublished from selectors', () => { it('extracts datePublished from selectors', () => {
const $ = cheerio.load(HTML.datePublishedSelectors.test) const $ = cheerio.load(HTML.datePublishedSelectors.test);
const metaCache = [] const metaCache = [];
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache } { $, url: '', metaCache }
) );
assert.equal( assert.equal(
result, result,
HTML.datePublishedMeta.result.toISOString() HTML.datePublishedMeta.result.toISOString()
) );
}) });
it('extracts from url formatted /2012/08/01/etc', () => { it('extracts from url formatted /2012/08/01/etc', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const url = 'https://example.com/2012/08/01/this-is-good' const url = 'https://example.com/2012/08/01/this-is-good';
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url, metaCache } { $, url, metaCache }
) );
assert.equal( assert.equal(
result, result,
new Date('2012/08/01').toISOString() new Date('2012/08/01').toISOString()
) );
}) });
it('extracts from url formatted /2020-01-01', () => { it('extracts from url formatted /2020-01-01', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const url = 'https://example.com/2020-01-01/this-is-good' const url = 'https://example.com/2020-01-01/this-is-good';
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url, metaCache } { $, url, metaCache }
) );
assert.equal( assert.equal(
result, result,
moment(new Date('2020-01-01')).toISOString() moment(new Date('2020-01-01')).toISOString()
) );
}) });
it('extracts from url formatted /2020/jan/01', () => { it('extracts from url formatted /2020/jan/01', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const url = 'https://example.com/2020/jan/01/this-is-good' const url = 'https://example.com/2020/jan/01/this-is-good';
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url, metaCache } { $, url, metaCache }
) );
assert.equal( assert.equal(
result, result,
new Date('2020/jan/01').toISOString() new Date('2020/jan/01').toISOString()
) );
}) });
it('returns null if no date can be found', () => { it('returns null if no date can be found', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache } { $, url: '', metaCache }
) );
assert.equal(result, null)
})
})
})
assert.equal(result, null);
});
});
});

@ -7,7 +7,7 @@ const HTML = {
</head> </head>
</html> </html>
`, `,
result: new Date('1/1/2020 8:30 (EST)') result: new Date('1/1/2020 8:30 (EST)'),
}, },
datePublishedSelectors: { datePublishedSelectors: {
test: ` test: `
@ -19,8 +19,8 @@ const HTML = {
</head> </head>
</div> </div>
`, `,
result: new Date('1/1/2020 8:30 am (EST)') result: new Date('1/1/2020 8:30 am (EST)'),
}, },
} };
export default HTML export default HTML;

@ -1,27 +1,28 @@
import { // import {
DEK_META_TAGS, // DEK_META_TAGS,
DEK_SELECTORS, // DEK_SELECTORS,
DEK_URL_RES, // DEK_URL_RES,
} from './constants' // } from './constants';
import { cleanDek } from 'cleaners' // import { cleanDek } from 'cleaners';
import { // import {
extractFromMeta, // extractFromMeta,
extractFromSelectors, // extractFromSelectors,
} from 'utils/dom' // } from 'utils/dom';
// Currently there is only one selector for // Currently there is only one selector for
// deks. We should simply return null here // deks. We should simply return null here
// until we have a more robust generic option. // until we have a more robust generic option.
// Below is the original source for this, for reference. // Below is the original source for this, for reference.
const GenericDekExtractor = { const GenericDekExtractor = {
extract({ $, content, metaCache }) { // extract({ $, content, metaCache }) {
return null extract() {
} return null;
} },
};
export default GenericDekExtractor export default GenericDekExtractor;
// def extract_dek(self): // def extract_dek(self):
// # First, check to see if we have a matching meta tag that we can make // # First, check to see if we have a matching meta tag that we can make

@ -1,20 +1,18 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
// import HTML from './fixtures/html' // import HTML from './fixtures/html'
import GenericDekExtractor from './extractor' import GenericDekExtractor from './extractor';
describe('GenericDekExtractor', () => { describe('GenericDekExtractor', () => {
describe('extract({ $, metaCache })', () => { describe('extract({ $, metaCache })', () => {
it('returns null if no dek can be found', () => { it('returns null if no dek can be found', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const result = const result =
GenericDekExtractor.extract({ $, metaCache }) GenericDekExtractor.extract({ $, metaCache });
assert.equal(result, null)
})
}) assert.equal(result, null);
}) });
});
});

@ -1,50 +1,50 @@
import cheerio from 'cheerio' import cheerio from 'cheerio';
import GenericContentExtractor from './content/extractor' import GenericContentExtractor from './content/extractor';
import GenericTitleExtractor from './title/extractor' import GenericTitleExtractor from './title/extractor';
import GenericAuthorExtractor from './author/extractor' import GenericAuthorExtractor from './author/extractor';
import GenericDatePublishedExtractor from './date-published/extractor' import GenericDatePublishedExtractor from './date-published/extractor';
import GenericDekExtractor from './dek/extractor' import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor' import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor' import GenericNextPageUrlExtractor from './next-page-url/extractor';
const GenericExtractor = { const GenericExtractor = {
// This extractor is the default for all domains // This extractor is the default for all domains
domain: '*', domain: '*',
title: GenericTitleExtractor.extract, title: GenericTitleExtractor.extract,
datePublished : GenericDatePublishedExtractor.extract, datePublished: GenericDatePublishedExtractor.extract,
author: GenericAuthorExtractor.extract, author: GenericAuthorExtractor.extract,
content: GenericContentExtractor.extract.bind(GenericContentExtractor), content: GenericContentExtractor.extract.bind(GenericContentExtractor),
leadImageUrl: GenericLeadImageUrlExtractor.extract, leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract, dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract, nextPageUrl: GenericNextPageUrlExtractor.extract,
extract: function(options) { extract(options) {
let { html } = options const { html } = options;
if (html) { if (html) {
const $ = cheerio.load(html) const $ = cheerio.load(html);
options.$ = $ options.$ = $;
} }
const title = this.title(options) const title = this.title(options);
const datePublished = this.datePublished(options) const datePublished = this.datePublished(options);
const author = this.author(options) const author = this.author(options);
const content = this.content({ ...options, title }) const content = this.content({ ...options, title });
const leadImageUrl = this.leadImageUrl(options) const leadImageUrl = this.leadImageUrl(options);
const dek = this.dek(options) const dek = this.dek(options);
const nextPageUrl = this.nextPageUrl(options) const nextPageUrl = this.nextPageUrl(options);
return { return {
title, title,
author, author,
datePublished: datePublished ? datePublished : null, datePublished: datePublished || null,
dek, dek,
leadImageUrl, leadImageUrl,
content, content,
nextPageUrl, nextPageUrl,
} };
} },
} };
export default GenericExtractor export default GenericExtractor;

@ -1,14 +1,12 @@
import assert from 'assert' import assert from 'assert';
import fs from 'fs' import fs from 'fs';
import { clean } from 'test-helpers' import GenericExtractor from './index';
import GenericExtractor from './index'
describe('GenericExtractor', () => { describe('GenericExtractor', () => {
describe('extract(opts)', () => { describe('extract(opts)', () => {
it("extracts this old LA Times article", () => { it('extracts this old LA Times article', () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
const { const {
title, title,
@ -16,23 +14,23 @@ describe('GenericExtractor', () => {
datePublished, datePublished,
dek, dek,
} = GenericExtractor.extract( } = GenericExtractor.extract(
{ url: "http://latimes.com", html, metaCache: [] } { url: 'http://latimes.com', html, metaCache: [] }
) );
assert.equal(author, null) assert.equal(author, null);
assert.equal( assert.equal(
title, title,
'California appears poised to be first to ban power-guzzling big-screen TVs' 'California appears poised to be first to ban power-guzzling big-screen TVs'
) );
assert.equal( assert.equal(
datePublished, datePublished,
'2009-10-14T04:00:00.000Z' '2009-10-14T04:00:00.000Z'
) );
assert.equal(dek, null) assert.equal(dek, null);
}) });
it("extracts html and returns the article title", () => { it('extracts html and returns the article title', () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8') const html = fs.readFileSync('../fixtures/wired.html', 'utf-8');
const { const {
author, author,
@ -40,18 +38,17 @@ describe('GenericExtractor', () => {
datePublished, datePublished,
dek, dek,
} = GenericExtractor.extract( } = GenericExtractor.extract(
{ url: "http://wired.com", html, metaCache: [] } { url: 'http://wired.com', html, metaCache: [] }
) );
assert.equal(author, 'Eric Adams') assert.equal(author, 'Eric Adams');
assert.equal( assert.equal(
title, title,
'Airplane Tires Dont Explode on Landing Because They Are Pumped!' 'Airplane Tires Dont Explode on Landing Because They Are Pumped!'
) );
assert.equal(datePublished, null) assert.equal(datePublished, null);
assert.equal(dek, null) assert.equal(dek, null);
}) });
});
}) });
})

@ -2,52 +2,52 @@
// All attributes should be lowercase for faster case-insensitive matching. // All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct. // From most distinct to least distinct.
export const LEAD_IMAGE_URL_META_TAGS = [ export const LEAD_IMAGE_URL_META_TAGS = [
'og:image', 'og:image',
'twitter:image', 'twitter:image',
'image_src', 'image_src',
] ];
export const LEAD_IMAGE_URL_SELECTORS = [ export const LEAD_IMAGE_URL_SELECTORS = [
'link[rel=image_src]', 'link[rel=image_src]',
] ];
export const POSITIVE_LEAD_IMAGE_URL_HINTS = [ export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'upload', 'upload',
'wp-content', 'wp-content',
'large', 'large',
'photo', 'photo',
'wp-image', 'wp-image',
] ];
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [ export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'spacer', 'spacer',
'sprite', 'sprite',
'blank', 'blank',
'throbber', 'throbber',
'gradient', 'gradient',
'tile', 'tile',
'bg', 'bg',
'background', 'background',
'icon', 'icon',
'social', 'social',
'header', 'header',
'hdr', 'hdr',
'advert', 'advert',
'spinner', 'spinner',
'loader', 'loader',
'loading', 'loading',
'default', 'default',
'rating', 'rating',
'share', 'share',
'facebook', 'facebook',
'twitter', 'twitter',
'theme', 'theme',
'promo', 'promo',
'ads', 'ads',
'wp-includes', 'wp-includes',
] ];
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
export const GIF_RE = /\.gif(\?.*)?$/i export const GIF_RE = /\.gif(\?.*)?$/i;
export const JPG_RE = /\.jpe?g(\?.*)?$/i export const JPG_RE = /\.jpe?g(\?.*)?$/i;

@ -1,14 +1,12 @@
import 'babel-polyfill' import 'babel-polyfill';
import { extractFromMeta } from 'utils/dom';
import { cleanImage } from 'cleaners';
import { import {
LEAD_IMAGE_URL_META_TAGS, LEAD_IMAGE_URL_META_TAGS,
LEAD_IMAGE_URL_SELECTORS, LEAD_IMAGE_URL_SELECTORS,
} from './constants' } from './constants';
import {
extractFromMeta,
extractFromSelectors
} from 'utils/dom'
import { import {
scoreImageUrl, scoreImageUrl,
@ -17,9 +15,7 @@ import {
scoreBySibling, scoreBySibling,
scoreByDimensions, scoreByDimensions,
scoreByPosition, scoreByPosition,
} from './score-image' } from './score-image';
import { cleanImage } from 'cleaners'
// Given a resource, try to find the lead image URL from within // Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system // it. Like content and next page extraction, uses a scoring system
@ -31,86 +27,87 @@ import { cleanImage } from 'cleaners'
// * weird aspect ratio // * weird aspect ratio
const GenericLeadImageUrlExtractor = { const GenericLeadImageUrlExtractor = {
extract({ $, content, metaCache }) { extract({ $, content, metaCache }) {
let imageUrl, cleanUrl let cleanUrl;
// Check to see if we have a matching meta tag that we can make use of. // Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large // Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards. // images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph. // images usually have for things like Open Graph.
imageUrl = const imageUrl =
extractFromMeta( extractFromMeta(
$, $,
LEAD_IMAGE_URL_META_TAGS, LEAD_IMAGE_URL_META_TAGS,
metaCache, metaCache,
false false
) );
if (imageUrl) { if (imageUrl) {
cleanUrl = cleanImage(imageUrl) cleanUrl = cleanImage(imageUrl);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
// Next, try to find the "best" image via the content. // Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions, // We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead. // so try to do some analysis and determine them instead.
const imgs = $('img', content).toArray() const imgs = $('img', content).toArray();
let imgScores = {} const imgScores = {};
imgs.forEach((img, index) => { imgs.forEach((img, index) => {
const $img = $(img) const $img = $(img);
const src = $img.attr('src') const src = $img.attr('src');
if (!src) return if (!src) return;
let score = scoreImageUrl(src) let score = scoreImageUrl(src);
score = score + scoreAttr($img) score += scoreAttr($img);
score = score + scoreByParents($img) score += scoreByParents($img);
score = score + scoreBySibling($img) score += scoreBySibling($img);
score = score + scoreByDimensions($img) score += scoreByDimensions($img);
score = score + scoreByPosition(imgs, index) score += scoreByPosition(imgs, index);
imgScores[src] = score imgScores[src] = score;
}) });
const [topUrl, topScore] = const [topUrl, topScore] =
Reflect.ownKeys(imgScores).reduce((acc, key) => Reflect.ownKeys(imgScores).reduce((acc, key) =>
imgScores[key] > acc[1] ? [key, imgScores[key]] : acc imgScores[key] > acc[1] ? [key, imgScores[key]] : acc
, [null, 0]) , [null, 0]);
if (topScore > 0) { if (topScore > 0) {
cleanUrl = cleanImage(topUrl) cleanUrl = cleanImage(topUrl);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
// If nothing else worked, check to see if there are any really // If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />. // probable nodes in the doc, like <link rel="image_src" />.
for (const selector of LEAD_IMAGE_URL_SELECTORS) { for (const selector of LEAD_IMAGE_URL_SELECTORS) {
const $node = $(selector).first() const $node = $(selector).first();
const src = $node.attr('src') const src = $node.attr('src');
if (src) { if (src) {
cleanUrl = cleanImage(src) cleanUrl = cleanImage(src);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
const href = $node.attr('href') const href = $node.attr('href');
if (href) { if (href) {
cleanUrl = cleanImage(href) cleanUrl = cleanImage(href);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
const value = $node.attr('value') const value = $node.attr('value');
if (value) { if (value) {
cleanUrl = cleanImage(value) cleanUrl = cleanImage(value);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
} }
return null;
}, },
} };
export default GenericLeadImageUrlExtractor export default GenericLeadImageUrlExtractor;
// def extract(self): // def extract(self):
// """ // """
@ -182,7 +179,7 @@ export default GenericLeadImageUrlExtractor
// if sibling is not None: // if sibling is not None:
// if sibling.tag == 'figcaption': // if sibling.tag == 'figcaption':
// img_score += 25 // img_score += 25
// //
// sib_sig = ' '.join([sibling.get('id', ''), // sib_sig = ' '.join([sibling.get('id', ''),
// sibling.get('class', '')]).lower() // sibling.get('class', '')]).lower()
// if 'caption' in sib_sig: // if 'caption' in sib_sig:
@ -215,7 +212,7 @@ export default GenericLeadImageUrlExtractor
// //
// if img_width and img_height and not 'sprite' in img_path: // if img_width and img_height and not 'sprite' in img_path:
// area = img_width * img_height // area = img_width * img_height
// //
// if area < 5000: # Smaller than 50x100 // if area < 5000: # Smaller than 50x100
// logger.debug('Image with small area found. Subtracting 100.') // logger.debug('Image with small area found. Subtracting 100.')
// img_score -= 100 // img_score -= 100

@ -1,62 +1,62 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import GenericLeadImageUrlExtractor from './extractor' import GenericLeadImageUrlExtractor from './extractor';
describe('GenericLeadImageUrlExtractor', () => { describe('GenericLeadImageUrlExtractor', () => {
describe('extract({ $, content, metaCache })', () => { describe('extract({ $, content, metaCache })', () => {
it('returns og:image first', () => { it('returns og:image first', () => {
const $ = cheerio.load(HTML.og.test) const $ = cheerio.load(HTML.og.test);
const content = $('*').first() const content = $('*').first();
const metaCache = ['og:image'] const metaCache = ['og:image'];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.og.result) assert.equal(result, HTML.og.result);
}) });
it('returns twitter:image', () => { it('returns twitter:image', () => {
const $ = cheerio.load(HTML.twitter.test) const $ = cheerio.load(HTML.twitter.test);
const content = $('*').first() const content = $('*').first();
const metaCache = ['twitter:image'] const metaCache = ['twitter:image'];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.twitter.result) assert.equal(result, HTML.twitter.result);
}) });
it('finds images based on scoring', () => { it('finds images based on scoring', () => {
const $ = cheerio.load(HTML.scoring.test) const $ = cheerio.load(HTML.scoring.test);
const content = $('*').first() const content = $('*').first();
const metaCache = [] const metaCache = [];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.scoring.result) assert.equal(result, HTML.scoring.result);
}) });
it('returns image based on selectors', () => { it('returns image based on selectors', () => {
const $ = cheerio.load(HTML.selectors.test) const $ = cheerio.load(HTML.selectors.test);
const content = $('*').first() const content = $('*').first();
const metaCache = [] const metaCache = [];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.selectors.result) assert.equal(result, HTML.selectors.result);
}) });
}) });
}) });

@ -7,7 +7,7 @@ const HTML = {
</head> </head>
</html> </html>
`, `,
result: `http://example.com/lead.jpg` result: 'http://example.com/lead.jpg',
}, },
twitter: { twitter: {
test: ` test: `
@ -17,7 +17,7 @@ const HTML = {
</head> </head>
</html> </html>
`, `,
result: `http://example.com/lead.jpg` result: 'http://example.com/lead.jpg',
}, },
scoring: { scoring: {
test: ` test: `
@ -27,7 +27,7 @@ const HTML = {
<img src="http://example.com/upload/whateverpic.png" /> <img src="http://example.com/upload/whateverpic.png" />
</div> </div>
`, `,
result: `http://example.com/upload/goodpic.jpg` result: 'http://example.com/upload/goodpic.jpg',
}, },
selectors: { selectors: {
test: ` test: `
@ -35,8 +35,8 @@ const HTML = {
<link rel="image_src" href="http://example.com/upload/goodpic.jpg"> <link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div> </div>
`, `,
result: `http://example.com/upload/goodpic.jpg` result: 'http://example.com/upload/goodpic.jpg',
}, },
} };
export default HTML export default HTML;

@ -3,123 +3,123 @@ import {
NEGATIVE_LEAD_IMAGE_URL_HINTS_RE, NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,
GIF_RE, GIF_RE,
JPG_RE, JPG_RE,
} from './constants' } from './constants';
import { PHOTO_HINTS_RE } from '../content/scoring/constants' import { PHOTO_HINTS_RE } from '../content/scoring/constants';
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`;
}
// Scores image urls based on a variety of heuristics. // Scores image urls based on a variety of heuristics.
export function scoreImageUrl(url) { export function scoreImageUrl(url) {
url = url.trim() url = url.trim();
let score = 0 let score = 0;
if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score + 20 score += 20;
} }
if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score - 20 score -= 20;
} }
// TODO: We might want to consider removing this as // TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were // gifs are much more common/popular than they once were
if (GIF_RE.test(url)) { if (GIF_RE.test(url)) {
score = score - 10 score -= 10;
} }
if (JPG_RE.test(url)) { if (JPG_RE.test(url)) {
score = score + 10 score += 10;
} }
// PNGs are neutral. // PNGs are neutral.
return score return score;
} }
// Alt attribute usually means non-presentational image. // Alt attribute usually means non-presentational image.
export function scoreAttr($img) { export function scoreAttr($img) {
if ($img.attr('alt')) { if ($img.attr('alt')) {
return 5 return 5;
} else {
return 0
} }
return 0;
} }
// Look through our parent and grandparent for figure-like // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them // container elements, give a bonus if we find them
export function scoreByParents($img) { export function scoreByParents($img) {
let score = 0 let score = 0;
const $figParent = $img.parents('figure').first() const $figParent = $img.parents('figure').first();
if ($figParent.length === 1) { if ($figParent.length === 1) {
score = score + 25 score += 25;
} }
const $parent = $img.parent() const $parent = $img.parent();
let $gParent let $gParent;
if ($parent.length === 1) { if ($parent.length === 1) {
$gParent = $parent.parent() $gParent = $parent.parent();
} }
[$parent, $gParent].forEach($node => { [$parent, $gParent].forEach(($node) => {
if (PHOTO_HINTS_RE.test(getSig($node))) { if (PHOTO_HINTS_RE.test(getSig($node))) {
score = score + 15 score += 15;
} }
}) });
return score return score;
} }
// Look at our immediate sibling and see if it looks like it's a // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so. // caption. Bonus if so.
export function scoreBySibling($img) { export function scoreBySibling($img) {
let score = 0 let score = 0;
const $sibling = $img.next() const $sibling = $img.next();
const sibling = $sibling.get(0) const sibling = $sibling.get(0);
if (sibling && sibling.tagName === 'figcaption') { if (sibling && sibling.tagName === 'figcaption') {
score = score + 25 score += 25;
} }
if (PHOTO_HINTS_RE.test(getSig($sibling))) { if (PHOTO_HINTS_RE.test(getSig($sibling))) {
score = score + 15 score += 15;
} }
return score return score;
} }
export function scoreByDimensions($img) { export function scoreByDimensions($img) {
let score = 0 let score = 0;
const width = parseFloat($img.attr('width')) const width = parseFloat($img.attr('width'));
const height = parseFloat($img.attr('height')) const height = parseFloat($img.attr('height'));
const src = $img.attr('src') const src = $img.attr('src');
// Penalty for skinny images // Penalty for skinny images
if (width && width <= 50) { if (width && width <= 50) {
score = score - 50 score -= 50;
} }
// Penalty for short images // Penalty for short images
if (height && height <= 50) { if (height && height <= 50) {
score = score - 50 score -= 50;
} }
if (width && height && !src.includes('sprite')) { if (width && height && !src.includes('sprite')) {
const area = width * height const area = width * height;
if (area < 5000) { // Smaller than 50 x 100 if (area < 5000) { // Smaller than 50 x 100
score = score - 100 score -= 100;
} else { } else {
score = score + Math.round(area/1000) score += Math.round(area / 1000);
} }
} }
return score return score;
} }
export function scoreByPosition($imgs, index) { export function scoreByPosition($imgs, index) {
return $imgs.length/2 - index return ($imgs.length / 2) - index;
}
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`
} }

@ -1,5 +1,5 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
scoreImageUrl, scoreImageUrl,
@ -8,61 +8,61 @@ import {
scoreBySibling, scoreBySibling,
scoreByDimensions, scoreByDimensions,
scoreByPosition, scoreByPosition,
} from './score-image' } from './score-image';
describe('scoreImageUrlUrl(url)', () => { describe('scoreImageUrlUrl(url)', () => {
it('gets 20 points for a positive lead img hint', () => { it('gets 20 points for a positive lead img hint', () => {
const url = 'http://example.com/upload/img.png' const url = 'http://example.com/upload/img.png';
assert.equal(scoreImageUrl(url), 20) assert.equal(scoreImageUrl(url), 20);
}) });
it('loses 20 points for a negative lead img hint', () => { it('loses 20 points for a negative lead img hint', () => {
const url = 'http://example.com/sprite/foo/bar.png' const url = 'http://example.com/sprite/foo/bar.png';
assert.equal(scoreImageUrl(url), -20) assert.equal(scoreImageUrl(url), -20);
}) });
it('loses 10 points for a gif', () => { it('loses 10 points for a gif', () => {
const url = 'http://example.com/foo/bar.gif' const url = 'http://example.com/foo/bar.gif';
assert.equal(scoreImageUrl(url), -10) assert.equal(scoreImageUrl(url), -10);
const url2 = 'http://example.com/foogif/bar' const url2 = 'http://example.com/foogif/bar';
assert.equal(scoreImageUrl(url2), 0) assert.equal(scoreImageUrl(url2), 0);
}) });
it('gains 10 points for a jpg', () => { it('gains 10 points for a jpg', () => {
const url = 'http://example.com/foo/bar.jpg' const url = 'http://example.com/foo/bar.jpg';
assert.equal(scoreImageUrl(url), 10) assert.equal(scoreImageUrl(url), 10);
const url2 = 'http://example.com/foo/bar.jpeg' const url2 = 'http://example.com/foo/bar.jpeg';
assert.equal(scoreImageUrl(url2), 10) assert.equal(scoreImageUrl(url2), 10);
const url3 = 'http://example.com/foojpg/bar' const url3 = 'http://example.com/foojpg/bar';
assert.equal(scoreImageUrl(url3), 0) assert.equal(scoreImageUrl(url3), 0);
const url4 = 'http://example.com/foo.jpg?bar=baz' const url4 = 'http://example.com/foo.jpg?bar=baz';
assert.equal(scoreImageUrl(url4), 10) assert.equal(scoreImageUrl(url4), 10);
}) });
}) });
describe('scoreAttr($img)', () => { describe('scoreAttr($img)', () => {
it('gets 5 points if the img node has an alt attribute', () => { it('gets 5 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img alt="Wow" /></div>') const $ = cheerio.load('<div><img alt="Wow" /></div>');
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreAttr($img), 5) assert.equal(scoreAttr($img), 5);
}) });
it('gets 0 points if the img node has an alt attribute', () => { it('gets 0 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img /></div>') const $ = cheerio.load('<div><img /></div>');
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreAttr($img), 0) assert.equal(scoreAttr($img), 0);
}) });
}) });
describe('scoreByParents($img)', () => { describe('scoreByParents($img)', () => {
it('gets 25 points if it has a figure parent', () => { it('gets 25 points if it has a figure parent', () => {
@ -74,18 +74,18 @@ describe('scoreByParents($img)', () => {
</div> </div>
</figure> </figure>
</div>` </div>`
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByParents($img), 25) assert.equal(scoreByParents($img), 25);
}) });
it('gets 0 points if the img has no figure parent', () => { it('gets 0 points if the img has no figure parent', () => {
const $ = cheerio.load('<div><img /></div>') const $ = cheerio.load('<div><img /></div>');
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByParents($img), 0) assert.equal(scoreByParents($img), 0);
}) });
it('gets 15 points if parent or gparent has photo hints', () => { it('gets 15 points if parent or gparent has photo hints', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -96,12 +96,12 @@ describe('scoreByParents($img)', () => {
</div> </div>
</div> </div>
</div>` </div>`
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByParents($img), 15) assert.equal(scoreByParents($img), 15);
}) });
}) });
describe('scoreBySibling($img)', () => { describe('scoreBySibling($img)', () => {
it('gets 25 points if its sibling is figcaption', () => { it('gets 25 points if its sibling is figcaption', () => {
@ -112,11 +112,11 @@ describe('scoreBySibling($img)', () => {
<figcaption>Wow</figcaption> <figcaption>Wow</figcaption>
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreBySibling($img), 25) assert.equal(scoreBySibling($img), 25);
}) });
it('gets 15 points if its sibling has photo hints', () => { it('gets 15 points if its sibling has photo hints', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -128,12 +128,12 @@ describe('scoreBySibling($img)', () => {
</div> </div>
</div> </div>
</div>` </div>`
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreBySibling($img), 15) assert.equal(scoreBySibling($img), 15);
}) });
}) });
describe('scoreByDimensions($img)', () => { describe('scoreByDimensions($img)', () => {
it('penalizes skinny images', () => { it('penalizes skinny images', () => {
@ -143,11 +143,11 @@ describe('scoreByDimensions($img)', () => {
<img width="10" /> <img width="10" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50) assert.equal(scoreByDimensions($img), -50);
}) });
it('penalizes short images', () => { it('penalizes short images', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -156,11 +156,11 @@ describe('scoreByDimensions($img)', () => {
<img height="10" /> <img height="10" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50) assert.equal(scoreByDimensions($img), -50);
}) });
it('ignores sprites', () => { it('ignores sprites', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -169,11 +169,11 @@ describe('scoreByDimensions($img)', () => {
<img src="/sprite/etc/foo.png" width="1000" height="1000" /> <img src="/sprite/etc/foo.png" width="1000" height="1000" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), 0) assert.equal(scoreByDimensions($img), 0);
}) });
it('penalizes images with small areas', () => { it('penalizes images with small areas', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -182,11 +182,11 @@ describe('scoreByDimensions($img)', () => {
<img src="/etc/foo.png" width="60" height="60" /> <img src="/etc/foo.png" width="60" height="60" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), -100) assert.equal(scoreByDimensions($img), -100);
}) });
it('prefers the largest images', () => { it('prefers the largest images', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -195,13 +195,12 @@ describe('scoreByDimensions($img)', () => {
<img src="/etc/foo.png" width="1000" height="1000" /> <img src="/etc/foo.png" width="1000" height="1000" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), 1000) assert.equal(scoreByDimensions($img), 1000);
}) });
});
})
describe('scoreByPosition($imgs, index)', () => { describe('scoreByPosition($imgs, index)', () => {
it('gives higher scores to images that come first', () => { it('gives higher scores to images that come first', () => {
@ -216,10 +215,10 @@ describe('scoreByPosition($imgs, index)', () => {
<img width="10" /> <img width="10" />
</div> </div>
` `
) );
const $imgs = $('img') const $imgs = $('img');
assert.equal(scoreByPosition($imgs, 0), 3) assert.equal(scoreByPosition($imgs, 0), 3);
}) });
}) });

@ -1,25 +1,22 @@
import 'babel-polyfill' import 'babel-polyfill';
import URL from 'url' import URL from 'url';
import { import {
pageNumFromUrl,
articleBaseUrl, articleBaseUrl,
removeAnchor, removeAnchor,
} from 'utils/text' } from 'utils/text';
import scoreLinks from './scoring/score-links' import scoreLinks from './scoring/score-links';
// Looks for and returns next page url // Looks for and returns next page url
// for multi-page articles // for multi-page articles
const GenericNextPageUrlExtractor = { const GenericNextPageUrlExtractor = {
extract({ $, url, parsedUrl, previousUrls=[] }) { extract({ $, url, parsedUrl, previousUrls = [] }) {
parsedUrl = parsedUrl || URL.parse(url) parsedUrl = parsedUrl || URL.parse(url);
const currentPageNum = pageNumFromUrl(url) const articleUrl = removeAnchor(url);
const articleUrl = removeAnchor(url) const baseUrl = articleBaseUrl(url, parsedUrl);
const baseUrl = articleBaseUrl(url, parsedUrl)
const { host } = parsedUrl
const links = $('a[href]').toArray() const links = $('a[href]').toArray();
const scoredLinks = scoreLinks({ const scoredLinks = scoreLinks({
links, links,
@ -27,28 +24,28 @@ const GenericNextPageUrlExtractor = {
baseUrl, baseUrl,
parsedUrl, parsedUrl,
$, $,
previousUrls previousUrls,
}) });
// If no links were scored, return null // If no links were scored, return null
if (!scoredLinks) return null if (!scoredLinks) return null;
// now that we've scored all possible pages, // now that we've scored all possible pages,
// find the biggest one. // find the biggest one.
const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => { const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
const scoredLink = scoredLinks[link] const scoredLink = scoredLinks[link];
return scoredLink.score > acc.score ? scoredLink : acc return scoredLink.score > acc.score ? scoredLink : acc;
}, { score: -100 }) }, { score: -100 });
// If the score is less than 50, we're not confident enough to use it, // If the score is less than 50, we're not confident enough to use it,
// so we fail. // so we fail.
if (topPage.score >= 50) { if (topPage.score >= 50) {
return topPage.href return topPage.href;
} else {
return null
} }
}
}
return null;
},
};
export default GenericNextPageUrlExtractor
export default GenericNextPageUrlExtractor;

@ -1,34 +1,34 @@
import assert from 'assert' import assert from 'assert';
import fs from 'fs' import fs from 'fs';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import GenericNextPageUrlExtractor from './extractor' import GenericNextPageUrlExtractor from './extractor';
describe('GenericNextPageUrlExtractor', () => { describe('GenericNextPageUrlExtractor', () => {
it('returns most likely next page url', () => { it('returns most likely next page url', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8') const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html) const $ = cheerio.load(html);
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2' const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2';
const nextPage = GenericNextPageUrlExtractor.extract({ const nextPage = GenericNextPageUrlExtractor.extract({
$, $,
url url,
}) });
assert.equal(nextPage, next) assert.equal(nextPage, next);
}) });
it('returns null if there is no likely next page', () => { it('returns null if there is no likely next page', () => {
const html = `<div><p>HI</p></div>` const html = '<div><p>HI</p></div>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const url = 'http://example.com/foo/bar' const url = 'http://example.com/foo/bar';
const nextPage = GenericNextPageUrlExtractor.extract({ const nextPage = GenericNextPageUrlExtractor.extract({
$, $,
url url,
}) });
assert.equal(nextPage, null) assert.equal(nextPage, null);
}) });
}) });

@ -1,38 +1,38 @@
export const DIGIT_RE = /\d/ export const DIGIT_RE = /\d/;
// A list of words that, if found in link text or URLs, likely mean that // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link. // this link is not a next page link.
export const EXTRANEOUS_LINK_HINTS = [ export const EXTRANEOUS_LINK_HINTS = [
'print', 'print',
'archive', 'archive',
'comment', 'comment',
'discuss', 'discuss',
'e-mail', 'e-mail',
'email', 'email',
'share', 'share',
'reply', 'reply',
'all', 'all',
'login', 'login',
'sign', 'sign',
'single', 'single',
'adx', 'adx',
'entry-unrelated' 'entry-unrelated',
] ];
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i') export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');
// Match any link text/classname/id that looks like it could mean the next // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can // page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page. // mean last page.
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i') export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i');
// Match any link text/classname/id that looks like it is an end link: things // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc. // like "first", "last", "end", etc.
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i') export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');
// Match any link text/classname/id that looks like it means the previous // Match any link text/classname/id that looks like it means the previous
// page. // page.
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i') export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');
// Match any phrase that looks like it could be page, or paging, or pagination // Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i') export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');

@ -1,27 +1,32 @@
import 'babel-polyfill' import 'babel-polyfill';
import URL from 'url' import URL from 'url';
import difflib from 'difflib'
import { range } from 'utils' import { isWordpress } from 'utils/dom';
import { isWordpress } from 'utils/dom'
import { import {
removeAnchor, removeAnchor,
pageNumFromUrl, pageNumFromUrl,
} from 'utils/text' } from 'utils/text';
import {
DIGIT_RE,
NEXT_LINK_TEXT_RE,
PREV_LINK_TEXT_RE,
EXTRANEOUS_LINK_HINTS_RE,
CAP_LINK_TEXT_RE,
PAGE_RE,
} from './constants'
import { import {
NEGATIVE_SCORE_RE, scoreSimilarity,
POSITIVE_SCORE_RE, scoreLinkText,
} from 'utils/dom/constants' scorePageInLink,
import { IS_DIGIT_RE } from 'utils/text/constants' scoreExtraneousLinks,
scoreByParents,
scorePrevLink,
shouldScore,
scoreBaseUrl,
scoreCapLinks,
scoreNextLinkText,
} from './utils';
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i');
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
}
export default function scoreLinks({ export default function scoreLinks({
links, links,
@ -29,11 +34,11 @@ export default function scoreLinks({
baseUrl, baseUrl,
parsedUrl, parsedUrl,
$, $,
previousUrls=[] previousUrls = [],
}) { }) {
parsedUrl = parsedUrl || URL.parse(articleUrl) parsedUrl = parsedUrl || URL.parse(articleUrl);
const baseRegex = makeBaseRegex(baseUrl) const baseRegex = makeBaseRegex(baseUrl);
const isWp = isWordpress($) const isWp = isWordpress($);
// Loop through all links, looking for hints that they may be next-page // Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or // links. Things like having "page" in their textContent, className or
@ -46,12 +51,12 @@ export default function scoreLinks({
// Remove any anchor data since we don't do a good job // Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do // standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash // some checking with and without a trailing slash
let href = removeAnchor(link.attribs.href) const href = removeAnchor(link.attribs.href);
const $link = $(link) const $link = $(link);
const linkText = $link.text() const linkText = $link.text();
if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) { if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
return possiblePages return possiblePages;
} }
// ## PASSED THE FIRST-PASS TESTS. Start scoring. ## // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
@ -60,242 +65,29 @@ export default function scoreLinks({
score: 0, score: 0,
linkText, linkText,
href, href,
} };
} else { } else {
possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}` possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`;
}
const possiblePage = possiblePages[href]
const linkData = makeSig($link, linkText)
const pageNum = pageNumFromUrl(href)
let score = scoreBaseUrl(href, baseRegex)
score = score + scoreNextLinkText(linkData)
score = score + scoreCapLinks(linkData)
score = score + scorePrevLink(linkData)
score = score + scoreByParents($link)
score = score + scoreExtraneousLinks(href)
score = score + scorePageInLink(pageNum, isWp)
score = score + scoreLinkText(linkText, pageNum)
score = score + scoreSimilarity(score, articleUrl, href)
possiblePage.score = score
return possiblePages
}, {})
return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages
}
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i')
}
export function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio()
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity
const diffModifier = -(250 * (diffPercent - 0.2))
return score + diffModifier
}
return 0
}
export function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText)
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30
} else {
score = Math.max(0, 10 - linkTextAsNum)
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score = score - 50
}
}
return score
}
export function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50
}
return 0
}
export function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25
}
return 0
}
export function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent()
let positiveMatch = false
let negativeMatch = false
let score = 0
Array.from(range(0, 4)).forEach((_) => {
if ($parent.length === 0) {
return
} }
const parentData = makeSig($parent, ' ') const possiblePage = possiblePages[href];
const linkData = makeSig($link, linkText);
// If we have 'page' or 'paging' in our data, that's a good const pageNum = pageNumFromUrl(href);
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true
score = score + 25
}
// If we have 'comment' or something in our data, and let score = scoreBaseUrl(href, baseRegex);
// we don't have something like 'content' as well, that's score += scoreNextLinkText(linkData);
// a bad sign. Give a penalty. score += scoreCapLinks(linkData);
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData) score += scorePrevLink(linkData);
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) { score += scoreByParents($link);
if (!POSITIVE_SCORE_RE.test(parentData)) { score += scoreExtraneousLinks(href);
negativeMatch = true score += scorePageInLink(pageNum, isWp);
score = score - 25 score += scoreLinkText(linkText, pageNum);
} score += scoreSimilarity(score, articleUrl, href);
}
$parent = $parent.parent()
})
return score
}
export function scorePrevLink(linkData) { possiblePage.score = score;
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200
}
return 0 return possiblePages;
} }, {});
export function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65
}
}
return 0
}
export function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50
}
return 0
}
export function scoreBaseUrl(href, baseRegex) { return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages;
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25
}
return 0
}
export function shouldScore(
href,
articleUrl,
baseUrl,
parsedUrl,
linkText,
previousUrls
) {
// skip if we've already fetched this url
if(previousUrls.find((url) => href === url) !== undefined) {
return false
}
// If we've already parsed this URL, or the URL matches the base
// URL, or is empty, skip it.
if (!href || href === articleUrl || href === baseUrl) {
return false
}
const { hostname } = parsedUrl
const { hostname: linkHost } = URL.parse(href)
// Domain mismatch.
if (linkHost !== hostname) {
return false
}
// If href doesn't contain a digit after removing the base URL,
// it's certainly not the next page.
const fragment = href.replace(baseUrl, '')
if (!DIGIT_RE.test(fragment)) {
return false
}
// This link has extraneous content (like "comment") in its link
// text, so we skip it.
if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) {
return false
}
// Next page link text is never long, skip if it is too long.
if (linkText.length > 25) {
return false
}
return true
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`
} }

@ -1,239 +1,42 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import URL from 'url'
import scoreLinks from './score-links' import scoreLinks from './score-links';
import {
makeBaseRegex,
scoreBaseUrl,
scoreNextLinkText,
scoreCapLinks,
scorePrevLink,
scoreByParents,
scoreExtraneousLinks,
scorePageInLink,
scoreLinkText,
scoreSimilarity,
shouldScore,
} from './score-links'
describe('scoreLinks(links)', () => { describe('scoreLinks(links)', () => {
it('returns an object of scored links', () => { it('returns an object of scored links', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8') const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html) const $ = cheerio.load(html);
const links = $('a[href]').toArray() const links = $('a[href]').toArray();
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const scoredPages = scoreLinks({ const scoredPages = scoreLinks({
links, links,
articleUrl: url, articleUrl: url,
baseUrl: 'http://arstechnica.com', baseUrl: 'http://arstechnica.com',
$, $,
}) });
assert.equal(typeof scoredPages, 'object') assert.equal(typeof scoredPages, 'object');
}) });
it('returns null if no possible pages', () => { it('returns null if no possible pages', () => {
const html = `<div><p>Hello wow</p></div>` const html = '<div><p>Hello wow</p></div>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const links = $('a[href]').toArray() const links = $('a[href]').toArray();
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const scoredPages = scoreLinks({ const scoredPages = scoreLinks({
links, links,
articleUrl: url, articleUrl: url,
baseUrl: 'http://arstechnica.com', baseUrl: 'http://arstechnica.com',
$, $,
}) });
assert.equal(scoredPages, null) assert.equal(scoredPages, null);
}) });
}) });
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://foo.com/foo/bar'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25)
})
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://example.com/foo/bar/bat'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0)
})
})
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = "foo bar Next page"
assert.equal(scoreNextLinkText(linkData), 50)
})
it('returns 0 if does not contain common next link text', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreNextLinkText(linkData), 0)
})
})
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = "foo next Last page"
assert.equal(scoreCapLinks(linkData), -65)
})
it('returns 0 if does not match a cap link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = "foo next previous page"
assert.equal(scorePrevLink(linkData), -200)
})
it('returns 0 if does not match a prev link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), 25)
})
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), -25)
})
})
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = "http://example.com/email-link"
assert.equal(scoreExtraneousLinks(url), -25)
})
it('returns 0 if does not match extraneous text', () => {
const url = "http://example.com/asdf"
assert.equal(scoreExtraneousLinks(url), 0)
})
})
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50)
})
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0)
})
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0)
})
})
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8)
})
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5)
})
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30)
})
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44)
})
})
describe('scoreSimilarity(score, articleUrl, href)', () => {
it('returns a similarity bonus based on current score', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 25
assert.equal(
Math.round(scoreSimilarity(score, articleUrl, href)),
66
)
})
it('returns 0 is current score <= 0', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 0
assert.equal(scoreSimilarity(score, articleUrl, href), 0)
})
})
describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
it('returns false if href has already been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar/2' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
false
)
})
it('returns true if href has not been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
true
)
})
})

@ -0,0 +1,10 @@
export { default as scoreSimilarity } from './score-similarity';
export { default as scoreLinkText } from './score-link-text';
export { default as scorePageInLink } from './score-page-in-link';
export { default as scoreExtraneousLinks } from './score-extraneous-links';
export { default as scoreByParents } from './score-by-parents';
export { default as scorePrevLink } from './score-prev-link';
export { default as shouldScore } from './should-score';
export { default as scoreBaseUrl } from './score-base-url';
export { default as scoreNextLinkText } from './score-next-link-text';
export { default as scoreCapLinks } from './score-cap-links';

@ -0,0 +1,11 @@
export default function scoreBaseUrl(href, baseRegex) {
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25;
}
return 0;
}

@ -0,0 +1,23 @@
import assert from 'assert';
import scoreBaseUrl from './score-base-url';
import { makeBaseRegex } from '../score-links';
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar';
const badUrl = 'http://foo.com/foo/bar';
const baseRegex = makeBaseRegex(baseUrl);
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25);
});
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar';
const badUrl = 'http://example.com/foo/bar/bat';
const baseRegex = makeBaseRegex(baseUrl);
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0);
});
});

@ -0,0 +1,52 @@
import { range } from 'utils';
import {
NEGATIVE_SCORE_RE,
POSITIVE_SCORE_RE,
PAGE_RE,
} from 'utils/dom/constants';
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
function makeSig($link) {
return `${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
}
export default function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent();
let positiveMatch = false;
let negativeMatch = false;
let score = 0;
Array.from(range(0, 4)).forEach(() => {
if ($parent.length === 0) {
return;
}
const parentData = makeSig($parent, ' ');
// If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true;
score += 25;
}
// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true;
score -= 25;
}
}
$parent = $parent.parent();
});
return score;
}

@ -0,0 +1,35 @@
import assert from 'assert';
import cheerio from 'cheerio';
import scoreByParents from './score-by-parents';
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), 25);
});
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), -25);
});
});

@ -0,0 +1,19 @@
import {
NEXT_LINK_TEXT_RE,
CAP_LINK_TEXT_RE,
} from '../constants';
export default function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65;
}
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreCapLinks from './score-cap-links';
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = 'foo next Last page';
assert.equal(scoreCapLinks(linkData), -65);
});
it('returns 0 if does not match a cap link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreCapLinks(linkData), 0);
});
});

@ -0,0 +1,10 @@
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
export default function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreExtraneousLinks from './score-extraneous-links';
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = 'http://example.com/email-link';
assert.equal(scoreExtraneousLinks(url), -25);
});
it('returns 0 if does not match extraneous text', () => {
const url = 'http://example.com/asdf';
assert.equal(scoreExtraneousLinks(url), 0);
});
});

@ -0,0 +1,30 @@
import { IS_DIGIT_RE } from 'utils/text/constants';
export default function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0;
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText, 10);
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30;
} else {
score = Math.max(0, 10 - linkTextAsNum);
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score -= 50;
}
}
return score;
}

@ -0,0 +1,22 @@
import assert from 'assert';
import scoreLinkText from './score-link-text';
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8);
});
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5);
});
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30);
});
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44);
});
});

@ -0,0 +1,10 @@
import { NEXT_LINK_TEXT_RE } from '../constants';
export default function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreNextLinkText from './score-next-link-text';
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = 'foo bar Next page';
assert.equal(scoreNextLinkText(linkData), 50);
});
it('returns 0 if does not contain common next link text', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreNextLinkText(linkData), 0);
});
});

@ -0,0 +1,10 @@
export default function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scorePageInLink from './score-page-in-link';
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50);
});
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0);
});
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0);
});
});

@ -0,0 +1,11 @@
import { PREV_LINK_TEXT_RE } from '../constants';
export default function scorePrevLink(linkData) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scorePrevLink from './score-prev-link';
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = 'foo next previous page';
assert.equal(scorePrevLink(linkData), -200);
});
it('returns 0 if does not match a prev link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scorePrevLink(linkData), 0);
});
});

@ -0,0 +1,23 @@
import difflib from 'difflib';
export default function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity;
const diffModifier = -(250 * (diffPercent - 0.2));
return score + diffModifier;
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save