chore: refactored and linted

pull/1/head
Adam Pash 8 years ago
parent 9906bd36a4
commit 7e2a34945f

@ -0,0 +1 @@
**/fixtures/*

@ -0,0 +1,39 @@
// Use this file as a starting point for your project's .eslintrc.
// Copy this file, and add rule overrides as needed.
{
"parser": "babel-eslint",
"extends": "airbnb",
"plugins": [
"babel"
],
"globals": {
/* mocha */
"describe",
"it"
},
"rules": {
"no-param-reassign": 0,
/* TODO fix this; this should work w/import/resolver below, but doesn't */
"import/no-extraneous-dependencies": 0,
"import/no-unresolved": 0,
"no-control-regex": 0,
"import/prefer-default-export": 0,
"generator-star-spacing": 0,
"babel/generator-star-spacing": 0,
"func-names": 0,
"no-useless-escape": 0,
"no-confusing-arrow": 0,
},
"settings": {
"import/resolver": {
"babel-module": {
"extensions": [".js"]
}
}
},
"parserOptions":{
"ecmaFeatures": {
"experimentalObjectRestSpread": true
}
}
}

@ -5,14 +5,17 @@
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"start": "node ./build", "start": "node ./build",
"build": "rollup -c", "lint": "eslint src/**",
"build": "eslint src/** && rollup -c",
"test": "./test-runner" "test": "./test-runner"
}, },
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"devDependencies": { "devDependencies": {
"babel-eslint": "^6.1.2",
"babel-plugin-external-helpers": "^6.8.0", "babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-module-alias": "^1.6.0", "babel-plugin-module-alias": "^1.6.0",
"babel-plugin-module-resolver": "^2.2.0",
"babel-plugin-transform-async-to-generator": "^6.8.0", "babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0", "babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0", "babel-plugin-transform-object-rest-spread": "^6.8.0",
@ -21,6 +24,14 @@
"babel-preset-es2015-rollup": "^1.2.0", "babel-preset-es2015-rollup": "^1.2.0",
"babel-register": "^6.11.6", "babel-register": "^6.11.6",
"babelrc-rollup": "^3.0.0", "babelrc-rollup": "^3.0.0",
"eslint": "^3.5.0",
"eslint-config-airbnb": "^11.1.0",
"eslint-import-resolver-babel-module": "^2.0.1",
"eslint-plugin-async": "^0.1.1",
"eslint-plugin-babel": "^3.3.0",
"eslint-plugin-import": "^1.15.0",
"eslint-plugin-jsx-a11y": "^2.2.2",
"eslint-plugin-react": "^6.2.1",
"mocha": "^3.0.2", "mocha": "^3.0.2",
"rollup": "^0.34.13", "rollup": "^0.34.13",
"rollup-plugin-babel": "^2.6.1", "rollup-plugin-babel": "^2.6.1",

@ -0,0 +1,21 @@
#!/usr/local/bin/fish
set file $argv[1]
set function $argv[2]
touch src/extractors/generic/next-page-url/scoring/utils/index.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js
echo "Now make it a default export"
echo "Move it to its file"
echo "Move its tests to its test file"
echo "import in score-links"
echo "Test it."

@ -1,7 +1,7 @@
import { CLEAN_AUTHOR_RE } from './constants' import { CLEAN_AUTHOR_RE } from './constants';
// Take an author string (like 'By David Smith ') and clean it to // Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'. // just the name(s): 'David Smith'.
export default function cleanAuthor(author) { export default function cleanAuthor(author) {
return author.replace(CLEAN_AUTHOR_RE, '$2').trim() return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
} }

@ -1,21 +1,21 @@
import assert from 'assert' import assert from 'assert';
import cleanAuthor from './author' import cleanAuthor from './author';
describe('cleanAuthor(author)', () => { describe('cleanAuthor(author)', () => {
it('removes the By from an author string', () => { it('removes the By from an author string', () => {
const author = cleanAuthor('By Bob Dylan') const author = cleanAuthor('By Bob Dylan');
assert.equal(author, 'Bob Dylan') assert.equal(author, 'Bob Dylan');
}) });
it('trims trailing whitespace and line breaks', () => { it('trims trailing whitespace and line breaks', () => {
const text = ` const text = `
written by written by
Bob Dylan Bob Dylan
` `;
const author = cleanAuthor(text) const author = cleanAuthor(text);
assert.equal(author, 'Bob Dylan') assert.equal(author, 'Bob Dylan');
}) });
}) });

@ -1,9 +1,9 @@
// CLEAN AUTHOR CONSTANTS // CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)', // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS // CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i') export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// An ordered list of meta tag names that denote likely article deks. // An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct. // From most distinct to least distinct.
// //
@ -14,7 +14,7 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// However, these tags often have SEO-specific junk in them that's not // However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best. // header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [ export const DEK_META_TAGS = [
] ];
// An ordered list of Selectors to find likely article deks. From // An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit. // most explicit to least explicit.
@ -23,18 +23,36 @@ export const DEK_META_TAGS = [
// detrimental to the aesthetics of an article. // detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [ export const DEK_SELECTORS = [
'.entry-summary', '.entry-summary',
] ];
// CLEAN DATE PUBLISHED CONSTANTS // CLEAN DATE PUBLISHED CONSTANTS
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}(\s?[ap]\.?m\.?)?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig const months = [
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
'aug',
'sep',
'oct',
'nov',
'dec',
];
const allMonths = months.join('|');
const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
export const SPLIT_DATE_STRING =
new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig');
// CLEAN TITLE CONSTANTS // CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a // A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar. // title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;
export const DOMAIN_ENDINGS_RE = export const DOMAIN_ENDINGS_RE =
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g') new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g');

@ -8,9 +8,7 @@ import {
rewriteTopLevel, rewriteTopLevel,
stripJunkTags, stripJunkTags,
makeLinksAbsolute, makeLinksAbsolute,
} from 'utils/dom' } from 'utils/dom';
import { convertNodeTo } from 'utils/dom'
// Clean our article content, returning a new, cleaned node. // Clean our article content, returning a new, cleaned node.
export default function extractCleanNode( export default function extractCleanNode(
@ -24,38 +22,38 @@ export default function extractCleanNode(
) { ) {
// Rewrite the tag name to div if it's a top level node like body or // Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags. // html to avoid later complications with multiple body tags.
rewriteTopLevel(article, $) rewriteTopLevel(article, $);
// Drop small images and spacer images // Drop small images and spacer images
cleanImages(article, $) cleanImages(article, $);
// Drop certain tags like <title>, etc // Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security. // This is -mostly- for cleanliness, not security.
stripJunkTags(article, $) stripJunkTags(article, $);
// H1 tags are typically the article title, which should be extracted // H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3), // by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s. // strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article, $) cleanHOnes(article, $);
// Clean headers // Clean headers
cleanHeaders(article, $, title) cleanHeaders(article, $, title);
// Make links absolute // Make links absolute
makeLinksAbsolute(article, $, url) makeLinksAbsolute(article, $, url);
// Remove style or align attributes // Remove style or align attributes
cleanAttributes(article, $) cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to // We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better // too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them. // way to detect menus particularly and remove them.
cleanTags(article, $, cleanConditionally) cleanTags(article, $, cleanConditionally);
// Remove empty paragraph nodes // Remove empty paragraph nodes
removeEmpty(article, $) removeEmpty(article, $);
return article return article;
} }
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6') // headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers: // for header in headers:

@ -1,32 +1,32 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import extractCleanNode from './content' import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractBestNode from 'extractors/generic/content/extract-best-node' import extractCleanNode from './content';
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => { describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it("cleans cruft out of a DOM node", () => { it('cleans cruft out of a DOM node', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
let $ = cheerio.load(html) const $ = cheerio.load(html);
const opts = { const opts = {
stripUnlikelyCandidates: true, stripUnlikelyCandidates: true,
weightNodes: true, weightNodes: true,
cleanConditionally: true, cleanConditionally: true,
} };
const bestNode = extractBestNode($, opts) const bestNode = extractBestNode($, opts);
let result = $.html(bestNode) // let result = $.html(bestNode);
// console.log(result) // // console.log(result)
// console.log(result.length) // // console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts }) const cleanNode = extractCleanNode(bestNode, { $, opts });
result = $.html(cleanNode) // result = $.html(cleanNode);
// console.log(result.length) // // console.log(result.length)
// console.log(result) // // console.log(result)
// console.log(bestNode.html()) // // console.log(bestNode.html())
assert.equal($(bestNode).text().length, 2687) assert.equal($(cleanNode).text().length, 2687);
}) });
}) });

@ -1,4 +1,4 @@
import moment from 'moment' import moment from 'moment';
// Is there a compelling reason to use moment here? // Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method, // Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string. // but could just check for 'Invalid Date' string.
@ -7,27 +7,27 @@ import {
CLEAN_DATE_STRING_RE, CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING, SPLIT_DATE_STRING,
TIME_MERIDIAN_SPACE_RE, TIME_MERIDIAN_SPACE_RE,
TIME_MERIDIAN_DOTS_RE TIME_MERIDIAN_DOTS_RE,
} from './constants' } from './constants';
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim();
}
// Take a date published string, and hopefully return a date out of // Take a date published string, and hopefully return a date out of
// it. Return none if we fail. // it. Return none if we fail.
export default function cleanDatePublished(dateString) { export default function cleanDatePublished(dateString) {
let date = moment(new Date(dateString)) let date = moment(new Date(dateString));
if (!date.isValid()) { if (!date.isValid()) {
dateString = cleanDateString(dateString) dateString = cleanDateString(dateString);
date = moment(new Date(dateString)) date = moment(new Date(dateString));
} }
return date.isValid() ? date.toISOString() : null return date.isValid() ? date.toISOString() : null;
}
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim()
} }

@ -1,67 +1,62 @@
import assert from 'assert' import assert from 'assert';
import { import {
default as cleanDatePublished, default as cleanDatePublished,
cleanDateString, cleanDateString,
} from './date-published' } from './date-published';
describe('cleanDatePublished(dateString)', () => { describe('cleanDatePublished(dateString)', () => {
it('returns a date object', () => { it('returns a date object', () => {
const datePublished = cleanDatePublished('published: 1/1/2020') const datePublished = cleanDatePublished('published: 1/1/2020');
assert.equal( assert.equal(
datePublished, datePublished,
new Date('1/1/2020').toISOString() new Date('1/1/2020').toISOString()
) );
}) });
it('returns null if date is invalid', () => { it('returns null if date is invalid', () => {
const datePublished = cleanDatePublished('blargh') const datePublished = cleanDatePublished('blargh');
assert.equal(datePublished, null) assert.equal(datePublished, null);
}) });
});
})
describe('cleanDateString(dateString)', () => { describe('cleanDateString(dateString)', () => {
it('removes "published" text from an datePublished string', () => { it('removes "published" text from an datePublished string', () => {
const datePublished = cleanDateString('published: 1/1/2020') const datePublished = cleanDateString('published: 1/1/2020');
assert.equal(datePublished, '1/1/2020') assert.equal(datePublished, '1/1/2020');
}) });
it('trims whitespace', () => { it('trims whitespace', () => {
const datePublished = cleanDateString(' 1/1/2020 ') const datePublished = cleanDateString(' 1/1/2020 ');
assert.equal(datePublished, '1/1/2020') assert.equal(datePublished, '1/1/2020');
}) });
it('puts a space b/w a time and am/pm', () => { it('puts a space b/w a time and am/pm', () => {
// The JS date parser is forgiving, but // The JS date parser is forgiving, but
// it needs am/pm separated from a time // it needs am/pm separated from a time
const date1 = cleanDateString('1/1/2020 8:30am') const date1 = cleanDateString('1/1/2020 8:30am');
assert.equal(date1, '1/1/2020 8:30 am') assert.equal(date1, '1/1/2020 8:30 am');
const date2 = cleanDateString('8:30PM 1/1/2020') const date2 = cleanDateString('8:30PM 1/1/2020');
assert.equal(date2, '8:30 PM 1/1/2020') assert.equal(date2, '8:30 PM 1/1/2020');
}) });
it('cleans the dots from a.m. or p.m.', () => { it('cleans the dots from a.m. or p.m.', () => {
// The JS date parser is forgiving, but // The JS date parser is forgiving, but
// it needs a.m./p.m. without dots // it needs a.m./p.m. without dots
const date1 = cleanDateString('1/1/2020 8:30 a.m.') const date1 = cleanDateString('1/1/2020 8:30 a.m.');
assert.equal(date1, '1/1/2020 8:30 am') assert.equal(date1, '1/1/2020 8:30 am');
}) });
it('can handle some tough timestamps', () => { it('can handle some tough timestamps', () => {
// The JS date parser is forgiving, but // The JS date parser is forgiving, but
// it needs am/pm separated from a time // it needs am/pm separated from a time
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.') const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.');
assert.equal(date1, '15 Apr 2016 10:59') assert.equal(date1, '15 Apr 2016 10:59');
});
const date2 = cleanDateString('8:30PM 1/1/2020') });
assert.equal(date2, '8:30 PM 1/1/2020')
})
})

@ -1,17 +1,18 @@
import { TEXT_LINK_RE } from './constants' import { stripTags } from 'utils/dom';
import { stripTags } from 'utils/dom'
import { TEXT_LINK_RE } from './constants';
// Take a dek HTML fragment, and return the cleaned version of it. // Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough. // Return None if the dek wasn't good enough.
export default function cleanDek(dek, { $ }) { export default function cleanDek(dek, { $ }) {
// Sanity check that we didn't get too short or long of a dek. // Sanity check that we didn't get too short or long of a dek.
if (dek.length > 1000 || dek.length < 5) return null if (dek.length > 1000 || dek.length < 5) return null;
const dekText = stripTags(dek, $) const dekText = stripTags(dek, $);
// Plain text links shouldn't exist in the dek. If we have some, it's // Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail. // not a good dek - bail.
if (TEXT_LINK_RE.test(dekText)) return null if (TEXT_LINK_RE.test(dekText)) return null;
return dekText.trim() return dekText.trim();
} }

@ -1,52 +1,50 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import cleanDek from './dek';
default as cleanDek,
cleanDekString,
} from './dek'
describe('cleanDek(dekString, { $ })', () => { describe('cleanDek(dekString, { $ })', () => {
it('returns null if the dek is < 5 chars', () => { it('returns null if the dek is < 5 chars', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
assert.equal(cleanDek('Hi', { $ }), null) assert.equal(cleanDek('Hi', { $ }), null);
}) });
it('returns null if the dek is > 1000 chars', () => { it('returns null if the dek is > 1000 chars', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const longDek = const longDek =
// generate a string that is 1,280 chars // generate a string that is 1,280 chars
[0,1,2,3,4,5,6].reduce((acc, i) => [0, 1, 2, 3, 4, 5, 6].reduce((acc) => {
acc += acc, '0123456789' acc += acc;
) return acc;
assert.equal(cleanDek(longDek, { $ }), null) }, '0123456789');
}) assert.equal(cleanDek(longDek, { $ }), null);
});
it('strip html tags from the dek', () => { it('strip html tags from the dek', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = 'This is a <em>very</em> important dek.' const dek = 'This is a <em>very</em> important dek.';
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.') assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.');
}) });
it('returns null if dek contains plain text link', () => { it('returns null if dek contains plain text link', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = 'This has this link http://example.com/foo/bar' const dek = 'This has this link http://example.com/foo/bar';
assert.equal(cleanDek(dek, { $ }), null) assert.equal(cleanDek(dek, { $ }), null);
}) });
it('returns a normal dek as is', () => { it('returns a normal dek as is', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = 'This is the dek' const dek = 'This is the dek';
assert.equal(cleanDek(dek, { $ }), dek) assert.equal(cleanDek(dek, { $ }), dek);
}) });
it('cleans extra whitespace', () => { it('cleans extra whitespace', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const dek = ' This is the dek ' const dek = ' This is the dek ';
assert.equal(cleanDek(dek, { $ }), 'This is the dek') assert.equal(cleanDek(dek, { $ }), 'This is the dek');
}) });
}) });

@ -1,5 +1,5 @@
const HTML = { const HTML = {
docWithH1: `<div><h1>This Is the Real Title</h1></div>`, docWithH1: '<div><h1>This Is the Real Title</h1></div>',
docWith2H1s: ` docWith2H1s: `
<div> <div>
<h1>This Is the Real Title</h1> <h1>This Is the Real Title</h1>
@ -7,9 +7,9 @@ const HTML = {
</div> </div>
`, `,
docWithTagsInH1: { docWithTagsInH1: {
before: `<div><h1>This Is the <em>Real</em> Title</h1></div>`, before: '<div><h1>This Is the <em>Real</em> Title</h1></div>',
after: `This Is the Real Title` after: 'This Is the Real Title',
}, },
} };
export default HTML export default HTML;

@ -1,9 +1,9 @@
import cleanAuthor from './author' import cleanAuthor from './author';
import cleanImage from './lead-image-url' import cleanImage from './lead-image-url';
import cleanDek from './dek' import cleanDek from './dek';
import cleanDatePublished from './date-published' import cleanDatePublished from './date-published';
import cleanContent from './content' import cleanContent from './content';
import cleanTitle from './title' import cleanTitle from './title';
const Cleaners = { const Cleaners = {
author: cleanAuthor, author: cleanAuthor,
@ -12,15 +12,15 @@ const Cleaners = {
datePublished: cleanDatePublished, datePublished: cleanDatePublished,
content: cleanContent, content: cleanContent,
title: cleanTitle, title: cleanTitle,
} };
export default Cleaners export default Cleaners;
export { cleanAuthor } export { cleanAuthor };
export { cleanImage } export { cleanImage };
export { cleanDek } export { cleanDek };
export { cleanDatePublished } export { cleanDatePublished };
export { cleanContent } export { cleanContent };
export { cleanTitle } export { cleanTitle };
export { default as resolveSplitTitle } from './resolve-split-title' export { default as resolveSplitTitle } from './resolve-split-title';

@ -1,10 +1,10 @@
import validUrl from 'valid-url' import validUrl from 'valid-url';
export default function clean(leadImageUrl) { export default function clean(leadImageUrl) {
leadImageUrl = leadImageUrl.trim() leadImageUrl = leadImageUrl.trim();
if (validUrl.isWebUri(leadImageUrl)) { if (validUrl.isWebUri(leadImageUrl)) {
return leadImageUrl return leadImageUrl;
} else {
return null
} }
return null;
} }

@ -1,20 +1,20 @@
import assert from 'assert' import assert from 'assert';
import clean from './lead-image-url' import clean from './lead-image-url';
describe('clean(leadImageUrl)', () => { describe('clean(leadImageUrl)', () => {
it('returns the url if valid', () => { it('returns the url if valid', () => {
const url = 'https://example.com' const url = 'https://example.com';
assert.equal(clean(url), url) assert.equal(clean(url), url);
}) });
it('returns null if the url is not valid', () => { it('returns null if the url is not valid', () => {
const url = 'this is not a valid url' const url = 'this is not a valid url';
assert.equal(clean(url), null) assert.equal(clean(url), null);
}) });
it('trims whitespace', () => { it('trims whitespace', () => {
const url = ' https://example.com/foo/bar.jpg' const url = ' https://example.com/foo/bar.jpg';
assert.equal(clean(url), url.trim()) assert.equal(clean(url), url.trim());
}) });
}) });

@ -1,34 +1,11 @@
import URL from 'url' import URL from 'url';
import 'babel-polyfill' import 'babel-polyfill';
import wuzzy from 'wuzzy' import wuzzy from 'wuzzy';
import { import {
TITLE_SPLITTERS_RE, TITLE_SPLITTERS_RE,
DOMAIN_ENDINGS_RE, DOMAIN_ENDINGS_RE,
} from './constants' } from './constants';
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url='') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
title = title
let splitTitle = title.split(TITLE_SPLITTERS_RE)
if (splitTitle.length === 1) {
return title
}
let newTitle = extractBreadcrumbTitle(splitTitle, title)
if (newTitle) return newTitle
newTitle = cleanDomainFromTitle(splitTitle, url)
if (newTitle) return newTitle
// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title
}
function extractBreadcrumbTitle(splitTitle, text) { function extractBreadcrumbTitle(splitTitle, text) {
// This must be a very breadcrumbed title, like: // This must be a very breadcrumbed title, like:
@ -38,40 +15,40 @@ function extractBreadcrumbTitle(splitTitle, text) {
// Look to see if we can find a breadcrumb splitter that happens // Look to see if we can find a breadcrumb splitter that happens
// more than once. If we can, we'll be able to better pull out // more than once. If we can, we'll be able to better pull out
// the title. // the title.
const termCounts = splitTitle.reduce((acc, text) => { const termCounts = splitTitle.reduce((acc, titleText) => {
acc[text] = acc[text] ? acc[text] + 1 : 1 acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
return acc return acc;
}, {}) }, {});
const [maxTerm, termCount] = const [maxTerm, termCount] =
Reflect.ownKeys(termCounts) Reflect.ownKeys(termCounts)
.reduce((acc, key) => { .reduce((acc, key) => {
if (acc[1] < termCounts[key]) { if (acc[1] < termCounts[key]) {
return [key, termCounts[key]] return [key, termCounts[key]];
} else {
return acc
} }
}, [0, 0])
return acc;
}, [0, 0]);
// We found a splitter that was used more than once, so it // We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead. // is probably the breadcrumber. Split our title on that instead.
// Note: max_term should be <= 4 characters, so that " >> " // Note: max_term should be <= 4 characters, so that " >> "
// will match, but nothing longer than that. // will match, but nothing longer than that.
if (termCount >= 2 && maxTerm.length <= 4) { if (termCount >= 2 && maxTerm.length <= 4) {
splitTitle = text.split(maxTerm) splitTitle = text.split(maxTerm);
} }
const splitEnds = [splitTitle[0], splitTitle.slice(-1)] const splitEnds = [splitTitle[0], splitTitle.slice(-1)];
const longestEnd = splitEnds.reduce((acc, end) => { const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');
return acc.length > end.length ? acc : end
}, '')
if (longestEnd.length > 10) { if (longestEnd.length > 10) {
return longestEnd return longestEnd;
} else {
return text
} }
return text;
} }
return null;
} }
function cleanDomainFromTitle(splitTitle, url) { function cleanDomainFromTitle(splitTitle, url) {
@ -81,20 +58,43 @@ function cleanDomainFromTitle(splitTitle, url) {
// //
// Strip out the big TLDs - it just makes the matching a bit more // Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right. // accurate. Not the end of the world if it doesn't strip right.
const { host } = URL.parse(url) const { host } = URL.parse(url);
const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '') const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
const startSlug = splitTitle[0].toLowerCase().replace(' ', '');
const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
if (startSlugRatio > 0.4 && startSlug.length > 5) {
return splitTitle.slice(2).join('');
}
const startSlug = splitTitle[0].toLowerCase().replace(' ', '') const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain) const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
if (startSlugRatio > .4 && startSlug.length > 5) { if (endSlugRatio > 0.4 && endSlug.length >= 5) {
return splitTitle.slice(2).join('') return splitTitle.slice(0, -2).join('');
} }
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '') return null;
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain) }
if (endSlugRatio > .4 && endSlug.length >= 5) { // Given a title with separators in it (colons, dashes, etc),
return splitTitle.slice(0, -2).join('') // resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url = '') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
const splitTitle = title.split(TITLE_SPLITTERS_RE);
if (splitTitle.length === 1) {
return title;
} }
let newTitle = extractBreadcrumbTitle(splitTitle, title);
if (newTitle) return newTitle;
newTitle = cleanDomainFromTitle(splitTitle, url);
if (newTitle) return newTitle;
// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title;
} }

@ -1,32 +1,31 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio'
import { resolveSplitTitle } from './index' import { resolveSplitTitle } from './index';
describe('resolveSplitTitle(text)', () => { describe('resolveSplitTitle(text)', () => {
it('does nothing if title not splittable', () => { it('does nothing if title not splittable', () => {
const title = "This Is a Normal Title" const title = 'This Is a Normal Title';
assert.equal(resolveSplitTitle(title), title) assert.equal(resolveSplitTitle(title), title);
}) });
it('extracts titles from breadcrumb-like titles', () => { it('extracts titles from breadcrumb-like titles', () => {
const title = "The Best Gadgets on Earth : Bits : Blogs : NYTimes.com" const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com';
assert.equal(resolveSplitTitle(title), "The Best Gadgets on Earth ") assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth ');
}) });
it('cleans domains from titles at the front', () => { it('cleans domains from titles at the front', () => {
const title = "NYTimes - The Best Gadgets on Earth" const title = 'NYTimes - The Best Gadgets on Earth';
const url = "https://www.nytimes.com/bits/blog/etc/" const url = 'https://www.nytimes.com/bits/blog/etc/';
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth") assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
}) });
it('cleans domains from titles at the back', () => { it('cleans domains from titles at the back', () => {
const title = "The Best Gadgets on Earth | NYTimes" const title = 'The Best Gadgets on Earth | NYTimes';
const url = "https://www.nytimes.com/bits/blog/etc/" const url = 'https://www.nytimes.com/bits/blog/etc/';
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth") assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
}) });
}) });

@ -1,25 +1,26 @@
import { TITLE_SPLITTERS_RE } from './constants' import { stripTags } from 'utils/dom';
import { resolveSplitTitle } from './index'
import { stripTags } from 'utils/dom' import { TITLE_SPLITTERS_RE } from './constants';
import { resolveSplitTitle } from './index';
export default function cleanTitle(title, { url, $ }) { export default function cleanTitle(title, { url, $ }) {
// If title has |, :, or - in it, see if // If title has |, :, or - in it, see if
// we can clean it up. // we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) { if (TITLE_SPLITTERS_RE.test(title)) {
title = resolveSplitTitle(title, url) title = resolveSplitTitle(title, url);
} }
// Final sanity check that we didn't get a crazy title. // Final sanity check that we didn't get a crazy title.
// if (title.length > 150 || title.length < 15) { // if (title.length > 150 || title.length < 15) {
if (title.length > 150) { if (title.length > 150) {
// If we did, return h1 from the document if it exists // If we did, return h1 from the document if it exists
const h1 = $('h1') const h1 = $('h1');
if (h1.length === 1) { if (h1.length === 1) {
title = h1.text() title = h1.text();
} }
} }
// strip any html tags in the title text // strip any html tags in the title text
return stripTags(title, $).trim() return stripTags(title, $).trim();
} }

@ -1,8 +1,8 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { cleanTitle } from './index' import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => { describe('cleanTitle(title, { url, $ })', () => {
it('uses a single h1 if the title is too short or too long', () => { it('uses a single h1 if the title is too short or too long', () => {
@ -10,28 +10,27 @@ describe('cleanTitle(title, { url, $ })', () => {
// const $ = cheerio.load(HTML.docWithH1) // const $ = cheerio.load(HTML.docWithH1)
// //
// assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text()) // assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
}) });
it('only uses h1 if there is only one on the page', () => { it('only uses h1 if there is only one on the page', () => {
const title = "Too Short" const title = 'Too Short';
const $ = cheerio.load(HTML.docWith2H1s) const $ = cheerio.load(HTML.docWith2H1s);
assert.equal(cleanTitle(title, { url: '', $ }), title) assert.equal(cleanTitle(title, { url: '', $ }), title);
}) });
it('removes HTML tags from titles', () => { it('removes HTML tags from titles', () => {
const $ = cheerio.load(HTML.docWithTagsInH1.before) const $ = cheerio.load(HTML.docWithTagsInH1.before);
const title = $('h1').html() const title = $('h1').html();
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after) assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after);
}) });
it('trims extraneous spaces', () => { it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love " const title = " This Is a Great Title That You'll Love ";
const $ = cheerio.load(HTML.docWithTagsInH1.before) const $ = cheerio.load(HTML.docWithTagsInH1.before);
assert.equal(cleanTitle(title, { url: '', $ }), title.trim()) assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
}) });
});
})

@ -1,12 +1,11 @@
import GenericExtractor from './generic' import NYMagExtractor from './custom/nymag.com';
import NYMagExtractor from './custom/nymag.com' import BloggerExtractor from './custom/blogspot.com';
import BloggerExtractor from './custom/blogspot.com' import WikipediaExtractor from './custom/wikipedia.org';
import WikipediaExtractor from './custom/wikipedia.org'
const Extractors = { const Extractors = {
'nymag.com': NYMagExtractor, 'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor, 'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor, 'wikipedia.org': WikipediaExtractor,
} };
export default Extractors export default Extractors;

@ -1 +1 @@
export const ATTR_RE = /\[([\w-]+)\]/ export const ATTR_RE = /\[([\w-]+)\]/;

@ -14,27 +14,27 @@ const BloggerExtractor = {
// Convert the noscript tag to a div // Convert the noscript tag to a div
transforms: { transforms: {
'noscript': 'div' noscript: 'div',
}, },
}, },
author: { author: {
selectors: [ selectors: [
'.post-author-name' '.post-author-name',
] ],
}, },
title: { title: {
selectors: [ selectors: [
'h2.title', 'h2.title',
] ],
}, },
datePublished: { datePublished: {
selectors: [ selectors: [
'span.publishdate', 'span.publishdate',
] ],
} },
} };
export default BloggerExtractor export default BloggerExtractor;

@ -22,37 +22,39 @@ const NYMagExtractor = {
// the transformation. // the transformation.
transforms: { transforms: {
// Convert h1s to h2s // Convert h1s to h2s
'h1': 'h2', h1: 'h2',
// Convert lazy-loaded noscript images to figures // Convert lazy-loaded noscript images to figures
'noscript': ($node) => { noscript: ($node) => {
const $children = $node.children() const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') { if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure' return 'figure';
}
}
} }
return null;
},
},
}, },
title: { title: {
selectors: [ selectors: [
'h1.headline-primary', 'h1.headline-primary',
'h1', 'h1',
] ],
}, },
author: { author: {
selectors: [ selectors: [
'.by-authors', '.by-authors',
] ],
}, },
datePublished: { datePublished: {
selectors: [ selectors: [
'time.article-timestamp[datetime]', 'time.article-timestamp[datetime]',
'time.article-timestamp', 'time.article-timestamp',
] ],
} },
} };
export default NYMagExtractor export default NYMagExtractor;

@ -8,7 +8,7 @@ const WikipediaExtractor = {
// transform top infobox to an image with caption // transform top infobox to an image with caption
transforms: { transforms: {
'.infobox img': ($node) => { '.infobox img': ($node) => {
$node.parents('.infobox').prepend($node) $node.parents('.infobox').prepend($node);
}, },
'.infobox caption': 'figcaption', '.infobox caption': 'figcaption',
'.infobox': 'figure', '.infobox': 'figure',
@ -28,15 +28,15 @@ const WikipediaExtractor = {
title: { title: {
selectors: [ selectors: [
'h2.title', 'h2.title',
] ],
}, },
datePublished: { datePublished: {
selectors: [ selectors: [
'#footer-info-lastmod', '#footer-info-lastmod',
] ],
}, },
} };
export default WikipediaExtractor export default WikipediaExtractor;

@ -12,9 +12,9 @@ export const AUTHOR_META_TAGS = [
'dc.creator', 'dc.creator',
'rbauthors', 'rbauthors',
'authors', 'authors',
] ];
export const AUTHOR_MAX_LENGTH = 300 export const AUTHOR_MAX_LENGTH = 300;
// An ordered list of XPath Selectors to find likely article authors. From // An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit. // most explicit to least explicit.
@ -47,12 +47,12 @@ export const AUTHOR_SELECTORS = [
'.articleauthor', '.articleauthor',
'.ArticleAuthor', '.ArticleAuthor',
'.byline', '.byline',
] ];
// An ordered list of Selectors to find likely article authors, with // An ordered list of Selectors to find likely article authors, with
// regular expression for content. // regular expression for content.
const byline_re = /^[\n\s]*By/i const bylineRe = /^[\n\s]*By/i;
export const BYLINE_SELECTORS_RE = [ export const BYLINE_SELECTORS_RE = [
['#byline', byline_re], ['#byline', bylineRe],
['.byline', byline_re], ['.byline', bylineRe],
] ];

@ -1,49 +1,48 @@
import { cleanAuthor } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import { import {
AUTHOR_META_TAGS, AUTHOR_META_TAGS,
AUTHOR_MAX_LENGTH, AUTHOR_MAX_LENGTH,
AUTHOR_SELECTORS, AUTHOR_SELECTORS,
BYLINE_SELECTORS_RE, BYLINE_SELECTORS_RE,
} from './constants' } from './constants';
import { cleanAuthor } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors
} from 'utils/dom'
const GenericAuthorExtractor = { const GenericAuthorExtractor = {
extract({ $, metaCache }) { extract({ $, metaCache }) {
let author let author;
// First, check to see if we have a matching // First, check to see if we have a matching
// meta tag that we can make use of. // meta tag that we can make use of.
author = extractFromMeta($, AUTHOR_META_TAGS, metaCache) author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
if (author && author.length < AUTHOR_MAX_LENGTH) { if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author) return cleanAuthor(author);
} }
// Second, look through our selectors looking for potential authors. // Second, look through our selectors looking for potential authors.
author = extractFromSelectors($, AUTHOR_SELECTORS, 2) author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
if (author && author.length < AUTHOR_MAX_LENGTH) { if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author) return cleanAuthor(author);
} }
// Last, use our looser regular-expression based selectors for // Last, use our looser regular-expression based selectors for
// potential authors. // potential authors.
for (const [selector, regex] of BYLINE_SELECTORS_RE) { for (const [selector, regex] of BYLINE_SELECTORS_RE) {
const node = $(selector) const node = $(selector);
if (node.length === 1) { if (node.length === 1) {
const text = node.text() const text = node.text();
if (regex.test(text)) { if (regex.test(text)) {
return cleanAuthor(text) return cleanAuthor(text);
} }
} }
} }
return null return null;
} },
} };
export default GenericAuthorExtractor export default GenericAuthorExtractor;

@ -1,46 +1,46 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import GenericAuthorExtractor from './extractor' import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => { describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => { describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => { it('extracts author from meta tags', () => {
const $ = cheerio.load(HTML.authorMeta.test) const $ = cheerio.load(HTML.authorMeta.test);
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, HTML.authorMeta.result) assert.equal(result, HTML.authorMeta.result);
}) });
it('extracts author from author selectors', () => { it('extracts author from author selectors', () => {
const $ = cheerio.load(HTML.authorSelectors.test) const $ = cheerio.load(HTML.authorSelectors.test);
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, HTML.authorSelectors.result) assert.equal(result, HTML.authorSelectors.result);
}) });
it('extracts author with regex selectors', () => { it('extracts author with regex selectors', () => {
const $ = cheerio.load(HTML.authorRegSelectors.test) const $ = cheerio.load(HTML.authorRegSelectors.test);
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, HTML.authorRegSelectors.result) assert.equal(result, HTML.authorRegSelectors.result);
}) });
it('returns null if no author found', () => { it('returns null if no author found', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const result = GenericAuthorExtractor.extract( const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] } { $, metaCache: ['dc.author', 'something-else'] }
) );
assert.equal(result, null) assert.equal(result, null);
}) });
}) });
}) });

@ -5,7 +5,7 @@ const HTML = {
<meta name="dc.author" value="Adam" /> <meta name="dc.author" value="Adam" />
</html> </html>
`, `,
result: `Adam` result: 'Adam',
}, },
authorSelectors: { authorSelectors: {
test: ` test: `
@ -15,7 +15,7 @@ const HTML = {
</div> </div>
</div> </div>
`, `,
result: `Adam` result: 'Adam',
}, },
authorRegSelectors: { authorRegSelectors: {
test: ` test: `
@ -25,8 +25,8 @@ const HTML = {
</div> </div>
</div> </div>
`, `,
result: `Adam` result: 'Adam',
}, },
} };
export default HTML export default HTML;

@ -1,11 +1,12 @@
import {
scoreContent,
findTopCandidate,
} from './scoring'
import { import {
stripUnlikelyCandidates, stripUnlikelyCandidates,
convertToParagraphs, convertToParagraphs,
} from 'utils/dom' } from 'utils/dom';
import {
scoreContent,
findTopCandidate,
} from './scoring';
// Using a variety of scoring techniques, extract the content most // Using a variety of scoring techniques, extract the content most
// likely to be article text. // likely to be article text.
@ -26,12 +27,12 @@ export default function extractBestNode($, opts) {
if (opts.stripUnlikelyCandidates) { if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($) $ = stripUnlikelyCandidates($);
} }
$ = convertToParagraphs($) $ = convertToParagraphs($);
$ = scoreContent($, opts.weightNodes) $ = scoreContent($, opts.weightNodes);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
return $topCandidate return $topCandidate;
} }

@ -1,24 +1,26 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
// import HTML from './fixtures/html' // import HTML from './fixtures/html'
import extractBestNode from './extract-best-node' import extractBestNode from './extract-best-node';
describe('extractBestNode($, flags)', () => { describe('extractBestNode($, flags)', () => {
it("scores the dom nodes and returns the best option", () => { it('scores the dom nodes and returns the best option', () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8');
const opts = { const opts = {
stripUnlikelyCandidates: true, stripUnlikelyCandidates: true,
weightNodes: true, weightNodes: true,
} };
let $ = cheerio.load(html) const $ = cheerio.load(html);
const bestNode = extractBestNode($, opts) const bestNode = extractBestNode($, opts);
assert(typeof bestNode, 'object');
// console.log(bestNode.html()) // console.log(bestNode.html())
// assert.equal($(bestNode).text().length, 3652) // assert.equal($(bestNode).text().length, 3652)
}) });
}) });

@ -1,10 +1,11 @@
import cheerio from 'cheerio' import cheerio from 'cheerio';
import 'babel-polyfill' import 'babel-polyfill';
import extractBestNode from './extract-best-node' import { nodeIsSufficient } from 'utils/dom';
import { nodeIsSufficient } from 'utils/dom' import { cleanContent } from 'cleaners';
import { cleanContent } from 'cleaners' import { normalizeSpaces } from 'utils/text';
import { normalizeSpaces } from 'utils/text'
import extractBestNode from './extract-best-node';
const GenericContentExtractor = { const GenericContentExtractor = {
defaultOpts: { defaultOpts: {
@ -33,34 +34,32 @@ const GenericContentExtractor = {
// cleanConditionally: Clean the node to return of some // cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc. // superfluous content. Things like forms, ads, etc.
extract({ $, html, title, url }, opts) { extract({ $, html, title, url }, opts) {
opts = { ...this.defaultOpts, ...opts } opts = { ...this.defaultOpts, ...opts };
$ = $ || cheerio.load(html) $ = $ || cheerio.load(html);
// Cascade through our extraction-specific opts in an ordered fashion, // Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content. // turning them off as we try to extract content.
let node = this.getContentNode($, title, url, opts) let node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) { if (nodeIsSufficient(node)) {
return this.cleanAndReturnNode(node, $) return this.cleanAndReturnNode(node, $);
} else { }
// We didn't succeed on first pass, one by one disable our // We didn't succeed on first pass, one by one disable our
// extraction opts and try again. // extraction opts and try again.
for (const key of Reflect.ownKeys(opts).filter(key => opts[key] === true)) { for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) {
opts[key] = false opts[key] = false;
$ = cheerio.load(html) $ = cheerio.load(html);
node = this.getContentNode($, title, url, opts) node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) { if (nodeIsSufficient(node)) {
break break;
} }
} }
return this.cleanAndReturnNode(node, $) return this.cleanAndReturnNode(node, $);
}
return this.cleanAndReturnNode(node, $)
}, },
// Get node given current options // Get node given current options
@ -72,7 +71,7 @@ const GenericContentExtractor = {
cleanConditionally: opts.cleanConditionally, cleanConditionally: opts.cleanConditionally,
title, title,
url, url,
}) });
}, },
// Once we got here, either we're at our last-resort node, or // Once we got here, either we're at our last-resort node, or
@ -80,10 +79,10 @@ const GenericContentExtractor = {
// move forward. // move forward.
cleanAndReturnNode(node, $) { cleanAndReturnNode(node, $) {
if (!node) { if (!node) {
return null return null;
} }
return normalizeSpaces($.html(node)) return normalizeSpaces($.html(node));
// if return_type == "html": // if return_type == "html":
// return normalize_spaces(node_to_html(node)) // return normalize_spaces(node_to_html(node))
@ -91,6 +90,6 @@ const GenericContentExtractor = {
// return node // return node
}, },
} };
export default GenericContentExtractor export default GenericContentExtractor;

@ -1,16 +1,15 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import fs from 'fs';
import fs from 'fs'
import { clean } from 'test-helpers' import { clean } from 'test-helpers';
import GenericContentExtractor from './extractor' import GenericContentExtractor from './extractor';
describe('GenericContentExtractor', function () { describe('GenericContentExtractor', function () {
this.timeout(1000000) this.timeout(1000000);
describe('extract($, html, opts)', () => { describe('extract($, html, opts)', () => {
it("extracts html and returns the article", () => { it('extracts html and returns the article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
// Array.from(range(1, 100)).map((i) => { // Array.from(range(1, 100)).map((i) => {
// console.log(i) // console.log(i)
@ -20,15 +19,10 @@ describe('GenericContentExtractor', function() {
// }) // })
const result = clean(GenericContentExtractor.extract( const result = clean(GenericContentExtractor.extract(
{ $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' } { $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' }
)) ));
// console.log(result)
})
})
})
function* range(start = 1, end = 1) { assert(typeof result, 'string');
while (start <= end) { // console.log(result)
yield start++ });
} });
} });

@ -1,15 +1,15 @@
import { import {
getOrInitScore, getOrInitScore,
setScore, setScore,
} from './index' } from './index';
export default function addScore($node, $, amount) { export default function addScore($node, $, amount) {
try { try {
const score = getOrInitScore($node, $) + amount const score = getOrInitScore($node, $) + amount;
setScore($node, $, score) setScore($node, $, score);
} catch (e) { } catch (e) {
console.debug(e) // Ignoring; error occurs in scoreNode
} finally {
return $node
} }
return $node;
} }

@ -1,28 +1,27 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
addScore, addScore,
getScore, getScore,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => { describe('addScore(node, $, amount)', () => {
it(`adds the specified amount to a node's score`, () => { it('adds the specified amount to a node\'s score', () => {
const $ = cheerio.load('<p score="25">Foo</p>') const $ = cheerio.load('<p score="25">Foo</p>');
let $node = $('p').first() let $node = $('p').first();
$node = addScore($node, $, 25) $node = addScore($node, $, 25);
assert.equal(getScore($node), 50) assert.equal(getScore($node), 50);
}) });
it(`adds score if score not yet set (assumes score is 0)`, () => { it('adds score if score not yet set (assumes score is 0)', () => {
const $ = cheerio.load('<p>Foo</p>') const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first() let $node = $('p').first();
$node = addScore($node, $, 25) $node = addScore($node, $, 25);
assert.equal(getScore($node), 25) assert.equal(getScore($node), 25);
}) });
});
}) });
})

@ -1,11 +1,11 @@
import { addScore } from './index' import { addScore } from './index';
// Adds 1/4 of a child's score to its parent // Adds 1/4 of a child's score to its parent
export default function addToParent(node, $, score) { export default function addToParent(node, $, score) {
const parent = node.parent() const parent = node.parent();
if (parent) { if (parent) {
addScore(parent, $, score * .25) addScore(parent, $, score * 0.25);
} }
return node return node;
} }

@ -1,24 +1,23 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
addToParent, addToParent,
getScore, getScore,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('addToParent(node, $, amount)', () => { describe('addToParent(node, $, amount)', () => {
it(`adds 1/4 of a node's score it its parent`, () => { it('adds 1/4 of a node\'s score it its parent', () => {
const html = '<div score="25"><p score="40">Foo</p></div>' const html = '<div score="25"><p score="40">Foo</p></div>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
let $node = $('p').first() let $node = $('p').first();
$node = addToParent($node, $, 40) $node = addToParent($node, $, 40);
assert.equal(getScore($node.parent()), 35) assert.equal(getScore($node.parent()), 35);
assert.equal(getScore($node), 40) assert.equal(getScore($node), 40);
}) });
}) });
});
})

@ -42,8 +42,8 @@ export const UNLIKELY_CANDIDATES_BLACKLIST = [
'sidebar', 'sidebar',
'sociable', 'sociable',
'sponsor', 'sponsor',
'tools' 'tools',
] ];
// A list of strings that can be considered LIKELY candidates when // A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the // extracting content from a resource. Essentially, the inverse of the
@ -71,8 +71,8 @@ export const UNLIKELY_CANDIDATES_WHITELIST = [
'main', 'main',
'page', 'page',
'posts', 'posts',
'shadow' 'shadow',
] ];
// A list of tags which, if found inside, should cause a <div /> to NOT // A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements // be turned into a paragraph tag. Shallow div tags without these elements
@ -86,7 +86,7 @@ export const DIV_TO_P_BLOCK_TAGS = [
'p', 'p',
'pre', 'pre',
'table', 'table',
].join(',') ].join(',');
// A list of tags that should be ignored when trying to find the top candidate // A list of tags that should be ignored when trying to find the top candidate
// for a document. // for a document.
@ -103,10 +103,10 @@ export const NON_TOP_CANDIDATE_TAGS = [
'img', 'img',
'link', 'link',
'meta', 'meta',
] ];
export const NON_TOP_CANDIDATE_TAGS_RE = export const NON_TOP_CANDIDATE_TAGS_RE =
new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i') new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i');
// A list of selectors that specify, very clearly, either hNews or other // A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates. // very content-specific style content, like Blogger templates.
@ -118,53 +118,15 @@ export const HNEWS_CONTENT_SELECTORS = [
['.post', '.postbody'], ['.post', '.postbody'],
['.post', '.post_body'], ['.post', '.post_body'],
['.post', '.post-body'], ['.post', '.post-body'],
] ];
// export const HNEWS_CONTENT_SELECTORS = [
// {
// //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['hentry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['entry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'),
// must_exist: {
// classes: ['entry', 'entry_content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'),
// must_exist: {
// classes: ['post', 'post-body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'),
// must_exist: {
// classes: ['post', 'post_body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'),
// must_exist: {
// classes: ['post', 'postbody'],
// }
// },
// ]
export const PHOTO_HINTS = [ export const PHOTO_HINTS = [
'figure', 'figure',
'photo', 'photo',
'image', 'image',
'caption' 'caption',
] ];
export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i') export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');
// A list of strings that denote a positive scoring for this content as being // A list of strings that denote a positive scoring for this content as being
@ -190,14 +152,14 @@ export const POSITIVE_SCORE_HINTS = [
'story', 'story',
'text', 'text',
'[-_]copy', // usatoday '[-_]copy', // usatoday
'\Bcopy' '\Bcopy',
] ];
// The above list, joined into a matching regular expression // The above list, joined into a matching regular expression
export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i') export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');
// Readability publisher-specific guidelines // Readability publisher-specific guidelines
export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i') export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');
// A list of strings that denote a negative scoring for this content as being // A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id. // an article container. Checked against className and id.
@ -258,19 +220,19 @@ export const NEGATIVE_SCORE_HINTS = [
'summary', 'summary',
'tags', 'tags',
'tools', 'tools',
'widget' 'widget',
] ];
// The above list, joined into a matching regular expression // The above list, joined into a matching regular expression
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i') export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');
// Match a digit. Pretty clear. // Match a digit. Pretty clear.
export const DIGIT_RE = new RegExp('[0-9]') export const DIGIT_RE = new RegExp('[0-9]');
// Match 2 or more consecutive <br> tags // Match 2 or more consecutive <br> tags
export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i') export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i');
// Match 1 BR tag. // Match 1 BR tag.
export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i') export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');
// A list of all of the block level tags known in HTML5 and below. Taken from // A list of all of the block level tags known in HTML5 and below. Taken from
// http://bit.ly/qneNIT // http://bit.ly/qneNIT
@ -322,25 +284,25 @@ export const BLOCK_LEVEL_TAGS = [
'tr', 'tr',
'ul', 'ul',
'video', 'video',
] ];
export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i') export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i');
// The removal is implemented as a blacklist and whitelist, this test finds // The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one // blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the // expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes. // serialization for whitelisted nodes.
const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|') const candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i') export const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|') const candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i') export const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');
export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i') export const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i');
export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i') export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');
export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i') export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
export const BAD_TAGS = new RegExp('^(address|form)$', 'i') export const BAD_TAGS = new RegExp('^(address|form)$', 'i');
export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i') export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');

@ -1,115 +1,35 @@
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants' import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
import { getScore } from './index' import { getScore } from './index';
import { import mergeSiblings from './merge-siblings';
textLength,
linkDensity
} from 'utils/dom'
// After we've calculated scores, loop through all of the possible // After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score. // candidate nodes we found and find the one with the highest score.
export default function findTopCandidate($) { export default function findTopCandidate($) {
let $candidate, topScore = 0 let $candidate;
let topScore = 0;
$('*[score]').each((index, node) => { $('*[score]').each((index, node) => {
const $node = $(node) const $node = $(node);
// Ignore tags like BR, HR, etc // Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) { if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
return return;
} }
const score = getScore($node) const score = getScore($node);
if (score > topScore) { if (score > topScore) {
topScore = score topScore = score;
$candidate = $node $candidate = $node;
} }
}) });
// If we don't have a candidate, return the body // If we don't have a candidate, return the body
// or whatever the first element is // or whatever the first element is
if (!$candidate) { if (!$candidate) {
return $('body') || $('*').first() return $('body') || $('*').first();
} }
$candidate = mergeSiblings($candidate, topScore, $) $candidate = mergeSiblings($candidate, topScore, $);
return $candidate return $candidate;
}
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
let wrappingDiv = $('<div></div>')
$candidate.parent().children().each((index, child) => {
const $child = $(child)
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return
}
const childScore = getScore($child)
if (childScore) {
if ($child === $candidate) {
wrappingDiv.append($child)
} else {
let contentBonus = 0
// extract to scoreLinkDensity() TODO
const density = linkDensity($child)
// If sibling has a very low link density,
// give it a small bonus
if (density < .05) {
contentBonus = contentBonus + 20
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus = contentBonus - 20
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus = contentBonus + topScore * .2
}
const newScore = getScore($child) + contentBonus
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($child)
} else if (child.tagName === 'p') {
const childContentLength = textLength($child.text())
if (childContentLength > 80 && density < .25) {
return wrappingDiv.append($child)
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append($child)
}
}
}
}
})
return wrappingDiv
}
// TODO Extract into util - AP
// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
const SENTENCE_END_RE = new RegExp('\.( |$)')
function hasSentenceEnd(text) {
return SENTENCE_END_RE.test(text)
} }

@ -1,58 +1,58 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
getScore, getScore,
findTopCandidate, findTopCandidate,
scoreContent scoreContent,
} from './index' } from './index';
describe('findTopCandidate($)', () => { describe('findTopCandidate($)', () => {
it("finds the top candidate from simple case", () => { it('finds the top candidate from simple case', () => {
const $ = cheerio.load(HTML.findDom1) const $ = cheerio.load(HTML.findDom1);
const $$topCandidate = findTopCandidate($) const $$topCandidate = findTopCandidate($);
assert.equal(getScore($$topCandidate), 100) assert.equal(getScore($$topCandidate), 100);
}) });
it("finds the top candidate from a nested case", () => { it('finds the top candidate from a nested case', () => {
const $ = cheerio.load(HTML.findDom2) const $ = cheerio.load(HTML.findDom2);
const $$topCandidate = findTopCandidate($) const $$topCandidate = findTopCandidate($);
// this is wrapped in a div so checking // this is wrapped in a div so checking
// the score of the first child // the score of the first child
assert.equal(getScore($$topCandidate.children().first()), 50) assert.equal(getScore($$topCandidate.children().first()), 50);
}) });
it("ignores tags like BR", () => { it('ignores tags like BR', () => {
const $ = cheerio.load(HTML.findDom3) const $ = cheerio.load(HTML.findDom3);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
assert.equal(getScore($topCandidate), 50) assert.equal(getScore($topCandidate), 50);
}) });
it("returns BODY if no candidates found", () => { it('returns BODY if no candidates found', () => {
const $ = cheerio.load(HTML.topBody) const $ = cheerio.load(HTML.topBody);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
assert.equal($topCandidate.get(0).tagName, 'body') assert.equal($topCandidate.get(0).tagName, 'body');
}) });
it("appends a sibling with a good enough score", () => { it('appends a sibling with a good enough score', () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
let $ = cheerio.load(html) let $ = cheerio.load(html);
$ = scoreContent($) $ = scoreContent($);
const $topCandidate = findTopCandidate($) const $topCandidate = findTopCandidate($);
assert.equal($($topCandidate).text().length, 3652) assert.equal($($topCandidate).text().length, 3652);
}) });
}) });

@ -237,7 +237,7 @@ const HTML = {
`, `,
after: ` after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div> <div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
` `,
}, },
// cleanImages // cleanImages
@ -252,7 +252,7 @@ const HTML = {
<div> <div>
<img width="50"> <img width="50">
</div> </div>
` `,
}, },
cleanHeight: { cleanHeight: {
before: ` before: `
@ -264,7 +264,7 @@ const HTML = {
<div> <div>
<img width="50"> <img width="50">
</div> </div>
` `,
}, },
cleanSpacer: { cleanSpacer: {
before: ` before: `
@ -279,7 +279,7 @@ const HTML = {
<img src="/foo/bar/baz/normal.png"> <img src="/foo/bar/baz/normal.png">
<p>Some text</p> <p>Some text</p>
</div> </div>
` `,
}, },
// stripJunkTags // stripJunkTags
stripsJunk: { stripsJunk: {
@ -298,7 +298,7 @@ const HTML = {
<div> <div>
<p>What an article</p> <p>What an article</p>
</div> </div>
` `,
}, },
// stripHOnes // stripHOnes
@ -314,7 +314,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
convertThreeHOnes: { convertThreeHOnes: {
before: ` before: `
@ -334,7 +334,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<h2>Can you believe it?!</h2> <h2>Can you believe it?!</h2>
</div> </div>
` `,
}, },
// cleanAttributes // cleanAttributes
@ -348,7 +348,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
removeAlign: { removeAlign: {
before: ` before: `
@ -360,7 +360,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
// removeEmpty // removeEmpty
@ -375,7 +375,7 @@ const HTML = {
<div> <div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
doNotRemoveBr: { doNotRemoveBr: {
before: ` before: `
@ -392,7 +392,7 @@ const HTML = {
<div></div> <div></div>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
doNotNested: { doNotNested: {
before: ` before: `
@ -409,7 +409,7 @@ const HTML = {
<p><img src="foo/bar.jpg" /></p> <p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
// cleanConditionally // cleanConditionally
@ -433,7 +433,7 @@ const HTML = {
</p> </p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
removeTooManyInputs: { removeTooManyInputs: {
before: ` before: `
@ -467,7 +467,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
removeShortNoImg: { removeShortNoImg: {
before: ` before: `
@ -490,7 +490,7 @@ const HTML = {
<img src="asdf"> <img src="asdf">
</div> </div>
</div> </div>
` `,
}, },
linkDensityHigh: { linkDensityHigh: {
@ -527,7 +527,7 @@ const HTML = {
<li>Keep this one</li> <li>Keep this one</li>
</ul> </ul>
</div> </div>
` `,
}, },
goodScoreTooDense: { goodScoreTooDense: {
before: ` before: `
@ -567,7 +567,7 @@ const HTML = {
<li>Keep this one</li> <li>Keep this one</li>
</ul> </ul>
</div> </div>
` `,
}, },
previousEndsInColon: { previousEndsInColon: {
before: ` before: `
@ -608,7 +608,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
`, `,
after: `What do you think?` after: 'What do you think?',
}, },
// cleanHeaders // cleanHeaders
@ -627,7 +627,7 @@ const HTML = {
<h2>Keep me</h2> <h2>Keep me</h2>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
cleanTitleMatch: { cleanTitleMatch: {
before: ` before: `
@ -642,7 +642,7 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
dropWithNegativeWeight: { dropWithNegativeWeight: {
before: ` before: `
@ -657,8 +657,8 @@ const HTML = {
<p>What do you think?</p> <p>What do you think?</p>
<p>What do you think?</p> <p>What do you think?</p>
</div> </div>
` `,
}, },
} };
export default HTML export default HTML;

@ -82,6 +82,6 @@ const HTML = {
</article> </article>
<body> <body>
`, `,
} };
export default HTML export default HTML;

@ -3,27 +3,26 @@ import {
scoreNode, scoreNode,
getWeight, getWeight,
addToParent, addToParent,
} from './index' } from './index';
// gets and returns the score if it exists // gets and returns the score if it exists
// if not, initializes a score based on // if not, initializes a score based on
// the node's tag type // the node's tag type
export default function getOrInitScore($node, $, weightNodes = true) { export default function getOrInitScore($node, $, weightNodes = true) {
let score = getScore($node) let score = getScore($node);
if (score) { if (score) {
return score return score;
} else { }
score = scoreNode($node)
score = scoreNode($node);
if (weightNodes) { if (weightNodes) {
score = score + getWeight($node) score += getWeight($node);
} }
addToParent($node, $, score) addToParent($node, $, score);
}
return score return score;
} }

@ -1,61 +1,61 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
getOrInitScore, getOrInitScore,
getScore, getScore,
} from './index' } from './index';
describe('getOrInitScore(node, $)', () => { describe('getOrInitScore(node, $)', () => {
describe('when score set', () => { describe('when score set', () => {
it(`returns score if node's score already set`, () => { it('returns score if node\'s score already set', () => {
const html = '<p score="40">Foo</p>' const html = '<p score="40">Foo</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 40) assert.equal(score, 40);
}) });
}) });
describe('when no score set', () => { describe('when no score set', () => {
it(`returns 0 if no class/id and text < 25 chars`, () => { it('returns 0 if no class/id and text < 25 chars', () => {
const html = '<p>Foo</p>' const html = '<p>Foo</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 0) assert.equal(score, 0);
}) });
it(`returns score if no class/id and has commas/length`, () => { it('returns score if no class/id and has commas/length', () => {
const $ = cheerio.load(HTML.score19) const $ = cheerio.load(HTML.score19);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 19) assert.equal(score, 19);
}) });
it(`returns greater score if weighted class/id is set`, () => { it('returns greater score if weighted class/id is set', () => {
const $ = cheerio.load(HTML.score44) const $ = cheerio.load(HTML.score44);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) const score = getOrInitScore(node, $);
assert.equal(score, 44) assert.equal(score, 44);
}) });
it(`gives 1/4 of its score to its parent`, () => { it('gives 1/4 of its score to its parent', () => {
const $ = cheerio.load(HTML.score44Parent) const $ = cheerio.load(HTML.score44Parent);
const node = $('p').first() const node = $('p').first();
const score = getOrInitScore(node, $) getOrInitScore(node, $);
assert.equal(getScore(node.parent()), 16) assert.equal(getScore(node.parent()), 16);
}) });
}) });
}) });

@ -2,5 +2,5 @@
// the node's score attribute // the node's score attribute
// returns null if no score set // returns null if no score set
export default function getScore($node) { export default function getScore($node) {
return parseFloat($node.attr('score')) || null return parseFloat($node.attr('score')) || null;
} }

@ -1,25 +1,22 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { getScore } from './index' import { getScore } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('getScore($node)', () => { describe('getScore($node)', () => {
it("returns null if the node has no score set", () => { it('returns null if the node has no score set', () => {
const $ = cheerio.load('<p>Foo</p>') const $ = cheerio.load('<p>Foo</p>');
const $node = $('p').first() const $node = $('p').first();
assert.equal(getScore($node), null) assert.equal(getScore($node), null);
}) });
it("returns 25 if the node has a score attr of 25", () => { it('returns 25 if the node has a score attr of 25', () => {
const $ = cheerio.load('<p score="25">Foo</p>') const $ = cheerio.load('<p score="25">Foo</p>');
const $node = $('p').first() const $node = $('p').first();
assert.equal(typeof getScore($node), 'number') assert.equal(typeof getScore($node), 'number');
assert.equal(getScore($node), 25) assert.equal(getScore($node), 25);
}) });
});
}) });
})

@ -3,34 +3,34 @@ import {
POSITIVE_SCORE_RE, POSITIVE_SCORE_RE,
PHOTO_HINTS_RE, PHOTO_HINTS_RE,
READABILITY_ASSET, READABILITY_ASSET,
} from './constants' } from './constants';
// Get the score of a node based on its className and id. // Get the score of a node based on its className and id.
export default function getWeight(node) { export default function getWeight(node) {
const classes = node.attr('class') const classes = node.attr('class');
const id = node.attr('id') const id = node.attr('id');
let score = 0 let score = 0;
if (id) { if (id) {
// if id exists, try to score on both positive and negative // if id exists, try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(id)) { if (POSITIVE_SCORE_RE.test(id)) {
score = score + 25 score += 25;
} }
if (NEGATIVE_SCORE_RE.test(id)) { if (NEGATIVE_SCORE_RE.test(id)) {
score = score - 25 score -= 25;
} }
} }
if (classes) { if (classes) {
if (score == 0) { if (score === 0) {
// if classes exist and id did not contribute to score // if classes exist and id did not contribute to score
// try to score on both positive and negative // try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(classes)) { if (POSITIVE_SCORE_RE.test(classes)) {
score = score + 25 score += 25;
} }
if (NEGATIVE_SCORE_RE.test(classes)) { if (NEGATIVE_SCORE_RE.test(classes)) {
score = score - 25 score -= 25;
} }
} }
@ -38,7 +38,7 @@ export default function getWeight(node) {
// possible photo matches // possible photo matches
// "try to keep photos if we can" // "try to keep photos if we can"
if (PHOTO_HINTS_RE.test(classes)) { if (PHOTO_HINTS_RE.test(classes)) {
score = score + 10 score += 10;
} }
// add 25 if class matches entry-content-asset, // add 25 if class matches entry-content-asset,
@ -46,11 +46,10 @@ export default function getWeight(node) {
// Readability publisher guidelines // Readability publisher guidelines
// https://www.readability.com/developers/guidelines // https://www.readability.com/developers/guidelines
if (READABILITY_ASSET.test(classes)) { if (READABILITY_ASSET.test(classes)) {
score = score + 25 score += 25;
} }
} }
return score return score;
} }

@ -1,59 +1,58 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/get-weight' import HTML from './fixtures/get-weight';
import { import {
getWeight getWeight,
} from './index' } from './index';
describe('Generic Extractor Utils', () => { describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => { describe('getWeight(node)', () => {
it("returns a score of 25 if node has positive id", () => { it('returns a score of 25 if node has positive id', () => {
const $ = cheerio.load(HTML.positiveId) const $ = cheerio.load(HTML.positiveId);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of -25 if node has negative id", () => { it('returns a score of -25 if node has negative id', () => {
const $ = cheerio.load(HTML.negativeId) const $ = cheerio.load(HTML.negativeId);
assert.equal(getWeight($('div')), -25) assert.equal(getWeight($('div')), -25);
}) });
it("returns a score of 25 if node has positive class", () => { it('returns a score of 25 if node has positive class', () => {
const $ = cheerio.load(HTML.positiveClass) const $ = cheerio.load(HTML.positiveClass);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of -25 if node has negative class", () => { it('returns a score of -25 if node has negative class', () => {
const $ = cheerio.load(HTML.negativeClass) const $ = cheerio.load(HTML.negativeClass);
assert.equal(getWeight($('div')), -25) assert.equal(getWeight($('div')), -25);
}) });
it("returns a score of 25 if node has both positive id and class", () => { it('returns a score of 25 if node has both positive id and class', () => {
const $ = cheerio.load(HTML.positiveIdAndClass) const $ = cheerio.load(HTML.positiveIdAndClass);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of 25 if node has pos id and neg class", () => { it('returns a score of 25 if node has pos id and neg class', () => {
// is this really wanted? id="entry" class="adbox" // is this really wanted? id="entry" class="adbox"
// should get positive score? // should get positive score?
const $ = cheerio.load(HTML.positiveIdNegClass) const $ = cheerio.load(HTML.positiveIdNegClass);
assert.equal(getWeight($('div')), 25) assert.equal(getWeight($('div')), 25);
}) });
it("returns a score of 10 if node has pos img class", () => { it('returns a score of 10 if node has pos img class', () => {
const $ = cheerio.load(HTML.positivePhotoClass) const $ = cheerio.load(HTML.positivePhotoClass);
assert.equal(getWeight($('div')), 10) assert.equal(getWeight($('div')), 10);
}) });
it("returns a score of 35 if node has pos id pos img class", () => { it('returns a score of 35 if node has pos id pos img class', () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto) const $ = cheerio.load(HTML.positiveIdAndPhoto);
assert.equal(getWeight($('div')), 35) assert.equal(getWeight($('div')), 35);
}) });
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => { it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
const $ = cheerio.load(HTML.entryContentAsset) const $ = cheerio.load(HTML.entryContentAsset);
assert.equal(getWeight($('div')), 50) assert.equal(getWeight($('div')), 50);
}) });
});
}) });
})

@ -1,13 +1,13 @@
// Scoring // Scoring
export { default as getWeight } from './get-weight' export { default as getWeight } from './get-weight';
export { default as getScore } from './get-score' export { default as getScore } from './get-score';
export { default as scoreCommas } from './score-commas' export { default as scoreCommas } from './score-commas';
export { default as scoreLength } from './score-length' export { default as scoreLength } from './score-length';
export { default as scoreParagraph } from './score-paragraph' export { default as scoreParagraph } from './score-paragraph';
export { default as setScore } from './set-score' export { default as setScore } from './set-score';
export { default as addScore } from './add-score' export { default as addScore } from './add-score';
export { default as addToParent } from './add-to-parent' export { default as addToParent } from './add-to-parent';
export { default as getOrInitScore } from './get-or-init-score' export { default as getOrInitScore } from './get-or-init-score';
export { default as scoreNode } from './score-node' export { default as scoreNode } from './score-node';
export { default as scoreContent } from './score-content' export { default as scoreContent } from './score-content';
export { default as findTopCandidate } from './find-top-candidate' export { default as findTopCandidate } from './find-top-candidate';

@ -0,0 +1,79 @@
import {
textLength,
linkDensity,
} from 'utils/dom';
import { hasSentenceEnd } from 'utils/text';
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
import { getScore } from './index';
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export default function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate;
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2);
const wrappingDiv = $('<div></div>');
$candidate.parent().children().each((index, child) => {
const $child = $(child);
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return null;
}
const childScore = getScore($child);
if (childScore) {
if ($child === $candidate) {
wrappingDiv.append($child);
} else {
let contentBonus = 0;
// extract to scoreLinkDensity() TODO
const density = linkDensity($child);
// If sibling has a very low link density,
// give it a small bonus
if (density < 0.05) {
contentBonus += 20;
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus -= 20;
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus += topScore * 0.2;
}
const newScore = getScore($child) + contentBonus;
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($child);
} else if (child.tagName === 'p') {
const childContent = $child.text();
const childContentLength = textLength(childContent);
if (childContentLength > 80 && density < 0.25) {
return wrappingDiv.append($child);
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append($child);
}
}
}
}
return null;
});
return wrappingDiv;
}

@ -1,5 +1,5 @@
// return 1 for every comma in text // return 1 for every comma in text
export default function scoreCommas(text) { export default function scoreCommas(text) {
return (text.match(/,/g) || []).length return (text.match(/,/g) || []).length;
} }

@ -1,20 +1,18 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio'
import { scoreCommas } from './index' import { scoreCommas } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreCommas(text)', () => { describe('scoreCommas(text)', () => {
it(`returns 0 if text has no commas`, () => { it('returns 0 if text has no commas', () => {
assert.equal(scoreCommas("Foo bar"), 0) assert.equal(scoreCommas('Foo bar'), 0);
}) });
it(`returns a point for every comma in the text`, () => {
assert.equal(scoreCommas('Foo, bar'), 1)
assert.equal(scoreCommas('Foo, bar, baz'), 2)
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3)
})
})
})
it('returns a point for every comma in the text', () => {
assert.equal(scoreCommas('Foo, bar'), 1);
assert.equal(scoreCommas('Foo, bar, baz'), 2);
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3);
});
});
});

@ -1,119 +1,69 @@
import { HNEWS_CONTENT_SELECTORS } from './constants' import { convertNodeTo } from 'utils/dom';
import { HNEWS_CONTENT_SELECTORS } from './constants';
import { import {
scoreNode, scoreNode,
setScore, setScore,
getOrInitScore, getOrInitScore,
addScore, addScore,
} from './index' } from './index';
import { convertNodeTo } from 'utils/dom' function convertSpans($node, $) {
if ($node.get(0)) {
// score content. Parents get the full value of their children's const { tagName } = $node.get(0);
// content score, grandparents half
export default function scoreContent($, weightNodes=true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80)
})
})
scorePs($, weightNodes) if (tagName === 'span') {
// convert spans to divs
convertNodeTo($node, $, 'div');
}
}
}
return $ function addScoreTo($node, $, score) {
if ($node) {
convertSpans($node, $);
addScore($node, $, score);
}
} }
function scorePs($, weightNodes) { function scorePs($, weightNodes) {
$('p, pre').toArray().map((node) => { $('p, pre').toArray().map((node) => {
// The raw score for this paragraph, before we add any parent/child // The raw score for this paragraph, before we add any parent/child
// scores. // scores.
let $node = $(node) let $node = $(node);
$node = setScore($node, $, getOrInitScore($node, $, weightNodes)) $node = setScore($node, $, getOrInitScore($node, $, weightNodes));
return $node return $node;
}).forEach(($node) => { }).forEach(($node) => {
// The parent scoring has to be done in a separate loop // The parent scoring has to be done in a separate loop
// because otherwise scoring the parent overwrites // because otherwise scoring the parent overwrites
// the score added to the child // the score added to the child
// Add the individual content score to the parent node // Add the individual content score to the parent node
const rawScore = scoreNode($node) const rawScore = scoreNode($node);
const $parent = $node.parent() const $parent = $node.parent();
addScoreTo($parent, $, rawScore, weightNodes) addScoreTo($parent, $, rawScore, weightNodes);
if ($parent) { if ($parent) {
// Add half of the individual content score to the // Add half of the individual content score to the
// grandparent // grandparent
addScoreTo($parent.parent(), $, rawScore/2, weightNodes) addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
} }
});
})
} }
function convertSpans($node, $) { // score content. Parents get the full value of their children's
if ($node.get(0)) { // content score, grandparents half
const { tagName } = $node.get(0) export default function scoreContent($, weightNodes = true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.forEach(([parentSelector, childSelector]) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80);
});
});
if (tagName === 'span') { scorePs($, weightNodes);
// convert spans to divs
convertNodeTo($node, $, 'div')
}
}
}
function addScoreTo($node, $, score, weightNodes) { return $;
if ($node) {
convertSpans($node, $)
addScore($node, $, score)
}
} }
// def _score_content(self, doc, weight_nodes=True):
// for selector in constants.HNEWS_CONTENT_SELECTORS:
// # Not self.resource.extract_by_selector because our doc is a copy
// # of the resource doc.
// nodes = extract_by_selector(doc, selector,
// AttribMap(doc))
// for node in nodes:
// self._add_score(node, 80)
//
// paras = doc.xpath('.//p | .//pre')
//
// # If we don't have any paragraphs at all, we can't score based on
// # paragraphs, so return without modifying anything else.
// if len(paras) == 0:
// return doc
//
// for para in paras:
// # Don't score invalid tags
// if not isinstance(para.tag, basestring):
// continue
//
// # The raw score for this paragraph, before we add any parent/child
// # scores.
// raw_score = self._score_node(para)
// self._set_score(para, self._get_score(para, weight_nodes))
//
// parent = para.getparent()
// if parent is not None:
// if parent.tag == 'span':
// parent.tag = 'div'
//
// # Add the individual content score to the parent node
// self._add_score(parent, raw_score, weight_nodes=weight_nodes)
//
// grandparent = parent.getparent()
// if grandparent is not None:
// if grandparent.tag == 'span':
// grandparent.tag = 'div'
//
// # Add half of the individual content score to the
// # grandparent
// gp_score = raw_score / 2.0
// self._add_score(grandparent, gp_score, weight_nodes=weight_nodes)
//
// return doc

@ -1,47 +1,45 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import { clean } from 'test-helpers' import HTML from './fixtures/html';
import HTML from './fixtures/html'
import { import {
scoreContent, scoreContent,
getScore, getScore,
} from './index' } from './index';
// TODO: Walk through these and sanity check my scores // TODO: Walk through these and sanity check my scores
// Commented out scores were what I expected, but I was also // Commented out scores were what I expected, but I was also
// probably missing something when calculating // probably missing something when calculating
describe('scoreContent($, weightNodes)', () => { describe('scoreContent($, weightNodes)', () => {
it("loves hNews content", () => { it('loves hNews content', () => {
const $ = cheerio.load(HTML.hNews.before) const $ = cheerio.load(HTML.hNews.before);
const result = scoreContent($).html() scoreContent($).html();
assert.equal(getScore($('div').first()), 140) assert.equal(getScore($('div').first()), 140);
}) });
it("is so-so about non-hNews content", () => { it('is so-so about non-hNews content', () => {
const $ = cheerio.load(HTML.nonHNews.before) const $ = cheerio.load(HTML.nonHNews.before);
const result = scoreContent($).html() scoreContent($).html();
assert.equal(getScore($('div').first()), 65) assert.equal(getScore($('div').first()), 65);
}) });
it("scores this Wired article the same", () => { it('scores this Wired article the same', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html) const $ = cheerio.load(html);
const result = scoreContent($).html() scoreContent($).html();
assert.equal(getScore($('article').first()), 65.5) assert.equal(getScore($('article').first()), 65.5);
}) });
it("scores this Vulture article", () => { it('scores this Vulture article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
let $ = cheerio.load(html) let $ = cheerio.load(html);
$ = scoreContent($) $ = scoreContent($);
assert.equal($('p[score]').length, 62) assert.equal($('p[score]').length, 62);
}) });
});
})

@ -1,11 +1,10 @@
const idkRe = new RegExp('^(p|pre)$', 'i') const idkRe = new RegExp('^(p|pre)$', 'i');
export default function scoreLength(textLength, tagName = 'p') { export default function scoreLength(textLength, tagName = 'p') {
let score const chunks = textLength / 50;
const chunks = textLength / 50
if (chunks > 0) { if (chunks > 0) {
let lengthBonus let lengthBonus;
// No idea why p or pre are being tamped down here // No idea why p or pre are being tamped down here
// but just following the source for now // but just following the source for now
@ -13,14 +12,14 @@ export default function scoreLength(textLength, tagName='p') {
// since this is only being called from the context // since this is only being called from the context
// of scoreParagraph // of scoreParagraph
if (idkRe.test(tagName)) { if (idkRe.test(tagName)) {
lengthBonus = chunks - 2 lengthBonus = chunks - 2;
} else { } else {
lengthBonus = chunks - 1.25 lengthBonus = chunks - 1.25;
} }
return Math.min(Math.max(lengthBonus, 0), 3) return Math.min(Math.max(lengthBonus, 0), 3);
} else {
return 0
} }
return 0;
} }

@ -1,22 +1,21 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio'
import { scoreLength } from './index' import { scoreLength } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreLength(textLength, tagName)', () => { describe('scoreLength(textLength, tagName)', () => {
it(`returns 0 if length < 50 chars`, () => { it('returns 0 if length < 50 chars', () => {
assert.equal(scoreLength(30), 0) assert.equal(scoreLength(30), 0);
}) });
it(`returns varying scores but maxes out at 3`, () => { it('returns varying scores but maxes out at 3', () => {
assert.equal(scoreLength(150), 1) assert.equal(scoreLength(150), 1);
assert.equal(scoreLength(199), 1.98) assert.equal(scoreLength(199), 1.98);
assert.equal(scoreLength(200), 2) assert.equal(scoreLength(200), 2);
assert.equal(scoreLength(250), 3) assert.equal(scoreLength(250), 3);
assert.equal(scoreLength(500), 3) assert.equal(scoreLength(500), 3);
assert.equal(scoreLength(1500), 3) assert.equal(scoreLength(1500), 3);
}) });
}) });
}) });

@ -1,29 +1,29 @@
import { scoreParagraph } from './index' import { scoreParagraph } from './index';
import { import {
PARAGRAPH_SCORE_TAGS, PARAGRAPH_SCORE_TAGS,
CHILD_CONTENT_TAGS, CHILD_CONTENT_TAGS,
BAD_TAGS, BAD_TAGS,
} from './constants' } from './constants';
// Score an individual node. Has some smarts for paragraphs, otherwise // Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag. // just scores based on tag.
export default function scoreNode($node) { export default function scoreNode($node) {
const { tagName } = $node.get(0) const { tagName } = $node.get(0);
// TODO: Consider ordering by most likely. // TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page, // E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node AP // Could save doing that regex test on every node AP
if (PARAGRAPH_SCORE_TAGS.test(tagName)) { if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
return scoreParagraph($node) return scoreParagraph($node);
} else if (tagName === 'div') { } else if (tagName === 'div') {
return 5 return 5;
} else if (CHILD_CONTENT_TAGS.test(tagName)) { } else if (CHILD_CONTENT_TAGS.test(tagName)) {
return 3 return 3;
} else if (BAD_TAGS.test(tagName)) { } else if (BAD_TAGS.test(tagName)) {
return -3 return -3;
} else if (tagName === 'th') { } else if (tagName === 'th') {
return -5 return -5;
} }
return 0 return 0;
} }

@ -1,95 +1,94 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
scoreNode, scoreNode,
scoreParagraph, scoreParagraph,
} from './index' } from './index';
describe('scoreNode(node)', () => { describe('scoreNode(node)', () => {
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const html = '<p><em>Foo</em> bar</p>' const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
let node = $('p').first() const node = $('p').first();
const score = scoreNode(node) const score = scoreNode(node);
const pScore = scoreParagraph(node) const pScore = scoreParagraph(node);
assert.equal(score, pScore) assert.equal(score, pScore);
assert.equal(score, 0) assert.equal(score, 0);
}) });
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score1) const $ = cheerio.load(HTML.score1);
let node = $('p').first() const node = $('p').first();
const score = scoreNode(node) const score = scoreNode(node);
const pScore = scoreParagraph(node) const pScore = scoreParagraph(node);
assert.equal(score, pScore) assert.equal(score, pScore);
assert.equal(score, 1) assert.equal(score, 1);
});
}) it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score3);
const node = $('p').first();
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.score3) const pScore = scoreParagraph(node);
let node = $('p').first()
const score = scoreNode(node) assert.equal(score, pScore);
const pScore = scoreParagraph(node) assert.equal(score, 3);
});
assert.equal(score, pScore) it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
assert.equal(score, 3) const $ = cheerio.load(HTML.score19);
}) const node = $('p').first();
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.score19) const pScore = scoreParagraph(node);
let node = $('p').first()
const score = scoreNode(node) assert.equal(score, pScore);
const pScore = scoreParagraph(node) assert.equal(score, 19);
});
assert.equal(score, pScore) it('scores divs with 5', () => {
assert.equal(score, 19) const $ = cheerio.load(HTML.divScore5);
}) const node = $('div').first();
it(`scores divs with 5`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.divScore5)
let node = $('div').first()
const score = scoreNode(node) assert.equal(score, 5);
});
assert.equal(score, 5) it('scores the blockquote family with 3', () => {
}) const $ = cheerio.load(HTML.blockquoteScore3);
const node = $('blockquote').first();
it(`scores the blockquote family with 3`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.blockquoteScore3)
let node = $('blockquote').first()
const score = scoreNode(node) assert.equal(score, 3);
});
assert.equal(score, 3) it('scores a form with negative 3', () => {
}) const $ = cheerio.load(HTML.formScoreNeg3);
const node = $('form').first();
it(`scores a form with negative 3`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.formScoreNeg3)
let node = $('form').first()
const score = scoreNode(node) assert.equal(score, -3);
});
assert.equal(score, -3) it('scores a TH element with negative 5', () => {
}) const $ = cheerio.load(HTML.thScoreNeg5);
const node = $('th').first();
it(`scores a TH element with negative 5`, () => { const score = scoreNode(node);
const $ = cheerio.load(HTML.thScoreNeg5)
let node = $('th').first()
const score = scoreNode(node) assert.equal(score, -5);
});
assert.equal(score, -5) });
})
})

@ -1,35 +1,35 @@
import { import {
scoreCommas, scoreCommas,
scoreLength, scoreLength,
} from './index' } from './index';
// Score a paragraph using various methods. Things like number of // Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better. // commas, etc. Higher is better.
export default function scoreParagraph(node) { export default function scoreParagraph(node) {
let score = 1 let score = 1;
const text = node.text().trim() const text = node.text().trim();
const textLength = text.length const textLength = text.length;
// If this paragraph is less than 25 characters, don't count it. // If this paragraph is less than 25 characters, don't count it.
if (textLength < 25) { if (textLength < 25) {
return 0 return 0;
} }
// Add points for any commas within this paragraph // Add points for any commas within this paragraph
score = score + scoreCommas(text) score += scoreCommas(text);
// For every 50 characters in this paragraph, add another point. Up // For every 50 characters in this paragraph, add another point. Up
// to 3 points. // to 3 points.
score = score + scoreLength(textLength) score += scoreLength(textLength);
// Articles can end with short paragraphs when people are being clever // Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk // but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below // that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold. // the cutoff threshold.
if (text.slice(-1) === ':') { if (text.slice(-1) === ':') {
score = score - 1 score -= 1;
} }
return score return score;
} }

@ -1,48 +1,48 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import { import {
scoreParagraph, scoreParagraph,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('scoreParagraph(node)', () => { describe('scoreParagraph(node)', () => {
it(`returns 0 if text is less than 25 chars`, () => { it('returns 0 if text is less than 25 chars', () => {
const html = '<p><em>Foo</em> bar</p>' const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 0) assert.equal(score, 0);
}) });
it(`returns 1 if text is > 25 chars and has 0 commas`, () => { it('returns 1 if text is > 25 chars and has 0 commas', () => {
const $ = cheerio.load(HTML.score1) const $ = cheerio.load(HTML.score1);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 1) assert.equal(score, 1);
}) });
it(`returns 3 if text is > 25 chars and has 2 commas`, () => { it('returns 3 if text is > 25 chars and has 2 commas', () => {
const $ = cheerio.load(HTML.score3) const $ = cheerio.load(HTML.score3);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 3) assert.equal(score, 3);
}) });
it(`returns 19 if text has 15 commas, ~600 chars`, () => { it('returns 19 if text has 15 commas, ~600 chars', () => {
const $ = cheerio.load(HTML.score19) const $ = cheerio.load(HTML.score19);
let node = $('p').first() const node = $('p').first();
const score = scoreParagraph(node) const score = scoreParagraph(node);
assert.equal(score, 19) assert.equal(score, 19);
}) });
}) });
}) });

@ -1,7 +1,6 @@
export default function setScore($node, $, score) { export default function setScore($node, $, score) {
$node.attr('score', score) $node.attr('score', score);
return $node return $node;
} }

@ -1,23 +1,22 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
setScore, setScore,
getScore getScore,
} from './index' } from './index';
describe('Scoring utils', () => { describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => { describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => { it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>') const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first() let $node = $('p').first();
const newScore = 25 const newScore = 25;
$node = setScore($node, $, newScore) $node = setScore($node, $, newScore);
const score = getScore($node) const score = getScore($node);
assert(score, newScore) assert(score, newScore);
}) });
}) });
}) });

@ -18,8 +18,8 @@ export const DATE_PUBLISHED_META_TAGS = [
'content_create_date', 'content_create_date',
'lastmodified', 'lastmodified',
'created', 'created',
'date' 'date',
] ];
// An ordered list of XPath Selectors to find // An ordered list of XPath Selectors to find
// likely date published dates. From most explicit // likely date published dates. From most explicit
@ -42,12 +42,12 @@ export const DATE_PUBLISHED_SELECTORS = [
'#story .datetime', '#story .datetime',
'.dateline', '.dateline',
'.pubdate', '.pubdate',
] ];
// An ordered list of compiled regular expressions to find likely date // An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first // published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse // reference be a date string that is parseable by dateutil.parser.parse
const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)' const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
export const DATE_PUBLISHED_URL_RES = [ export const DATE_PUBLISHED_URL_RES = [
// /2012/01/27/ but not /2012/01/293 // /2012/01/27/ but not /2012/01/293
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
@ -56,6 +56,6 @@ export const DATE_PUBLISHED_URL_RES = [
// 2012-01-27 // 2012-01-27
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
// /2012/jan/27/ // /2012/jan/27/
new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i') new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'),
] ];

@ -1,37 +1,36 @@
import { cleanDatePublished } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import { extractFromUrl } from 'utils/text';
import { import {
DATE_PUBLISHED_META_TAGS, DATE_PUBLISHED_META_TAGS,
DATE_PUBLISHED_SELECTORS, DATE_PUBLISHED_SELECTORS,
DATE_PUBLISHED_URL_RES, DATE_PUBLISHED_URL_RES,
} from './constants' } from './constants';
import { cleanDatePublished } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom'
import { extractFromUrl } from 'utils/text'
const GenericDatePublishedExtractor = { const GenericDatePublishedExtractor = {
extract({ $, url, metaCache }) { extract({ $, url, metaCache }) {
let datePublished let datePublished;
// First, check to see if we have a matching meta tag // First, check to see if we have a matching meta tag
// that we can make use of. // that we can make use of.
// Don't try cleaning tags from this string // Don't try cleaning tags from this string
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false) datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false);
if(datePublished) return cleanDatePublished(datePublished) if (datePublished) return cleanDatePublished(datePublished);
// Second, look through our selectors looking for potential // Second, look through our selectors looking for potential
// date_published's. // date_published's.
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS) datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
if(datePublished) return cleanDatePublished(datePublished) if (datePublished) return cleanDatePublished(datePublished);
// Lastly, look to see if a dately string exists in the URL // Lastly, look to see if a dately string exists in the URL
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES) datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
if(datePublished) return cleanDatePublished(datePublished) if (datePublished) return cleanDatePublished(datePublished);
return null return null;
} },
} };
export default GenericDatePublishedExtractor export default GenericDatePublishedExtractor;

@ -1,97 +1,95 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import moment from 'moment' import moment from 'moment';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import GenericDatePublishedExtractor from './extractor' import GenericDatePublishedExtractor from './extractor';
describe('GenericDatePublishedExtractor', () => { describe('GenericDatePublishedExtractor', () => {
describe('extract($, metaCache)', () => { describe('extract($, metaCache)', () => {
it('extracts datePublished from meta tags', () => { it('extracts datePublished from meta tags', () => {
const $ = cheerio.load(HTML.datePublishedMeta.test) const $ = cheerio.load(HTML.datePublishedMeta.test);
const metaCache = ["displaydate", "something-else"] const metaCache = ['displaydate', 'something-else'];
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache } { $, url: '', metaCache }
) );
assert.equal( assert.equal(
result, result,
HTML.datePublishedMeta.result.toISOString() HTML.datePublishedMeta.result.toISOString()
) );
}) });
it('extracts datePublished from selectors', () => { it('extracts datePublished from selectors', () => {
const $ = cheerio.load(HTML.datePublishedSelectors.test) const $ = cheerio.load(HTML.datePublishedSelectors.test);
const metaCache = [] const metaCache = [];
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache } { $, url: '', metaCache }
) );
assert.equal( assert.equal(
result, result,
HTML.datePublishedMeta.result.toISOString() HTML.datePublishedMeta.result.toISOString()
) );
}) });
it('extracts from url formatted /2012/08/01/etc', () => { it('extracts from url formatted /2012/08/01/etc', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const url = 'https://example.com/2012/08/01/this-is-good' const url = 'https://example.com/2012/08/01/this-is-good';
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url, metaCache } { $, url, metaCache }
) );
assert.equal( assert.equal(
result, result,
new Date('2012/08/01').toISOString() new Date('2012/08/01').toISOString()
) );
}) });
it('extracts from url formatted /2020-01-01', () => { it('extracts from url formatted /2020-01-01', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const url = 'https://example.com/2020-01-01/this-is-good' const url = 'https://example.com/2020-01-01/this-is-good';
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url, metaCache } { $, url, metaCache }
) );
assert.equal( assert.equal(
result, result,
moment(new Date('2020-01-01')).toISOString() moment(new Date('2020-01-01')).toISOString()
) );
}) });
it('extracts from url formatted /2020/jan/01', () => { it('extracts from url formatted /2020/jan/01', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const url = 'https://example.com/2020/jan/01/this-is-good' const url = 'https://example.com/2020/jan/01/this-is-good';
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url, metaCache } { $, url, metaCache }
) );
assert.equal( assert.equal(
result, result,
new Date('2020/jan/01').toISOString() new Date('2020/jan/01').toISOString()
) );
}) });
it('returns null if no date can be found', () => { it('returns null if no date can be found', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const result = const result =
GenericDatePublishedExtractor.extract( GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache } { $, url: '', metaCache }
) );
assert.equal(result, null)
})
})
})
assert.equal(result, null);
});
});
});

@ -7,7 +7,7 @@ const HTML = {
</head> </head>
</html> </html>
`, `,
result: new Date('1/1/2020 8:30 (EST)') result: new Date('1/1/2020 8:30 (EST)'),
}, },
datePublishedSelectors: { datePublishedSelectors: {
test: ` test: `
@ -19,8 +19,8 @@ const HTML = {
</head> </head>
</div> </div>
`, `,
result: new Date('1/1/2020 8:30 am (EST)') result: new Date('1/1/2020 8:30 am (EST)'),
}, },
} };
export default HTML export default HTML;

@ -1,27 +1,28 @@
import { // import {
DEK_META_TAGS, // DEK_META_TAGS,
DEK_SELECTORS, // DEK_SELECTORS,
DEK_URL_RES, // DEK_URL_RES,
} from './constants' // } from './constants';
import { cleanDek } from 'cleaners' // import { cleanDek } from 'cleaners';
import { // import {
extractFromMeta, // extractFromMeta,
extractFromSelectors, // extractFromSelectors,
} from 'utils/dom' // } from 'utils/dom';
// Currently there is only one selector for // Currently there is only one selector for
// deks. We should simply return null here // deks. We should simply return null here
// until we have a more robust generic option. // until we have a more robust generic option.
// Below is the original source for this, for reference. // Below is the original source for this, for reference.
const GenericDekExtractor = { const GenericDekExtractor = {
extract({ $, content, metaCache }) { // extract({ $, content, metaCache }) {
return null extract() {
} return null;
} },
};
export default GenericDekExtractor export default GenericDekExtractor;
// def extract_dek(self): // def extract_dek(self):
// # First, check to see if we have a matching meta tag that we can make // # First, check to see if we have a matching meta tag that we can make

@ -1,20 +1,18 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
// import HTML from './fixtures/html' // import HTML from './fixtures/html'
import GenericDekExtractor from './extractor' import GenericDekExtractor from './extractor';
describe('GenericDekExtractor', () => { describe('GenericDekExtractor', () => {
describe('extract({ $, metaCache })', () => { describe('extract({ $, metaCache })', () => {
it('returns null if no dek can be found', () => { it('returns null if no dek can be found', () => {
const $ = cheerio.load('<div></div>') const $ = cheerio.load('<div></div>');
const metaCache = [] const metaCache = [];
const result = const result =
GenericDekExtractor.extract({ $, metaCache }) GenericDekExtractor.extract({ $, metaCache });
assert.equal(result, null)
})
}) assert.equal(result, null);
}) });
});
});

@ -1,12 +1,12 @@
import cheerio from 'cheerio' import cheerio from 'cheerio';
import GenericContentExtractor from './content/extractor' import GenericContentExtractor from './content/extractor';
import GenericTitleExtractor from './title/extractor' import GenericTitleExtractor from './title/extractor';
import GenericAuthorExtractor from './author/extractor' import GenericAuthorExtractor from './author/extractor';
import GenericDatePublishedExtractor from './date-published/extractor' import GenericDatePublishedExtractor from './date-published/extractor';
import GenericDekExtractor from './dek/extractor' import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor' import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor' import GenericNextPageUrlExtractor from './next-page-url/extractor';
const GenericExtractor = { const GenericExtractor = {
// This extractor is the default for all domains // This extractor is the default for all domains
@ -19,32 +19,32 @@ const GenericExtractor = {
dek: GenericDekExtractor.extract, dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract, nextPageUrl: GenericNextPageUrlExtractor.extract,
extract: function(options) { extract(options) {
let { html } = options const { html } = options;
if (html) { if (html) {
const $ = cheerio.load(html) const $ = cheerio.load(html);
options.$ = $ options.$ = $;
} }
const title = this.title(options) const title = this.title(options);
const datePublished = this.datePublished(options) const datePublished = this.datePublished(options);
const author = this.author(options) const author = this.author(options);
const content = this.content({ ...options, title }) const content = this.content({ ...options, title });
const leadImageUrl = this.leadImageUrl(options) const leadImageUrl = this.leadImageUrl(options);
const dek = this.dek(options) const dek = this.dek(options);
const nextPageUrl = this.nextPageUrl(options) const nextPageUrl = this.nextPageUrl(options);
return { return {
title, title,
author, author,
datePublished: datePublished ? datePublished : null, datePublished: datePublished || null,
dek, dek,
leadImageUrl, leadImageUrl,
content, content,
nextPageUrl, nextPageUrl,
} };
} },
} };
export default GenericExtractor export default GenericExtractor;

@ -1,14 +1,12 @@
import assert from 'assert' import assert from 'assert';
import fs from 'fs' import fs from 'fs';
import { clean } from 'test-helpers' import GenericExtractor from './index';
import GenericExtractor from './index'
describe('GenericExtractor', () => { describe('GenericExtractor', () => {
describe('extract(opts)', () => { describe('extract(opts)', () => {
it("extracts this old LA Times article", () => { it('extracts this old LA Times article', () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
const { const {
title, title,
@ -16,23 +14,23 @@ describe('GenericExtractor', () => {
datePublished, datePublished,
dek, dek,
} = GenericExtractor.extract( } = GenericExtractor.extract(
{ url: "http://latimes.com", html, metaCache: [] } { url: 'http://latimes.com', html, metaCache: [] }
) );
assert.equal(author, null) assert.equal(author, null);
assert.equal( assert.equal(
title, title,
'California appears poised to be first to ban power-guzzling big-screen TVs' 'California appears poised to be first to ban power-guzzling big-screen TVs'
) );
assert.equal( assert.equal(
datePublished, datePublished,
'2009-10-14T04:00:00.000Z' '2009-10-14T04:00:00.000Z'
) );
assert.equal(dek, null) assert.equal(dek, null);
}) });
it("extracts html and returns the article title", () => { it('extracts html and returns the article title', () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8') const html = fs.readFileSync('../fixtures/wired.html', 'utf-8');
const { const {
author, author,
@ -40,18 +38,17 @@ describe('GenericExtractor', () => {
datePublished, datePublished,
dek, dek,
} = GenericExtractor.extract( } = GenericExtractor.extract(
{ url: "http://wired.com", html, metaCache: [] } { url: 'http://wired.com', html, metaCache: [] }
) );
assert.equal(author, 'Eric Adams') assert.equal(author, 'Eric Adams');
assert.equal( assert.equal(
title, title,
'Airplane Tires Dont Explode on Landing Because They Are Pumped!' 'Airplane Tires Dont Explode on Landing Because They Are Pumped!'
) );
assert.equal(datePublished, null) assert.equal(datePublished, null);
assert.equal(dek, null) assert.equal(dek, null);
}) });
});
}) });
})

@ -5,11 +5,11 @@ export const LEAD_IMAGE_URL_META_TAGS = [
'og:image', 'og:image',
'twitter:image', 'twitter:image',
'image_src', 'image_src',
] ];
export const LEAD_IMAGE_URL_SELECTORS = [ export const LEAD_IMAGE_URL_SELECTORS = [
'link[rel=image_src]', 'link[rel=image_src]',
] ];
export const POSITIVE_LEAD_IMAGE_URL_HINTS = [ export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'upload', 'upload',
@ -17,8 +17,8 @@ export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'large', 'large',
'photo', 'photo',
'wp-image', 'wp-image',
] ];
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [ export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'spacer', 'spacer',
@ -46,8 +46,8 @@ export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'promo', 'promo',
'ads', 'ads',
'wp-includes', 'wp-includes',
] ];
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
export const GIF_RE = /\.gif(\?.*)?$/i export const GIF_RE = /\.gif(\?.*)?$/i;
export const JPG_RE = /\.jpe?g(\?.*)?$/i export const JPG_RE = /\.jpe?g(\?.*)?$/i;

@ -1,14 +1,12 @@
import 'babel-polyfill' import 'babel-polyfill';
import { extractFromMeta } from 'utils/dom';
import { cleanImage } from 'cleaners';
import { import {
LEAD_IMAGE_URL_META_TAGS, LEAD_IMAGE_URL_META_TAGS,
LEAD_IMAGE_URL_SELECTORS, LEAD_IMAGE_URL_SELECTORS,
} from './constants' } from './constants';
import {
extractFromMeta,
extractFromSelectors
} from 'utils/dom'
import { import {
scoreImageUrl, scoreImageUrl,
@ -17,9 +15,7 @@ import {
scoreBySibling, scoreBySibling,
scoreByDimensions, scoreByDimensions,
scoreByPosition, scoreByPosition,
} from './score-image' } from './score-image';
import { cleanImage } from 'cleaners'
// Given a resource, try to find the lead image URL from within // Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system // it. Like content and next page extraction, uses a scoring system
@ -31,86 +27,87 @@ import { cleanImage } from 'cleaners'
// * weird aspect ratio // * weird aspect ratio
const GenericLeadImageUrlExtractor = { const GenericLeadImageUrlExtractor = {
extract({ $, content, metaCache }) { extract({ $, content, metaCache }) {
let imageUrl, cleanUrl let cleanUrl;
// Check to see if we have a matching meta tag that we can make use of. // Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large // Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards. // images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph. // images usually have for things like Open Graph.
imageUrl = const imageUrl =
extractFromMeta( extractFromMeta(
$, $,
LEAD_IMAGE_URL_META_TAGS, LEAD_IMAGE_URL_META_TAGS,
metaCache, metaCache,
false false
) );
if (imageUrl) { if (imageUrl) {
cleanUrl = cleanImage(imageUrl) cleanUrl = cleanImage(imageUrl);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
// Next, try to find the "best" image via the content. // Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions, // We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead. // so try to do some analysis and determine them instead.
const imgs = $('img', content).toArray() const imgs = $('img', content).toArray();
let imgScores = {} const imgScores = {};
imgs.forEach((img, index) => { imgs.forEach((img, index) => {
const $img = $(img) const $img = $(img);
const src = $img.attr('src') const src = $img.attr('src');
if (!src) return if (!src) return;
let score = scoreImageUrl(src) let score = scoreImageUrl(src);
score = score + scoreAttr($img) score += scoreAttr($img);
score = score + scoreByParents($img) score += scoreByParents($img);
score = score + scoreBySibling($img) score += scoreBySibling($img);
score = score + scoreByDimensions($img) score += scoreByDimensions($img);
score = score + scoreByPosition(imgs, index) score += scoreByPosition(imgs, index);
imgScores[src] = score imgScores[src] = score;
}) });
const [topUrl, topScore] = const [topUrl, topScore] =
Reflect.ownKeys(imgScores).reduce((acc, key) => Reflect.ownKeys(imgScores).reduce((acc, key) =>
imgScores[key] > acc[1] ? [key, imgScores[key]] : acc imgScores[key] > acc[1] ? [key, imgScores[key]] : acc
, [null, 0]) , [null, 0]);
if (topScore > 0) { if (topScore > 0) {
cleanUrl = cleanImage(topUrl) cleanUrl = cleanImage(topUrl);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
// If nothing else worked, check to see if there are any really // If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />. // probable nodes in the doc, like <link rel="image_src" />.
for (const selector of LEAD_IMAGE_URL_SELECTORS) { for (const selector of LEAD_IMAGE_URL_SELECTORS) {
const $node = $(selector).first() const $node = $(selector).first();
const src = $node.attr('src') const src = $node.attr('src');
if (src) { if (src) {
cleanUrl = cleanImage(src) cleanUrl = cleanImage(src);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
const href = $node.attr('href') const href = $node.attr('href');
if (href) { if (href) {
cleanUrl = cleanImage(href) cleanUrl = cleanImage(href);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
const value = $node.attr('value') const value = $node.attr('value');
if (value) { if (value) {
cleanUrl = cleanImage(value) cleanUrl = cleanImage(value);
if (cleanUrl) return cleanUrl if (cleanUrl) return cleanUrl;
} }
} }
return null;
}, },
} };
export default GenericLeadImageUrlExtractor export default GenericLeadImageUrlExtractor;
// def extract(self): // def extract(self):
// """ // """

@ -1,62 +1,62 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import HTML from './fixtures/html' import HTML from './fixtures/html';
import GenericLeadImageUrlExtractor from './extractor' import GenericLeadImageUrlExtractor from './extractor';
describe('GenericLeadImageUrlExtractor', () => { describe('GenericLeadImageUrlExtractor', () => {
describe('extract({ $, content, metaCache })', () => { describe('extract({ $, content, metaCache })', () => {
it('returns og:image first', () => { it('returns og:image first', () => {
const $ = cheerio.load(HTML.og.test) const $ = cheerio.load(HTML.og.test);
const content = $('*').first() const content = $('*').first();
const metaCache = ['og:image'] const metaCache = ['og:image'];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.og.result) assert.equal(result, HTML.og.result);
}) });
it('returns twitter:image', () => { it('returns twitter:image', () => {
const $ = cheerio.load(HTML.twitter.test) const $ = cheerio.load(HTML.twitter.test);
const content = $('*').first() const content = $('*').first();
const metaCache = ['twitter:image'] const metaCache = ['twitter:image'];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.twitter.result) assert.equal(result, HTML.twitter.result);
}) });
it('finds images based on scoring', () => { it('finds images based on scoring', () => {
const $ = cheerio.load(HTML.scoring.test) const $ = cheerio.load(HTML.scoring.test);
const content = $('*').first() const content = $('*').first();
const metaCache = [] const metaCache = [];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.scoring.result) assert.equal(result, HTML.scoring.result);
}) });
it('returns image based on selectors', () => { it('returns image based on selectors', () => {
const $ = cheerio.load(HTML.selectors.test) const $ = cheerio.load(HTML.selectors.test);
const content = $('*').first() const content = $('*').first();
const metaCache = [] const metaCache = [];
const result = const result =
GenericLeadImageUrlExtractor.extract( GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache } { $, content, metaCache }
) );
assert.equal(result, HTML.selectors.result) assert.equal(result, HTML.selectors.result);
}) });
}) });
}) });

@ -7,7 +7,7 @@ const HTML = {
</head> </head>
</html> </html>
`, `,
result: `http://example.com/lead.jpg` result: 'http://example.com/lead.jpg',
}, },
twitter: { twitter: {
test: ` test: `
@ -17,7 +17,7 @@ const HTML = {
</head> </head>
</html> </html>
`, `,
result: `http://example.com/lead.jpg` result: 'http://example.com/lead.jpg',
}, },
scoring: { scoring: {
test: ` test: `
@ -27,7 +27,7 @@ const HTML = {
<img src="http://example.com/upload/whateverpic.png" /> <img src="http://example.com/upload/whateverpic.png" />
</div> </div>
`, `,
result: `http://example.com/upload/goodpic.jpg` result: 'http://example.com/upload/goodpic.jpg',
}, },
selectors: { selectors: {
test: ` test: `
@ -35,8 +35,8 @@ const HTML = {
<link rel="image_src" href="http://example.com/upload/goodpic.jpg"> <link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div> </div>
`, `,
result: `http://example.com/upload/goodpic.jpg` result: 'http://example.com/upload/goodpic.jpg',
}, },
} };
export default HTML export default HTML;

@ -3,123 +3,123 @@ import {
NEGATIVE_LEAD_IMAGE_URL_HINTS_RE, NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,
GIF_RE, GIF_RE,
JPG_RE, JPG_RE,
} from './constants' } from './constants';
import { PHOTO_HINTS_RE } from '../content/scoring/constants' import { PHOTO_HINTS_RE } from '../content/scoring/constants';
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`;
}
// Scores image urls based on a variety of heuristics. // Scores image urls based on a variety of heuristics.
export function scoreImageUrl(url) { export function scoreImageUrl(url) {
url = url.trim() url = url.trim();
let score = 0 let score = 0;
if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score + 20 score += 20;
} }
if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score - 20 score -= 20;
} }
// TODO: We might want to consider removing this as // TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were // gifs are much more common/popular than they once were
if (GIF_RE.test(url)) { if (GIF_RE.test(url)) {
score = score - 10 score -= 10;
} }
if (JPG_RE.test(url)) { if (JPG_RE.test(url)) {
score = score + 10 score += 10;
} }
// PNGs are neutral. // PNGs are neutral.
return score return score;
} }
// Alt attribute usually means non-presentational image. // Alt attribute usually means non-presentational image.
export function scoreAttr($img) { export function scoreAttr($img) {
if ($img.attr('alt')) { if ($img.attr('alt')) {
return 5 return 5;
} else {
return 0
} }
return 0;
} }
// Look through our parent and grandparent for figure-like // Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them // container elements, give a bonus if we find them
export function scoreByParents($img) { export function scoreByParents($img) {
let score = 0 let score = 0;
const $figParent = $img.parents('figure').first() const $figParent = $img.parents('figure').first();
if ($figParent.length === 1) { if ($figParent.length === 1) {
score = score + 25 score += 25;
} }
const $parent = $img.parent() const $parent = $img.parent();
let $gParent let $gParent;
if ($parent.length === 1) { if ($parent.length === 1) {
$gParent = $parent.parent() $gParent = $parent.parent();
} }
[$parent, $gParent].forEach($node => { [$parent, $gParent].forEach(($node) => {
if (PHOTO_HINTS_RE.test(getSig($node))) { if (PHOTO_HINTS_RE.test(getSig($node))) {
score = score + 15 score += 15;
} }
}) });
return score return score;
} }
// Look at our immediate sibling and see if it looks like it's a // Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so. // caption. Bonus if so.
export function scoreBySibling($img) { export function scoreBySibling($img) {
let score = 0 let score = 0;
const $sibling = $img.next() const $sibling = $img.next();
const sibling = $sibling.get(0) const sibling = $sibling.get(0);
if (sibling && sibling.tagName === 'figcaption') { if (sibling && sibling.tagName === 'figcaption') {
score = score + 25 score += 25;
} }
if (PHOTO_HINTS_RE.test(getSig($sibling))) { if (PHOTO_HINTS_RE.test(getSig($sibling))) {
score = score + 15 score += 15;
} }
return score return score;
} }
export function scoreByDimensions($img) { export function scoreByDimensions($img) {
let score = 0 let score = 0;
const width = parseFloat($img.attr('width')) const width = parseFloat($img.attr('width'));
const height = parseFloat($img.attr('height')) const height = parseFloat($img.attr('height'));
const src = $img.attr('src') const src = $img.attr('src');
// Penalty for skinny images // Penalty for skinny images
if (width && width <= 50) { if (width && width <= 50) {
score = score - 50 score -= 50;
} }
// Penalty for short images // Penalty for short images
if (height && height <= 50) { if (height && height <= 50) {
score = score - 50 score -= 50;
} }
if (width && height && !src.includes('sprite')) { if (width && height && !src.includes('sprite')) {
const area = width * height const area = width * height;
if (area < 5000) { // Smaller than 50 x 100 if (area < 5000) { // Smaller than 50 x 100
score = score - 100 score -= 100;
} else { } else {
score = score + Math.round(area/1000) score += Math.round(area / 1000);
} }
} }
return score return score;
} }
export function scoreByPosition($imgs, index) { export function scoreByPosition($imgs, index) {
return $imgs.length/2 - index return ($imgs.length / 2) - index;
}
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`
} }

@ -1,5 +1,5 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import { import {
scoreImageUrl, scoreImageUrl,
@ -8,61 +8,61 @@ import {
scoreBySibling, scoreBySibling,
scoreByDimensions, scoreByDimensions,
scoreByPosition, scoreByPosition,
} from './score-image' } from './score-image';
describe('scoreImageUrlUrl(url)', () => { describe('scoreImageUrlUrl(url)', () => {
it('gets 20 points for a positive lead img hint', () => { it('gets 20 points for a positive lead img hint', () => {
const url = 'http://example.com/upload/img.png' const url = 'http://example.com/upload/img.png';
assert.equal(scoreImageUrl(url), 20) assert.equal(scoreImageUrl(url), 20);
}) });
it('loses 20 points for a negative lead img hint', () => { it('loses 20 points for a negative lead img hint', () => {
const url = 'http://example.com/sprite/foo/bar.png' const url = 'http://example.com/sprite/foo/bar.png';
assert.equal(scoreImageUrl(url), -20) assert.equal(scoreImageUrl(url), -20);
}) });
it('loses 10 points for a gif', () => { it('loses 10 points for a gif', () => {
const url = 'http://example.com/foo/bar.gif' const url = 'http://example.com/foo/bar.gif';
assert.equal(scoreImageUrl(url), -10) assert.equal(scoreImageUrl(url), -10);
const url2 = 'http://example.com/foogif/bar' const url2 = 'http://example.com/foogif/bar';
assert.equal(scoreImageUrl(url2), 0) assert.equal(scoreImageUrl(url2), 0);
}) });
it('gains 10 points for a jpg', () => { it('gains 10 points for a jpg', () => {
const url = 'http://example.com/foo/bar.jpg' const url = 'http://example.com/foo/bar.jpg';
assert.equal(scoreImageUrl(url), 10) assert.equal(scoreImageUrl(url), 10);
const url2 = 'http://example.com/foo/bar.jpeg' const url2 = 'http://example.com/foo/bar.jpeg';
assert.equal(scoreImageUrl(url2), 10) assert.equal(scoreImageUrl(url2), 10);
const url3 = 'http://example.com/foojpg/bar' const url3 = 'http://example.com/foojpg/bar';
assert.equal(scoreImageUrl(url3), 0) assert.equal(scoreImageUrl(url3), 0);
const url4 = 'http://example.com/foo.jpg?bar=baz' const url4 = 'http://example.com/foo.jpg?bar=baz';
assert.equal(scoreImageUrl(url4), 10) assert.equal(scoreImageUrl(url4), 10);
}) });
}) });
describe('scoreAttr($img)', () => { describe('scoreAttr($img)', () => {
it('gets 5 points if the img node has an alt attribute', () => { it('gets 5 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img alt="Wow" /></div>') const $ = cheerio.load('<div><img alt="Wow" /></div>');
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreAttr($img), 5) assert.equal(scoreAttr($img), 5);
}) });
it('gets 0 points if the img node has an alt attribute', () => { it('gets 0 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img /></div>') const $ = cheerio.load('<div><img /></div>');
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreAttr($img), 0) assert.equal(scoreAttr($img), 0);
}) });
}) });
describe('scoreByParents($img)', () => { describe('scoreByParents($img)', () => {
it('gets 25 points if it has a figure parent', () => { it('gets 25 points if it has a figure parent', () => {
@ -74,18 +74,18 @@ describe('scoreByParents($img)', () => {
</div> </div>
</figure> </figure>
</div>` </div>`
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByParents($img), 25) assert.equal(scoreByParents($img), 25);
}) });
it('gets 0 points if the img has no figure parent', () => { it('gets 0 points if the img has no figure parent', () => {
const $ = cheerio.load('<div><img /></div>') const $ = cheerio.load('<div><img /></div>');
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByParents($img), 0) assert.equal(scoreByParents($img), 0);
}) });
it('gets 15 points if parent or gparent has photo hints', () => { it('gets 15 points if parent or gparent has photo hints', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -96,12 +96,12 @@ describe('scoreByParents($img)', () => {
</div> </div>
</div> </div>
</div>` </div>`
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByParents($img), 15) assert.equal(scoreByParents($img), 15);
}) });
}) });
describe('scoreBySibling($img)', () => { describe('scoreBySibling($img)', () => {
it('gets 25 points if its sibling is figcaption', () => { it('gets 25 points if its sibling is figcaption', () => {
@ -112,11 +112,11 @@ describe('scoreBySibling($img)', () => {
<figcaption>Wow</figcaption> <figcaption>Wow</figcaption>
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreBySibling($img), 25) assert.equal(scoreBySibling($img), 25);
}) });
it('gets 15 points if its sibling has photo hints', () => { it('gets 15 points if its sibling has photo hints', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -128,12 +128,12 @@ describe('scoreBySibling($img)', () => {
</div> </div>
</div> </div>
</div>` </div>`
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreBySibling($img), 15) assert.equal(scoreBySibling($img), 15);
}) });
}) });
describe('scoreByDimensions($img)', () => { describe('scoreByDimensions($img)', () => {
it('penalizes skinny images', () => { it('penalizes skinny images', () => {
@ -143,11 +143,11 @@ describe('scoreByDimensions($img)', () => {
<img width="10" /> <img width="10" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50) assert.equal(scoreByDimensions($img), -50);
}) });
it('penalizes short images', () => { it('penalizes short images', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -156,11 +156,11 @@ describe('scoreByDimensions($img)', () => {
<img height="10" /> <img height="10" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50) assert.equal(scoreByDimensions($img), -50);
}) });
it('ignores sprites', () => { it('ignores sprites', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -169,11 +169,11 @@ describe('scoreByDimensions($img)', () => {
<img src="/sprite/etc/foo.png" width="1000" height="1000" /> <img src="/sprite/etc/foo.png" width="1000" height="1000" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), 0) assert.equal(scoreByDimensions($img), 0);
}) });
it('penalizes images with small areas', () => { it('penalizes images with small areas', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -182,11 +182,11 @@ describe('scoreByDimensions($img)', () => {
<img src="/etc/foo.png" width="60" height="60" /> <img src="/etc/foo.png" width="60" height="60" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), -100) assert.equal(scoreByDimensions($img), -100);
}) });
it('prefers the largest images', () => { it('prefers the largest images', () => {
const $ = cheerio.load( const $ = cheerio.load(
@ -195,13 +195,12 @@ describe('scoreByDimensions($img)', () => {
<img src="/etc/foo.png" width="1000" height="1000" /> <img src="/etc/foo.png" width="1000" height="1000" />
</div> </div>
` `
) );
const $img = $('img').first() const $img = $('img').first();
assert.equal(scoreByDimensions($img), 1000) assert.equal(scoreByDimensions($img), 1000);
}) });
});
})
describe('scoreByPosition($imgs, index)', () => { describe('scoreByPosition($imgs, index)', () => {
it('gives higher scores to images that come first', () => { it('gives higher scores to images that come first', () => {
@ -216,10 +215,10 @@ describe('scoreByPosition($imgs, index)', () => {
<img width="10" /> <img width="10" />
</div> </div>
` `
) );
const $imgs = $('img') const $imgs = $('img');
assert.equal(scoreByPosition($imgs, 0), 3) assert.equal(scoreByPosition($imgs, 0), 3);
}) });
}) });

@ -1,25 +1,22 @@
import 'babel-polyfill' import 'babel-polyfill';
import URL from 'url' import URL from 'url';
import { import {
pageNumFromUrl,
articleBaseUrl, articleBaseUrl,
removeAnchor, removeAnchor,
} from 'utils/text' } from 'utils/text';
import scoreLinks from './scoring/score-links' import scoreLinks from './scoring/score-links';
// Looks for and returns next page url // Looks for and returns next page url
// for multi-page articles // for multi-page articles
const GenericNextPageUrlExtractor = { const GenericNextPageUrlExtractor = {
extract({ $, url, parsedUrl, previousUrls = [] }) { extract({ $, url, parsedUrl, previousUrls = [] }) {
parsedUrl = parsedUrl || URL.parse(url) parsedUrl = parsedUrl || URL.parse(url);
const currentPageNum = pageNumFromUrl(url) const articleUrl = removeAnchor(url);
const articleUrl = removeAnchor(url) const baseUrl = articleBaseUrl(url, parsedUrl);
const baseUrl = articleBaseUrl(url, parsedUrl)
const { host } = parsedUrl
const links = $('a[href]').toArray() const links = $('a[href]').toArray();
const scoredLinks = scoreLinks({ const scoredLinks = scoreLinks({
links, links,
@ -27,28 +24,28 @@ const GenericNextPageUrlExtractor = {
baseUrl, baseUrl,
parsedUrl, parsedUrl,
$, $,
previousUrls previousUrls,
}) });
// If no links were scored, return null // If no links were scored, return null
if (!scoredLinks) return null if (!scoredLinks) return null;
// now that we've scored all possible pages, // now that we've scored all possible pages,
// find the biggest one. // find the biggest one.
const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => { const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
const scoredLink = scoredLinks[link] const scoredLink = scoredLinks[link];
return scoredLink.score > acc.score ? scoredLink : acc return scoredLink.score > acc.score ? scoredLink : acc;
}, { score: -100 }) }, { score: -100 });
// If the score is less than 50, we're not confident enough to use it, // If the score is less than 50, we're not confident enough to use it,
// so we fail. // so we fail.
if (topPage.score >= 50) { if (topPage.score >= 50) {
return topPage.href return topPage.href;
} else {
return null
}
}
} }
return null;
},
};
export default GenericNextPageUrlExtractor export default GenericNextPageUrlExtractor;

@ -1,34 +1,34 @@
import assert from 'assert' import assert from 'assert';
import fs from 'fs' import fs from 'fs';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import GenericNextPageUrlExtractor from './extractor' import GenericNextPageUrlExtractor from './extractor';
describe('GenericNextPageUrlExtractor', () => { describe('GenericNextPageUrlExtractor', () => {
it('returns most likely next page url', () => { it('returns most likely next page url', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8') const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html) const $ = cheerio.load(html);
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2' const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2';
const nextPage = GenericNextPageUrlExtractor.extract({ const nextPage = GenericNextPageUrlExtractor.extract({
$, $,
url url,
}) });
assert.equal(nextPage, next) assert.equal(nextPage, next);
}) });
it('returns null if there is no likely next page', () => { it('returns null if there is no likely next page', () => {
const html = `<div><p>HI</p></div>` const html = '<div><p>HI</p></div>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const url = 'http://example.com/foo/bar' const url = 'http://example.com/foo/bar';
const nextPage = GenericNextPageUrlExtractor.extract({ const nextPage = GenericNextPageUrlExtractor.extract({
$, $,
url url,
}) });
assert.equal(nextPage, null) assert.equal(nextPage, null);
}) });
}) });

@ -1,4 +1,4 @@
export const DIGIT_RE = /\d/ export const DIGIT_RE = /\d/;
// A list of words that, if found in link text or URLs, likely mean that // A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link. // this link is not a next page link.
@ -16,23 +16,23 @@ export const EXTRANEOUS_LINK_HINTS = [
'sign', 'sign',
'single', 'single',
'adx', 'adx',
'entry-unrelated' 'entry-unrelated',
] ];
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i') export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');
// Match any link text/classname/id that looks like it could mean the next // Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can // page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page. // mean last page.
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i') export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i');
// Match any link text/classname/id that looks like it is an end link: things // Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc. // like "first", "last", "end", etc.
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i') export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');
// Match any link text/classname/id that looks like it means the previous // Match any link text/classname/id that looks like it means the previous
// page. // page.
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i') export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');
// Match any phrase that looks like it could be page, or paging, or pagination // Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i') export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');

@ -1,27 +1,32 @@
import 'babel-polyfill' import 'babel-polyfill';
import URL from 'url' import URL from 'url';
import difflib from 'difflib'
import { range } from 'utils' import { isWordpress } from 'utils/dom';
import { isWordpress } from 'utils/dom'
import { import {
removeAnchor, removeAnchor,
pageNumFromUrl, pageNumFromUrl,
} from 'utils/text' } from 'utils/text';
import {
DIGIT_RE,
NEXT_LINK_TEXT_RE,
PREV_LINK_TEXT_RE,
EXTRANEOUS_LINK_HINTS_RE,
CAP_LINK_TEXT_RE,
PAGE_RE,
} from './constants'
import { import {
NEGATIVE_SCORE_RE, scoreSimilarity,
POSITIVE_SCORE_RE, scoreLinkText,
} from 'utils/dom/constants' scorePageInLink,
import { IS_DIGIT_RE } from 'utils/text/constants' scoreExtraneousLinks,
scoreByParents,
scorePrevLink,
shouldScore,
scoreBaseUrl,
scoreCapLinks,
scoreNextLinkText,
} from './utils';
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i');
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
}
export default function scoreLinks({ export default function scoreLinks({
links, links,
@ -29,11 +34,11 @@ export default function scoreLinks({
baseUrl, baseUrl,
parsedUrl, parsedUrl,
$, $,
previousUrls=[] previousUrls = [],
}) { }) {
parsedUrl = parsedUrl || URL.parse(articleUrl) parsedUrl = parsedUrl || URL.parse(articleUrl);
const baseRegex = makeBaseRegex(baseUrl) const baseRegex = makeBaseRegex(baseUrl);
const isWp = isWordpress($) const isWp = isWordpress($);
// Loop through all links, looking for hints that they may be next-page // Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or // links. Things like having "page" in their textContent, className or
@ -46,12 +51,12 @@ export default function scoreLinks({
// Remove any anchor data since we don't do a good job // Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do // standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash // some checking with and without a trailing slash
let href = removeAnchor(link.attribs.href) const href = removeAnchor(link.attribs.href);
const $link = $(link) const $link = $(link);
const linkText = $link.text() const linkText = $link.text();
if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) { if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
return possiblePages return possiblePages;
} }
// ## PASSED THE FIRST-PASS TESTS. Start scoring. ## // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
@ -60,242 +65,29 @@ export default function scoreLinks({
score: 0, score: 0,
linkText, linkText,
href, href,
} };
} else {
possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`
}
const possiblePage = possiblePages[href]
const linkData = makeSig($link, linkText)
const pageNum = pageNumFromUrl(href)
let score = scoreBaseUrl(href, baseRegex)
score = score + scoreNextLinkText(linkData)
score = score + scoreCapLinks(linkData)
score = score + scorePrevLink(linkData)
score = score + scoreByParents($link)
score = score + scoreExtraneousLinks(href)
score = score + scorePageInLink(pageNum, isWp)
score = score + scoreLinkText(linkText, pageNum)
score = score + scoreSimilarity(score, articleUrl, href)
possiblePage.score = score
return possiblePages
}, {})
return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages
}
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i')
}
export function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio()
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity
const diffModifier = -(250 * (diffPercent - 0.2))
return score + diffModifier
}
return 0
}
export function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText)
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30
} else { } else {
score = Math.max(0, 10 - linkTextAsNum) possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`;
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score = score - 50
}
}
return score
}
export function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50
}
return 0
}
export function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25
}
return 0
}
export function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent()
let positiveMatch = false
let negativeMatch = false
let score = 0
Array.from(range(0, 4)).forEach((_) => {
if ($parent.length === 0) {
return
}
const parentData = makeSig($parent, ' ')
// If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true
score = score + 25
}
// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true
score = score - 25
}
}
$parent = $parent.parent()
})
return score
}
export function scorePrevLink(linkData) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200
}
return 0
}
export function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65
}
}
return 0
}
export function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50
}
return 0
} }
export function scoreBaseUrl(href, baseRegex) { const possiblePage = possiblePages[href];
// If the baseUrl isn't part of this URL, penalize this const linkData = makeSig($link, linkText);
// link. It could still be the link, but the odds are lower. const pageNum = pageNumFromUrl(href);
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25
}
return 0 let score = scoreBaseUrl(href, baseRegex);
} score += scoreNextLinkText(linkData);
score += scoreCapLinks(linkData);
score += scorePrevLink(linkData);
score += scoreByParents($link);
score += scoreExtraneousLinks(href);
score += scorePageInLink(pageNum, isWp);
score += scoreLinkText(linkText, pageNum);
score += scoreSimilarity(score, articleUrl, href);
export function shouldScore( possiblePage.score = score;
href,
articleUrl,
baseUrl,
parsedUrl,
linkText,
previousUrls
) {
// skip if we've already fetched this url
if(previousUrls.find((url) => href === url) !== undefined) {
return false
}
// If we've already parsed this URL, or the URL matches the base return possiblePages;
// URL, or is empty, skip it. }, {});
if (!href || href === articleUrl || href === baseUrl) {
return false
}
const { hostname } = parsedUrl return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages;
const { hostname: linkHost } = URL.parse(href)
// Domain mismatch.
if (linkHost !== hostname) {
return false
}
// If href doesn't contain a digit after removing the base URL,
// it's certainly not the next page.
const fragment = href.replace(baseUrl, '')
if (!DIGIT_RE.test(fragment)) {
return false
}
// This link has extraneous content (like "comment") in its link
// text, so we skip it.
if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) {
return false
}
// Next page link text is never long, skip if it is too long.
if (linkText.length > 25) {
return false
}
return true
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`
} }

@ -1,239 +1,42 @@
import assert from 'assert' import assert from 'assert';
import cheerio from 'cheerio' import cheerio from 'cheerio';
import fs from 'fs' import fs from 'fs';
import URL from 'url'
import scoreLinks from './score-links' import scoreLinks from './score-links';
import {
makeBaseRegex,
scoreBaseUrl,
scoreNextLinkText,
scoreCapLinks,
scorePrevLink,
scoreByParents,
scoreExtraneousLinks,
scorePageInLink,
scoreLinkText,
scoreSimilarity,
shouldScore,
} from './score-links'
describe('scoreLinks(links)', () => { describe('scoreLinks(links)', () => {
it('returns an object of scored links', () => { it('returns an object of scored links', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8') const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html) const $ = cheerio.load(html);
const links = $('a[href]').toArray() const links = $('a[href]').toArray();
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const scoredPages = scoreLinks({ const scoredPages = scoreLinks({
links, links,
articleUrl: url, articleUrl: url,
baseUrl: 'http://arstechnica.com', baseUrl: 'http://arstechnica.com',
$, $,
}) });
assert.equal(typeof scoredPages, 'object') assert.equal(typeof scoredPages, 'object');
}) });
it('returns null if no possible pages', () => { it('returns null if no possible pages', () => {
const html = `<div><p>Hello wow</p></div>` const html = '<div><p>Hello wow</p></div>';
const $ = cheerio.load(html) const $ = cheerio.load(html);
const links = $('a[href]').toArray() const links = $('a[href]').toArray();
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/' const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const scoredPages = scoreLinks({ const scoredPages = scoreLinks({
links, links,
articleUrl: url, articleUrl: url,
baseUrl: 'http://arstechnica.com', baseUrl: 'http://arstechnica.com',
$, $,
}) });
assert.equal(scoredPages, null) assert.equal(scoredPages, null);
}) });
}) });
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://foo.com/foo/bar'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25)
})
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://example.com/foo/bar/bat'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0)
})
})
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = "foo bar Next page"
assert.equal(scoreNextLinkText(linkData), 50)
})
it('returns 0 if does not contain common next link text', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreNextLinkText(linkData), 0)
})
})
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = "foo next Last page"
assert.equal(scoreCapLinks(linkData), -65)
})
it('returns 0 if does not match a cap link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = "foo next previous page"
assert.equal(scorePrevLink(linkData), -200)
})
it('returns 0 if does not match a prev link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), 25)
})
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), -25)
})
})
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = "http://example.com/email-link"
assert.equal(scoreExtraneousLinks(url), -25)
})
it('returns 0 if does not match extraneous text', () => {
const url = "http://example.com/asdf"
assert.equal(scoreExtraneousLinks(url), 0)
})
})
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50)
})
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0)
})
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0)
})
})
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8)
})
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5)
})
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30)
})
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44)
})
})
describe('scoreSimilarity(score, articleUrl, href)', () => {
it('returns a similarity bonus based on current score', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 25
assert.equal(
Math.round(scoreSimilarity(score, articleUrl, href)),
66
)
})
it('returns 0 is current score <= 0', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 0
assert.equal(scoreSimilarity(score, articleUrl, href), 0)
})
})
describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
it('returns false if href has already been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar/2' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
false
)
})
it('returns true if href has not been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
true
)
})
})

@ -0,0 +1,10 @@
export { default as scoreSimilarity } from './score-similarity';
export { default as scoreLinkText } from './score-link-text';
export { default as scorePageInLink } from './score-page-in-link';
export { default as scoreExtraneousLinks } from './score-extraneous-links';
export { default as scoreByParents } from './score-by-parents';
export { default as scorePrevLink } from './score-prev-link';
export { default as shouldScore } from './should-score';
export { default as scoreBaseUrl } from './score-base-url';
export { default as scoreNextLinkText } from './score-next-link-text';
export { default as scoreCapLinks } from './score-cap-links';

@ -0,0 +1,11 @@
export default function scoreBaseUrl(href, baseRegex) {
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25;
}
return 0;
}

@ -0,0 +1,23 @@
import assert from 'assert';
import scoreBaseUrl from './score-base-url';
import { makeBaseRegex } from '../score-links';
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar';
const badUrl = 'http://foo.com/foo/bar';
const baseRegex = makeBaseRegex(baseUrl);
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25);
});
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar';
const badUrl = 'http://example.com/foo/bar/bat';
const baseRegex = makeBaseRegex(baseUrl);
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0);
});
});

@ -0,0 +1,52 @@
import { range } from 'utils';
import {
NEGATIVE_SCORE_RE,
POSITIVE_SCORE_RE,
PAGE_RE,
} from 'utils/dom/constants';
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
function makeSig($link) {
return `${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
}
export default function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent();
let positiveMatch = false;
let negativeMatch = false;
let score = 0;
Array.from(range(0, 4)).forEach(() => {
if ($parent.length === 0) {
return;
}
const parentData = makeSig($parent, ' ');
// If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true;
score += 25;
}
// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true;
score -= 25;
}
}
$parent = $parent.parent();
});
return score;
}

@ -0,0 +1,35 @@
import assert from 'assert';
import cheerio from 'cheerio';
import scoreByParents from './score-by-parents';
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), 25);
});
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), -25);
});
});

@ -0,0 +1,19 @@
import {
NEXT_LINK_TEXT_RE,
CAP_LINK_TEXT_RE,
} from '../constants';
export default function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65;
}
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreCapLinks from './score-cap-links';
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = 'foo next Last page';
assert.equal(scoreCapLinks(linkData), -65);
});
it('returns 0 if does not match a cap link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreCapLinks(linkData), 0);
});
});

@ -0,0 +1,10 @@
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
export default function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreExtraneousLinks from './score-extraneous-links';
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = 'http://example.com/email-link';
assert.equal(scoreExtraneousLinks(url), -25);
});
it('returns 0 if does not match extraneous text', () => {
const url = 'http://example.com/asdf';
assert.equal(scoreExtraneousLinks(url), 0);
});
});

@ -0,0 +1,30 @@
import { IS_DIGIT_RE } from 'utils/text/constants';
export default function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0;
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText, 10);
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30;
} else {
score = Math.max(0, 10 - linkTextAsNum);
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score -= 50;
}
}
return score;
}

@ -0,0 +1,22 @@
import assert from 'assert';
import scoreLinkText from './score-link-text';
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8);
});
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5);
});
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30);
});
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44);
});
});

@ -0,0 +1,10 @@
import { NEXT_LINK_TEXT_RE } from '../constants';
export default function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreNextLinkText from './score-next-link-text';
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = 'foo bar Next page';
assert.equal(scoreNextLinkText(linkData), 50);
});
it('returns 0 if does not contain common next link text', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreNextLinkText(linkData), 0);
});
});

@ -0,0 +1,10 @@
export default function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scorePageInLink from './score-page-in-link';
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50);
});
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0);
});
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0);
});
});

@ -0,0 +1,11 @@
import { PREV_LINK_TEXT_RE } from '../constants';
export default function scorePrevLink(linkData) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scorePrevLink from './score-prev-link';
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = 'foo next previous page';
assert.equal(scorePrevLink(linkData), -200);
});
it('returns 0 if does not match a prev link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scorePrevLink(linkData), 0);
});
});

@ -0,0 +1,23 @@
import difflib from 'difflib';
export default function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity;
const diffModifier = -(250 * (diffPercent - 0.2));
return score + diffModifier;
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save