chore: refactored and linted

pull/1/head
Adam Pash 8 years ago
parent 9906bd36a4
commit 7e2a34945f

@ -0,0 +1 @@
**/fixtures/*

@ -0,0 +1,39 @@
// Use this file as a starting point for your project's .eslintrc.
// Copy this file, and add rule overrides as needed.
{
"parser": "babel-eslint",
"extends": "airbnb",
"plugins": [
"babel"
],
"globals": {
/* mocha */
"describe",
"it"
},
"rules": {
"no-param-reassign": 0,
/* TODO fix this; this should work w/import/resolver below, but doesn't */
"import/no-extraneous-dependencies": 0,
"import/no-unresolved": 0,
"no-control-regex": 0,
"import/prefer-default-export": 0,
"generator-star-spacing": 0,
"babel/generator-star-spacing": 0,
"func-names": 0,
"no-useless-escape": 0,
"no-confusing-arrow": 0,
},
"settings": {
"import/resolver": {
"babel-module": {
"extensions": [".js"]
}
}
},
"parserOptions":{
"ecmaFeatures": {
"experimentalObjectRestSpread": true
}
}
}

@ -5,14 +5,17 @@
"main": "index.js",
"scripts": {
"start": "node ./build",
"build": "rollup -c",
"lint": "eslint src/**",
"build": "eslint src/** && rollup -c",
"test": "./test-runner"
},
"author": "",
"license": "ISC",
"devDependencies": {
"babel-eslint": "^6.1.2",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-module-alias": "^1.6.0",
"babel-plugin-module-resolver": "^2.2.0",
"babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0",
@ -21,6 +24,14 @@
"babel-preset-es2015-rollup": "^1.2.0",
"babel-register": "^6.11.6",
"babelrc-rollup": "^3.0.0",
"eslint": "^3.5.0",
"eslint-config-airbnb": "^11.1.0",
"eslint-import-resolver-babel-module": "^2.0.1",
"eslint-plugin-async": "^0.1.1",
"eslint-plugin-babel": "^3.3.0",
"eslint-plugin-import": "^1.15.0",
"eslint-plugin-jsx-a11y": "^2.2.2",
"eslint-plugin-react": "^6.2.1",
"mocha": "^3.0.2",
"rollup": "^0.34.13",
"rollup-plugin-babel": "^2.6.1",

@ -0,0 +1,21 @@
#!/usr/local/bin/fish
set file $argv[1]
set function $argv[2]
touch src/extractors/generic/next-page-url/scoring/utils/index.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js
echo "Now make it a default export"
echo "Move it to its file"
echo "Move its tests to its test file"
echo "import in score-links"
echo "Test it."

@ -1,7 +1,7 @@
import { CLEAN_AUTHOR_RE } from './constants'
import { CLEAN_AUTHOR_RE } from './constants';
// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
export default function cleanAuthor(author) {
return author.replace(CLEAN_AUTHOR_RE, '$2').trim()
return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
}

@ -1,21 +1,21 @@
import assert from 'assert'
import assert from 'assert';
import cleanAuthor from './author'
import cleanAuthor from './author';
describe('cleanAuthor(author)', () => {
it('removes the By from an author string', () => {
const author = cleanAuthor('By Bob Dylan')
const author = cleanAuthor('By Bob Dylan');
assert.equal(author, 'Bob Dylan')
})
assert.equal(author, 'Bob Dylan');
});
it('trims trailing whitespace and line breaks', () => {
const text = `
written by
Bob Dylan
`
const author = cleanAuthor(text)
`;
const author = cleanAuthor(text);
assert.equal(author, 'Bob Dylan')
})
})
assert.equal(author, 'Bob Dylan');
});
});

@ -1,9 +1,9 @@
// CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct.
//
@ -14,7 +14,7 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [
]
];
// An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit.
@ -23,18 +23,36 @@ export const DEK_META_TAGS = [
// detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [
'.entry-summary',
]
];
// CLEAN DATE PUBLISHED CONSTANTS
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i
export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}(\s?[ap]\.?m\.?)?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
const months = [
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
'aug',
'sep',
'oct',
'nov',
'dec',
];
const allMonths = months.join('|');
const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
export const SPLIT_DATE_STRING =
new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig');
// CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;
export const DOMAIN_ENDINGS_RE =
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g')
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g');

@ -8,54 +8,52 @@ import {
rewriteTopLevel,
stripJunkTags,
makeLinksAbsolute,
} from 'utils/dom'
import { convertNodeTo } from 'utils/dom'
} from 'utils/dom';
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(
article,
{
$,
cleanConditionally=true,
title='',
url='',
cleanConditionally = true,
title = '',
url = '',
}
) {
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
rewriteTopLevel(article, $)
rewriteTopLevel(article, $);
// Drop small images and spacer images
cleanImages(article, $)
cleanImages(article, $);
// Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security.
stripJunkTags(article, $)
stripJunkTags(article, $);
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article, $)
cleanHOnes(article, $);
// Clean headers
cleanHeaders(article, $, title)
cleanHeaders(article, $, title);
// Make links absolute
makeLinksAbsolute(article, $, url)
makeLinksAbsolute(article, $, url);
// Remove style or align attributes
cleanAttributes(article, $)
cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
cleanTags(article, $, cleanConditionally)
cleanTags(article, $, cleanConditionally);
// Remove empty paragraph nodes
removeEmpty(article, $)
removeEmpty(article, $);
return article
return article;
}
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers:

@ -1,32 +1,32 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
import extractCleanNode from './content'
import extractBestNode from 'extractors/generic/content/extract-best-node'
import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractCleanNode from './content';
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it("cleans cruft out of a DOM node", () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
let $ = cheerio.load(html)
it('cleans cruft out of a DOM node', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html);
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
}
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
};
const bestNode = extractBestNode($, opts)
let result = $.html(bestNode)
// console.log(result)
// console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts })
result = $.html(cleanNode)
// console.log(result.length)
// console.log(result)
// console.log(bestNode.html())
const bestNode = extractBestNode($, opts);
// let result = $.html(bestNode);
// // console.log(result)
// // console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts });
// result = $.html(cleanNode);
// // console.log(result.length)
// // console.log(result)
// // console.log(bestNode.html())
assert.equal($(bestNode).text().length, 2687)
})
})
assert.equal($(cleanNode).text().length, 2687);
});
});

@ -1,4 +1,4 @@
import moment from 'moment'
import moment from 'moment';
// Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string.
@ -7,27 +7,27 @@ import {
CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING,
TIME_MERIDIAN_SPACE_RE,
TIME_MERIDIAN_DOTS_RE
} from './constants'
TIME_MERIDIAN_DOTS_RE,
} from './constants';
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim();
}
// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
export default function cleanDatePublished(dateString) {
let date = moment(new Date(dateString))
let date = moment(new Date(dateString));
if (!date.isValid()) {
dateString = cleanDateString(dateString)
date = moment(new Date(dateString))
dateString = cleanDateString(dateString);
date = moment(new Date(dateString));
}
return date.isValid() ? date.toISOString() : null
}
export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim()
return date.isValid() ? date.toISOString() : null;
}

@ -1,67 +1,62 @@
import assert from 'assert'
import assert from 'assert';
import {
default as cleanDatePublished,
cleanDateString,
} from './date-published'
} from './date-published';
describe('cleanDatePublished(dateString)', () => {
it('returns a date object', () => {
const datePublished = cleanDatePublished('published: 1/1/2020')
const datePublished = cleanDatePublished('published: 1/1/2020');
assert.equal(
datePublished,
new Date('1/1/2020').toISOString()
)
})
);
});
it('returns null if date is invalid', () => {
const datePublished = cleanDatePublished('blargh')
const datePublished = cleanDatePublished('blargh');
assert.equal(datePublished, null)
})
})
assert.equal(datePublished, null);
});
});
describe('cleanDateString(dateString)', () => {
it('removes "published" text from an datePublished string', () => {
const datePublished = cleanDateString('published: 1/1/2020')
const datePublished = cleanDateString('published: 1/1/2020');
assert.equal(datePublished, '1/1/2020')
})
assert.equal(datePublished, '1/1/2020');
});
it('trims whitespace', () => {
const datePublished = cleanDateString(' 1/1/2020 ')
const datePublished = cleanDateString(' 1/1/2020 ');
assert.equal(datePublished, '1/1/2020')
})
assert.equal(datePublished, '1/1/2020');
});
it('puts a space b/w a time and am/pm', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
const date1 = cleanDateString('1/1/2020 8:30am')
assert.equal(date1, '1/1/2020 8:30 am')
const date1 = cleanDateString('1/1/2020 8:30am');
assert.equal(date1, '1/1/2020 8:30 am');
const date2 = cleanDateString('8:30PM 1/1/2020')
assert.equal(date2, '8:30 PM 1/1/2020')
})
const date2 = cleanDateString('8:30PM 1/1/2020');
assert.equal(date2, '8:30 PM 1/1/2020');
});
it('cleans the dots from a.m. or p.m.', () => {
// The JS date parser is forgiving, but
// it needs a.m./p.m. without dots
const date1 = cleanDateString('1/1/2020 8:30 a.m.')
assert.equal(date1, '1/1/2020 8:30 am')
})
const date1 = cleanDateString('1/1/2020 8:30 a.m.');
assert.equal(date1, '1/1/2020 8:30 am');
});
it('can handle some tough timestamps', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.')
assert.equal(date1, '15 Apr 2016 10:59')
const date2 = cleanDateString('8:30PM 1/1/2020')
assert.equal(date2, '8:30 PM 1/1/2020')
})
})
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.');
assert.equal(date1, '15 Apr 2016 10:59');
});
});

@ -1,17 +1,18 @@
import { TEXT_LINK_RE } from './constants'
import { stripTags } from 'utils/dom'
import { stripTags } from 'utils/dom';
import { TEXT_LINK_RE } from './constants';
// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.
export default function cleanDek(dek, { $ }) {
// Sanity check that we didn't get too short or long of a dek.
if (dek.length > 1000 || dek.length < 5) return null
if (dek.length > 1000 || dek.length < 5) return null;
const dekText = stripTags(dek, $)
const dekText = stripTags(dek, $);
// Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail.
if (TEXT_LINK_RE.test(dekText)) return null
if (TEXT_LINK_RE.test(dekText)) return null;
return dekText.trim()
return dekText.trim();
}

@ -1,52 +1,50 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import {
default as cleanDek,
cleanDekString,
} from './dek'
import cleanDek from './dek';
describe('cleanDek(dekString, { $ })', () => {
it('returns null if the dek is < 5 chars', () => {
const $ = cheerio.load('<div></div>')
assert.equal(cleanDek('Hi', { $ }), null)
})
const $ = cheerio.load('<div></div>');
assert.equal(cleanDek('Hi', { $ }), null);
});
it('returns null if the dek is > 1000 chars', () => {
const $ = cheerio.load('<div></div>')
const $ = cheerio.load('<div></div>');
const longDek =
// generate a string that is 1,280 chars
[0,1,2,3,4,5,6].reduce((acc, i) =>
acc += acc, '0123456789'
)
assert.equal(cleanDek(longDek, { $ }), null)
})
[0, 1, 2, 3, 4, 5, 6].reduce((acc) => {
acc += acc;
return acc;
}, '0123456789');
assert.equal(cleanDek(longDek, { $ }), null);
});
it('strip html tags from the dek', () => {
const $ = cheerio.load('<div></div>')
const dek = 'This is a <em>very</em> important dek.'
const $ = cheerio.load('<div></div>');
const dek = 'This is a <em>very</em> important dek.';
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.')
})
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.');
});
it('returns null if dek contains plain text link', () => {
const $ = cheerio.load('<div></div>')
const dek = 'This has this link http://example.com/foo/bar'
const $ = cheerio.load('<div></div>');
const dek = 'This has this link http://example.com/foo/bar';
assert.equal(cleanDek(dek, { $ }), null)
})
assert.equal(cleanDek(dek, { $ }), null);
});
it('returns a normal dek as is', () => {
const $ = cheerio.load('<div></div>')
const dek = 'This is the dek'
const $ = cheerio.load('<div></div>');
const dek = 'This is the dek';
assert.equal(cleanDek(dek, { $ }), dek)
})
assert.equal(cleanDek(dek, { $ }), dek);
});
it('cleans extra whitespace', () => {
const $ = cheerio.load('<div></div>')
const dek = ' This is the dek '
const $ = cheerio.load('<div></div>');
const dek = ' This is the dek ';
assert.equal(cleanDek(dek, { $ }), 'This is the dek')
})
})
assert.equal(cleanDek(dek, { $ }), 'This is the dek');
});
});

@ -1,5 +1,5 @@
const HTML = {
docWithH1: `<div><h1>This Is the Real Title</h1></div>`,
docWithH1: '<div><h1>This Is the Real Title</h1></div>',
docWith2H1s: `
<div>
<h1>This Is the Real Title</h1>
@ -7,9 +7,9 @@ const HTML = {
</div>
`,
docWithTagsInH1: {
before: `<div><h1>This Is the <em>Real</em> Title</h1></div>`,
after: `This Is the Real Title`
before: '<div><h1>This Is the <em>Real</em> Title</h1></div>',
after: 'This Is the Real Title',
},
}
};
export default HTML
export default HTML;

@ -1,9 +1,9 @@
import cleanAuthor from './author'
import cleanImage from './lead-image-url'
import cleanDek from './dek'
import cleanDatePublished from './date-published'
import cleanContent from './content'
import cleanTitle from './title'
import cleanAuthor from './author';
import cleanImage from './lead-image-url';
import cleanDek from './dek';
import cleanDatePublished from './date-published';
import cleanContent from './content';
import cleanTitle from './title';
const Cleaners = {
author: cleanAuthor,
@ -12,15 +12,15 @@ const Cleaners = {
datePublished: cleanDatePublished,
content: cleanContent,
title: cleanTitle,
}
};
export default Cleaners
export default Cleaners;
export { cleanAuthor }
export { cleanImage }
export { cleanDek }
export { cleanDatePublished }
export { cleanContent }
export { cleanTitle }
export { default as resolveSplitTitle } from './resolve-split-title'
export { cleanAuthor };
export { cleanImage };
export { cleanDek };
export { cleanDatePublished };
export { cleanContent };
export { cleanTitle };
export { default as resolveSplitTitle } from './resolve-split-title';

@ -1,10 +1,10 @@
import validUrl from 'valid-url'
import validUrl from 'valid-url';
export default function clean(leadImageUrl) {
leadImageUrl = leadImageUrl.trim()
leadImageUrl = leadImageUrl.trim();
if (validUrl.isWebUri(leadImageUrl)) {
return leadImageUrl
} else {
return null
return leadImageUrl;
}
return null;
}

@ -1,20 +1,20 @@
import assert from 'assert'
import assert from 'assert';
import clean from './lead-image-url'
import clean from './lead-image-url';
describe('clean(leadImageUrl)', () => {
it('returns the url if valid', () => {
const url = 'https://example.com'
assert.equal(clean(url), url)
})
const url = 'https://example.com';
assert.equal(clean(url), url);
});
it('returns null if the url is not valid', () => {
const url = 'this is not a valid url'
assert.equal(clean(url), null)
})
const url = 'this is not a valid url';
assert.equal(clean(url), null);
});
it('trims whitespace', () => {
const url = ' https://example.com/foo/bar.jpg'
assert.equal(clean(url), url.trim())
})
})
const url = ' https://example.com/foo/bar.jpg';
assert.equal(clean(url), url.trim());
});
});

@ -1,34 +1,11 @@
import URL from 'url'
import 'babel-polyfill'
import wuzzy from 'wuzzy'
import URL from 'url';
import 'babel-polyfill';
import wuzzy from 'wuzzy';
import {
TITLE_SPLITTERS_RE,
DOMAIN_ENDINGS_RE,
} from './constants'
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url='') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
title = title
let splitTitle = title.split(TITLE_SPLITTERS_RE)
if (splitTitle.length === 1) {
return title
}
let newTitle = extractBreadcrumbTitle(splitTitle, title)
if (newTitle) return newTitle
newTitle = cleanDomainFromTitle(splitTitle, url)
if (newTitle) return newTitle
// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title
}
} from './constants';
function extractBreadcrumbTitle(splitTitle, text) {
// This must be a very breadcrumbed title, like:
@ -38,40 +15,40 @@ function extractBreadcrumbTitle(splitTitle, text) {
// Look to see if we can find a breadcrumb splitter that happens
// more than once. If we can, we'll be able to better pull out
// the title.
const termCounts = splitTitle.reduce((acc, text) => {
acc[text] = acc[text] ? acc[text] + 1 : 1
return acc
}, {})
const termCounts = splitTitle.reduce((acc, titleText) => {
acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
return acc;
}, {});
const [maxTerm, termCount] =
Reflect.ownKeys(termCounts)
.reduce((acc, key) => {
if (acc[1] < termCounts[key]) {
return [key, termCounts[key]]
} else {
return acc
return [key, termCounts[key]];
}
}, [0, 0])
return acc;
}, [0, 0]);
// We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead.
// Note: max_term should be <= 4 characters, so that " >> "
// will match, but nothing longer than that.
if (termCount >= 2 && maxTerm.length <= 4) {
splitTitle = text.split(maxTerm)
splitTitle = text.split(maxTerm);
}
const splitEnds = [splitTitle[0], splitTitle.slice(-1)]
const longestEnd = splitEnds.reduce((acc, end) => {
return acc.length > end.length ? acc : end
}, '')
const splitEnds = [splitTitle[0], splitTitle.slice(-1)];
const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');
if (longestEnd.length > 10) {
return longestEnd
} else {
return text
return longestEnd;
}
return text;
}
return null;
}
function cleanDomainFromTitle(splitTitle, url) {
@ -81,20 +58,43 @@ function cleanDomainFromTitle(splitTitle, url) {
//
// Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right.
const { host } = URL.parse(url)
const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '')
const { host } = URL.parse(url);
const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
const startSlug = splitTitle[0].toLowerCase().replace(' ', '')
const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain)
const startSlug = splitTitle[0].toLowerCase().replace(' ', '');
const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
if (startSlugRatio > .4 && startSlug.length > 5) {
return splitTitle.slice(2).join('')
if (startSlugRatio > 0.4 && startSlug.length > 5) {
return splitTitle.slice(2).join('');
}
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '')
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain)
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
if (endSlugRatio > .4 && endSlug.length >= 5) {
return splitTitle.slice(0, -2).join('')
if (endSlugRatio > 0.4 && endSlug.length >= 5) {
return splitTitle.slice(0, -2).join('');
}
return null;
}
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url = '') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
const splitTitle = title.split(TITLE_SPLITTERS_RE);
if (splitTitle.length === 1) {
return title;
}
let newTitle = extractBreadcrumbTitle(splitTitle, title);
if (newTitle) return newTitle;
newTitle = cleanDomainFromTitle(splitTitle, url);
if (newTitle) return newTitle;
// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
return title;
}

@ -1,32 +1,31 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import { resolveSplitTitle } from './index'
import { resolveSplitTitle } from './index';
describe('resolveSplitTitle(text)', () => {
it('does nothing if title not splittable', () => {
const title = "This Is a Normal Title"
const title = 'This Is a Normal Title';
assert.equal(resolveSplitTitle(title), title)
})
assert.equal(resolveSplitTitle(title), title);
});
it('extracts titles from breadcrumb-like titles', () => {
const title = "The Best Gadgets on Earth : Bits : Blogs : NYTimes.com"
const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com';
assert.equal(resolveSplitTitle(title), "The Best Gadgets on Earth ")
})
assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth ');
});
it('cleans domains from titles at the front', () => {
const title = "NYTimes - The Best Gadgets on Earth"
const url = "https://www.nytimes.com/bits/blog/etc/"
const title = 'NYTimes - The Best Gadgets on Earth';
const url = 'https://www.nytimes.com/bits/blog/etc/';
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth")
})
assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
});
it('cleans domains from titles at the back', () => {
const title = "The Best Gadgets on Earth | NYTimes"
const url = "https://www.nytimes.com/bits/blog/etc/"
const title = 'The Best Gadgets on Earth | NYTimes';
const url = 'https://www.nytimes.com/bits/blog/etc/';
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth")
})
})
assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
});
});

@ -1,25 +1,26 @@
import { TITLE_SPLITTERS_RE } from './constants'
import { resolveSplitTitle } from './index'
import { stripTags } from 'utils/dom'
import { stripTags } from 'utils/dom';
import { TITLE_SPLITTERS_RE } from './constants';
import { resolveSplitTitle } from './index';
export default function cleanTitle(title, { url, $ }) {
// If title has |, :, or - in it, see if
// we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) {
title = resolveSplitTitle(title, url)
title = resolveSplitTitle(title, url);
}
// Final sanity check that we didn't get a crazy title.
// if (title.length > 150 || title.length < 15) {
if (title.length > 150) {
// If we did, return h1 from the document if it exists
const h1 = $('h1')
const h1 = $('h1');
if (h1.length === 1) {
title = h1.text()
title = h1.text();
}
}
// strip any html tags in the title text
return stripTags(title, $).trim()
return stripTags(title, $).trim();
}

@ -1,8 +1,8 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html'
import { cleanTitle } from './index'
import HTML from './fixtures/html';
import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => {
it('uses a single h1 if the title is too short or too long', () => {
@ -10,28 +10,27 @@ describe('cleanTitle(title, { url, $ })', () => {
// const $ = cheerio.load(HTML.docWithH1)
//
// assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
})
});
it('only uses h1 if there is only one on the page', () => {
const title = "Too Short"
const $ = cheerio.load(HTML.docWith2H1s)
const title = 'Too Short';
const $ = cheerio.load(HTML.docWith2H1s);
assert.equal(cleanTitle(title, { url: '', $ }), title)
})
assert.equal(cleanTitle(title, { url: '', $ }), title);
});
it('removes HTML tags from titles', () => {
const $ = cheerio.load(HTML.docWithTagsInH1.before)
const title = $('h1').html()
const $ = cheerio.load(HTML.docWithTagsInH1.before);
const title = $('h1').html();
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after)
})
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after);
});
it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love "
const $ = cheerio.load(HTML.docWithTagsInH1.before)
const title = " This Is a Great Title That You'll Love ";
const $ = cheerio.load(HTML.docWithTagsInH1.before);
assert.equal(cleanTitle(title, { url: '', $ }), title.trim())
})
})
assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
});
});

@ -1,12 +1,11 @@
import GenericExtractor from './generic'
import NYMagExtractor from './custom/nymag.com'
import BloggerExtractor from './custom/blogspot.com'
import WikipediaExtractor from './custom/wikipedia.org'
import NYMagExtractor from './custom/nymag.com';
import BloggerExtractor from './custom/blogspot.com';
import WikipediaExtractor from './custom/wikipedia.org';
const Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor,
}
};
export default Extractors
export default Extractors;

@ -1 +1 @@
export const ATTR_RE = /\[([\w-]+)\]/
export const ATTR_RE = /\[([\w-]+)\]/;

@ -14,27 +14,27 @@ const BloggerExtractor = {
// Convert the noscript tag to a div
transforms: {
'noscript': 'div'
noscript: 'div',
},
},
author: {
selectors: [
'.post-author-name'
]
'.post-author-name',
],
},
title: {
selectors: [
'h2.title',
]
],
},
datePublished: {
selectors: [
'span.publishdate',
]
}
}
],
},
};
export default BloggerExtractor
export default BloggerExtractor;

@ -22,37 +22,39 @@ const NYMagExtractor = {
// the transformation.
transforms: {
// Convert h1s to h2s
'h1': 'h2',
h1: 'h2',
// Convert lazy-loaded noscript images to figures
'noscript': ($node) => {
const $children = $node.children()
noscript: ($node) => {
const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure'
return 'figure';
}
}
}
return null;
},
},
},
title: {
selectors: [
'h1.headline-primary',
'h1',
]
],
},
author: {
selectors: [
'.by-authors',
]
],
},
datePublished: {
selectors: [
'time.article-timestamp[datetime]',
'time.article-timestamp',
]
}
}
],
},
};
export default NYMagExtractor
export default NYMagExtractor;

@ -8,7 +8,7 @@ const WikipediaExtractor = {
// transform top infobox to an image with caption
transforms: {
'.infobox img': ($node) => {
$node.parents('.infobox').prepend($node)
$node.parents('.infobox').prepend($node);
},
'.infobox caption': 'figcaption',
'.infobox': 'figure',
@ -28,15 +28,15 @@ const WikipediaExtractor = {
title: {
selectors: [
'h2.title',
]
],
},
datePublished: {
selectors: [
'#footer-info-lastmod',
]
],
},
}
};
export default WikipediaExtractor
export default WikipediaExtractor;

@ -5,22 +5,22 @@
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
export const AUTHOR_META_TAGS = [
'byl',
'clmst',
'dc.author',
'dcsext.author',
'dc.creator',
'rbauthors',
'authors',
]
'byl',
'clmst',
'dc.author',
'dcsext.author',
'dc.creator',
'rbauthors',
'authors',
];
export const AUTHOR_MAX_LENGTH = 300
export const AUTHOR_MAX_LENGTH = 300;
// An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
export const AUTHOR_SELECTORS = [
@ -47,12 +47,12 @@ export const AUTHOR_SELECTORS = [
'.articleauthor',
'.ArticleAuthor',
'.byline',
]
];
// An ordered list of Selectors to find likely article authors, with
// regular expression for content.
const byline_re = /^[\n\s]*By/i
const bylineRe = /^[\n\s]*By/i;
export const BYLINE_SELECTORS_RE = [
['#byline', byline_re],
['.byline', byline_re],
]
['#byline', bylineRe],
['.byline', bylineRe],
];

@ -1,49 +1,48 @@
import { cleanAuthor } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import {
AUTHOR_META_TAGS,
AUTHOR_MAX_LENGTH,
AUTHOR_SELECTORS,
BYLINE_SELECTORS_RE,
} from './constants'
import { cleanAuthor } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors
} from 'utils/dom'
} from './constants';
const GenericAuthorExtractor = {
extract({ $, metaCache }) {
let author
let author;
// First, check to see if we have a matching
// meta tag that we can make use of.
author = extractFromMeta($, AUTHOR_META_TAGS, metaCache)
author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author)
return cleanAuthor(author);
}
// Second, look through our selectors looking for potential authors.
author = extractFromSelectors($, AUTHOR_SELECTORS, 2)
author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
if (author && author.length < AUTHOR_MAX_LENGTH) {
return cleanAuthor(author)
return cleanAuthor(author);
}
// Last, use our looser regular-expression based selectors for
// potential authors.
for (const [selector, regex] of BYLINE_SELECTORS_RE) {
const node = $(selector)
const node = $(selector);
if (node.length === 1) {
const text = node.text()
const text = node.text();
if (regex.test(text)) {
return cleanAuthor(text)
return cleanAuthor(text);
}
}
}
return null
}
}
return null;
},
};
export default GenericAuthorExtractor
export default GenericAuthorExtractor;

@ -1,46 +1,46 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html'
import GenericAuthorExtractor from './extractor'
import HTML from './fixtures/html';
import GenericAuthorExtractor from './extractor';
describe('GenericAuthorExtractor', () => {
describe('extract($, cachedMeta)', () => {
it('extracts author from meta tags', () => {
const $ = cheerio.load(HTML.authorMeta.test)
const $ = cheerio.load(HTML.authorMeta.test);
const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] }
)
{ $, metaCache: ['dc.author', 'something-else'] }
);
assert.equal(result, HTML.authorMeta.result)
})
assert.equal(result, HTML.authorMeta.result);
});
it('extracts author from author selectors', () => {
const $ = cheerio.load(HTML.authorSelectors.test)
const $ = cheerio.load(HTML.authorSelectors.test);
const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] }
)
{ $, metaCache: ['dc.author', 'something-else'] }
);
assert.equal(result, HTML.authorSelectors.result)
})
assert.equal(result, HTML.authorSelectors.result);
});
it('extracts author with regex selectors', () => {
const $ = cheerio.load(HTML.authorRegSelectors.test)
const $ = cheerio.load(HTML.authorRegSelectors.test);
const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] }
)
{ $, metaCache: ['dc.author', 'something-else'] }
);
assert.equal(result, HTML.authorRegSelectors.result)
})
assert.equal(result, HTML.authorRegSelectors.result);
});
it('returns null if no author found', () => {
const $ = cheerio.load('<div></div>')
const $ = cheerio.load('<div></div>');
const result = GenericAuthorExtractor.extract(
{ $, metaCache: ["dc.author", "something-else"] }
)
{ $, metaCache: ['dc.author', 'something-else'] }
);
assert.equal(result, null)
})
})
})
assert.equal(result, null);
});
});
});

@ -5,7 +5,7 @@ const HTML = {
<meta name="dc.author" value="Adam" />
</html>
`,
result: `Adam`
result: 'Adam',
},
authorSelectors: {
test: `
@ -15,7 +15,7 @@ const HTML = {
</div>
</div>
`,
result: `Adam`
result: 'Adam',
},
authorRegSelectors: {
test: `
@ -25,8 +25,8 @@ const HTML = {
</div>
</div>
`,
result: `Adam`
result: 'Adam',
},
}
};
export default HTML
export default HTML;

@ -1,11 +1,12 @@
import {
scoreContent,
findTopCandidate,
} from './scoring'
import {
stripUnlikelyCandidates,
convertToParagraphs,
} from 'utils/dom'
} from 'utils/dom';
import {
scoreContent,
findTopCandidate,
} from './scoring';
// Using a variety of scoring techniques, extract the content most
// likely to be article text.
@ -26,12 +27,12 @@ export default function extractBestNode($, opts) {
if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($)
$ = stripUnlikelyCandidates($);
}
$ = convertToParagraphs($)
$ = scoreContent($, opts.weightNodes)
const $topCandidate = findTopCandidate($)
$ = convertToParagraphs($);
$ = scoreContent($, opts.weightNodes);
const $topCandidate = findTopCandidate($);
return $topCandidate
return $topCandidate;
}

@ -1,24 +1,26 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
// import HTML from './fixtures/html'
import extractBestNode from './extract-best-node'
import extractBestNode from './extract-best-node';
describe('extractBestNode($, flags)', () => {
it("scores the dom nodes and returns the best option", () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8')
it('scores the dom nodes and returns the best option', () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8');
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
}
stripUnlikelyCandidates: true,
weightNodes: true,
};
let $ = cheerio.load(html)
const $ = cheerio.load(html);
const bestNode = extractBestNode($, opts)
const bestNode = extractBestNode($, opts);
assert(typeof bestNode, 'object');
// console.log(bestNode.html())
// assert.equal($(bestNode).text().length, 3652)
})
})
});
});

@ -1,10 +1,11 @@
import cheerio from 'cheerio'
import 'babel-polyfill'
import cheerio from 'cheerio';
import 'babel-polyfill';
import extractBestNode from './extract-best-node'
import { nodeIsSufficient } from 'utils/dom'
import { cleanContent } from 'cleaners'
import { normalizeSpaces } from 'utils/text'
import { nodeIsSufficient } from 'utils/dom';
import { cleanContent } from 'cleaners';
import { normalizeSpaces } from 'utils/text';
import extractBestNode from './extract-best-node';
const GenericContentExtractor = {
defaultOpts: {
@ -33,46 +34,44 @@ const GenericContentExtractor = {
// cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
extract({ $, html, title, url }, opts) {
opts = { ...this.defaultOpts, ...opts }
opts = { ...this.defaultOpts, ...opts };
$ = $ || cheerio.load(html)
$ = $ || cheerio.load(html);
// Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content.
let node = this.getContentNode($, title, url, opts)
let node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) {
return this.cleanAndReturnNode(node, $)
} else {
// We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
for (const key of Reflect.ownKeys(opts).filter(key => opts[key] === true)) {
opts[key] = false
$ = cheerio.load(html)
node = this.getContentNode($, title, url, opts)
if (nodeIsSufficient(node)) {
break
}
}
return this.cleanAndReturnNode(node, $);
}
return this.cleanAndReturnNode(node, $)
// We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) {
opts[key] = false;
$ = cheerio.load(html);
node = this.getContentNode($, title, url, opts);
if (nodeIsSufficient(node)) {
break;
}
}
return this.cleanAndReturnNode(node, $)
return this.cleanAndReturnNode(node, $);
},
// Get node given current options
getContentNode($, title, url, opts) {
return cleanContent(
extractBestNode($, opts),
{
$,
cleanConditionally: opts.cleanConditionally,
title,
url,
})
{
$,
cleanConditionally: opts.cleanConditionally,
title,
url,
});
},
// Once we got here, either we're at our last-resort node, or
@ -80,10 +79,10 @@ const GenericContentExtractor = {
// move forward.
cleanAndReturnNode(node, $) {
if (!node) {
return null
return null;
}
return normalizeSpaces($.html(node))
return normalizeSpaces($.html(node));
// if return_type == "html":
// return normalize_spaces(node_to_html(node))
@ -91,6 +90,6 @@ const GenericContentExtractor = {
// return node
},
}
};
export default GenericContentExtractor
export default GenericContentExtractor;

@ -1,16 +1,15 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import assert from 'assert';
import fs from 'fs';
import { clean } from 'test-helpers'
import { clean } from 'test-helpers';
import GenericContentExtractor from './extractor'
import GenericContentExtractor from './extractor';
describe('GenericContentExtractor', function() {
this.timeout(1000000)
describe('GenericContentExtractor', function () {
this.timeout(1000000);
describe('extract($, html, opts)', () => {
it("extracts html and returns the article", () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8')
it('extracts html and returns the article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
// Array.from(range(1, 100)).map((i) => {
// console.log(i)
@ -20,15 +19,10 @@ describe('GenericContentExtractor', function() {
// })
const result = clean(GenericContentExtractor.extract(
{ $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' }
))
// console.log(result)
})
})
})
));
function* range(start = 1, end = 1) {
while (start <= end) {
yield start++
}
}
assert(typeof result, 'string');
// console.log(result)
});
});
});

@ -1,15 +1,15 @@
import {
getOrInitScore,
setScore,
} from './index'
} from './index';
export default function addScore($node, $, amount) {
try {
const score = getOrInitScore($node, $) + amount
setScore($node, $, score)
} catch(e) {
console.debug(e)
} finally {
return $node
const score = getOrInitScore($node, $) + amount;
setScore($node, $, score);
} catch (e) {
// Ignoring; error occurs in scoreNode
}
return $node;
}

@ -1,28 +1,27 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import {
addScore,
getScore,
} from './index'
} from './index';
describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => {
it(`adds the specified amount to a node's score`, () => {
const $ = cheerio.load('<p score="25">Foo</p>')
let $node = $('p').first()
it('adds the specified amount to a node\'s score', () => {
const $ = cheerio.load('<p score="25">Foo</p>');
let $node = $('p').first();
$node = addScore($node, $, 25)
assert.equal(getScore($node), 50)
})
$node = addScore($node, $, 25);
assert.equal(getScore($node), 50);
});
it(`adds score if score not yet set (assumes score is 0)`, () => {
const $ = cheerio.load('<p>Foo</p>')
let $node = $('p').first()
it('adds score if score not yet set (assumes score is 0)', () => {
const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first();
$node = addScore($node, $, 25)
assert.equal(getScore($node), 25)
})
})
})
$node = addScore($node, $, 25);
assert.equal(getScore($node), 25);
});
});
});

@ -1,11 +1,11 @@
import { addScore } from './index'
import { addScore } from './index';
// Adds 1/4 of a child's score to its parent
export default function addToParent(node, $, score) {
const parent = node.parent()
const parent = node.parent();
if (parent) {
addScore(parent, $, score * .25)
addScore(parent, $, score * 0.25);
}
return node
return node;
}

@ -1,24 +1,23 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import {
addToParent,
getScore,
} from './index'
} from './index';
describe('Scoring utils', () => {
describe('addToParent(node, $, amount)', () => {
it(`adds 1/4 of a node's score it its parent`, () => {
const html = '<div score="25"><p score="40">Foo</p></div>'
const $ = cheerio.load(html)
let $node = $('p').first()
it('adds 1/4 of a node\'s score it its parent', () => {
const html = '<div score="25"><p score="40">Foo</p></div>';
const $ = cheerio.load(html);
let $node = $('p').first();
$node = addToParent($node, $, 40)
$node = addToParent($node, $, 40);
assert.equal(getScore($node.parent()), 35)
assert.equal(getScore($node), 40)
})
})
})
assert.equal(getScore($node.parent()), 35);
assert.equal(getScore($node), 40);
});
});
});

@ -1,49 +1,49 @@
//// CONTENT FETCHING CONSTANTS ////
// // CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
export const UNLIKELY_CANDIDATES_BLACKLIST = [
'ad-break',
'adbox',
'advert',
'addthis',
'agegate',
'aux',
'blogger-labels',
'combx',
'comment',
'conversation',
'disqus',
'entry-unrelated',
'extra',
'foot',
'form',
'header',
'hidden',
'loader',
'login', // Note: This can hit 'blogindex'.
'menu',
'meta',
'nav',
'pager',
'pagination',
'predicta', // readwriteweb inline ad box
'presence_control_external', // lifehacker.com container full of false positives
'popup',
'printfriendly',
'related',
'remove',
'remark',
'rss',
'share',
'shoutbox',
'sidebar',
'sociable',
'sponsor',
'tools'
]
'ad-break',
'adbox',
'advert',
'addthis',
'agegate',
'aux',
'blogger-labels',
'combx',
'comment',
'conversation',
'disqus',
'entry-unrelated',
'extra',
'foot',
'form',
'header',
'hidden',
'loader',
'login', // Note: This can hit 'blogindex'.
'menu',
'meta',
'nav',
'pager',
'pagination',
'predicta', // readwriteweb inline ad box
'presence_control_external', // lifehacker.com container full of false positives
'popup',
'printfriendly',
'related',
'remove',
'remark',
'rss',
'share',
'shoutbox',
'sidebar',
'sociable',
'sponsor',
'tools',
];
// A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
@ -57,56 +57,56 @@ export const UNLIKELY_CANDIDATES_BLACKLIST = [
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.
export const UNLIKELY_CANDIDATES_WHITELIST = [
'and',
'article',
'body',
'blogindex',
'column',
'content',
'entry-content-asset',
'format', // misuse of form
'hfeed',
'hentry',
'hatom',
'main',
'page',
'posts',
'shadow'
]
'and',
'article',
'body',
'blogindex',
'column',
'content',
'entry-content-asset',
'format', // misuse of form
'hfeed',
'hentry',
'hatom',
'main',
'page',
'posts',
'shadow',
];
// A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.
export const DIV_TO_P_BLOCK_TAGS = [
'a',
'blockquote',
'dl',
'div',
'img',
'p',
'pre',
'table',
].join(',')
'a',
'blockquote',
'dl',
'div',
'img',
'p',
'pre',
'table',
].join(',');
// A list of tags that should be ignored when trying to find the top candidate
// for a document.
export const NON_TOP_CANDIDATE_TAGS = [
'br',
'b',
'i',
'label',
'hr',
'area',
'base',
'basefont',
'input',
'img',
'link',
'meta',
]
'br',
'b',
'i',
'label',
'hr',
'area',
'base',
'basefont',
'input',
'img',
'link',
'meta',
];
export const NON_TOP_CANDIDATE_TAGS_RE =
new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i')
new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i');
// A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
@ -118,53 +118,15 @@ export const HNEWS_CONTENT_SELECTORS = [
['.post', '.postbody'],
['.post', '.post_body'],
['.post', '.post-body'],
]
// export const HNEWS_CONTENT_SELECTORS = [
// {
// //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['hentry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['entry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'),
// must_exist: {
// classes: ['entry', 'entry_content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'),
// must_exist: {
// classes: ['post', 'post-body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'),
// must_exist: {
// classes: ['post', 'post_body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'),
// must_exist: {
// classes: ['post', 'postbody'],
// }
// },
// ]
];
export const PHOTO_HINTS = [
'figure',
'photo',
'image',
'caption'
]
export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
'figure',
'photo',
'image',
'caption',
];
export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');
// A list of strings that denote a positive scoring for this content as being
@ -172,175 +134,175 @@ export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
//
// TODO: Perhaps have these scale based on their odds of being quality?
export const POSITIVE_SCORE_HINTS = [
'article',
'articlecontent',
'instapaper_body',
'blog',
'body',
'content',
'entry-content-asset',
'entry',
'hentry',
'main',
'Normal',
'page',
'pagination',
'permalink',
'post',
'story',
'text',
'[-_]copy', //usatoday
'\Bcopy'
]
'article',
'articlecontent',
'instapaper_body',
'blog',
'body',
'content',
'entry-content-asset',
'entry',
'hentry',
'main',
'Normal',
'page',
'pagination',
'permalink',
'post',
'story',
'text',
'[-_]copy', // usatoday
'\Bcopy',
];
// The above list, joined into a matching regular expression
export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');
// Readability publisher-specific guidelines
export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i')
export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');
// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
export const NEGATIVE_SCORE_HINTS = [
'adbox',
'advert',
'author',
'bio',
'bookmark',
'bottom',
'byline',
'clear',
'com-',
'combx',
'comment',
'comment\B',
'contact',
'copy',
'credit',
'crumb',
'date',
'deck',
'excerpt',
'featured', //tnr.com has a featured_content which throws us off
'foot',
'footer',
'footnote',
'graf',
'head',
'info',
'infotext', //newscientist.com copyright
'instapaper_ignore',
'jump',
'linebreak',
'link',
'masthead',
'media',
'meta',
'modal',
'outbrain', //slate.com junk
'promo',
'pr_', // autoblog - press release
'related',
'respond',
'roundcontent', //lifehacker restricted content warning
'scroll',
'secondary',
'share',
'shopping',
'shoutbox',
'side',
'sidebar',
'sponsor',
'stamp',
'sub',
'summary',
'tags',
'tools',
'widget'
]
'adbox',
'advert',
'author',
'bio',
'bookmark',
'bottom',
'byline',
'clear',
'com-',
'combx',
'comment',
'comment\B',
'contact',
'copy',
'credit',
'crumb',
'date',
'deck',
'excerpt',
'featured', // tnr.com has a featured_content which throws us off
'foot',
'footer',
'footnote',
'graf',
'head',
'info',
'infotext', // newscientist.com copyright
'instapaper_ignore',
'jump',
'linebreak',
'link',
'masthead',
'media',
'meta',
'modal',
'outbrain', // slate.com junk
'promo',
'pr_', // autoblog - press release
'related',
'respond',
'roundcontent', // lifehacker restricted content warning
'scroll',
'secondary',
'share',
'shopping',
'shoutbox',
'side',
'sidebar',
'sponsor',
'stamp',
'sub',
'summary',
'tags',
'tools',
'widget',
];
// The above list, joined into a matching regular expression
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');
// Match a digit. Pretty clear.
export const DIGIT_RE = new RegExp('[0-9]')
export const DIGIT_RE = new RegExp('[0-9]');
// Match 2 or more consecutive <br> tags
export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i')
export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i');
// Match 1 BR tag.
export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i')
export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');
// A list of all of the block level tags known in HTML5 and below. Taken from
// http://bit.ly/qneNIT
export const BLOCK_LEVEL_TAGS = [
'article',
'aside',
'blockquote',
'body',
'br',
'button',
'canvas',
'caption',
'col',
'colgroup',
'dd',
'div',
'dl',
'dt',
'embed',
'fieldset',
'figcaption',
'figure',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hgroup',
'hr',
'li',
'map',
'object',
'ol',
'output',
'p',
'pre',
'progress',
'section',
'table',
'tbody',
'textarea',
'tfoot',
'th',
'thead',
'tr',
'ul',
'video',
]
export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i')
'article',
'aside',
'blockquote',
'body',
'br',
'button',
'canvas',
'caption',
'col',
'colgroup',
'dd',
'div',
'dl',
'dt',
'embed',
'fieldset',
'figcaption',
'figure',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hgroup',
'hr',
'li',
'map',
'object',
'ol',
'output',
'p',
'pre',
'progress',
'section',
'table',
'tbody',
'textarea',
'tfoot',
'th',
'thead',
'tr',
'ul',
'video',
];
export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i');
// The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|')
export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i')
const candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');
export const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');
const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|')
export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i')
const candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');
export const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');
export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i')
export const UNLIKELY_RE = new RegExp(`!(${candidatesWhitelist})|(${candidatesBlacklist})`, 'i');
export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i')
export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i')
export const BAD_TAGS = new RegExp('^(address|form)$', 'i')
export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');
export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');
export const BAD_TAGS = new RegExp('^(address|form)$', 'i');
export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i')
export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');

@ -1,115 +1,35 @@
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants'
import { getScore } from './index'
import {
textLength,
linkDensity
} from 'utils/dom'
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
import { getScore } from './index';
import mergeSiblings from './merge-siblings';
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
export default function findTopCandidate($) {
let $candidate, topScore = 0
let $candidate;
let topScore = 0;
$('*[score]').each((index, node) => {
const $node = $(node)
const $node = $(node);
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
return
return;
}
const score = getScore($node)
const score = getScore($node);
if (score > topScore) {
topScore = score
$candidate = $node
topScore = score;
$candidate = $node;
}
})
});
// If we don't have a candidate, return the body
// or whatever the first element is
if (!$candidate) {
return $('body') || $('*').first()
return $('body') || $('*').first();
}
$candidate = mergeSiblings($candidate, topScore, $)
$candidate = mergeSiblings($candidate, topScore, $);
return $candidate
}
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
let wrappingDiv = $('<div></div>')
$candidate.parent().children().each((index, child) => {
const $child = $(child)
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return
}
const childScore = getScore($child)
if (childScore) {
if ($child === $candidate) {
wrappingDiv.append($child)
} else {
let contentBonus = 0
// extract to scoreLinkDensity() TODO
const density = linkDensity($child)
// If sibling has a very low link density,
// give it a small bonus
if (density < .05) {
contentBonus = contentBonus + 20
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus = contentBonus - 20
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus = contentBonus + topScore * .2
}
const newScore = getScore($child) + contentBonus
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($child)
} else if (child.tagName === 'p') {
const childContentLength = textLength($child.text())
if (childContentLength > 80 && density < .25) {
return wrappingDiv.append($child)
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append($child)
}
}
}
}
})
return wrappingDiv
}
// TODO Extract into util - AP
// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
const SENTENCE_END_RE = new RegExp('\.( |$)')
function hasSentenceEnd(text) {
return SENTENCE_END_RE.test(text)
return $candidate;
}

@ -1,58 +1,58 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
import HTML from './fixtures/html'
import HTML from './fixtures/html';
import {
getScore,
findTopCandidate,
scoreContent
} from './index'
scoreContent,
} from './index';
describe('findTopCandidate($)', () => {
it("finds the top candidate from simple case", () => {
const $ = cheerio.load(HTML.findDom1)
it('finds the top candidate from simple case', () => {
const $ = cheerio.load(HTML.findDom1);
const $$topCandidate = findTopCandidate($)
const $$topCandidate = findTopCandidate($);
assert.equal(getScore($$topCandidate), 100)
})
assert.equal(getScore($$topCandidate), 100);
});
it("finds the top candidate from a nested case", () => {
const $ = cheerio.load(HTML.findDom2)
it('finds the top candidate from a nested case', () => {
const $ = cheerio.load(HTML.findDom2);
const $$topCandidate = findTopCandidate($)
const $$topCandidate = findTopCandidate($);
// this is wrapped in a div so checking
// the score of the first child
assert.equal(getScore($$topCandidate.children().first()), 50)
})
assert.equal(getScore($$topCandidate.children().first()), 50);
});
it("ignores tags like BR", () => {
const $ = cheerio.load(HTML.findDom3)
it('ignores tags like BR', () => {
const $ = cheerio.load(HTML.findDom3);
const $topCandidate = findTopCandidate($)
const $topCandidate = findTopCandidate($);
assert.equal(getScore($topCandidate), 50)
})
assert.equal(getScore($topCandidate), 50);
});
it("returns BODY if no candidates found", () => {
const $ = cheerio.load(HTML.topBody)
it('returns BODY if no candidates found', () => {
const $ = cheerio.load(HTML.topBody);
const $topCandidate = findTopCandidate($)
const $topCandidate = findTopCandidate($);
assert.equal($topCandidate.get(0).tagName, 'body')
})
assert.equal($topCandidate.get(0).tagName, 'body');
});
it("appends a sibling with a good enough score", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
it('appends a sibling with a good enough score', () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
let $ = cheerio.load(html)
$ = scoreContent($)
let $ = cheerio.load(html);
$ = scoreContent($);
const $topCandidate = findTopCandidate($)
assert.equal($($topCandidate).text().length, 3652)
})
})
const $topCandidate = findTopCandidate($);
assert.equal($($topCandidate).text().length, 3652);
});
});

@ -237,7 +237,7 @@ const HTML = {
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`
`,
},
// cleanImages
@ -252,7 +252,7 @@ const HTML = {
<div>
<img width="50">
</div>
`
`,
},
cleanHeight: {
before: `
@ -264,7 +264,7 @@ const HTML = {
<div>
<img width="50">
</div>
`
`,
},
cleanSpacer: {
before: `
@ -279,7 +279,7 @@ const HTML = {
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`
`,
},
// stripJunkTags
stripsJunk: {
@ -298,7 +298,7 @@ const HTML = {
<div>
<p>What an article</p>
</div>
`
`,
},
// stripHOnes
@ -314,7 +314,7 @@ const HTML = {
<div>
<p>What do you think?</p>
</div>
`
`,
},
convertThreeHOnes: {
before: `
@ -334,7 +334,7 @@ const HTML = {
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`
`,
},
// cleanAttributes
@ -348,7 +348,7 @@ const HTML = {
<div>
<p>What do you think?</p>
</div>
`
`,
},
removeAlign: {
before: `
@ -360,7 +360,7 @@ const HTML = {
<div>
<p>What do you think?</p>
</div>
`
`,
},
// removeEmpty
@ -375,7 +375,7 @@ const HTML = {
<div>
<p>What do you think?</p>
</div>
`
`,
},
doNotRemoveBr: {
before: `
@ -392,7 +392,7 @@ const HTML = {
<div></div>
<p>What do you think?</p>
</div>
`
`,
},
doNotNested: {
before: `
@ -409,7 +409,7 @@ const HTML = {
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`
`,
},
// cleanConditionally
@ -433,7 +433,7 @@ const HTML = {
</p>
<p>What do you think?</p>
</div>
`
`,
},
removeTooManyInputs: {
before: `
@ -467,7 +467,7 @@ const HTML = {
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
`,
},
removeShortNoImg: {
before: `
@ -490,7 +490,7 @@ const HTML = {
<img src="asdf">
</div>
</div>
`
`,
},
linkDensityHigh: {
@ -527,7 +527,7 @@ const HTML = {
<li>Keep this one</li>
</ul>
</div>
`
`,
},
goodScoreTooDense: {
before: `
@ -567,7 +567,7 @@ const HTML = {
<li>Keep this one</li>
</ul>
</div>
`
`,
},
previousEndsInColon: {
before: `
@ -608,7 +608,7 @@ const HTML = {
<p>What do you think?</p>
</div>
`,
after: `What do you think?`
after: 'What do you think?',
},
// cleanHeaders
@ -627,7 +627,7 @@ const HTML = {
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`
`,
},
cleanTitleMatch: {
before: `
@ -642,7 +642,7 @@ const HTML = {
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
`,
},
dropWithNegativeWeight: {
before: `
@ -657,8 +657,8 @@ const HTML = {
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
`,
},
}
};
export default HTML
export default HTML;

@ -82,6 +82,6 @@ const HTML = {
</article>
<body>
`,
}
};
export default HTML
export default HTML;

@ -3,27 +3,26 @@ import {
scoreNode,
getWeight,
addToParent,
} from './index'
} from './index';
// gets and returns the score if it exists
// if not, initializes a score based on
// the node's tag type
export default function getOrInitScore($node, $, weightNodes=true) {
let score = getScore($node)
export default function getOrInitScore($node, $, weightNodes = true) {
let score = getScore($node);
if (score) {
return score
} else {
score = scoreNode($node)
return score;
}
if (weightNodes) {
score = score + getWeight($node)
}
score = scoreNode($node);
addToParent($node, $, score)
if (weightNodes) {
score += getWeight($node);
}
return score
}
addToParent($node, $, score);
return score;
}

@ -1,61 +1,61 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html'
import HTML from './fixtures/html';
import {
getOrInitScore,
getScore,
} from './index'
} from './index';
describe('getOrInitScore(node, $)', () => {
describe('when score set', () => {
it(`returns score if node's score already set`, () => {
const html = '<p score="40">Foo</p>'
const $ = cheerio.load(html)
const node = $('p').first()
it('returns score if node\'s score already set', () => {
const html = '<p score="40">Foo</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = getOrInitScore(node, $)
const score = getOrInitScore(node, $);
assert.equal(score, 40)
})
})
assert.equal(score, 40);
});
});
describe('when no score set', () => {
it(`returns 0 if no class/id and text < 25 chars`, () => {
const html = '<p>Foo</p>'
const $ = cheerio.load(html)
const node = $('p').first()
it('returns 0 if no class/id and text < 25 chars', () => {
const html = '<p>Foo</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = getOrInitScore(node, $)
const score = getOrInitScore(node, $);
assert.equal(score, 0)
})
assert.equal(score, 0);
});
it(`returns score if no class/id and has commas/length`, () => {
const $ = cheerio.load(HTML.score19)
const node = $('p').first()
it('returns score if no class/id and has commas/length', () => {
const $ = cheerio.load(HTML.score19);
const node = $('p').first();
const score = getOrInitScore(node, $)
const score = getOrInitScore(node, $);
assert.equal(score, 19)
})
assert.equal(score, 19);
});
it(`returns greater score if weighted class/id is set`, () => {
const $ = cheerio.load(HTML.score44)
const node = $('p').first()
it('returns greater score if weighted class/id is set', () => {
const $ = cheerio.load(HTML.score44);
const node = $('p').first();
const score = getOrInitScore(node, $)
const score = getOrInitScore(node, $);
assert.equal(score, 44)
})
assert.equal(score, 44);
});
it(`gives 1/4 of its score to its parent`, () => {
const $ = cheerio.load(HTML.score44Parent)
const node = $('p').first()
it('gives 1/4 of its score to its parent', () => {
const $ = cheerio.load(HTML.score44Parent);
const node = $('p').first();
const score = getOrInitScore(node, $)
getOrInitScore(node, $);
assert.equal(getScore(node.parent()), 16)
})
})
})
assert.equal(getScore(node.parent()), 16);
});
});
});

@ -2,5 +2,5 @@
// the node's score attribute
// returns null if no score set
export default function getScore($node) {
return parseFloat($node.attr('score')) || null
return parseFloat($node.attr('score')) || null;
}

@ -1,25 +1,22 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import { getScore } from './index'
import { getScore } from './index';
describe('Scoring utils', () => {
describe('getScore($node)', () => {
it("returns null if the node has no score set", () => {
const $ = cheerio.load('<p>Foo</p>')
const $node = $('p').first()
assert.equal(getScore($node), null)
})
it('returns null if the node has no score set', () => {
const $ = cheerio.load('<p>Foo</p>');
const $node = $('p').first();
assert.equal(getScore($node), null);
});
it("returns 25 if the node has a score attr of 25", () => {
const $ = cheerio.load('<p score="25">Foo</p>')
const $node = $('p').first()
assert.equal(typeof getScore($node), 'number')
assert.equal(getScore($node), 25)
})
})
})
it('returns 25 if the node has a score attr of 25', () => {
const $ = cheerio.load('<p score="25">Foo</p>');
const $node = $('p').first();
assert.equal(typeof getScore($node), 'number');
assert.equal(getScore($node), 25);
});
});
});

@ -3,42 +3,42 @@ import {
POSITIVE_SCORE_RE,
PHOTO_HINTS_RE,
READABILITY_ASSET,
} from './constants'
} from './constants';
// Get the score of a node based on its className and id.
export default function getWeight(node) {
const classes = node.attr('class')
const id = node.attr('id')
let score = 0
const classes = node.attr('class');
const id = node.attr('id');
let score = 0;
if (id) {
// if id exists, try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(id)) {
score = score + 25
score += 25;
}
if (NEGATIVE_SCORE_RE.test(id)) {
score = score - 25
score -= 25;
}
}
if (classes) {
if (score == 0) {
if (score === 0) {
// if classes exist and id did not contribute to score
// try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(classes)) {
score = score + 25
score += 25;
}
if (NEGATIVE_SCORE_RE.test(classes)) {
score = score - 25
score -= 25;
}
}
// even if score has been set by id, add score for
// even if score has been set by id, add score for
// possible photo matches
// "try to keep photos if we can"
if (PHOTO_HINTS_RE.test(classes)) {
score = score + 10
score += 10;
}
// add 25 if class matches entry-content-asset,
@ -46,11 +46,10 @@ export default function getWeight(node) {
// Readability publisher guidelines
// https://www.readability.com/developers/guidelines
if (READABILITY_ASSET.test(classes)) {
score = score + 25
score += 25;
}
}
return score
return score;
}

@ -1,59 +1,58 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/get-weight'
import HTML from './fixtures/get-weight';
import {
getWeight
} from './index'
getWeight,
} from './index';
describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => {
it("returns a score of 25 if node has positive id", () => {
const $ = cheerio.load(HTML.positiveId)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of -25 if node has negative id", () => {
const $ = cheerio.load(HTML.negativeId)
assert.equal(getWeight($('div')), -25)
})
it("returns a score of 25 if node has positive class", () => {
const $ = cheerio.load(HTML.positiveClass)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of -25 if node has negative class", () => {
const $ = cheerio.load(HTML.negativeClass)
assert.equal(getWeight($('div')), -25)
})
it("returns a score of 25 if node has both positive id and class", () => {
const $ = cheerio.load(HTML.positiveIdAndClass)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of 25 if node has pos id and neg class", () => {
it('returns a score of 25 if node has positive id', () => {
const $ = cheerio.load(HTML.positiveId);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of -25 if node has negative id', () => {
const $ = cheerio.load(HTML.negativeId);
assert.equal(getWeight($('div')), -25);
});
it('returns a score of 25 if node has positive class', () => {
const $ = cheerio.load(HTML.positiveClass);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of -25 if node has negative class', () => {
const $ = cheerio.load(HTML.negativeClass);
assert.equal(getWeight($('div')), -25);
});
it('returns a score of 25 if node has both positive id and class', () => {
const $ = cheerio.load(HTML.positiveIdAndClass);
assert.equal(getWeight($('div')), 25);
});
it('returns a score of 25 if node has pos id and neg class', () => {
// is this really wanted? id="entry" class="adbox"
// should get positive score?
const $ = cheerio.load(HTML.positiveIdNegClass)
assert.equal(getWeight($('div')), 25)
})
const $ = cheerio.load(HTML.positiveIdNegClass);
assert.equal(getWeight($('div')), 25);
});
it("returns a score of 10 if node has pos img class", () => {
const $ = cheerio.load(HTML.positivePhotoClass)
assert.equal(getWeight($('div')), 10)
})
it('returns a score of 10 if node has pos img class', () => {
const $ = cheerio.load(HTML.positivePhotoClass);
assert.equal(getWeight($('div')), 10);
});
it("returns a score of 35 if node has pos id pos img class", () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto)
assert.equal(getWeight($('div')), 35)
})
it('returns a score of 35 if node has pos id pos img class', () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto);
assert.equal(getWeight($('div')), 35);
});
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
const $ = cheerio.load(HTML.entryContentAsset)
assert.equal(getWeight($('div')), 50)
})
})
})
const $ = cheerio.load(HTML.entryContentAsset);
assert.equal(getWeight($('div')), 50);
});
});
});

@ -1,13 +1,13 @@
// Scoring
export { default as getWeight } from './get-weight'
export { default as getScore } from './get-score'
export { default as scoreCommas } from './score-commas'
export { default as scoreLength } from './score-length'
export { default as scoreParagraph } from './score-paragraph'
export { default as setScore } from './set-score'
export { default as addScore } from './add-score'
export { default as addToParent } from './add-to-parent'
export { default as getOrInitScore } from './get-or-init-score'
export { default as scoreNode } from './score-node'
export { default as scoreContent } from './score-content'
export { default as findTopCandidate } from './find-top-candidate'
export { default as getWeight } from './get-weight';
export { default as getScore } from './get-score';
export { default as scoreCommas } from './score-commas';
export { default as scoreLength } from './score-length';
export { default as scoreParagraph } from './score-paragraph';
export { default as setScore } from './set-score';
export { default as addScore } from './add-score';
export { default as addToParent } from './add-to-parent';
export { default as getOrInitScore } from './get-or-init-score';
export { default as scoreNode } from './score-node';
export { default as scoreContent } from './score-content';
export { default as findTopCandidate } from './find-top-candidate';

@ -0,0 +1,79 @@
import {
textLength,
linkDensity,
} from 'utils/dom';
import { hasSentenceEnd } from 'utils/text';
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
import { getScore } from './index';
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export default function mergeSiblings($candidate, topScore, $) {
if (!$candidate.parent().length) {
return $candidate;
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2);
const wrappingDiv = $('<div></div>');
$candidate.parent().children().each((index, child) => {
const $child = $(child);
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return null;
}
const childScore = getScore($child);
if (childScore) {
if ($child === $candidate) {
wrappingDiv.append($child);
} else {
let contentBonus = 0;
// extract to scoreLinkDensity() TODO
const density = linkDensity($child);
// If sibling has a very low link density,
// give it a small bonus
if (density < 0.05) {
contentBonus += 20;
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus -= 20;
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($child.attr('class') === $candidate.attr('class')) {
contentBonus += topScore * 0.2;
}
const newScore = getScore($child) + contentBonus;
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($child);
} else if (child.tagName === 'p') {
const childContent = $child.text();
const childContentLength = textLength(childContent);
if (childContentLength > 80 && density < 0.25) {
return wrappingDiv.append($child);
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append($child);
}
}
}
}
return null;
});
return wrappingDiv;
}

@ -1,5 +1,5 @@
// return 1 for every comma in text
export default function scoreCommas(text) {
return (text.match(/,/g) || []).length
return (text.match(/,/g) || []).length;
}

@ -1,20 +1,18 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import { scoreCommas } from './index'
import { scoreCommas } from './index';
describe('Scoring utils', () => {
describe('scoreCommas(text)', () => {
it(`returns 0 if text has no commas`, () => {
assert.equal(scoreCommas("Foo bar"), 0)
})
it(`returns a point for every comma in the text`, () => {
assert.equal(scoreCommas('Foo, bar'), 1)
assert.equal(scoreCommas('Foo, bar, baz'), 2)
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3)
})
})
})
it('returns 0 if text has no commas', () => {
assert.equal(scoreCommas('Foo bar'), 0);
});
it('returns a point for every comma in the text', () => {
assert.equal(scoreCommas('Foo, bar'), 1);
assert.equal(scoreCommas('Foo, bar, baz'), 2);
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3);
});
});
});

@ -1,119 +1,69 @@
import { HNEWS_CONTENT_SELECTORS } from './constants'
import { convertNodeTo } from 'utils/dom';
import { HNEWS_CONTENT_SELECTORS } from './constants';
import {
scoreNode,
setScore,
getOrInitScore,
addScore,
} from './index'
} from './index';
import { convertNodeTo } from 'utils/dom'
// score content. Parents get the full value of their children's
// content score, grandparents half
export default function scoreContent($, weightNodes=true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80)
})
})
function convertSpans($node, $) {
if ($node.get(0)) {
const { tagName } = $node.get(0);
scorePs($, weightNodes)
if (tagName === 'span') {
// convert spans to divs
convertNodeTo($node, $, 'div');
}
}
}
return $
function addScoreTo($node, $, score) {
if ($node) {
convertSpans($node, $);
addScore($node, $, score);
}
}
function scorePs($, weightNodes) {
$('p, pre').toArray().map((node) => {
// The raw score for this paragraph, before we add any parent/child
// scores.
let $node = $(node)
$node = setScore($node, $, getOrInitScore($node, $, weightNodes))
let $node = $(node);
$node = setScore($node, $, getOrInitScore($node, $, weightNodes));
return $node
return $node;
}).forEach(($node) => {
// The parent scoring has to be done in a separate loop
// because otherwise scoring the parent overwrites
// the score added to the child
// Add the individual content score to the parent node
const rawScore = scoreNode($node)
const rawScore = scoreNode($node);
const $parent = $node.parent()
addScoreTo($parent, $, rawScore, weightNodes)
const $parent = $node.parent();
addScoreTo($parent, $, rawScore, weightNodes);
if ($parent) {
// Add half of the individual content score to the
// grandparent
addScoreTo($parent.parent(), $, rawScore/2, weightNodes)
addScoreTo($parent.parent(), $, rawScore / 2, weightNodes);
}
})
});
}
function convertSpans($node, $) {
if ($node.get(0)) {
const { tagName } = $node.get(0)
// score content. Parents get the full value of their children's
// content score, grandparents half
export default function scoreContent($, weightNodes = true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.forEach(([parentSelector, childSelector]) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80);
});
});
if (tagName === 'span') {
// convert spans to divs
convertNodeTo($node, $, 'div')
}
}
}
scorePs($, weightNodes);
function addScoreTo($node, $, score, weightNodes) {
if ($node) {
convertSpans($node, $)
addScore($node, $, score)
}
return $;
}
// def _score_content(self, doc, weight_nodes=True):
// for selector in constants.HNEWS_CONTENT_SELECTORS:
// # Not self.resource.extract_by_selector because our doc is a copy
// # of the resource doc.
// nodes = extract_by_selector(doc, selector,
// AttribMap(doc))
// for node in nodes:
// self._add_score(node, 80)
//
// paras = doc.xpath('.//p | .//pre')
//
// # If we don't have any paragraphs at all, we can't score based on
// # paragraphs, so return without modifying anything else.
// if len(paras) == 0:
// return doc
//
// for para in paras:
// # Don't score invalid tags
// if not isinstance(para.tag, basestring):
// continue
//
// # The raw score for this paragraph, before we add any parent/child
// # scores.
// raw_score = self._score_node(para)
// self._set_score(para, self._get_score(para, weight_nodes))
//
// parent = para.getparent()
// if parent is not None:
// if parent.tag == 'span':
// parent.tag = 'div'
//
// # Add the individual content score to the parent node
// self._add_score(parent, raw_score, weight_nodes=weight_nodes)
//
// grandparent = parent.getparent()
// if grandparent is not None:
// if grandparent.tag == 'span':
// grandparent.tag = 'div'
//
// # Add half of the individual content score to the
// # grandparent
// gp_score = raw_score / 2.0
// self._add_score(grandparent, gp_score, weight_nodes=weight_nodes)
//
// return doc

@ -1,47 +1,45 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
import { clean } from 'test-helpers'
import HTML from './fixtures/html'
import HTML from './fixtures/html';
import {
scoreContent,
getScore,
} from './index'
} from './index';
// TODO: Walk through these and sanity check my scores
// Commented out scores were what I expected, but I was also
// probably missing something when calculating
describe('scoreContent($, weightNodes)', () => {
it("loves hNews content", () => {
const $ = cheerio.load(HTML.hNews.before)
const result = scoreContent($).html()
it('loves hNews content', () => {
const $ = cheerio.load(HTML.hNews.before);
scoreContent($).html();
assert.equal(getScore($('div').first()), 140)
})
assert.equal(getScore($('div').first()), 140);
});
it("is so-so about non-hNews content", () => {
const $ = cheerio.load(HTML.nonHNews.before)
const result = scoreContent($).html()
it('is so-so about non-hNews content', () => {
const $ = cheerio.load(HTML.nonHNews.before);
scoreContent($).html();
assert.equal(getScore($('div').first()), 65)
})
assert.equal(getScore($('div').first()), 65);
});
it("scores this Wired article the same", () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
const $ = cheerio.load(html)
const result = scoreContent($).html()
it('scores this Wired article the same', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html);
scoreContent($).html();
assert.equal(getScore($('article').first()), 65.5)
})
assert.equal(getScore($('article').first()), 65.5);
});
it("scores this Vulture article", () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8')
let $ = cheerio.load(html)
$ = scoreContent($)
it('scores this Vulture article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
let $ = cheerio.load(html);
$ = scoreContent($);
assert.equal($('p[score]').length, 62)
})
})
assert.equal($('p[score]').length, 62);
});
});

@ -1,11 +1,10 @@
const idkRe = new RegExp('^(p|pre)$', 'i')
const idkRe = new RegExp('^(p|pre)$', 'i');
export default function scoreLength(textLength, tagName='p') {
let score
const chunks = textLength / 50
export default function scoreLength(textLength, tagName = 'p') {
const chunks = textLength / 50;
if (chunks > 0) {
let lengthBonus
let lengthBonus;
// No idea why p or pre are being tamped down here
// but just following the source for now
@ -13,14 +12,14 @@ export default function scoreLength(textLength, tagName='p') {
// since this is only being called from the context
// of scoreParagraph
if (idkRe.test(tagName)) {
lengthBonus = chunks - 2
lengthBonus = chunks - 2;
} else {
lengthBonus = chunks - 1.25
lengthBonus = chunks - 1.25;
}
return Math.min(Math.max(lengthBonus, 0), 3)
} else {
return 0
return Math.min(Math.max(lengthBonus, 0), 3);
}
return 0;
}

@ -1,22 +1,21 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import { scoreLength } from './index'
import { scoreLength } from './index';
describe('Scoring utils', () => {
describe('scoreLength(textLength, tagName)', () => {
it(`returns 0 if length < 50 chars`, () => {
assert.equal(scoreLength(30), 0)
})
it('returns 0 if length < 50 chars', () => {
assert.equal(scoreLength(30), 0);
});
it(`returns varying scores but maxes out at 3`, () => {
assert.equal(scoreLength(150), 1)
assert.equal(scoreLength(199), 1.98)
assert.equal(scoreLength(200), 2)
assert.equal(scoreLength(250), 3)
assert.equal(scoreLength(500), 3)
assert.equal(scoreLength(1500), 3)
})
})
})
it('returns varying scores but maxes out at 3', () => {
assert.equal(scoreLength(150), 1);
assert.equal(scoreLength(199), 1.98);
assert.equal(scoreLength(200), 2);
assert.equal(scoreLength(250), 3);
assert.equal(scoreLength(500), 3);
assert.equal(scoreLength(1500), 3);
});
});
});

@ -1,29 +1,29 @@
import { scoreParagraph } from './index'
import { scoreParagraph } from './index';
import {
PARAGRAPH_SCORE_TAGS,
CHILD_CONTENT_TAGS,
BAD_TAGS,
} from './constants'
} from './constants';
// Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag.
export default function scoreNode($node) {
const { tagName } = $node.get(0)
const { tagName } = $node.get(0);
// TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node AP
if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
return scoreParagraph($node)
return scoreParagraph($node);
} else if (tagName === 'div') {
return 5
return 5;
} else if (CHILD_CONTENT_TAGS.test(tagName)) {
return 3
return 3;
} else if (BAD_TAGS.test(tagName)) {
return -3
return -3;
} else if (tagName === 'th') {
return -5
return -5;
}
return 0
return 0;
}

@ -1,95 +1,94 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html'
import HTML from './fixtures/html';
import {
scoreNode,
scoreParagraph,
} from './index'
} from './index';
describe('scoreNode(node)', () => {
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const html = '<p><em>Foo</em> bar</p>'
const $ = cheerio.load(html)
let node = $('p').first()
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = scoreNode(node)
const pScore = scoreParagraph(node)
const score = scoreNode(node);
const pScore = scoreParagraph(node);
assert.equal(score, pScore)
assert.equal(score, 0)
})
assert.equal(score, pScore);
assert.equal(score, 0);
});
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const $ = cheerio.load(HTML.score1)
let node = $('p').first()
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score1);
const node = $('p').first();
const score = scoreNode(node)
const pScore = scoreParagraph(node)
const score = scoreNode(node);
const pScore = scoreParagraph(node);
assert.equal(score, pScore)
assert.equal(score, 1)
assert.equal(score, pScore);
assert.equal(score, 1);
});
})
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score3);
const node = $('p').first();
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const $ = cheerio.load(HTML.score3)
let node = $('p').first()
const score = scoreNode(node);
const pScore = scoreParagraph(node);
const score = scoreNode(node)
const pScore = scoreParagraph(node)
assert.equal(score, pScore);
assert.equal(score, 3);
});
assert.equal(score, pScore)
assert.equal(score, 3)
})
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
const $ = cheerio.load(HTML.score19);
const node = $('p').first();
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const $ = cheerio.load(HTML.score19)
let node = $('p').first()
const score = scoreNode(node);
const pScore = scoreParagraph(node);
const score = scoreNode(node)
const pScore = scoreParagraph(node)
assert.equal(score, pScore);
assert.equal(score, 19);
});
assert.equal(score, pScore)
assert.equal(score, 19)
})
it('scores divs with 5', () => {
const $ = cheerio.load(HTML.divScore5);
const node = $('div').first();
it(`scores divs with 5`, () => {
const $ = cheerio.load(HTML.divScore5)
let node = $('div').first()
const score = scoreNode(node);
const score = scoreNode(node)
assert.equal(score, 5);
});
assert.equal(score, 5)
})
it('scores the blockquote family with 3', () => {
const $ = cheerio.load(HTML.blockquoteScore3);
const node = $('blockquote').first();
it(`scores the blockquote family with 3`, () => {
const $ = cheerio.load(HTML.blockquoteScore3)
let node = $('blockquote').first()
const score = scoreNode(node);
const score = scoreNode(node)
assert.equal(score, 3);
});
assert.equal(score, 3)
})
it('scores a form with negative 3', () => {
const $ = cheerio.load(HTML.formScoreNeg3);
const node = $('form').first();
it(`scores a form with negative 3`, () => {
const $ = cheerio.load(HTML.formScoreNeg3)
let node = $('form').first()
const score = scoreNode(node);
const score = scoreNode(node)
assert.equal(score, -3);
});
assert.equal(score, -3)
})
it('scores a TH element with negative 5', () => {
const $ = cheerio.load(HTML.thScoreNeg5);
const node = $('th').first();
it(`scores a TH element with negative 5`, () => {
const $ = cheerio.load(HTML.thScoreNeg5)
let node = $('th').first()
const score = scoreNode(node);
const score = scoreNode(node)
assert.equal(score, -5)
})
})
assert.equal(score, -5);
});
});

@ -1,35 +1,35 @@
import {
scoreCommas,
scoreLength,
} from './index'
} from './index';
// Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better.
export default function scoreParagraph(node) {
let score = 1
const text = node.text().trim()
const textLength = text.length
let score = 1;
const text = node.text().trim();
const textLength = text.length;
// If this paragraph is less than 25 characters, don't count it.
if (textLength < 25) {
return 0
return 0;
}
// Add points for any commas within this paragraph
score = score + scoreCommas(text)
score += scoreCommas(text);
// For every 50 characters in this paragraph, add another point. Up
// to 3 points.
score = score + scoreLength(textLength)
score += scoreLength(textLength);
// Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold.
if (text.slice(-1) === ':') {
score = score - 1
score -= 1;
}
return score
return score;
}

@ -1,48 +1,48 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html'
import HTML from './fixtures/html';
import {
scoreParagraph,
} from './index'
} from './index';
describe('Scoring utils', () => {
describe('scoreParagraph(node)', () => {
it(`returns 0 if text is less than 25 chars`, () => {
const html = '<p><em>Foo</em> bar</p>'
const $ = cheerio.load(html)
let node = $('p').first()
it('returns 0 if text is less than 25 chars', () => {
const html = '<p><em>Foo</em> bar</p>';
const $ = cheerio.load(html);
const node = $('p').first();
const score = scoreParagraph(node)
const score = scoreParagraph(node);
assert.equal(score, 0)
})
assert.equal(score, 0);
});
it(`returns 1 if text is > 25 chars and has 0 commas`, () => {
const $ = cheerio.load(HTML.score1)
let node = $('p').first()
it('returns 1 if text is > 25 chars and has 0 commas', () => {
const $ = cheerio.load(HTML.score1);
const node = $('p').first();
const score = scoreParagraph(node)
const score = scoreParagraph(node);
assert.equal(score, 1)
})
assert.equal(score, 1);
});
it(`returns 3 if text is > 25 chars and has 2 commas`, () => {
const $ = cheerio.load(HTML.score3)
let node = $('p').first()
it('returns 3 if text is > 25 chars and has 2 commas', () => {
const $ = cheerio.load(HTML.score3);
const node = $('p').first();
const score = scoreParagraph(node)
const score = scoreParagraph(node);
assert.equal(score, 3)
})
assert.equal(score, 3);
});
it(`returns 19 if text has 15 commas, ~600 chars`, () => {
const $ = cheerio.load(HTML.score19)
let node = $('p').first()
it('returns 19 if text has 15 commas, ~600 chars', () => {
const $ = cheerio.load(HTML.score19);
const node = $('p').first();
const score = scoreParagraph(node)
const score = scoreParagraph(node);
assert.equal(score, 19)
})
})
})
assert.equal(score, 19);
});
});
});

@ -1,7 +1,6 @@
export default function setScore($node, $, score) {
$node.attr('score', score)
return $node
$node.attr('score', score);
return $node;
}

@ -1,23 +1,22 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import {
setScore,
getScore
} from './index'
getScore,
} from './index';
describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>')
let $node = $('p').first()
const $ = cheerio.load('<p>Foo</p>');
let $node = $('p').first();
const newScore = 25
$node = setScore($node, $, newScore)
const newScore = 25;
$node = setScore($node, $, newScore);
const score = getScore($node)
assert(score, newScore)
})
})
})
const score = getScore($node);
assert(score, newScore);
});
});
});

@ -3,23 +3,23 @@
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
export const DATE_PUBLISHED_META_TAGS = [
'article:published_time',
'displaydate',
'dc.date',
'dc.date.issued',
'rbpubdate',
'publish_date',
'pub_date',
'pagedate',
'pubdate',
'revision_date',
'doc_date',
'date_created',
'content_create_date',
'lastmodified',
'created',
'date'
]
'article:published_time',
'displaydate',
'dc.date',
'dc.date.issued',
'rbpubdate',
'publish_date',
'pub_date',
'pagedate',
'pubdate',
'revision_date',
'doc_date',
'date_created',
'content_create_date',
'lastmodified',
'created',
'date',
];
// An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
@ -42,20 +42,20 @@ export const DATE_PUBLISHED_SELECTORS = [
'#story .datetime',
'.dateline',
'.pubdate',
]
];
// An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse
const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
export const DATE_PUBLISHED_URL_RES = [
// /2012/01/27/ but not /2012/01/293
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
// 20120127 or 20120127T but not 2012012733 or 8201201733
// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
// 2012-01-27
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
// /2012/jan/27/
new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i')
]
new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'),
];

@ -1,37 +1,36 @@
import { cleanDatePublished } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import { extractFromUrl } from 'utils/text';
import {
DATE_PUBLISHED_META_TAGS,
DATE_PUBLISHED_SELECTORS,
DATE_PUBLISHED_URL_RES,
} from './constants'
import { cleanDatePublished } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom'
import { extractFromUrl } from 'utils/text'
} from './constants';
const GenericDatePublishedExtractor = {
extract({ $, url, metaCache }) {
let datePublished
let datePublished;
// First, check to see if we have a matching meta tag
// that we can make use of.
// Don't try cleaning tags from this string
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false)
if(datePublished) return cleanDatePublished(datePublished)
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false);
if (datePublished) return cleanDatePublished(datePublished);
// Second, look through our selectors looking for potential
// date_published's.
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS)
if(datePublished) return cleanDatePublished(datePublished)
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
if (datePublished) return cleanDatePublished(datePublished);
// Lastly, look to see if a dately string exists in the URL
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES)
if(datePublished) return cleanDatePublished(datePublished)
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
if (datePublished) return cleanDatePublished(datePublished);
return null
}
}
return null;
},
};
export default GenericDatePublishedExtractor
export default GenericDatePublishedExtractor;

@ -1,97 +1,95 @@
import assert from 'assert'
import cheerio from 'cheerio'
import moment from 'moment'
import assert from 'assert';
import cheerio from 'cheerio';
import moment from 'moment';
import HTML from './fixtures/html'
import GenericDatePublishedExtractor from './extractor'
import HTML from './fixtures/html';
import GenericDatePublishedExtractor from './extractor';
describe('GenericDatePublishedExtractor', () => {
describe('extract($, metaCache)', () => {
it('extracts datePublished from meta tags', () => {
const $ = cheerio.load(HTML.datePublishedMeta.test)
const metaCache = ["displaydate", "something-else"]
const $ = cheerio.load(HTML.datePublishedMeta.test);
const metaCache = ['displaydate', 'something-else'];
const result =
GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache }
)
);
assert.equal(
assert.equal(
result,
HTML.datePublishedMeta.result.toISOString()
)
})
);
});
it('extracts datePublished from selectors', () => {
const $ = cheerio.load(HTML.datePublishedSelectors.test)
const metaCache = []
const $ = cheerio.load(HTML.datePublishedSelectors.test);
const metaCache = [];
const result =
GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache }
)
);
assert.equal(
assert.equal(
result,
HTML.datePublishedMeta.result.toISOString()
)
})
);
});
it('extracts from url formatted /2012/08/01/etc', () => {
const $ = cheerio.load('<div></div>')
const metaCache = []
const url = 'https://example.com/2012/08/01/this-is-good'
const $ = cheerio.load('<div></div>');
const metaCache = [];
const url = 'https://example.com/2012/08/01/this-is-good';
const result =
GenericDatePublishedExtractor.extract(
{ $, url, metaCache }
)
);
assert.equal(
assert.equal(
result,
new Date('2012/08/01').toISOString()
)
})
);
});
it('extracts from url formatted /2020-01-01', () => {
const $ = cheerio.load('<div></div>')
const metaCache = []
const url = 'https://example.com/2020-01-01/this-is-good'
const $ = cheerio.load('<div></div>');
const metaCache = [];
const url = 'https://example.com/2020-01-01/this-is-good';
const result =
GenericDatePublishedExtractor.extract(
{ $, url, metaCache }
)
);
assert.equal(
assert.equal(
result,
moment(new Date('2020-01-01')).toISOString()
)
})
);
});
it('extracts from url formatted /2020/jan/01', () => {
const $ = cheerio.load('<div></div>')
const metaCache = []
const url = 'https://example.com/2020/jan/01/this-is-good'
const $ = cheerio.load('<div></div>');
const metaCache = [];
const url = 'https://example.com/2020/jan/01/this-is-good';
const result =
GenericDatePublishedExtractor.extract(
{ $, url, metaCache }
)
);
assert.equal(
assert.equal(
result,
new Date('2020/jan/01').toISOString()
)
})
);
});
it('returns null if no date can be found', () => {
const $ = cheerio.load('<div></div>')
const metaCache = []
const $ = cheerio.load('<div></div>');
const metaCache = [];
const result =
GenericDatePublishedExtractor.extract(
{ $, url: '', metaCache }
)
assert.equal(result, null)
})
})
})
);
assert.equal(result, null);
});
});
});

@ -7,7 +7,7 @@ const HTML = {
</head>
</html>
`,
result: new Date('1/1/2020 8:30 (EST)')
result: new Date('1/1/2020 8:30 (EST)'),
},
datePublishedSelectors: {
test: `
@ -19,8 +19,8 @@ const HTML = {
</head>
</div>
`,
result: new Date('1/1/2020 8:30 am (EST)')
result: new Date('1/1/2020 8:30 am (EST)'),
},
}
};
export default HTML
export default HTML;

@ -1,27 +1,28 @@
import {
DEK_META_TAGS,
DEK_SELECTORS,
DEK_URL_RES,
} from './constants'
// import {
// DEK_META_TAGS,
// DEK_SELECTORS,
// DEK_URL_RES,
// } from './constants';
import { cleanDek } from 'cleaners'
// import { cleanDek } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom'
// import {
// extractFromMeta,
// extractFromSelectors,
// } from 'utils/dom';
// Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.
const GenericDekExtractor = {
extract({ $, content, metaCache }) {
return null
}
}
// extract({ $, content, metaCache }) {
extract() {
return null;
},
};
export default GenericDekExtractor
export default GenericDekExtractor;
// def extract_dek(self):
// # First, check to see if we have a matching meta tag that we can make

@ -1,20 +1,18 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
// import HTML from './fixtures/html'
import GenericDekExtractor from './extractor'
import GenericDekExtractor from './extractor';
describe('GenericDekExtractor', () => {
describe('extract({ $, metaCache })', () => {
it('returns null if no dek can be found', () => {
const $ = cheerio.load('<div></div>')
const metaCache = []
const $ = cheerio.load('<div></div>');
const metaCache = [];
const result =
GenericDekExtractor.extract({ $, metaCache })
assert.equal(result, null)
})
GenericDekExtractor.extract({ $, metaCache });
})
})
assert.equal(result, null);
});
});
});

@ -1,50 +1,50 @@
import cheerio from 'cheerio'
import cheerio from 'cheerio';
import GenericContentExtractor from './content/extractor'
import GenericTitleExtractor from './title/extractor'
import GenericAuthorExtractor from './author/extractor'
import GenericDatePublishedExtractor from './date-published/extractor'
import GenericDekExtractor from './dek/extractor'
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
import GenericNextPageUrlExtractor from './next-page-url/extractor'
import GenericContentExtractor from './content/extractor';
import GenericTitleExtractor from './title/extractor';
import GenericAuthorExtractor from './author/extractor';
import GenericDatePublishedExtractor from './date-published/extractor';
import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor';
const GenericExtractor = {
// This extractor is the default for all domains
domain: '*',
title: GenericTitleExtractor.extract,
datePublished : GenericDatePublishedExtractor.extract,
datePublished: GenericDatePublishedExtractor.extract,
author: GenericAuthorExtractor.extract,
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract,
extract: function(options) {
let { html } = options
extract(options) {
const { html } = options;
if (html) {
const $ = cheerio.load(html)
options.$ = $
const $ = cheerio.load(html);
options.$ = $;
}
const title = this.title(options)
const datePublished = this.datePublished(options)
const author = this.author(options)
const content = this.content({ ...options, title })
const leadImageUrl = this.leadImageUrl(options)
const dek = this.dek(options)
const nextPageUrl = this.nextPageUrl(options)
const title = this.title(options);
const datePublished = this.datePublished(options);
const author = this.author(options);
const content = this.content({ ...options, title });
const leadImageUrl = this.leadImageUrl(options);
const dek = this.dek(options);
const nextPageUrl = this.nextPageUrl(options);
return {
title,
author,
datePublished: datePublished ? datePublished : null,
datePublished: datePublished || null,
dek,
leadImageUrl,
content,
nextPageUrl,
}
}
}
};
},
};
export default GenericExtractor
export default GenericExtractor;

@ -1,14 +1,12 @@
import assert from 'assert'
import fs from 'fs'
import assert from 'assert';
import fs from 'fs';
import { clean } from 'test-helpers'
import GenericExtractor from './index'
import GenericExtractor from './index';
describe('GenericExtractor', () => {
describe('extract(opts)', () => {
it("extracts this old LA Times article", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
it('extracts this old LA Times article', () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
const {
title,
@ -16,23 +14,23 @@ describe('GenericExtractor', () => {
datePublished,
dek,
} = GenericExtractor.extract(
{ url: "http://latimes.com", html, metaCache: [] }
)
{ url: 'http://latimes.com', html, metaCache: [] }
);
assert.equal(author, null)
assert.equal(author, null);
assert.equal(
title,
'California appears poised to be first to ban power-guzzling big-screen TVs'
)
);
assert.equal(
datePublished,
'2009-10-14T04:00:00.000Z'
)
assert.equal(dek, null)
})
);
assert.equal(dek, null);
});
it("extracts html and returns the article title", () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
it('extracts html and returns the article title', () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8');
const {
author,
@ -40,18 +38,17 @@ describe('GenericExtractor', () => {
datePublished,
dek,
} = GenericExtractor.extract(
{ url: "http://wired.com", html, metaCache: [] }
)
{ url: 'http://wired.com', html, metaCache: [] }
);
assert.equal(author, 'Eric Adams')
assert.equal(author, 'Eric Adams');
assert.equal(
title,
'Airplane Tires Dont Explode on Landing Because They Are Pumped!'
)
assert.equal(datePublished, null)
assert.equal(dek, null)
})
})
})
);
assert.equal(datePublished, null);
assert.equal(dek, null);
});
});
});

@ -2,52 +2,52 @@
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
export const LEAD_IMAGE_URL_META_TAGS = [
'og:image',
'twitter:image',
'image_src',
]
'og:image',
'twitter:image',
'image_src',
];
export const LEAD_IMAGE_URL_SELECTORS = [
'link[rel=image_src]',
]
];
export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'upload',
'wp-content',
'large',
'photo',
'wp-image',
]
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
'upload',
'wp-content',
'large',
'photo',
'wp-image',
];
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'spacer',
'sprite',
'blank',
'throbber',
'gradient',
'tile',
'bg',
'background',
'icon',
'social',
'header',
'hdr',
'advert',
'spinner',
'loader',
'loading',
'default',
'rating',
'share',
'facebook',
'twitter',
'theme',
'promo',
'ads',
'wp-includes',
]
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
'spacer',
'sprite',
'blank',
'throbber',
'gradient',
'tile',
'bg',
'background',
'icon',
'social',
'header',
'hdr',
'advert',
'spinner',
'loader',
'loading',
'default',
'rating',
'share',
'facebook',
'twitter',
'theme',
'promo',
'ads',
'wp-includes',
];
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i');
export const GIF_RE = /\.gif(\?.*)?$/i
export const JPG_RE = /\.jpe?g(\?.*)?$/i
export const GIF_RE = /\.gif(\?.*)?$/i;
export const JPG_RE = /\.jpe?g(\?.*)?$/i;

@ -1,14 +1,12 @@
import 'babel-polyfill'
import 'babel-polyfill';
import { extractFromMeta } from 'utils/dom';
import { cleanImage } from 'cleaners';
import {
LEAD_IMAGE_URL_META_TAGS,
LEAD_IMAGE_URL_SELECTORS,
} from './constants'
import {
extractFromMeta,
extractFromSelectors
} from 'utils/dom'
} from './constants';
import {
scoreImageUrl,
@ -17,9 +15,7 @@ import {
scoreBySibling,
scoreByDimensions,
scoreByPosition,
} from './score-image'
import { cleanImage } from 'cleaners'
} from './score-image';
// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system
@ -31,86 +27,87 @@ import { cleanImage } from 'cleaners'
// * weird aspect ratio
const GenericLeadImageUrlExtractor = {
extract({ $, content, metaCache }) {
let imageUrl, cleanUrl
let cleanUrl;
// Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph.
imageUrl =
const imageUrl =
extractFromMeta(
$,
LEAD_IMAGE_URL_META_TAGS,
metaCache,
false
)
);
if (imageUrl) {
cleanUrl = cleanImage(imageUrl)
cleanUrl = cleanImage(imageUrl);
if (cleanUrl) return cleanUrl
if (cleanUrl) return cleanUrl;
}
// Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
const imgs = $('img', content).toArray()
let imgScores = {}
const imgs = $('img', content).toArray();
const imgScores = {};
imgs.forEach((img, index) => {
const $img = $(img)
const src = $img.attr('src')
const $img = $(img);
const src = $img.attr('src');
if (!src) return
if (!src) return;
let score = scoreImageUrl(src)
score = score + scoreAttr($img)
score = score + scoreByParents($img)
score = score + scoreBySibling($img)
score = score + scoreByDimensions($img)
score = score + scoreByPosition(imgs, index)
let score = scoreImageUrl(src);
score += scoreAttr($img);
score += scoreByParents($img);
score += scoreBySibling($img);
score += scoreByDimensions($img);
score += scoreByPosition(imgs, index);
imgScores[src] = score
})
imgScores[src] = score;
});
const [topUrl, topScore] =
Reflect.ownKeys(imgScores).reduce((acc, key) =>
imgScores[key] > acc[1] ? [key, imgScores[key]] : acc
, [null, 0])
, [null, 0]);
if (topScore > 0) {
cleanUrl = cleanImage(topUrl)
cleanUrl = cleanImage(topUrl);
if (cleanUrl) return cleanUrl
if (cleanUrl) return cleanUrl;
}
// If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />.
for (const selector of LEAD_IMAGE_URL_SELECTORS) {
const $node = $(selector).first()
const src = $node.attr('src')
const $node = $(selector).first();
const src = $node.attr('src');
if (src) {
cleanUrl = cleanImage(src)
if (cleanUrl) return cleanUrl
cleanUrl = cleanImage(src);
if (cleanUrl) return cleanUrl;
}
const href = $node.attr('href')
const href = $node.attr('href');
if (href) {
cleanUrl = cleanImage(href)
if (cleanUrl) return cleanUrl
cleanUrl = cleanImage(href);
if (cleanUrl) return cleanUrl;
}
const value = $node.attr('value')
const value = $node.attr('value');
if (value) {
cleanUrl = cleanImage(value)
if (cleanUrl) return cleanUrl
cleanUrl = cleanImage(value);
if (cleanUrl) return cleanUrl;
}
}
return null;
},
}
};
export default GenericLeadImageUrlExtractor
export default GenericLeadImageUrlExtractor;
// def extract(self):
// """
@ -182,7 +179,7 @@ export default GenericLeadImageUrlExtractor
// if sibling is not None:
// if sibling.tag == 'figcaption':
// img_score += 25
//
//
// sib_sig = ' '.join([sibling.get('id', ''),
// sibling.get('class', '')]).lower()
// if 'caption' in sib_sig:
@ -215,7 +212,7 @@ export default GenericLeadImageUrlExtractor
//
// if img_width and img_height and not 'sprite' in img_path:
// area = img_width * img_height
//
//
// if area < 5000: # Smaller than 50x100
// logger.debug('Image with small area found. Subtracting 100.')
// img_score -= 100

@ -1,62 +1,62 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import HTML from './fixtures/html'
import HTML from './fixtures/html';
import GenericLeadImageUrlExtractor from './extractor'
import GenericLeadImageUrlExtractor from './extractor';
describe('GenericLeadImageUrlExtractor', () => {
describe('extract({ $, content, metaCache })', () => {
it('returns og:image first', () => {
const $ = cheerio.load(HTML.og.test)
const content = $('*').first()
const metaCache = ['og:image']
const $ = cheerio.load(HTML.og.test);
const content = $('*').first();
const metaCache = ['og:image'];
const result =
GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache }
)
);
assert.equal(result, HTML.og.result)
})
assert.equal(result, HTML.og.result);
});
it('returns twitter:image', () => {
const $ = cheerio.load(HTML.twitter.test)
const content = $('*').first()
const metaCache = ['twitter:image']
const $ = cheerio.load(HTML.twitter.test);
const content = $('*').first();
const metaCache = ['twitter:image'];
const result =
GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache }
)
);
assert.equal(result, HTML.twitter.result)
})
assert.equal(result, HTML.twitter.result);
});
it('finds images based on scoring', () => {
const $ = cheerio.load(HTML.scoring.test)
const content = $('*').first()
const metaCache = []
const $ = cheerio.load(HTML.scoring.test);
const content = $('*').first();
const metaCache = [];
const result =
GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache }
)
);
assert.equal(result, HTML.scoring.result)
})
assert.equal(result, HTML.scoring.result);
});
it('returns image based on selectors', () => {
const $ = cheerio.load(HTML.selectors.test)
const content = $('*').first()
const metaCache = []
const $ = cheerio.load(HTML.selectors.test);
const content = $('*').first();
const metaCache = [];
const result =
GenericLeadImageUrlExtractor.extract(
{ $, content, metaCache }
)
);
assert.equal(result, HTML.selectors.result)
})
})
})
assert.equal(result, HTML.selectors.result);
});
});
});

@ -7,7 +7,7 @@ const HTML = {
</head>
</html>
`,
result: `http://example.com/lead.jpg`
result: 'http://example.com/lead.jpg',
},
twitter: {
test: `
@ -17,7 +17,7 @@ const HTML = {
</head>
</html>
`,
result: `http://example.com/lead.jpg`
result: 'http://example.com/lead.jpg',
},
scoring: {
test: `
@ -27,7 +27,7 @@ const HTML = {
<img src="http://example.com/upload/whateverpic.png" />
</div>
`,
result: `http://example.com/upload/goodpic.jpg`
result: 'http://example.com/upload/goodpic.jpg',
},
selectors: {
test: `
@ -35,8 +35,8 @@ const HTML = {
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div>
`,
result: `http://example.com/upload/goodpic.jpg`
result: 'http://example.com/upload/goodpic.jpg',
},
}
};
export default HTML
export default HTML;

@ -3,123 +3,123 @@ import {
NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,
GIF_RE,
JPG_RE,
} from './constants'
} from './constants';
import { PHOTO_HINTS_RE } from '../content/scoring/constants'
import { PHOTO_HINTS_RE } from '../content/scoring/constants';
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`;
}
// Scores image urls based on a variety of heuristics.
export function scoreImageUrl(url) {
url = url.trim()
let score = 0
url = url.trim();
let score = 0;
if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score + 20
score += 20;
}
if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score - 20
score -= 20;
}
// TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were
if (GIF_RE.test(url)) {
score = score - 10
score -= 10;
}
if (JPG_RE.test(url)) {
score = score + 10
score += 10;
}
// PNGs are neutral.
return score
return score;
}
// Alt attribute usually means non-presentational image.
export function scoreAttr($img) {
if ($img.attr('alt')) {
return 5
} else {
return 0
return 5;
}
return 0;
}
// Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them
export function scoreByParents($img) {
let score = 0
const $figParent = $img.parents('figure').first()
let score = 0;
const $figParent = $img.parents('figure').first();
if ($figParent.length === 1) {
score = score + 25
score += 25;
}
const $parent = $img.parent()
let $gParent
const $parent = $img.parent();
let $gParent;
if ($parent.length === 1) {
$gParent = $parent.parent()
$gParent = $parent.parent();
}
[$parent, $gParent].forEach($node => {
[$parent, $gParent].forEach(($node) => {
if (PHOTO_HINTS_RE.test(getSig($node))) {
score = score + 15
score += 15;
}
})
});
return score
return score;
}
// Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
export function scoreBySibling($img) {
let score = 0
const $sibling = $img.next()
const sibling = $sibling.get(0)
let score = 0;
const $sibling = $img.next();
const sibling = $sibling.get(0);
if (sibling && sibling.tagName === 'figcaption') {
score = score + 25
score += 25;
}
if (PHOTO_HINTS_RE.test(getSig($sibling))) {
score = score + 15
score += 15;
}
return score
return score;
}
export function scoreByDimensions($img) {
let score = 0
let score = 0;
const width = parseFloat($img.attr('width'))
const height = parseFloat($img.attr('height'))
const src = $img.attr('src')
const width = parseFloat($img.attr('width'));
const height = parseFloat($img.attr('height'));
const src = $img.attr('src');
// Penalty for skinny images
if (width && width <= 50) {
score = score - 50
score -= 50;
}
// Penalty for short images
if (height && height <= 50) {
score = score - 50
score -= 50;
}
if (width && height && !src.includes('sprite')) {
const area = width * height
const area = width * height;
if (area < 5000) { // Smaller than 50 x 100
score = score - 100
score -= 100;
} else {
score = score + Math.round(area/1000)
score += Math.round(area / 1000);
}
}
return score
return score;
}
export function scoreByPosition($imgs, index) {
return $imgs.length/2 - index
}
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`
return ($imgs.length / 2) - index;
}

@ -1,5 +1,5 @@
import assert from 'assert'
import cheerio from 'cheerio'
import assert from 'assert';
import cheerio from 'cheerio';
import {
scoreImageUrl,
@ -8,61 +8,61 @@ import {
scoreBySibling,
scoreByDimensions,
scoreByPosition,
} from './score-image'
} from './score-image';
describe('scoreImageUrlUrl(url)', () => {
it('gets 20 points for a positive lead img hint', () => {
const url = 'http://example.com/upload/img.png'
const url = 'http://example.com/upload/img.png';
assert.equal(scoreImageUrl(url), 20)
})
assert.equal(scoreImageUrl(url), 20);
});
it('loses 20 points for a negative lead img hint', () => {
const url = 'http://example.com/sprite/foo/bar.png'
const url = 'http://example.com/sprite/foo/bar.png';
assert.equal(scoreImageUrl(url), -20)
})
assert.equal(scoreImageUrl(url), -20);
});
it('loses 10 points for a gif', () => {
const url = 'http://example.com/foo/bar.gif'
const url = 'http://example.com/foo/bar.gif';
assert.equal(scoreImageUrl(url), -10)
assert.equal(scoreImageUrl(url), -10);
const url2 = 'http://example.com/foogif/bar'
const url2 = 'http://example.com/foogif/bar';
assert.equal(scoreImageUrl(url2), 0)
})
assert.equal(scoreImageUrl(url2), 0);
});
it('gains 10 points for a jpg', () => {
const url = 'http://example.com/foo/bar.jpg'
assert.equal(scoreImageUrl(url), 10)
const url = 'http://example.com/foo/bar.jpg';
assert.equal(scoreImageUrl(url), 10);
const url2 = 'http://example.com/foo/bar.jpeg'
assert.equal(scoreImageUrl(url2), 10)
const url2 = 'http://example.com/foo/bar.jpeg';
assert.equal(scoreImageUrl(url2), 10);
const url3 = 'http://example.com/foojpg/bar'
assert.equal(scoreImageUrl(url3), 0)
const url3 = 'http://example.com/foojpg/bar';
assert.equal(scoreImageUrl(url3), 0);
const url4 = 'http://example.com/foo.jpg?bar=baz'
assert.equal(scoreImageUrl(url4), 10)
})
})
const url4 = 'http://example.com/foo.jpg?bar=baz';
assert.equal(scoreImageUrl(url4), 10);
});
});
describe('scoreAttr($img)', () => {
it('gets 5 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img alt="Wow" /></div>')
const $img = $('img').first()
const $ = cheerio.load('<div><img alt="Wow" /></div>');
const $img = $('img').first();
assert.equal(scoreAttr($img), 5)
})
assert.equal(scoreAttr($img), 5);
});
it('gets 0 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img /></div>')
const $img = $('img').first()
const $ = cheerio.load('<div><img /></div>');
const $img = $('img').first();
assert.equal(scoreAttr($img), 0)
})
})
assert.equal(scoreAttr($img), 0);
});
});
describe('scoreByParents($img)', () => {
it('gets 25 points if it has a figure parent', () => {
@ -74,18 +74,18 @@ describe('scoreByParents($img)', () => {
</div>
</figure>
</div>`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByParents($img), 25)
})
assert.equal(scoreByParents($img), 25);
});
it('gets 0 points if the img has no figure parent', () => {
const $ = cheerio.load('<div><img /></div>')
const $img = $('img').first()
const $ = cheerio.load('<div><img /></div>');
const $img = $('img').first();
assert.equal(scoreByParents($img), 0)
})
assert.equal(scoreByParents($img), 0);
});
it('gets 15 points if parent or gparent has photo hints', () => {
const $ = cheerio.load(
@ -96,12 +96,12 @@ describe('scoreByParents($img)', () => {
</div>
</div>
</div>`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByParents($img), 15)
})
})
assert.equal(scoreByParents($img), 15);
});
});
describe('scoreBySibling($img)', () => {
it('gets 25 points if its sibling is figcaption', () => {
@ -112,11 +112,11 @@ describe('scoreBySibling($img)', () => {
<figcaption>Wow</figcaption>
</div>
`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreBySibling($img), 25)
})
assert.equal(scoreBySibling($img), 25);
});
it('gets 15 points if its sibling has photo hints', () => {
const $ = cheerio.load(
@ -128,12 +128,12 @@ describe('scoreBySibling($img)', () => {
</div>
</div>
</div>`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreBySibling($img), 15)
})
})
assert.equal(scoreBySibling($img), 15);
});
});
describe('scoreByDimensions($img)', () => {
it('penalizes skinny images', () => {
@ -143,11 +143,11 @@ describe('scoreByDimensions($img)', () => {
<img width="10" />
</div>
`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50)
})
assert.equal(scoreByDimensions($img), -50);
});
it('penalizes short images', () => {
const $ = cheerio.load(
@ -156,11 +156,11 @@ describe('scoreByDimensions($img)', () => {
<img height="10" />
</div>
`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), -50)
})
assert.equal(scoreByDimensions($img), -50);
});
it('ignores sprites', () => {
const $ = cheerio.load(
@ -169,11 +169,11 @@ describe('scoreByDimensions($img)', () => {
<img src="/sprite/etc/foo.png" width="1000" height="1000" />
</div>
`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), 0)
})
assert.equal(scoreByDimensions($img), 0);
});
it('penalizes images with small areas', () => {
const $ = cheerio.load(
@ -182,11 +182,11 @@ describe('scoreByDimensions($img)', () => {
<img src="/etc/foo.png" width="60" height="60" />
</div>
`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), -100)
})
assert.equal(scoreByDimensions($img), -100);
});
it('prefers the largest images', () => {
const $ = cheerio.load(
@ -195,13 +195,12 @@ describe('scoreByDimensions($img)', () => {
<img src="/etc/foo.png" width="1000" height="1000" />
</div>
`
)
const $img = $('img').first()
);
const $img = $('img').first();
assert.equal(scoreByDimensions($img), 1000)
})
})
assert.equal(scoreByDimensions($img), 1000);
});
});
describe('scoreByPosition($imgs, index)', () => {
it('gives higher scores to images that come first', () => {
@ -216,10 +215,10 @@ describe('scoreByPosition($imgs, index)', () => {
<img width="10" />
</div>
`
)
const $imgs = $('img')
);
const $imgs = $('img');
assert.equal(scoreByPosition($imgs, 0), 3)
})
})
assert.equal(scoreByPosition($imgs, 0), 3);
});
});

@ -1,25 +1,22 @@
import 'babel-polyfill'
import URL from 'url'
import 'babel-polyfill';
import URL from 'url';
import {
pageNumFromUrl,
articleBaseUrl,
removeAnchor,
} from 'utils/text'
import scoreLinks from './scoring/score-links'
} from 'utils/text';
import scoreLinks from './scoring/score-links';
// Looks for and returns next page url
// for multi-page articles
const GenericNextPageUrlExtractor = {
extract({ $, url, parsedUrl, previousUrls=[] }) {
parsedUrl = parsedUrl || URL.parse(url)
extract({ $, url, parsedUrl, previousUrls = [] }) {
parsedUrl = parsedUrl || URL.parse(url);
const currentPageNum = pageNumFromUrl(url)
const articleUrl = removeAnchor(url)
const baseUrl = articleBaseUrl(url, parsedUrl)
const { host } = parsedUrl
const articleUrl = removeAnchor(url);
const baseUrl = articleBaseUrl(url, parsedUrl);
const links = $('a[href]').toArray()
const links = $('a[href]').toArray();
const scoredLinks = scoreLinks({
links,
@ -27,28 +24,28 @@ const GenericNextPageUrlExtractor = {
baseUrl,
parsedUrl,
$,
previousUrls
})
previousUrls,
});
// If no links were scored, return null
if (!scoredLinks) return null
if (!scoredLinks) return null;
// now that we've scored all possible pages,
// find the biggest one.
const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
const scoredLink = scoredLinks[link]
return scoredLink.score > acc.score ? scoredLink : acc
}, { score: -100 })
const scoredLink = scoredLinks[link];
return scoredLink.score > acc.score ? scoredLink : acc;
}, { score: -100 });
// If the score is less than 50, we're not confident enough to use it,
// so we fail.
if (topPage.score >= 50) {
return topPage.href
} else {
return null
return topPage.href;
}
}
}
return null;
},
};
export default GenericNextPageUrlExtractor
export default GenericNextPageUrlExtractor;

@ -1,34 +1,34 @@
import assert from 'assert'
import fs from 'fs'
import cheerio from 'cheerio'
import assert from 'assert';
import fs from 'fs';
import cheerio from 'cheerio';
import GenericNextPageUrlExtractor from './extractor'
import GenericNextPageUrlExtractor from './extractor';
describe('GenericNextPageUrlExtractor', () => {
it('returns most likely next page url', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
const $ = cheerio.load(html)
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'
const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html);
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2';
const nextPage = GenericNextPageUrlExtractor.extract({
$,
url
})
url,
});
assert.equal(nextPage, next)
})
assert.equal(nextPage, next);
});
it('returns null if there is no likely next page', () => {
const html = `<div><p>HI</p></div>`
const $ = cheerio.load(html)
const url = 'http://example.com/foo/bar'
const html = '<div><p>HI</p></div>';
const $ = cheerio.load(html);
const url = 'http://example.com/foo/bar';
const nextPage = GenericNextPageUrlExtractor.extract({
$,
url
})
url,
});
assert.equal(nextPage, null)
})
})
assert.equal(nextPage, null);
});
});

@ -1,38 +1,38 @@
export const DIGIT_RE = /\d/
export const DIGIT_RE = /\d/;
// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
export const EXTRANEOUS_LINK_HINTS = [
'print',
'archive',
'comment',
'discuss',
'e-mail',
'email',
'share',
'reply',
'all',
'login',
'sign',
'single',
'adx',
'entry-unrelated'
]
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
'print',
'archive',
'comment',
'discuss',
'e-mail',
'email',
'share',
'reply',
'all',
'login',
'sign',
'single',
'adx',
'entry-unrelated',
];
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');
// Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i');
// Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');
// Match any link text/classname/id that looks like it means the previous
// page.
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');
// Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');

@ -1,27 +1,32 @@
import 'babel-polyfill'
import URL from 'url'
import difflib from 'difflib'
import 'babel-polyfill';
import URL from 'url';
import { range } from 'utils'
import { isWordpress } from 'utils/dom'
import { isWordpress } from 'utils/dom';
import {
removeAnchor,
pageNumFromUrl,
} from 'utils/text'
import {
DIGIT_RE,
NEXT_LINK_TEXT_RE,
PREV_LINK_TEXT_RE,
EXTRANEOUS_LINK_HINTS_RE,
CAP_LINK_TEXT_RE,
PAGE_RE,
} from './constants'
} from 'utils/text';
import {
NEGATIVE_SCORE_RE,
POSITIVE_SCORE_RE,
} from 'utils/dom/constants'
import { IS_DIGIT_RE } from 'utils/text/constants'
scoreSimilarity,
scoreLinkText,
scorePageInLink,
scoreExtraneousLinks,
scoreByParents,
scorePrevLink,
shouldScore,
scoreBaseUrl,
scoreCapLinks,
scoreNextLinkText,
} from './utils';
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i');
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
}
export default function scoreLinks({
links,
@ -29,11 +34,11 @@ export default function scoreLinks({
baseUrl,
parsedUrl,
$,
previousUrls=[]
previousUrls = [],
}) {
parsedUrl = parsedUrl || URL.parse(articleUrl)
const baseRegex = makeBaseRegex(baseUrl)
const isWp = isWordpress($)
parsedUrl = parsedUrl || URL.parse(articleUrl);
const baseRegex = makeBaseRegex(baseUrl);
const isWp = isWordpress($);
// Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or
@ -46,12 +51,12 @@ export default function scoreLinks({
// Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
let href = removeAnchor(link.attribs.href)
const $link = $(link)
const linkText = $link.text()
const href = removeAnchor(link.attribs.href);
const $link = $(link);
const linkText = $link.text();
if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
return possiblePages
return possiblePages;
}
// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
@ -60,242 +65,29 @@ export default function scoreLinks({
score: 0,
linkText,
href,
}
};
} else {
possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`
}
const possiblePage = possiblePages[href]
const linkData = makeSig($link, linkText)
const pageNum = pageNumFromUrl(href)
let score = scoreBaseUrl(href, baseRegex)
score = score + scoreNextLinkText(linkData)
score = score + scoreCapLinks(linkData)
score = score + scorePrevLink(linkData)
score = score + scoreByParents($link)
score = score + scoreExtraneousLinks(href)
score = score + scorePageInLink(pageNum, isWp)
score = score + scoreLinkText(linkText, pageNum)
score = score + scoreSimilarity(score, articleUrl, href)
possiblePage.score = score
return possiblePages
}, {})
return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages
}
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i')
}
export function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio()
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity
const diffModifier = -(250 * (diffPercent - 0.2))
return score + diffModifier
}
return 0
}
export function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText)
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30
} else {
score = Math.max(0, 10 - linkTextAsNum)
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score = score - 50
}
}
return score
}
export function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50
}
return 0
}
export function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25
}
return 0
}
export function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent()
let positiveMatch = false
let negativeMatch = false
let score = 0
Array.from(range(0, 4)).forEach((_) => {
if ($parent.length === 0) {
return
possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`;
}
const parentData = makeSig($parent, ' ')
// If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true
score = score + 25
}
const possiblePage = possiblePages[href];
const linkData = makeSig($link, linkText);
const pageNum = pageNumFromUrl(href);
// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true
score = score - 25
}
}
$parent = $parent.parent()
})
return score
}
let score = scoreBaseUrl(href, baseRegex);
score += scoreNextLinkText(linkData);
score += scoreCapLinks(linkData);
score += scorePrevLink(linkData);
score += scoreByParents($link);
score += scoreExtraneousLinks(href);
score += scorePageInLink(pageNum, isWp);
score += scoreLinkText(linkText, pageNum);
score += scoreSimilarity(score, articleUrl, href);
export function scorePrevLink(linkData) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200
}
possiblePage.score = score;
return 0
}
export function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65
}
}
return 0
}
export function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50
}
return 0
}
return possiblePages;
}, {});
export function scoreBaseUrl(href, baseRegex) {
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25
}
return 0
}
export function shouldScore(
href,
articleUrl,
baseUrl,
parsedUrl,
linkText,
previousUrls
) {
// skip if we've already fetched this url
if(previousUrls.find((url) => href === url) !== undefined) {
return false
}
// If we've already parsed this URL, or the URL matches the base
// URL, or is empty, skip it.
if (!href || href === articleUrl || href === baseUrl) {
return false
}
const { hostname } = parsedUrl
const { hostname: linkHost } = URL.parse(href)
// Domain mismatch.
if (linkHost !== hostname) {
return false
}
// If href doesn't contain a digit after removing the base URL,
// it's certainly not the next page.
const fragment = href.replace(baseUrl, '')
if (!DIGIT_RE.test(fragment)) {
return false
}
// This link has extraneous content (like "comment") in its link
// text, so we skip it.
if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) {
return false
}
// Next page link text is never long, skip if it is too long.
if (linkText.length > 25) {
return false
}
return true
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`
return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages;
}

@ -1,239 +1,42 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import URL from 'url'
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
import scoreLinks from './score-links'
import {
makeBaseRegex,
scoreBaseUrl,
scoreNextLinkText,
scoreCapLinks,
scorePrevLink,
scoreByParents,
scoreExtraneousLinks,
scorePageInLink,
scoreLinkText,
scoreSimilarity,
shouldScore,
} from './score-links'
import scoreLinks from './score-links';
describe('scoreLinks(links)', () => {
it('returns an object of scored links', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
const $ = cheerio.load(html)
const links = $('a[href]').toArray()
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const $ = cheerio.load(html);
const links = $('a[href]').toArray();
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const scoredPages = scoreLinks({
links,
articleUrl: url,
baseUrl: 'http://arstechnica.com',
$,
})
});
assert.equal(typeof scoredPages, 'object')
})
assert.equal(typeof scoredPages, 'object');
});
it('returns null if no possible pages', () => {
const html = `<div><p>Hello wow</p></div>`
const html = '<div><p>Hello wow</p></div>';
const $ = cheerio.load(html)
const links = $('a[href]').toArray()
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const $ = cheerio.load(html);
const links = $('a[href]').toArray();
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const scoredPages = scoreLinks({
links,
articleUrl: url,
baseUrl: 'http://arstechnica.com',
$,
})
});
assert.equal(scoredPages, null)
})
})
assert.equal(scoredPages, null);
});
});
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://foo.com/foo/bar'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25)
})
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://example.com/foo/bar/bat'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0)
})
})
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = "foo bar Next page"
assert.equal(scoreNextLinkText(linkData), 50)
})
it('returns 0 if does not contain common next link text', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreNextLinkText(linkData), 0)
})
})
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = "foo next Last page"
assert.equal(scoreCapLinks(linkData), -65)
})
it('returns 0 if does not match a cap link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = "foo next previous page"
assert.equal(scorePrevLink(linkData), -200)
})
it('returns 0 if does not match a prev link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), 25)
})
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), -25)
})
})
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = "http://example.com/email-link"
assert.equal(scoreExtraneousLinks(url), -25)
})
it('returns 0 if does not match extraneous text', () => {
const url = "http://example.com/asdf"
assert.equal(scoreExtraneousLinks(url), 0)
})
})
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50)
})
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0)
})
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0)
})
})
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8)
})
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5)
})
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30)
})
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44)
})
})
describe('scoreSimilarity(score, articleUrl, href)', () => {
it('returns a similarity bonus based on current score', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 25
assert.equal(
Math.round(scoreSimilarity(score, articleUrl, href)),
66
)
})
it('returns 0 is current score <= 0', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 0
assert.equal(scoreSimilarity(score, articleUrl, href), 0)
})
})
describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
it('returns false if href has already been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar/2' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
false
)
})
it('returns true if href has not been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
true
)
})
})

@ -0,0 +1,10 @@
export { default as scoreSimilarity } from './score-similarity';
export { default as scoreLinkText } from './score-link-text';
export { default as scorePageInLink } from './score-page-in-link';
export { default as scoreExtraneousLinks } from './score-extraneous-links';
export { default as scoreByParents } from './score-by-parents';
export { default as scorePrevLink } from './score-prev-link';
export { default as shouldScore } from './should-score';
export { default as scoreBaseUrl } from './score-base-url';
export { default as scoreNextLinkText } from './score-next-link-text';
export { default as scoreCapLinks } from './score-cap-links';

@ -0,0 +1,11 @@
export default function scoreBaseUrl(href, baseRegex) {
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25;
}
return 0;
}

@ -0,0 +1,23 @@
import assert from 'assert';
import scoreBaseUrl from './score-base-url';
import { makeBaseRegex } from '../score-links';
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar';
const badUrl = 'http://foo.com/foo/bar';
const baseRegex = makeBaseRegex(baseUrl);
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25);
});
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar';
const badUrl = 'http://example.com/foo/bar/bat';
const baseRegex = makeBaseRegex(baseUrl);
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0);
});
});

@ -0,0 +1,52 @@
import { range } from 'utils';
import {
NEGATIVE_SCORE_RE,
POSITIVE_SCORE_RE,
PAGE_RE,
} from 'utils/dom/constants';
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
function makeSig($link) {
return `${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
}
export default function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent();
let positiveMatch = false;
let negativeMatch = false;
let score = 0;
Array.from(range(0, 4)).forEach(() => {
if ($parent.length === 0) {
return;
}
const parentData = makeSig($parent, ' ');
// If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true;
score += 25;
}
// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true;
score -= 25;
}
}
$parent = $parent.parent();
});
return score;
}

@ -0,0 +1,35 @@
import assert from 'assert';
import cheerio from 'cheerio';
import scoreByParents from './score-by-parents';
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), 25);
});
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`;
const $ = cheerio.load(html);
const $link = $('a').first();
assert.equal(scoreByParents($link), -25);
});
});

@ -0,0 +1,19 @@
import {
NEXT_LINK_TEXT_RE,
CAP_LINK_TEXT_RE,
} from '../constants';
export default function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65;
}
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreCapLinks from './score-cap-links';
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = 'foo next Last page';
assert.equal(scoreCapLinks(linkData), -65);
});
it('returns 0 if does not match a cap link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreCapLinks(linkData), 0);
});
});

@ -0,0 +1,10 @@
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
export default function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreExtraneousLinks from './score-extraneous-links';
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = 'http://example.com/email-link';
assert.equal(scoreExtraneousLinks(url), -25);
});
it('returns 0 if does not match extraneous text', () => {
const url = 'http://example.com/asdf';
assert.equal(scoreExtraneousLinks(url), 0);
});
});

@ -0,0 +1,30 @@
import { IS_DIGIT_RE } from 'utils/text/constants';
export default function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0;
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText, 10);
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30;
} else {
score = Math.max(0, 10 - linkTextAsNum);
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score -= 50;
}
}
return score;
}

@ -0,0 +1,22 @@
import assert from 'assert';
import scoreLinkText from './score-link-text';
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8);
});
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5);
});
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30);
});
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44);
});
});

@ -0,0 +1,10 @@
import { NEXT_LINK_TEXT_RE } from '../constants';
export default function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scoreNextLinkText from './score-next-link-text';
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = 'foo bar Next page';
assert.equal(scoreNextLinkText(linkData), 50);
});
it('returns 0 if does not contain common next link text', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scoreNextLinkText(linkData), 0);
});
});

@ -0,0 +1,10 @@
export default function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scorePageInLink from './score-page-in-link';
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50);
});
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0);
});
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0);
});
});

@ -0,0 +1,11 @@
import { PREV_LINK_TEXT_RE } from '../constants';
export default function scorePrevLink(linkData) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200;
}
return 0;
}

@ -0,0 +1,18 @@
import assert from 'assert';
import scorePrevLink from './score-prev-link';
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = 'foo next previous page';
assert.equal(scorePrevLink(linkData), -200);
});
it('returns 0 if does not match a prev link', () => {
const linkData = 'foo bar WOW GREAT';
assert.equal(scorePrevLink(linkData), 0);
});
});

@ -0,0 +1,23 @@
import difflib from 'difflib';
export default function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity;
const diffModifier = -(250 * (diffPercent - 0.2));
return score + diffModifier;
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save