chore: refactored and linted
parent
9906bd36a4
commit
7e2a34945f
@ -0,0 +1 @@
|
||||
**/fixtures/*
|
@ -0,0 +1,39 @@
|
||||
// Use this file as a starting point for your project's .eslintrc.
|
||||
// Copy this file, and add rule overrides as needed.
|
||||
{
|
||||
"parser": "babel-eslint",
|
||||
"extends": "airbnb",
|
||||
"plugins": [
|
||||
"babel"
|
||||
],
|
||||
"globals": {
|
||||
/* mocha */
|
||||
"describe",
|
||||
"it"
|
||||
},
|
||||
"rules": {
|
||||
"no-param-reassign": 0,
|
||||
/* TODO fix this; this should work w/import/resolver below, but doesn't */
|
||||
"import/no-extraneous-dependencies": 0,
|
||||
"import/no-unresolved": 0,
|
||||
"no-control-regex": 0,
|
||||
"import/prefer-default-export": 0,
|
||||
"generator-star-spacing": 0,
|
||||
"babel/generator-star-spacing": 0,
|
||||
"func-names": 0,
|
||||
"no-useless-escape": 0,
|
||||
"no-confusing-arrow": 0,
|
||||
},
|
||||
"settings": {
|
||||
"import/resolver": {
|
||||
"babel-module": {
|
||||
"extensions": [".js"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"parserOptions":{
|
||||
"ecmaFeatures": {
|
||||
"experimentalObjectRestSpread": true
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
#!/usr/local/bin/fish
|
||||
|
||||
set file $argv[1]
|
||||
set function $argv[2]
|
||||
|
||||
touch src/extractors/generic/next-page-url/scoring/utils/index.js
|
||||
touch src/extractors/generic/next-page-url/scoring/utils/$file.js
|
||||
touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js
|
||||
|
||||
echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
|
||||
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
|
||||
echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
|
||||
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
|
||||
echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js
|
||||
|
||||
echo "Now make it a default export"
|
||||
echo "Move it to its file"
|
||||
echo "Move its tests to its test file"
|
||||
echo "import in score-links"
|
||||
echo "Test it."
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { CLEAN_AUTHOR_RE } from './constants'
|
||||
import { CLEAN_AUTHOR_RE } from './constants';
|
||||
|
||||
// Take an author string (like 'By David Smith ') and clean it to
|
||||
// just the name(s): 'David Smith'.
|
||||
export default function cleanAuthor(author) {
|
||||
return author.replace(CLEAN_AUTHOR_RE, '$2').trim()
|
||||
return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
|
||||
}
|
||||
|
@ -1,21 +1,21 @@
|
||||
import assert from 'assert'
|
||||
import assert from 'assert';
|
||||
|
||||
import cleanAuthor from './author'
|
||||
import cleanAuthor from './author';
|
||||
|
||||
describe('cleanAuthor(author)', () => {
|
||||
it('removes the By from an author string', () => {
|
||||
const author = cleanAuthor('By Bob Dylan')
|
||||
const author = cleanAuthor('By Bob Dylan');
|
||||
|
||||
assert.equal(author, 'Bob Dylan')
|
||||
})
|
||||
assert.equal(author, 'Bob Dylan');
|
||||
});
|
||||
|
||||
it('trims trailing whitespace and line breaks', () => {
|
||||
const text = `
|
||||
written by
|
||||
Bob Dylan
|
||||
`
|
||||
const author = cleanAuthor(text)
|
||||
`;
|
||||
const author = cleanAuthor(text);
|
||||
|
||||
assert.equal(author, 'Bob Dylan')
|
||||
})
|
||||
})
|
||||
assert.equal(author, 'Bob Dylan');
|
||||
});
|
||||
});
|
||||
|
@ -1,32 +1,32 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import fs from 'fs'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
import fs from 'fs';
|
||||
|
||||
import extractCleanNode from './content'
|
||||
import extractBestNode from 'extractors/generic/content/extract-best-node'
|
||||
import extractBestNode from 'extractors/generic/content/extract-best-node';
|
||||
import extractCleanNode from './content';
|
||||
|
||||
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
|
||||
it("cleans cruft out of a DOM node", () => {
|
||||
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
|
||||
let $ = cheerio.load(html)
|
||||
it('cleans cruft out of a DOM node', () => {
|
||||
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const opts = {
|
||||
stripUnlikelyCandidates: true,
|
||||
weightNodes: true,
|
||||
cleanConditionally: true,
|
||||
}
|
||||
stripUnlikelyCandidates: true,
|
||||
weightNodes: true,
|
||||
cleanConditionally: true,
|
||||
};
|
||||
|
||||
const bestNode = extractBestNode($, opts)
|
||||
let result = $.html(bestNode)
|
||||
// console.log(result)
|
||||
// console.log(result.length)
|
||||
const cleanNode = extractCleanNode(bestNode, { $, opts })
|
||||
result = $.html(cleanNode)
|
||||
// console.log(result.length)
|
||||
// console.log(result)
|
||||
// console.log(bestNode.html())
|
||||
const bestNode = extractBestNode($, opts);
|
||||
// let result = $.html(bestNode);
|
||||
// // console.log(result)
|
||||
// // console.log(result.length)
|
||||
const cleanNode = extractCleanNode(bestNode, { $, opts });
|
||||
// result = $.html(cleanNode);
|
||||
// // console.log(result.length)
|
||||
// // console.log(result)
|
||||
// // console.log(bestNode.html())
|
||||
|
||||
assert.equal($(bestNode).text().length, 2687)
|
||||
})
|
||||
})
|
||||
assert.equal($(cleanNode).text().length, 2687);
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,67 +1,62 @@
|
||||
import assert from 'assert'
|
||||
import assert from 'assert';
|
||||
|
||||
import {
|
||||
default as cleanDatePublished,
|
||||
cleanDateString,
|
||||
} from './date-published'
|
||||
} from './date-published';
|
||||
|
||||
describe('cleanDatePublished(dateString)', () => {
|
||||
it('returns a date object', () => {
|
||||
const datePublished = cleanDatePublished('published: 1/1/2020')
|
||||
const datePublished = cleanDatePublished('published: 1/1/2020');
|
||||
|
||||
assert.equal(
|
||||
datePublished,
|
||||
new Date('1/1/2020').toISOString()
|
||||
)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('returns null if date is invalid', () => {
|
||||
const datePublished = cleanDatePublished('blargh')
|
||||
const datePublished = cleanDatePublished('blargh');
|
||||
|
||||
assert.equal(datePublished, null)
|
||||
})
|
||||
|
||||
})
|
||||
assert.equal(datePublished, null);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cleanDateString(dateString)', () => {
|
||||
it('removes "published" text from an datePublished string', () => {
|
||||
const datePublished = cleanDateString('published: 1/1/2020')
|
||||
const datePublished = cleanDateString('published: 1/1/2020');
|
||||
|
||||
assert.equal(datePublished, '1/1/2020')
|
||||
})
|
||||
assert.equal(datePublished, '1/1/2020');
|
||||
});
|
||||
|
||||
it('trims whitespace', () => {
|
||||
const datePublished = cleanDateString(' 1/1/2020 ')
|
||||
const datePublished = cleanDateString(' 1/1/2020 ');
|
||||
|
||||
assert.equal(datePublished, '1/1/2020')
|
||||
})
|
||||
assert.equal(datePublished, '1/1/2020');
|
||||
});
|
||||
|
||||
it('puts a space b/w a time and am/pm', () => {
|
||||
// The JS date parser is forgiving, but
|
||||
// it needs am/pm separated from a time
|
||||
const date1 = cleanDateString('1/1/2020 8:30am')
|
||||
assert.equal(date1, '1/1/2020 8:30 am')
|
||||
const date1 = cleanDateString('1/1/2020 8:30am');
|
||||
assert.equal(date1, '1/1/2020 8:30 am');
|
||||
|
||||
const date2 = cleanDateString('8:30PM 1/1/2020')
|
||||
assert.equal(date2, '8:30 PM 1/1/2020')
|
||||
})
|
||||
const date2 = cleanDateString('8:30PM 1/1/2020');
|
||||
assert.equal(date2, '8:30 PM 1/1/2020');
|
||||
});
|
||||
|
||||
it('cleans the dots from a.m. or p.m.', () => {
|
||||
// The JS date parser is forgiving, but
|
||||
// it needs a.m./p.m. without dots
|
||||
const date1 = cleanDateString('1/1/2020 8:30 a.m.')
|
||||
assert.equal(date1, '1/1/2020 8:30 am')
|
||||
})
|
||||
const date1 = cleanDateString('1/1/2020 8:30 a.m.');
|
||||
assert.equal(date1, '1/1/2020 8:30 am');
|
||||
});
|
||||
|
||||
it('can handle some tough timestamps', () => {
|
||||
// The JS date parser is forgiving, but
|
||||
// it needs am/pm separated from a time
|
||||
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.')
|
||||
assert.equal(date1, '15 Apr 2016 10:59')
|
||||
|
||||
const date2 = cleanDateString('8:30PM 1/1/2020')
|
||||
assert.equal(date2, '8:30 PM 1/1/2020')
|
||||
})
|
||||
|
||||
})
|
||||
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.');
|
||||
assert.equal(date1, '15 Apr 2016 10:59');
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,17 +1,18 @@
|
||||
import { TEXT_LINK_RE } from './constants'
|
||||
import { stripTags } from 'utils/dom'
|
||||
import { stripTags } from 'utils/dom';
|
||||
|
||||
import { TEXT_LINK_RE } from './constants';
|
||||
|
||||
// Take a dek HTML fragment, and return the cleaned version of it.
|
||||
// Return None if the dek wasn't good enough.
|
||||
export default function cleanDek(dek, { $ }) {
|
||||
// Sanity check that we didn't get too short or long of a dek.
|
||||
if (dek.length > 1000 || dek.length < 5) return null
|
||||
if (dek.length > 1000 || dek.length < 5) return null;
|
||||
|
||||
const dekText = stripTags(dek, $)
|
||||
const dekText = stripTags(dek, $);
|
||||
|
||||
// Plain text links shouldn't exist in the dek. If we have some, it's
|
||||
// not a good dek - bail.
|
||||
if (TEXT_LINK_RE.test(dekText)) return null
|
||||
if (TEXT_LINK_RE.test(dekText)) return null;
|
||||
|
||||
return dekText.trim()
|
||||
return dekText.trim();
|
||||
}
|
||||
|
@ -1,52 +1,50 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import {
|
||||
default as cleanDek,
|
||||
cleanDekString,
|
||||
} from './dek'
|
||||
import cleanDek from './dek';
|
||||
|
||||
describe('cleanDek(dekString, { $ })', () => {
|
||||
it('returns null if the dek is < 5 chars', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
assert.equal(cleanDek('Hi', { $ }), null)
|
||||
})
|
||||
const $ = cheerio.load('<div></div>');
|
||||
assert.equal(cleanDek('Hi', { $ }), null);
|
||||
});
|
||||
|
||||
it('returns null if the dek is > 1000 chars', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const longDek =
|
||||
// generate a string that is 1,280 chars
|
||||
[0,1,2,3,4,5,6].reduce((acc, i) =>
|
||||
acc += acc, '0123456789'
|
||||
)
|
||||
assert.equal(cleanDek(longDek, { $ }), null)
|
||||
})
|
||||
[0, 1, 2, 3, 4, 5, 6].reduce((acc) => {
|
||||
acc += acc;
|
||||
return acc;
|
||||
}, '0123456789');
|
||||
assert.equal(cleanDek(longDek, { $ }), null);
|
||||
});
|
||||
|
||||
it('strip html tags from the dek', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const dek = 'This is a <em>very</em> important dek.'
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const dek = 'This is a <em>very</em> important dek.';
|
||||
|
||||
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.')
|
||||
})
|
||||
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.');
|
||||
});
|
||||
|
||||
it('returns null if dek contains plain text link', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const dek = 'This has this link http://example.com/foo/bar'
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const dek = 'This has this link http://example.com/foo/bar';
|
||||
|
||||
assert.equal(cleanDek(dek, { $ }), null)
|
||||
})
|
||||
assert.equal(cleanDek(dek, { $ }), null);
|
||||
});
|
||||
|
||||
it('returns a normal dek as is', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const dek = 'This is the dek'
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const dek = 'This is the dek';
|
||||
|
||||
assert.equal(cleanDek(dek, { $ }), dek)
|
||||
})
|
||||
assert.equal(cleanDek(dek, { $ }), dek);
|
||||
});
|
||||
|
||||
it('cleans extra whitespace', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const dek = ' This is the dek '
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const dek = ' This is the dek ';
|
||||
|
||||
assert.equal(cleanDek(dek, { $ }), 'This is the dek')
|
||||
})
|
||||
})
|
||||
assert.equal(cleanDek(dek, { $ }), 'This is the dek');
|
||||
});
|
||||
});
|
||||
|
@ -1,10 +1,10 @@
|
||||
import validUrl from 'valid-url'
|
||||
import validUrl from 'valid-url';
|
||||
|
||||
export default function clean(leadImageUrl) {
|
||||
leadImageUrl = leadImageUrl.trim()
|
||||
leadImageUrl = leadImageUrl.trim();
|
||||
if (validUrl.isWebUri(leadImageUrl)) {
|
||||
return leadImageUrl
|
||||
} else {
|
||||
return null
|
||||
return leadImageUrl;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
@ -1,20 +1,20 @@
|
||||
import assert from 'assert'
|
||||
import assert from 'assert';
|
||||
|
||||
import clean from './lead-image-url'
|
||||
import clean from './lead-image-url';
|
||||
|
||||
describe('clean(leadImageUrl)', () => {
|
||||
it('returns the url if valid', () => {
|
||||
const url = 'https://example.com'
|
||||
assert.equal(clean(url), url)
|
||||
})
|
||||
const url = 'https://example.com';
|
||||
assert.equal(clean(url), url);
|
||||
});
|
||||
|
||||
it('returns null if the url is not valid', () => {
|
||||
const url = 'this is not a valid url'
|
||||
assert.equal(clean(url), null)
|
||||
})
|
||||
const url = 'this is not a valid url';
|
||||
assert.equal(clean(url), null);
|
||||
});
|
||||
|
||||
it('trims whitespace', () => {
|
||||
const url = ' https://example.com/foo/bar.jpg'
|
||||
assert.equal(clean(url), url.trim())
|
||||
})
|
||||
})
|
||||
const url = ' https://example.com/foo/bar.jpg';
|
||||
assert.equal(clean(url), url.trim());
|
||||
});
|
||||
});
|
||||
|
@ -1,32 +1,31 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
|
||||
import { resolveSplitTitle } from './index'
|
||||
import { resolveSplitTitle } from './index';
|
||||
|
||||
describe('resolveSplitTitle(text)', () => {
|
||||
it('does nothing if title not splittable', () => {
|
||||
const title = "This Is a Normal Title"
|
||||
const title = 'This Is a Normal Title';
|
||||
|
||||
assert.equal(resolveSplitTitle(title), title)
|
||||
})
|
||||
assert.equal(resolveSplitTitle(title), title);
|
||||
});
|
||||
|
||||
it('extracts titles from breadcrumb-like titles', () => {
|
||||
const title = "The Best Gadgets on Earth : Bits : Blogs : NYTimes.com"
|
||||
const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com';
|
||||
|
||||
assert.equal(resolveSplitTitle(title), "The Best Gadgets on Earth ")
|
||||
})
|
||||
assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth ');
|
||||
});
|
||||
|
||||
it('cleans domains from titles at the front', () => {
|
||||
const title = "NYTimes - The Best Gadgets on Earth"
|
||||
const url = "https://www.nytimes.com/bits/blog/etc/"
|
||||
const title = 'NYTimes - The Best Gadgets on Earth';
|
||||
const url = 'https://www.nytimes.com/bits/blog/etc/';
|
||||
|
||||
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth")
|
||||
})
|
||||
assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
|
||||
});
|
||||
|
||||
it('cleans domains from titles at the back', () => {
|
||||
const title = "The Best Gadgets on Earth | NYTimes"
|
||||
const url = "https://www.nytimes.com/bits/blog/etc/"
|
||||
const title = 'The Best Gadgets on Earth | NYTimes';
|
||||
const url = 'https://www.nytimes.com/bits/blog/etc/';
|
||||
|
||||
assert.equal(resolveSplitTitle(title, url), "The Best Gadgets on Earth")
|
||||
})
|
||||
})
|
||||
assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
|
||||
});
|
||||
});
|
||||
|
@ -1,25 +1,26 @@
|
||||
import { TITLE_SPLITTERS_RE } from './constants'
|
||||
import { resolveSplitTitle } from './index'
|
||||
import { stripTags } from 'utils/dom'
|
||||
import { stripTags } from 'utils/dom';
|
||||
|
||||
import { TITLE_SPLITTERS_RE } from './constants';
|
||||
import { resolveSplitTitle } from './index';
|
||||
|
||||
export default function cleanTitle(title, { url, $ }) {
|
||||
// If title has |, :, or - in it, see if
|
||||
// we can clean it up.
|
||||
if (TITLE_SPLITTERS_RE.test(title)) {
|
||||
title = resolveSplitTitle(title, url)
|
||||
title = resolveSplitTitle(title, url);
|
||||
}
|
||||
|
||||
// Final sanity check that we didn't get a crazy title.
|
||||
// if (title.length > 150 || title.length < 15) {
|
||||
if (title.length > 150) {
|
||||
// If we did, return h1 from the document if it exists
|
||||
const h1 = $('h1')
|
||||
const h1 = $('h1');
|
||||
if (h1.length === 1) {
|
||||
title = h1.text()
|
||||
title = h1.text();
|
||||
}
|
||||
}
|
||||
|
||||
// strip any html tags in the title text
|
||||
return stripTags(title, $).trim()
|
||||
return stripTags(title, $).trim();
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,11 @@
|
||||
import GenericExtractor from './generic'
|
||||
import NYMagExtractor from './custom/nymag.com'
|
||||
import BloggerExtractor from './custom/blogspot.com'
|
||||
import WikipediaExtractor from './custom/wikipedia.org'
|
||||
import NYMagExtractor from './custom/nymag.com';
|
||||
import BloggerExtractor from './custom/blogspot.com';
|
||||
import WikipediaExtractor from './custom/wikipedia.org';
|
||||
|
||||
const Extractors = {
|
||||
'nymag.com': NYMagExtractor,
|
||||
'blogspot.com': BloggerExtractor,
|
||||
'wikipedia.org': WikipediaExtractor,
|
||||
}
|
||||
};
|
||||
|
||||
export default Extractors
|
||||
export default Extractors;
|
||||
|
@ -1 +1 @@
|
||||
export const ATTR_RE = /\[([\w-]+)\]/
|
||||
export const ATTR_RE = /\[([\w-]+)\]/;
|
||||
|
@ -1,49 +1,48 @@
|
||||
import { cleanAuthor } from 'cleaners';
|
||||
import {
|
||||
extractFromMeta,
|
||||
extractFromSelectors,
|
||||
} from 'utils/dom';
|
||||
|
||||
import {
|
||||
AUTHOR_META_TAGS,
|
||||
AUTHOR_MAX_LENGTH,
|
||||
AUTHOR_SELECTORS,
|
||||
BYLINE_SELECTORS_RE,
|
||||
} from './constants'
|
||||
|
||||
import { cleanAuthor } from 'cleaners'
|
||||
|
||||
import {
|
||||
extractFromMeta,
|
||||
extractFromSelectors
|
||||
} from 'utils/dom'
|
||||
} from './constants';
|
||||
|
||||
const GenericAuthorExtractor = {
|
||||
extract({ $, metaCache }) {
|
||||
let author
|
||||
let author;
|
||||
|
||||
// First, check to see if we have a matching
|
||||
// meta tag that we can make use of.
|
||||
author = extractFromMeta($, AUTHOR_META_TAGS, metaCache)
|
||||
author = extractFromMeta($, AUTHOR_META_TAGS, metaCache);
|
||||
if (author && author.length < AUTHOR_MAX_LENGTH) {
|
||||
return cleanAuthor(author)
|
||||
return cleanAuthor(author);
|
||||
}
|
||||
|
||||
// Second, look through our selectors looking for potential authors.
|
||||
author = extractFromSelectors($, AUTHOR_SELECTORS, 2)
|
||||
author = extractFromSelectors($, AUTHOR_SELECTORS, 2);
|
||||
if (author && author.length < AUTHOR_MAX_LENGTH) {
|
||||
return cleanAuthor(author)
|
||||
return cleanAuthor(author);
|
||||
}
|
||||
|
||||
// Last, use our looser regular-expression based selectors for
|
||||
// potential authors.
|
||||
for (const [selector, regex] of BYLINE_SELECTORS_RE) {
|
||||
const node = $(selector)
|
||||
const node = $(selector);
|
||||
if (node.length === 1) {
|
||||
const text = node.text()
|
||||
const text = node.text();
|
||||
if (regex.test(text)) {
|
||||
return cleanAuthor(text)
|
||||
return cleanAuthor(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
}
|
||||
return null;
|
||||
},
|
||||
};
|
||||
|
||||
export default GenericAuthorExtractor
|
||||
export default GenericAuthorExtractor;
|
||||
|
||||
|
@ -1,46 +1,46 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import GenericAuthorExtractor from './extractor'
|
||||
import HTML from './fixtures/html';
|
||||
import GenericAuthorExtractor from './extractor';
|
||||
|
||||
describe('GenericAuthorExtractor', () => {
|
||||
describe('extract($, cachedMeta)', () => {
|
||||
it('extracts author from meta tags', () => {
|
||||
const $ = cheerio.load(HTML.authorMeta.test)
|
||||
const $ = cheerio.load(HTML.authorMeta.test);
|
||||
const result = GenericAuthorExtractor.extract(
|
||||
{ $, metaCache: ["dc.author", "something-else"] }
|
||||
)
|
||||
{ $, metaCache: ['dc.author', 'something-else'] }
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.authorMeta.result)
|
||||
})
|
||||
assert.equal(result, HTML.authorMeta.result);
|
||||
});
|
||||
|
||||
it('extracts author from author selectors', () => {
|
||||
const $ = cheerio.load(HTML.authorSelectors.test)
|
||||
const $ = cheerio.load(HTML.authorSelectors.test);
|
||||
const result = GenericAuthorExtractor.extract(
|
||||
{ $, metaCache: ["dc.author", "something-else"] }
|
||||
)
|
||||
{ $, metaCache: ['dc.author', 'something-else'] }
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.authorSelectors.result)
|
||||
})
|
||||
assert.equal(result, HTML.authorSelectors.result);
|
||||
});
|
||||
|
||||
it('extracts author with regex selectors', () => {
|
||||
const $ = cheerio.load(HTML.authorRegSelectors.test)
|
||||
const $ = cheerio.load(HTML.authorRegSelectors.test);
|
||||
const result = GenericAuthorExtractor.extract(
|
||||
{ $, metaCache: ["dc.author", "something-else"] }
|
||||
)
|
||||
{ $, metaCache: ['dc.author', 'something-else'] }
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.authorRegSelectors.result)
|
||||
})
|
||||
assert.equal(result, HTML.authorRegSelectors.result);
|
||||
});
|
||||
|
||||
it('returns null if no author found', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const result = GenericAuthorExtractor.extract(
|
||||
{ $, metaCache: ["dc.author", "something-else"] }
|
||||
)
|
||||
{ $, metaCache: ['dc.author', 'something-else'] }
|
||||
);
|
||||
|
||||
assert.equal(result, null)
|
||||
})
|
||||
})
|
||||
})
|
||||
assert.equal(result, null);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,24 +1,26 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import fs from 'fs'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
import fs from 'fs';
|
||||
|
||||
// import HTML from './fixtures/html'
|
||||
|
||||
import extractBestNode from './extract-best-node'
|
||||
import extractBestNode from './extract-best-node';
|
||||
|
||||
describe('extractBestNode($, flags)', () => {
|
||||
it("scores the dom nodes and returns the best option", () => {
|
||||
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8')
|
||||
it('scores the dom nodes and returns the best option', () => {
|
||||
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8');
|
||||
const opts = {
|
||||
stripUnlikelyCandidates: true,
|
||||
weightNodes: true,
|
||||
}
|
||||
stripUnlikelyCandidates: true,
|
||||
weightNodes: true,
|
||||
};
|
||||
|
||||
let $ = cheerio.load(html)
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const bestNode = extractBestNode($, opts)
|
||||
const bestNode = extractBestNode($, opts);
|
||||
|
||||
assert(typeof bestNode, 'object');
|
||||
// console.log(bestNode.html())
|
||||
|
||||
// assert.equal($(bestNode).text().length, 3652)
|
||||
})
|
||||
})
|
||||
});
|
||||
});
|
||||
|
@ -1,15 +1,15 @@
|
||||
import {
|
||||
getOrInitScore,
|
||||
setScore,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
export default function addScore($node, $, amount) {
|
||||
try {
|
||||
const score = getOrInitScore($node, $) + amount
|
||||
setScore($node, $, score)
|
||||
} catch(e) {
|
||||
console.debug(e)
|
||||
} finally {
|
||||
return $node
|
||||
const score = getOrInitScore($node, $) + amount;
|
||||
setScore($node, $, score);
|
||||
} catch (e) {
|
||||
// Ignoring; error occurs in scoreNode
|
||||
}
|
||||
|
||||
return $node;
|
||||
}
|
||||
|
@ -1,28 +1,27 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import {
|
||||
addScore,
|
||||
getScore,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
describe('addScore(node, $, amount)', () => {
|
||||
it(`adds the specified amount to a node's score`, () => {
|
||||
const $ = cheerio.load('<p score="25">Foo</p>')
|
||||
let $node = $('p').first()
|
||||
it('adds the specified amount to a node\'s score', () => {
|
||||
const $ = cheerio.load('<p score="25">Foo</p>');
|
||||
let $node = $('p').first();
|
||||
|
||||
$node = addScore($node, $, 25)
|
||||
assert.equal(getScore($node), 50)
|
||||
})
|
||||
$node = addScore($node, $, 25);
|
||||
assert.equal(getScore($node), 50);
|
||||
});
|
||||
|
||||
it(`adds score if score not yet set (assumes score is 0)`, () => {
|
||||
const $ = cheerio.load('<p>Foo</p>')
|
||||
let $node = $('p').first()
|
||||
it('adds score if score not yet set (assumes score is 0)', () => {
|
||||
const $ = cheerio.load('<p>Foo</p>');
|
||||
let $node = $('p').first();
|
||||
|
||||
$node = addScore($node, $, 25)
|
||||
assert.equal(getScore($node), 25)
|
||||
})
|
||||
|
||||
})
|
||||
})
|
||||
$node = addScore($node, $, 25);
|
||||
assert.equal(getScore($node), 25);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,11 +1,11 @@
|
||||
import { addScore } from './index'
|
||||
import { addScore } from './index';
|
||||
|
||||
// Adds 1/4 of a child's score to its parent
|
||||
export default function addToParent(node, $, score) {
|
||||
const parent = node.parent()
|
||||
const parent = node.parent();
|
||||
if (parent) {
|
||||
addScore(parent, $, score * .25)
|
||||
addScore(parent, $, score * 0.25);
|
||||
}
|
||||
|
||||
return node
|
||||
return node;
|
||||
}
|
||||
|
@ -1,24 +1,23 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import {
|
||||
addToParent,
|
||||
getScore,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
describe('addToParent(node, $, amount)', () => {
|
||||
it(`adds 1/4 of a node's score it its parent`, () => {
|
||||
const html = '<div score="25"><p score="40">Foo</p></div>'
|
||||
const $ = cheerio.load(html)
|
||||
let $node = $('p').first()
|
||||
it('adds 1/4 of a node\'s score it its parent', () => {
|
||||
const html = '<div score="25"><p score="40">Foo</p></div>';
|
||||
const $ = cheerio.load(html);
|
||||
let $node = $('p').first();
|
||||
|
||||
$node = addToParent($node, $, 40)
|
||||
$node = addToParent($node, $, 40);
|
||||
|
||||
assert.equal(getScore($node.parent()), 35)
|
||||
assert.equal(getScore($node), 40)
|
||||
})
|
||||
})
|
||||
|
||||
})
|
||||
assert.equal(getScore($node.parent()), 35);
|
||||
assert.equal(getScore($node), 40);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,115 +1,35 @@
|
||||
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants'
|
||||
import { getScore } from './index'
|
||||
import {
|
||||
textLength,
|
||||
linkDensity
|
||||
} from 'utils/dom'
|
||||
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
|
||||
import { getScore } from './index';
|
||||
import mergeSiblings from './merge-siblings';
|
||||
|
||||
// After we've calculated scores, loop through all of the possible
|
||||
// candidate nodes we found and find the one with the highest score.
|
||||
export default function findTopCandidate($) {
|
||||
let $candidate, topScore = 0
|
||||
let $candidate;
|
||||
let topScore = 0;
|
||||
|
||||
$('*[score]').each((index, node) => {
|
||||
const $node = $(node)
|
||||
const $node = $(node);
|
||||
// Ignore tags like BR, HR, etc
|
||||
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
|
||||
return
|
||||
return;
|
||||
}
|
||||
|
||||
const score = getScore($node)
|
||||
const score = getScore($node);
|
||||
|
||||
if (score > topScore) {
|
||||
topScore = score
|
||||
$candidate = $node
|
||||
topScore = score;
|
||||
$candidate = $node;
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
// If we don't have a candidate, return the body
|
||||
// or whatever the first element is
|
||||
if (!$candidate) {
|
||||
return $('body') || $('*').first()
|
||||
return $('body') || $('*').first();
|
||||
}
|
||||
|
||||
$candidate = mergeSiblings($candidate, topScore, $)
|
||||
$candidate = mergeSiblings($candidate, topScore, $);
|
||||
|
||||
return $candidate
|
||||
}
|
||||
|
||||
// Now that we have a top_candidate, look through the siblings of
|
||||
// it to see if any of them are decently scored. If they are, they
|
||||
// may be split parts of the content (Like two divs, a preamble and
|
||||
// a body.) Example:
|
||||
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
|
||||
export function mergeSiblings($candidate, topScore, $) {
|
||||
if (!$candidate.parent().length) {
|
||||
return $candidate
|
||||
}
|
||||
|
||||
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
|
||||
let wrappingDiv = $('<div></div>')
|
||||
|
||||
$candidate.parent().children().each((index, child) => {
|
||||
const $child = $(child)
|
||||
// Ignore tags like BR, HR, etc
|
||||
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
|
||||
return
|
||||
}
|
||||
|
||||
const childScore = getScore($child)
|
||||
if (childScore) {
|
||||
if ($child === $candidate) {
|
||||
wrappingDiv.append($child)
|
||||
} else {
|
||||
let contentBonus = 0
|
||||
// extract to scoreLinkDensity() TODO
|
||||
const density = linkDensity($child)
|
||||
|
||||
// If sibling has a very low link density,
|
||||
// give it a small bonus
|
||||
if (density < .05) {
|
||||
contentBonus = contentBonus + 20
|
||||
}
|
||||
|
||||
// If sibling has a high link density,
|
||||
// give it a penalty
|
||||
if (density >= 0.5) {
|
||||
contentBonus = contentBonus - 20
|
||||
}
|
||||
|
||||
// If sibling node has the same class as
|
||||
// candidate, give it a bonus
|
||||
if ($child.attr('class') === $candidate.attr('class')) {
|
||||
contentBonus = contentBonus + topScore * .2
|
||||
}
|
||||
|
||||
const newScore = getScore($child) + contentBonus
|
||||
|
||||
if (newScore >= siblingScoreThreshold) {
|
||||
return wrappingDiv.append($child)
|
||||
} else if (child.tagName === 'p') {
|
||||
const childContentLength = textLength($child.text())
|
||||
|
||||
if (childContentLength > 80 && density < .25) {
|
||||
return wrappingDiv.append($child)
|
||||
} else if (childContentLength <= 80 && density === 0 &&
|
||||
hasSentenceEnd(childContent)) {
|
||||
|
||||
return wrappingDiv.append($child)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
return wrappingDiv
|
||||
}
|
||||
|
||||
// TODO Extract into util - AP
|
||||
// Given a string, return True if it appears to have an ending sentence
|
||||
// within it, false otherwise.
|
||||
const SENTENCE_END_RE = new RegExp('\.( |$)')
|
||||
function hasSentenceEnd(text) {
|
||||
return SENTENCE_END_RE.test(text)
|
||||
return $candidate;
|
||||
}
|
||||
|
@ -1,58 +1,58 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import fs from 'fs'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
import fs from 'fs';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import HTML from './fixtures/html';
|
||||
|
||||
import {
|
||||
getScore,
|
||||
findTopCandidate,
|
||||
scoreContent
|
||||
} from './index'
|
||||
scoreContent,
|
||||
} from './index';
|
||||
|
||||
describe('findTopCandidate($)', () => {
|
||||
it("finds the top candidate from simple case", () => {
|
||||
const $ = cheerio.load(HTML.findDom1)
|
||||
it('finds the top candidate from simple case', () => {
|
||||
const $ = cheerio.load(HTML.findDom1);
|
||||
|
||||
const $$topCandidate = findTopCandidate($)
|
||||
const $$topCandidate = findTopCandidate($);
|
||||
|
||||
assert.equal(getScore($$topCandidate), 100)
|
||||
})
|
||||
assert.equal(getScore($$topCandidate), 100);
|
||||
});
|
||||
|
||||
it("finds the top candidate from a nested case", () => {
|
||||
const $ = cheerio.load(HTML.findDom2)
|
||||
it('finds the top candidate from a nested case', () => {
|
||||
const $ = cheerio.load(HTML.findDom2);
|
||||
|
||||
const $$topCandidate = findTopCandidate($)
|
||||
const $$topCandidate = findTopCandidate($);
|
||||
|
||||
// this is wrapped in a div so checking
|
||||
// the score of the first child
|
||||
assert.equal(getScore($$topCandidate.children().first()), 50)
|
||||
})
|
||||
assert.equal(getScore($$topCandidate.children().first()), 50);
|
||||
});
|
||||
|
||||
it("ignores tags like BR", () => {
|
||||
const $ = cheerio.load(HTML.findDom3)
|
||||
it('ignores tags like BR', () => {
|
||||
const $ = cheerio.load(HTML.findDom3);
|
||||
|
||||
const $topCandidate = findTopCandidate($)
|
||||
const $topCandidate = findTopCandidate($);
|
||||
|
||||
assert.equal(getScore($topCandidate), 50)
|
||||
})
|
||||
assert.equal(getScore($topCandidate), 50);
|
||||
});
|
||||
|
||||
it("returns BODY if no candidates found", () => {
|
||||
const $ = cheerio.load(HTML.topBody)
|
||||
it('returns BODY if no candidates found', () => {
|
||||
const $ = cheerio.load(HTML.topBody);
|
||||
|
||||
const $topCandidate = findTopCandidate($)
|
||||
const $topCandidate = findTopCandidate($);
|
||||
|
||||
assert.equal($topCandidate.get(0).tagName, 'body')
|
||||
})
|
||||
assert.equal($topCandidate.get(0).tagName, 'body');
|
||||
});
|
||||
|
||||
it("appends a sibling with a good enough score", () => {
|
||||
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
|
||||
it('appends a sibling with a good enough score', () => {
|
||||
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8');
|
||||
|
||||
let $ = cheerio.load(html)
|
||||
$ = scoreContent($)
|
||||
let $ = cheerio.load(html);
|
||||
$ = scoreContent($);
|
||||
|
||||
const $topCandidate = findTopCandidate($)
|
||||
assert.equal($($topCandidate).text().length, 3652)
|
||||
})
|
||||
})
|
||||
const $topCandidate = findTopCandidate($);
|
||||
assert.equal($($topCandidate).text().length, 3652);
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,61 +1,61 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import HTML from './fixtures/html';
|
||||
import {
|
||||
getOrInitScore,
|
||||
getScore,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
describe('getOrInitScore(node, $)', () => {
|
||||
describe('when score set', () => {
|
||||
it(`returns score if node's score already set`, () => {
|
||||
const html = '<p score="40">Foo</p>'
|
||||
const $ = cheerio.load(html)
|
||||
const node = $('p').first()
|
||||
it('returns score if node\'s score already set', () => {
|
||||
const html = '<p score="40">Foo</p>';
|
||||
const $ = cheerio.load(html);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = getOrInitScore(node, $)
|
||||
const score = getOrInitScore(node, $);
|
||||
|
||||
assert.equal(score, 40)
|
||||
})
|
||||
})
|
||||
assert.equal(score, 40);
|
||||
});
|
||||
});
|
||||
|
||||
describe('when no score set', () => {
|
||||
it(`returns 0 if no class/id and text < 25 chars`, () => {
|
||||
const html = '<p>Foo</p>'
|
||||
const $ = cheerio.load(html)
|
||||
const node = $('p').first()
|
||||
it('returns 0 if no class/id and text < 25 chars', () => {
|
||||
const html = '<p>Foo</p>';
|
||||
const $ = cheerio.load(html);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = getOrInitScore(node, $)
|
||||
const score = getOrInitScore(node, $);
|
||||
|
||||
assert.equal(score, 0)
|
||||
})
|
||||
assert.equal(score, 0);
|
||||
});
|
||||
|
||||
it(`returns score if no class/id and has commas/length`, () => {
|
||||
const $ = cheerio.load(HTML.score19)
|
||||
const node = $('p').first()
|
||||
it('returns score if no class/id and has commas/length', () => {
|
||||
const $ = cheerio.load(HTML.score19);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = getOrInitScore(node, $)
|
||||
const score = getOrInitScore(node, $);
|
||||
|
||||
assert.equal(score, 19)
|
||||
})
|
||||
assert.equal(score, 19);
|
||||
});
|
||||
|
||||
it(`returns greater score if weighted class/id is set`, () => {
|
||||
const $ = cheerio.load(HTML.score44)
|
||||
const node = $('p').first()
|
||||
it('returns greater score if weighted class/id is set', () => {
|
||||
const $ = cheerio.load(HTML.score44);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = getOrInitScore(node, $)
|
||||
const score = getOrInitScore(node, $);
|
||||
|
||||
assert.equal(score, 44)
|
||||
})
|
||||
assert.equal(score, 44);
|
||||
});
|
||||
|
||||
it(`gives 1/4 of its score to its parent`, () => {
|
||||
const $ = cheerio.load(HTML.score44Parent)
|
||||
const node = $('p').first()
|
||||
it('gives 1/4 of its score to its parent', () => {
|
||||
const $ = cheerio.load(HTML.score44Parent);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = getOrInitScore(node, $)
|
||||
getOrInitScore(node, $);
|
||||
|
||||
assert.equal(getScore(node.parent()), 16)
|
||||
})
|
||||
})
|
||||
})
|
||||
assert.equal(getScore(node.parent()), 16);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,25 +1,22 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import { getScore } from './index'
|
||||
import { getScore } from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
describe('getScore($node)', () => {
|
||||
it("returns null if the node has no score set", () => {
|
||||
const $ = cheerio.load('<p>Foo</p>')
|
||||
const $node = $('p').first()
|
||||
assert.equal(getScore($node), null)
|
||||
})
|
||||
it('returns null if the node has no score set', () => {
|
||||
const $ = cheerio.load('<p>Foo</p>');
|
||||
const $node = $('p').first();
|
||||
assert.equal(getScore($node), null);
|
||||
});
|
||||
|
||||
it("returns 25 if the node has a score attr of 25", () => {
|
||||
const $ = cheerio.load('<p score="25">Foo</p>')
|
||||
const $node = $('p').first()
|
||||
assert.equal(typeof getScore($node), 'number')
|
||||
assert.equal(getScore($node), 25)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
|
||||
})
|
||||
it('returns 25 if the node has a score attr of 25', () => {
|
||||
const $ = cheerio.load('<p score="25">Foo</p>');
|
||||
const $node = $('p').first();
|
||||
assert.equal(typeof getScore($node), 'number');
|
||||
assert.equal(getScore($node), 25);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,59 +1,58 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import HTML from './fixtures/get-weight'
|
||||
import HTML from './fixtures/get-weight';
|
||||
import {
|
||||
getWeight
|
||||
} from './index'
|
||||
getWeight,
|
||||
} from './index';
|
||||
|
||||
describe('Generic Extractor Utils', () => {
|
||||
describe('getWeight(node)', () => {
|
||||
it("returns a score of 25 if node has positive id", () => {
|
||||
const $ = cheerio.load(HTML.positiveId)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of -25 if node has negative id", () => {
|
||||
const $ = cheerio.load(HTML.negativeId)
|
||||
assert.equal(getWeight($('div')), -25)
|
||||
})
|
||||
|
||||
it("returns a score of 25 if node has positive class", () => {
|
||||
const $ = cheerio.load(HTML.positiveClass)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of -25 if node has negative class", () => {
|
||||
const $ = cheerio.load(HTML.negativeClass)
|
||||
assert.equal(getWeight($('div')), -25)
|
||||
})
|
||||
|
||||
it("returns a score of 25 if node has both positive id and class", () => {
|
||||
const $ = cheerio.load(HTML.positiveIdAndClass)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of 25 if node has pos id and neg class", () => {
|
||||
it('returns a score of 25 if node has positive id', () => {
|
||||
const $ = cheerio.load(HTML.positiveId);
|
||||
assert.equal(getWeight($('div')), 25);
|
||||
});
|
||||
|
||||
it('returns a score of -25 if node has negative id', () => {
|
||||
const $ = cheerio.load(HTML.negativeId);
|
||||
assert.equal(getWeight($('div')), -25);
|
||||
});
|
||||
|
||||
it('returns a score of 25 if node has positive class', () => {
|
||||
const $ = cheerio.load(HTML.positiveClass);
|
||||
assert.equal(getWeight($('div')), 25);
|
||||
});
|
||||
|
||||
it('returns a score of -25 if node has negative class', () => {
|
||||
const $ = cheerio.load(HTML.negativeClass);
|
||||
assert.equal(getWeight($('div')), -25);
|
||||
});
|
||||
|
||||
it('returns a score of 25 if node has both positive id and class', () => {
|
||||
const $ = cheerio.load(HTML.positiveIdAndClass);
|
||||
assert.equal(getWeight($('div')), 25);
|
||||
});
|
||||
|
||||
it('returns a score of 25 if node has pos id and neg class', () => {
|
||||
// is this really wanted? id="entry" class="adbox"
|
||||
// should get positive score?
|
||||
const $ = cheerio.load(HTML.positiveIdNegClass)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
const $ = cheerio.load(HTML.positiveIdNegClass);
|
||||
assert.equal(getWeight($('div')), 25);
|
||||
});
|
||||
|
||||
it("returns a score of 10 if node has pos img class", () => {
|
||||
const $ = cheerio.load(HTML.positivePhotoClass)
|
||||
assert.equal(getWeight($('div')), 10)
|
||||
})
|
||||
it('returns a score of 10 if node has pos img class', () => {
|
||||
const $ = cheerio.load(HTML.positivePhotoClass);
|
||||
assert.equal(getWeight($('div')), 10);
|
||||
});
|
||||
|
||||
it("returns a score of 35 if node has pos id pos img class", () => {
|
||||
const $ = cheerio.load(HTML.positiveIdAndPhoto)
|
||||
assert.equal(getWeight($('div')), 35)
|
||||
})
|
||||
it('returns a score of 35 if node has pos id pos img class', () => {
|
||||
const $ = cheerio.load(HTML.positiveIdAndPhoto);
|
||||
assert.equal(getWeight($('div')), 35);
|
||||
});
|
||||
|
||||
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
|
||||
const $ = cheerio.load(HTML.entryContentAsset)
|
||||
assert.equal(getWeight($('div')), 50)
|
||||
})
|
||||
|
||||
})
|
||||
})
|
||||
const $ = cheerio.load(HTML.entryContentAsset);
|
||||
assert.equal(getWeight($('div')), 50);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,13 +1,13 @@
|
||||
// Scoring
|
||||
export { default as getWeight } from './get-weight'
|
||||
export { default as getScore } from './get-score'
|
||||
export { default as scoreCommas } from './score-commas'
|
||||
export { default as scoreLength } from './score-length'
|
||||
export { default as scoreParagraph } from './score-paragraph'
|
||||
export { default as setScore } from './set-score'
|
||||
export { default as addScore } from './add-score'
|
||||
export { default as addToParent } from './add-to-parent'
|
||||
export { default as getOrInitScore } from './get-or-init-score'
|
||||
export { default as scoreNode } from './score-node'
|
||||
export { default as scoreContent } from './score-content'
|
||||
export { default as findTopCandidate } from './find-top-candidate'
|
||||
export { default as getWeight } from './get-weight';
|
||||
export { default as getScore } from './get-score';
|
||||
export { default as scoreCommas } from './score-commas';
|
||||
export { default as scoreLength } from './score-length';
|
||||
export { default as scoreParagraph } from './score-paragraph';
|
||||
export { default as setScore } from './set-score';
|
||||
export { default as addScore } from './add-score';
|
||||
export { default as addToParent } from './add-to-parent';
|
||||
export { default as getOrInitScore } from './get-or-init-score';
|
||||
export { default as scoreNode } from './score-node';
|
||||
export { default as scoreContent } from './score-content';
|
||||
export { default as findTopCandidate } from './find-top-candidate';
|
||||
|
@ -0,0 +1,79 @@
|
||||
import {
|
||||
textLength,
|
||||
linkDensity,
|
||||
} from 'utils/dom';
|
||||
import { hasSentenceEnd } from 'utils/text';
|
||||
|
||||
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
|
||||
import { getScore } from './index';
|
||||
|
||||
// Now that we have a top_candidate, look through the siblings of
|
||||
// it to see if any of them are decently scored. If they are, they
|
||||
// may be split parts of the content (Like two divs, a preamble and
|
||||
// a body.) Example:
|
||||
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
|
||||
export default function mergeSiblings($candidate, topScore, $) {
|
||||
if (!$candidate.parent().length) {
|
||||
return $candidate;
|
||||
}
|
||||
|
||||
const siblingScoreThreshold = Math.max(10, topScore * 0.2);
|
||||
const wrappingDiv = $('<div></div>');
|
||||
|
||||
$candidate.parent().children().each((index, child) => {
|
||||
const $child = $(child);
|
||||
// Ignore tags like BR, HR, etc
|
||||
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const childScore = getScore($child);
|
||||
if (childScore) {
|
||||
if ($child === $candidate) {
|
||||
wrappingDiv.append($child);
|
||||
} else {
|
||||
let contentBonus = 0;
|
||||
// extract to scoreLinkDensity() TODO
|
||||
const density = linkDensity($child);
|
||||
|
||||
// If sibling has a very low link density,
|
||||
// give it a small bonus
|
||||
if (density < 0.05) {
|
||||
contentBonus += 20;
|
||||
}
|
||||
|
||||
// If sibling has a high link density,
|
||||
// give it a penalty
|
||||
if (density >= 0.5) {
|
||||
contentBonus -= 20;
|
||||
}
|
||||
|
||||
// If sibling node has the same class as
|
||||
// candidate, give it a bonus
|
||||
if ($child.attr('class') === $candidate.attr('class')) {
|
||||
contentBonus += topScore * 0.2;
|
||||
}
|
||||
|
||||
const newScore = getScore($child) + contentBonus;
|
||||
|
||||
if (newScore >= siblingScoreThreshold) {
|
||||
return wrappingDiv.append($child);
|
||||
} else if (child.tagName === 'p') {
|
||||
const childContent = $child.text();
|
||||
const childContentLength = textLength(childContent);
|
||||
|
||||
if (childContentLength > 80 && density < 0.25) {
|
||||
return wrappingDiv.append($child);
|
||||
} else if (childContentLength <= 80 && density === 0 &&
|
||||
hasSentenceEnd(childContent)) {
|
||||
return wrappingDiv.append($child);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
|
||||
return wrappingDiv;
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
// return 1 for every comma in text
|
||||
export default function scoreCommas(text) {
|
||||
return (text.match(/,/g) || []).length
|
||||
return (text.match(/,/g) || []).length;
|
||||
}
|
||||
|
||||
|
@ -1,20 +1,18 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
|
||||
import { scoreCommas } from './index'
|
||||
import { scoreCommas } from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
describe('scoreCommas(text)', () => {
|
||||
it(`returns 0 if text has no commas`, () => {
|
||||
assert.equal(scoreCommas("Foo bar"), 0)
|
||||
})
|
||||
|
||||
it(`returns a point for every comma in the text`, () => {
|
||||
assert.equal(scoreCommas('Foo, bar'), 1)
|
||||
assert.equal(scoreCommas('Foo, bar, baz'), 2)
|
||||
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3)
|
||||
})
|
||||
})
|
||||
})
|
||||
it('returns 0 if text has no commas', () => {
|
||||
assert.equal(scoreCommas('Foo bar'), 0);
|
||||
});
|
||||
|
||||
it('returns a point for every comma in the text', () => {
|
||||
assert.equal(scoreCommas('Foo, bar'), 1);
|
||||
assert.equal(scoreCommas('Foo, bar, baz'), 2);
|
||||
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,47 +1,45 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import fs from 'fs'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
import fs from 'fs';
|
||||
|
||||
import { clean } from 'test-helpers'
|
||||
import HTML from './fixtures/html'
|
||||
import HTML from './fixtures/html';
|
||||
|
||||
import {
|
||||
scoreContent,
|
||||
getScore,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
// TODO: Walk through these and sanity check my scores
|
||||
// Commented out scores were what I expected, but I was also
|
||||
// probably missing something when calculating
|
||||
describe('scoreContent($, weightNodes)', () => {
|
||||
it("loves hNews content", () => {
|
||||
const $ = cheerio.load(HTML.hNews.before)
|
||||
const result = scoreContent($).html()
|
||||
it('loves hNews content', () => {
|
||||
const $ = cheerio.load(HTML.hNews.before);
|
||||
scoreContent($).html();
|
||||
|
||||
assert.equal(getScore($('div').first()), 140)
|
||||
})
|
||||
assert.equal(getScore($('div').first()), 140);
|
||||
});
|
||||
|
||||
it("is so-so about non-hNews content", () => {
|
||||
const $ = cheerio.load(HTML.nonHNews.before)
|
||||
const result = scoreContent($).html()
|
||||
it('is so-so about non-hNews content', () => {
|
||||
const $ = cheerio.load(HTML.nonHNews.before);
|
||||
scoreContent($).html();
|
||||
|
||||
assert.equal(getScore($('div').first()), 65)
|
||||
})
|
||||
assert.equal(getScore($('div').first()), 65);
|
||||
});
|
||||
|
||||
it("scores this Wired article the same", () => {
|
||||
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
|
||||
const $ = cheerio.load(html)
|
||||
const result = scoreContent($).html()
|
||||
it('scores this Wired article the same', () => {
|
||||
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
|
||||
const $ = cheerio.load(html);
|
||||
scoreContent($).html();
|
||||
|
||||
assert.equal(getScore($('article').first()), 65.5)
|
||||
})
|
||||
assert.equal(getScore($('article').first()), 65.5);
|
||||
});
|
||||
|
||||
it("scores this Vulture article", () => {
|
||||
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8')
|
||||
let $ = cheerio.load(html)
|
||||
$ = scoreContent($)
|
||||
it('scores this Vulture article', () => {
|
||||
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
|
||||
let $ = cheerio.load(html);
|
||||
$ = scoreContent($);
|
||||
|
||||
assert.equal($('p[score]').length, 62)
|
||||
})
|
||||
|
||||
})
|
||||
assert.equal($('p[score]').length, 62);
|
||||
});
|
||||
});
|
||||
|
@ -1,22 +1,21 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
|
||||
import { scoreLength } from './index'
|
||||
import { scoreLength } from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
describe('scoreLength(textLength, tagName)', () => {
|
||||
it(`returns 0 if length < 50 chars`, () => {
|
||||
assert.equal(scoreLength(30), 0)
|
||||
})
|
||||
it('returns 0 if length < 50 chars', () => {
|
||||
assert.equal(scoreLength(30), 0);
|
||||
});
|
||||
|
||||
it(`returns varying scores but maxes out at 3`, () => {
|
||||
assert.equal(scoreLength(150), 1)
|
||||
assert.equal(scoreLength(199), 1.98)
|
||||
assert.equal(scoreLength(200), 2)
|
||||
assert.equal(scoreLength(250), 3)
|
||||
assert.equal(scoreLength(500), 3)
|
||||
assert.equal(scoreLength(1500), 3)
|
||||
})
|
||||
})
|
||||
})
|
||||
it('returns varying scores but maxes out at 3', () => {
|
||||
assert.equal(scoreLength(150), 1);
|
||||
assert.equal(scoreLength(199), 1.98);
|
||||
assert.equal(scoreLength(200), 2);
|
||||
assert.equal(scoreLength(250), 3);
|
||||
assert.equal(scoreLength(500), 3);
|
||||
assert.equal(scoreLength(1500), 3);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,95 +1,94 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import HTML from './fixtures/html';
|
||||
|
||||
import {
|
||||
scoreNode,
|
||||
scoreParagraph,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
|
||||
describe('scoreNode(node)', () => {
|
||||
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
|
||||
const html = '<p><em>Foo</em> bar</p>'
|
||||
const $ = cheerio.load(html)
|
||||
let node = $('p').first()
|
||||
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
|
||||
const html = '<p><em>Foo</em> bar</p>';
|
||||
const $ = cheerio.load(html);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = scoreNode(node)
|
||||
const pScore = scoreParagraph(node)
|
||||
const score = scoreNode(node);
|
||||
const pScore = scoreParagraph(node);
|
||||
|
||||
assert.equal(score, pScore)
|
||||
assert.equal(score, 0)
|
||||
})
|
||||
assert.equal(score, pScore);
|
||||
assert.equal(score, 0);
|
||||
});
|
||||
|
||||
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
|
||||
const $ = cheerio.load(HTML.score1)
|
||||
let node = $('p').first()
|
||||
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
|
||||
const $ = cheerio.load(HTML.score1);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = scoreNode(node)
|
||||
const pScore = scoreParagraph(node)
|
||||
const score = scoreNode(node);
|
||||
const pScore = scoreParagraph(node);
|
||||
|
||||
assert.equal(score, pScore)
|
||||
assert.equal(score, 1)
|
||||
assert.equal(score, pScore);
|
||||
assert.equal(score, 1);
|
||||
});
|
||||
|
||||
})
|
||||
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
|
||||
const $ = cheerio.load(HTML.score3);
|
||||
const node = $('p').first();
|
||||
|
||||
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
|
||||
const $ = cheerio.load(HTML.score3)
|
||||
let node = $('p').first()
|
||||
const score = scoreNode(node);
|
||||
const pScore = scoreParagraph(node);
|
||||
|
||||
const score = scoreNode(node)
|
||||
const pScore = scoreParagraph(node)
|
||||
assert.equal(score, pScore);
|
||||
assert.equal(score, 3);
|
||||
});
|
||||
|
||||
assert.equal(score, pScore)
|
||||
assert.equal(score, 3)
|
||||
})
|
||||
it('scores P, LI, SPAN, and PRE using scoreParagraph', () => {
|
||||
const $ = cheerio.load(HTML.score19);
|
||||
const node = $('p').first();
|
||||
|
||||
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
|
||||
const $ = cheerio.load(HTML.score19)
|
||||
let node = $('p').first()
|
||||
const score = scoreNode(node);
|
||||
const pScore = scoreParagraph(node);
|
||||
|
||||
const score = scoreNode(node)
|
||||
const pScore = scoreParagraph(node)
|
||||
assert.equal(score, pScore);
|
||||
assert.equal(score, 19);
|
||||
});
|
||||
|
||||
assert.equal(score, pScore)
|
||||
assert.equal(score, 19)
|
||||
})
|
||||
it('scores divs with 5', () => {
|
||||
const $ = cheerio.load(HTML.divScore5);
|
||||
const node = $('div').first();
|
||||
|
||||
it(`scores divs with 5`, () => {
|
||||
const $ = cheerio.load(HTML.divScore5)
|
||||
let node = $('div').first()
|
||||
const score = scoreNode(node);
|
||||
|
||||
const score = scoreNode(node)
|
||||
assert.equal(score, 5);
|
||||
});
|
||||
|
||||
assert.equal(score, 5)
|
||||
})
|
||||
it('scores the blockquote family with 3', () => {
|
||||
const $ = cheerio.load(HTML.blockquoteScore3);
|
||||
const node = $('blockquote').first();
|
||||
|
||||
it(`scores the blockquote family with 3`, () => {
|
||||
const $ = cheerio.load(HTML.blockquoteScore3)
|
||||
let node = $('blockquote').first()
|
||||
const score = scoreNode(node);
|
||||
|
||||
const score = scoreNode(node)
|
||||
assert.equal(score, 3);
|
||||
});
|
||||
|
||||
assert.equal(score, 3)
|
||||
})
|
||||
it('scores a form with negative 3', () => {
|
||||
const $ = cheerio.load(HTML.formScoreNeg3);
|
||||
const node = $('form').first();
|
||||
|
||||
it(`scores a form with negative 3`, () => {
|
||||
const $ = cheerio.load(HTML.formScoreNeg3)
|
||||
let node = $('form').first()
|
||||
const score = scoreNode(node);
|
||||
|
||||
const score = scoreNode(node)
|
||||
assert.equal(score, -3);
|
||||
});
|
||||
|
||||
assert.equal(score, -3)
|
||||
})
|
||||
it('scores a TH element with negative 5', () => {
|
||||
const $ = cheerio.load(HTML.thScoreNeg5);
|
||||
const node = $('th').first();
|
||||
|
||||
it(`scores a TH element with negative 5`, () => {
|
||||
const $ = cheerio.load(HTML.thScoreNeg5)
|
||||
let node = $('th').first()
|
||||
const score = scoreNode(node);
|
||||
|
||||
const score = scoreNode(node)
|
||||
|
||||
assert.equal(score, -5)
|
||||
})
|
||||
})
|
||||
assert.equal(score, -5);
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,35 +1,35 @@
|
||||
import {
|
||||
scoreCommas,
|
||||
scoreLength,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
// Score a paragraph using various methods. Things like number of
|
||||
// commas, etc. Higher is better.
|
||||
export default function scoreParagraph(node) {
|
||||
let score = 1
|
||||
const text = node.text().trim()
|
||||
const textLength = text.length
|
||||
let score = 1;
|
||||
const text = node.text().trim();
|
||||
const textLength = text.length;
|
||||
|
||||
// If this paragraph is less than 25 characters, don't count it.
|
||||
if (textLength < 25) {
|
||||
return 0
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Add points for any commas within this paragraph
|
||||
score = score + scoreCommas(text)
|
||||
score += scoreCommas(text);
|
||||
|
||||
// For every 50 characters in this paragraph, add another point. Up
|
||||
// to 3 points.
|
||||
score = score + scoreLength(textLength)
|
||||
score += scoreLength(textLength);
|
||||
|
||||
// Articles can end with short paragraphs when people are being clever
|
||||
// but they can also end with short paragraphs setting up lists of junk
|
||||
// that we strip. This negative tweaks junk setup paragraphs just below
|
||||
// the cutoff threshold.
|
||||
if (text.slice(-1) === ':') {
|
||||
score = score - 1
|
||||
score -= 1;
|
||||
}
|
||||
|
||||
return score
|
||||
return score;
|
||||
}
|
||||
|
||||
|
@ -1,48 +1,48 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import HTML from './fixtures/html';
|
||||
import {
|
||||
scoreParagraph,
|
||||
} from './index'
|
||||
} from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
describe('scoreParagraph(node)', () => {
|
||||
it(`returns 0 if text is less than 25 chars`, () => {
|
||||
const html = '<p><em>Foo</em> bar</p>'
|
||||
const $ = cheerio.load(html)
|
||||
let node = $('p').first()
|
||||
it('returns 0 if text is less than 25 chars', () => {
|
||||
const html = '<p><em>Foo</em> bar</p>';
|
||||
const $ = cheerio.load(html);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = scoreParagraph(node)
|
||||
const score = scoreParagraph(node);
|
||||
|
||||
assert.equal(score, 0)
|
||||
})
|
||||
assert.equal(score, 0);
|
||||
});
|
||||
|
||||
it(`returns 1 if text is > 25 chars and has 0 commas`, () => {
|
||||
const $ = cheerio.load(HTML.score1)
|
||||
let node = $('p').first()
|
||||
it('returns 1 if text is > 25 chars and has 0 commas', () => {
|
||||
const $ = cheerio.load(HTML.score1);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = scoreParagraph(node)
|
||||
const score = scoreParagraph(node);
|
||||
|
||||
assert.equal(score, 1)
|
||||
})
|
||||
assert.equal(score, 1);
|
||||
});
|
||||
|
||||
it(`returns 3 if text is > 25 chars and has 2 commas`, () => {
|
||||
const $ = cheerio.load(HTML.score3)
|
||||
let node = $('p').first()
|
||||
it('returns 3 if text is > 25 chars and has 2 commas', () => {
|
||||
const $ = cheerio.load(HTML.score3);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = scoreParagraph(node)
|
||||
const score = scoreParagraph(node);
|
||||
|
||||
assert.equal(score, 3)
|
||||
})
|
||||
assert.equal(score, 3);
|
||||
});
|
||||
|
||||
it(`returns 19 if text has 15 commas, ~600 chars`, () => {
|
||||
const $ = cheerio.load(HTML.score19)
|
||||
let node = $('p').first()
|
||||
it('returns 19 if text has 15 commas, ~600 chars', () => {
|
||||
const $ = cheerio.load(HTML.score19);
|
||||
const node = $('p').first();
|
||||
|
||||
const score = scoreParagraph(node)
|
||||
const score = scoreParagraph(node);
|
||||
|
||||
assert.equal(score, 19)
|
||||
})
|
||||
})
|
||||
})
|
||||
assert.equal(score, 19);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,23 +1,22 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import {
|
||||
setScore,
|
||||
getScore
|
||||
} from './index'
|
||||
getScore,
|
||||
} from './index';
|
||||
|
||||
describe('Scoring utils', () => {
|
||||
|
||||
describe('setScore(node, $, amount)', () => {
|
||||
it("sets the specified amount as the node's score", () => {
|
||||
const $ = cheerio.load('<p>Foo</p>')
|
||||
let $node = $('p').first()
|
||||
const $ = cheerio.load('<p>Foo</p>');
|
||||
let $node = $('p').first();
|
||||
|
||||
const newScore = 25
|
||||
$node = setScore($node, $, newScore)
|
||||
const newScore = 25;
|
||||
$node = setScore($node, $, newScore);
|
||||
|
||||
const score = getScore($node)
|
||||
assert(score, newScore)
|
||||
})
|
||||
})
|
||||
})
|
||||
const score = getScore($node);
|
||||
assert(score, newScore);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,37 +1,36 @@
|
||||
import { cleanDatePublished } from 'cleaners';
|
||||
import {
|
||||
extractFromMeta,
|
||||
extractFromSelectors,
|
||||
} from 'utils/dom';
|
||||
import { extractFromUrl } from 'utils/text';
|
||||
|
||||
import {
|
||||
DATE_PUBLISHED_META_TAGS,
|
||||
DATE_PUBLISHED_SELECTORS,
|
||||
DATE_PUBLISHED_URL_RES,
|
||||
} from './constants'
|
||||
|
||||
import { cleanDatePublished } from 'cleaners'
|
||||
|
||||
import {
|
||||
extractFromMeta,
|
||||
extractFromSelectors,
|
||||
} from 'utils/dom'
|
||||
import { extractFromUrl } from 'utils/text'
|
||||
} from './constants';
|
||||
|
||||
const GenericDatePublishedExtractor = {
|
||||
extract({ $, url, metaCache }) {
|
||||
let datePublished
|
||||
let datePublished;
|
||||
// First, check to see if we have a matching meta tag
|
||||
// that we can make use of.
|
||||
// Don't try cleaning tags from this string
|
||||
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false)
|
||||
if(datePublished) return cleanDatePublished(datePublished)
|
||||
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false);
|
||||
if (datePublished) return cleanDatePublished(datePublished);
|
||||
|
||||
// Second, look through our selectors looking for potential
|
||||
// date_published's.
|
||||
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS)
|
||||
if(datePublished) return cleanDatePublished(datePublished)
|
||||
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
|
||||
if (datePublished) return cleanDatePublished(datePublished);
|
||||
|
||||
// Lastly, look to see if a dately string exists in the URL
|
||||
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES)
|
||||
if(datePublished) return cleanDatePublished(datePublished)
|
||||
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
|
||||
if (datePublished) return cleanDatePublished(datePublished);
|
||||
|
||||
return null
|
||||
}
|
||||
}
|
||||
return null;
|
||||
},
|
||||
};
|
||||
|
||||
export default GenericDatePublishedExtractor
|
||||
export default GenericDatePublishedExtractor;
|
||||
|
@ -1,97 +1,95 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import moment from 'moment'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
import moment from 'moment';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import GenericDatePublishedExtractor from './extractor'
|
||||
import HTML from './fixtures/html';
|
||||
import GenericDatePublishedExtractor from './extractor';
|
||||
|
||||
describe('GenericDatePublishedExtractor', () => {
|
||||
describe('extract($, metaCache)', () => {
|
||||
it('extracts datePublished from meta tags', () => {
|
||||
const $ = cheerio.load(HTML.datePublishedMeta.test)
|
||||
const metaCache = ["displaydate", "something-else"]
|
||||
const $ = cheerio.load(HTML.datePublishedMeta.test);
|
||||
const metaCache = ['displaydate', 'something-else'];
|
||||
const result =
|
||||
GenericDatePublishedExtractor.extract(
|
||||
{ $, url: '', metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
assert.equal(
|
||||
result,
|
||||
HTML.datePublishedMeta.result.toISOString()
|
||||
)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('extracts datePublished from selectors', () => {
|
||||
const $ = cheerio.load(HTML.datePublishedSelectors.test)
|
||||
const metaCache = []
|
||||
const $ = cheerio.load(HTML.datePublishedSelectors.test);
|
||||
const metaCache = [];
|
||||
const result =
|
||||
GenericDatePublishedExtractor.extract(
|
||||
{ $, url: '', metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
assert.equal(
|
||||
result,
|
||||
HTML.datePublishedMeta.result.toISOString()
|
||||
)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('extracts from url formatted /2012/08/01/etc', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const metaCache = []
|
||||
const url = 'https://example.com/2012/08/01/this-is-good'
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const metaCache = [];
|
||||
const url = 'https://example.com/2012/08/01/this-is-good';
|
||||
const result =
|
||||
GenericDatePublishedExtractor.extract(
|
||||
{ $, url, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
assert.equal(
|
||||
result,
|
||||
new Date('2012/08/01').toISOString()
|
||||
)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('extracts from url formatted /2020-01-01', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const metaCache = []
|
||||
const url = 'https://example.com/2020-01-01/this-is-good'
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const metaCache = [];
|
||||
const url = 'https://example.com/2020-01-01/this-is-good';
|
||||
const result =
|
||||
GenericDatePublishedExtractor.extract(
|
||||
{ $, url, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
assert.equal(
|
||||
result,
|
||||
moment(new Date('2020-01-01')).toISOString()
|
||||
)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('extracts from url formatted /2020/jan/01', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const metaCache = []
|
||||
const url = 'https://example.com/2020/jan/01/this-is-good'
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const metaCache = [];
|
||||
const url = 'https://example.com/2020/jan/01/this-is-good';
|
||||
const result =
|
||||
GenericDatePublishedExtractor.extract(
|
||||
{ $, url, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
assert.equal(
|
||||
result,
|
||||
new Date('2020/jan/01').toISOString()
|
||||
)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('returns null if no date can be found', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const metaCache = []
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const metaCache = [];
|
||||
const result =
|
||||
GenericDatePublishedExtractor.extract(
|
||||
{ $, url: '', metaCache }
|
||||
)
|
||||
|
||||
assert.equal(result, null)
|
||||
})
|
||||
|
||||
})
|
||||
})
|
||||
);
|
||||
|
||||
assert.equal(result, null);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -1,20 +1,18 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
// import HTML from './fixtures/html'
|
||||
import GenericDekExtractor from './extractor'
|
||||
import GenericDekExtractor from './extractor';
|
||||
|
||||
describe('GenericDekExtractor', () => {
|
||||
describe('extract({ $, metaCache })', () => {
|
||||
|
||||
it('returns null if no dek can be found', () => {
|
||||
const $ = cheerio.load('<div></div>')
|
||||
const metaCache = []
|
||||
const $ = cheerio.load('<div></div>');
|
||||
const metaCache = [];
|
||||
const result =
|
||||
GenericDekExtractor.extract({ $, metaCache })
|
||||
|
||||
assert.equal(result, null)
|
||||
})
|
||||
GenericDekExtractor.extract({ $, metaCache });
|
||||
|
||||
})
|
||||
})
|
||||
assert.equal(result, null);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,50 +1,50 @@
|
||||
import cheerio from 'cheerio'
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import GenericContentExtractor from './content/extractor'
|
||||
import GenericTitleExtractor from './title/extractor'
|
||||
import GenericAuthorExtractor from './author/extractor'
|
||||
import GenericDatePublishedExtractor from './date-published/extractor'
|
||||
import GenericDekExtractor from './dek/extractor'
|
||||
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
|
||||
import GenericNextPageUrlExtractor from './next-page-url/extractor'
|
||||
import GenericContentExtractor from './content/extractor';
|
||||
import GenericTitleExtractor from './title/extractor';
|
||||
import GenericAuthorExtractor from './author/extractor';
|
||||
import GenericDatePublishedExtractor from './date-published/extractor';
|
||||
import GenericDekExtractor from './dek/extractor';
|
||||
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
|
||||
import GenericNextPageUrlExtractor from './next-page-url/extractor';
|
||||
|
||||
const GenericExtractor = {
|
||||
// This extractor is the default for all domains
|
||||
domain: '*',
|
||||
title: GenericTitleExtractor.extract,
|
||||
datePublished : GenericDatePublishedExtractor.extract,
|
||||
datePublished: GenericDatePublishedExtractor.extract,
|
||||
author: GenericAuthorExtractor.extract,
|
||||
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
|
||||
leadImageUrl: GenericLeadImageUrlExtractor.extract,
|
||||
dek: GenericDekExtractor.extract,
|
||||
nextPageUrl: GenericNextPageUrlExtractor.extract,
|
||||
|
||||
extract: function(options) {
|
||||
let { html } = options
|
||||
extract(options) {
|
||||
const { html } = options;
|
||||
|
||||
if (html) {
|
||||
const $ = cheerio.load(html)
|
||||
options.$ = $
|
||||
const $ = cheerio.load(html);
|
||||
options.$ = $;
|
||||
}
|
||||
|
||||
const title = this.title(options)
|
||||
const datePublished = this.datePublished(options)
|
||||
const author = this.author(options)
|
||||
const content = this.content({ ...options, title })
|
||||
const leadImageUrl = this.leadImageUrl(options)
|
||||
const dek = this.dek(options)
|
||||
const nextPageUrl = this.nextPageUrl(options)
|
||||
const title = this.title(options);
|
||||
const datePublished = this.datePublished(options);
|
||||
const author = this.author(options);
|
||||
const content = this.content({ ...options, title });
|
||||
const leadImageUrl = this.leadImageUrl(options);
|
||||
const dek = this.dek(options);
|
||||
const nextPageUrl = this.nextPageUrl(options);
|
||||
|
||||
return {
|
||||
title,
|
||||
author,
|
||||
datePublished: datePublished ? datePublished : null,
|
||||
datePublished: datePublished || null,
|
||||
dek,
|
||||
leadImageUrl,
|
||||
content,
|
||||
nextPageUrl,
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
export default GenericExtractor
|
||||
export default GenericExtractor;
|
||||
|
@ -1,62 +1,62 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import HTML from './fixtures/html';
|
||||
|
||||
import GenericLeadImageUrlExtractor from './extractor'
|
||||
import GenericLeadImageUrlExtractor from './extractor';
|
||||
|
||||
describe('GenericLeadImageUrlExtractor', () => {
|
||||
describe('extract({ $, content, metaCache })', () => {
|
||||
it('returns og:image first', () => {
|
||||
const $ = cheerio.load(HTML.og.test)
|
||||
const content = $('*').first()
|
||||
const metaCache = ['og:image']
|
||||
const $ = cheerio.load(HTML.og.test);
|
||||
const content = $('*').first();
|
||||
const metaCache = ['og:image'];
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract(
|
||||
{ $, content, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.og.result)
|
||||
})
|
||||
assert.equal(result, HTML.og.result);
|
||||
});
|
||||
|
||||
it('returns twitter:image', () => {
|
||||
const $ = cheerio.load(HTML.twitter.test)
|
||||
const content = $('*').first()
|
||||
const metaCache = ['twitter:image']
|
||||
const $ = cheerio.load(HTML.twitter.test);
|
||||
const content = $('*').first();
|
||||
const metaCache = ['twitter:image'];
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract(
|
||||
{ $, content, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.twitter.result)
|
||||
})
|
||||
assert.equal(result, HTML.twitter.result);
|
||||
});
|
||||
|
||||
it('finds images based on scoring', () => {
|
||||
const $ = cheerio.load(HTML.scoring.test)
|
||||
const content = $('*').first()
|
||||
const metaCache = []
|
||||
const $ = cheerio.load(HTML.scoring.test);
|
||||
const content = $('*').first();
|
||||
const metaCache = [];
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract(
|
||||
{ $, content, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.scoring.result)
|
||||
})
|
||||
assert.equal(result, HTML.scoring.result);
|
||||
});
|
||||
|
||||
it('returns image based on selectors', () => {
|
||||
const $ = cheerio.load(HTML.selectors.test)
|
||||
const content = $('*').first()
|
||||
const metaCache = []
|
||||
const $ = cheerio.load(HTML.selectors.test);
|
||||
const content = $('*').first();
|
||||
const metaCache = [];
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract(
|
||||
{ $, content, metaCache }
|
||||
)
|
||||
);
|
||||
|
||||
assert.equal(result, HTML.selectors.result)
|
||||
})
|
||||
})
|
||||
})
|
||||
assert.equal(result, HTML.selectors.result);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,34 +1,34 @@
|
||||
import assert from 'assert'
|
||||
import fs from 'fs'
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert';
|
||||
import fs from 'fs';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import GenericNextPageUrlExtractor from './extractor'
|
||||
import GenericNextPageUrlExtractor from './extractor';
|
||||
|
||||
describe('GenericNextPageUrlExtractor', () => {
|
||||
it('returns most likely next page url', () => {
|
||||
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
|
||||
const $ = cheerio.load(html)
|
||||
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
|
||||
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'
|
||||
const html = fs.readFileSync('./fixtures/ars.html', 'utf8');
|
||||
const $ = cheerio.load(html);
|
||||
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
|
||||
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2';
|
||||
|
||||
const nextPage = GenericNextPageUrlExtractor.extract({
|
||||
$,
|
||||
url
|
||||
})
|
||||
url,
|
||||
});
|
||||
|
||||
assert.equal(nextPage, next)
|
||||
})
|
||||
assert.equal(nextPage, next);
|
||||
});
|
||||
|
||||
it('returns null if there is no likely next page', () => {
|
||||
const html = `<div><p>HI</p></div>`
|
||||
const $ = cheerio.load(html)
|
||||
const url = 'http://example.com/foo/bar'
|
||||
const html = '<div><p>HI</p></div>';
|
||||
const $ = cheerio.load(html);
|
||||
const url = 'http://example.com/foo/bar';
|
||||
|
||||
const nextPage = GenericNextPageUrlExtractor.extract({
|
||||
$,
|
||||
url
|
||||
})
|
||||
url,
|
||||
});
|
||||
|
||||
assert.equal(nextPage, null)
|
||||
})
|
||||
})
|
||||
assert.equal(nextPage, null);
|
||||
});
|
||||
});
|
||||
|
@ -1,38 +1,38 @@
|
||||
export const DIGIT_RE = /\d/
|
||||
export const DIGIT_RE = /\d/;
|
||||
|
||||
// A list of words that, if found in link text or URLs, likely mean that
|
||||
// this link is not a next page link.
|
||||
export const EXTRANEOUS_LINK_HINTS = [
|
||||
'print',
|
||||
'archive',
|
||||
'comment',
|
||||
'discuss',
|
||||
'e-mail',
|
||||
'email',
|
||||
'share',
|
||||
'reply',
|
||||
'all',
|
||||
'login',
|
||||
'sign',
|
||||
'single',
|
||||
'adx',
|
||||
'entry-unrelated'
|
||||
]
|
||||
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
|
||||
'print',
|
||||
'archive',
|
||||
'comment',
|
||||
'discuss',
|
||||
'e-mail',
|
||||
'email',
|
||||
'share',
|
||||
'reply',
|
||||
'all',
|
||||
'login',
|
||||
'sign',
|
||||
'single',
|
||||
'adx',
|
||||
'entry-unrelated',
|
||||
];
|
||||
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');
|
||||
|
||||
// Match any link text/classname/id that looks like it could mean the next
|
||||
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
|
||||
// mean last page.
|
||||
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
|
||||
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i');
|
||||
|
||||
// Match any link text/classname/id that looks like it is an end link: things
|
||||
// like "first", "last", "end", etc.
|
||||
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
|
||||
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');
|
||||
|
||||
// Match any link text/classname/id that looks like it means the previous
|
||||
// page.
|
||||
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
|
||||
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');
|
||||
|
||||
// Match any phrase that looks like it could be page, or paging, or pagination
|
||||
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
|
||||
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');
|
||||
|
||||
|
@ -0,0 +1,10 @@
|
||||
export { default as scoreSimilarity } from './score-similarity';
|
||||
export { default as scoreLinkText } from './score-link-text';
|
||||
export { default as scorePageInLink } from './score-page-in-link';
|
||||
export { default as scoreExtraneousLinks } from './score-extraneous-links';
|
||||
export { default as scoreByParents } from './score-by-parents';
|
||||
export { default as scorePrevLink } from './score-prev-link';
|
||||
export { default as shouldScore } from './should-score';
|
||||
export { default as scoreBaseUrl } from './score-base-url';
|
||||
export { default as scoreNextLinkText } from './score-next-link-text';
|
||||
export { default as scoreCapLinks } from './score-cap-links';
|
@ -0,0 +1,11 @@
|
||||
export default function scoreBaseUrl(href, baseRegex) {
|
||||
// If the baseUrl isn't part of this URL, penalize this
|
||||
// link. It could still be the link, but the odds are lower.
|
||||
// Example:
|
||||
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
|
||||
if (!baseRegex.test(href)) {
|
||||
return -25;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scoreBaseUrl from './score-base-url';
|
||||
import { makeBaseRegex } from '../score-links';
|
||||
|
||||
describe('scoreBaseUrl(href, baseRegex)', () => {
|
||||
it('returns -25 if url does not contain the base url', () => {
|
||||
const baseUrl = 'http://example.com/foo/bar';
|
||||
const badUrl = 'http://foo.com/foo/bar';
|
||||
const baseRegex = makeBaseRegex(baseUrl);
|
||||
|
||||
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25);
|
||||
});
|
||||
|
||||
it('returns 0 if url contains the base url', () => {
|
||||
const baseUrl = 'http://example.com/foo/bar';
|
||||
const badUrl = 'http://example.com/foo/bar/bat';
|
||||
const baseRegex = makeBaseRegex(baseUrl);
|
||||
|
||||
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,52 @@
|
||||
import { range } from 'utils';
|
||||
import {
|
||||
NEGATIVE_SCORE_RE,
|
||||
POSITIVE_SCORE_RE,
|
||||
PAGE_RE,
|
||||
} from 'utils/dom/constants';
|
||||
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
|
||||
|
||||
function makeSig($link) {
|
||||
return `${$link.attr('class') || ''} ${$link.attr('id') || ''}`;
|
||||
}
|
||||
|
||||
export default function scoreByParents($link) {
|
||||
// If a parent node contains paging-like classname or id, give a
|
||||
// bonus. Additionally, if a parent_node contains bad content
|
||||
// (like 'sponsor'), give a penalty.
|
||||
let $parent = $link.parent();
|
||||
let positiveMatch = false;
|
||||
let negativeMatch = false;
|
||||
let score = 0;
|
||||
|
||||
Array.from(range(0, 4)).forEach(() => {
|
||||
if ($parent.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const parentData = makeSig($parent, ' ');
|
||||
|
||||
// If we have 'page' or 'paging' in our data, that's a good
|
||||
// sign. Add a bonus.
|
||||
if (!positiveMatch && PAGE_RE.test(parentData)) {
|
||||
positiveMatch = true;
|
||||
score += 25;
|
||||
}
|
||||
|
||||
// If we have 'comment' or something in our data, and
|
||||
// we don't have something like 'content' as well, that's
|
||||
// a bad sign. Give a penalty.
|
||||
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
|
||||
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
|
||||
if (!POSITIVE_SCORE_RE.test(parentData)) {
|
||||
negativeMatch = true;
|
||||
score -= 25;
|
||||
}
|
||||
}
|
||||
|
||||
$parent = $parent.parent();
|
||||
});
|
||||
|
||||
return score;
|
||||
}
|
||||
|
@ -0,0 +1,35 @@
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import scoreByParents from './score-by-parents';
|
||||
|
||||
describe('scoreByParents($link)', () => {
|
||||
it('returns 25 if parent sig looks like a page', () => {
|
||||
const html = `
|
||||
<div>
|
||||
<div class="next-page">
|
||||
<a href="blah">Next page</a>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const $link = $('a').first();
|
||||
|
||||
assert.equal(scoreByParents($link), 25);
|
||||
});
|
||||
|
||||
it('returns -25 if parent sig looks like a comment', () => {
|
||||
const html = `
|
||||
<div>
|
||||
<div class="comment">
|
||||
<a href="blah">Next page</a>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const $link = $('a').first();
|
||||
|
||||
assert.equal(scoreByParents($link), -25);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,19 @@
|
||||
import {
|
||||
NEXT_LINK_TEXT_RE,
|
||||
CAP_LINK_TEXT_RE,
|
||||
} from '../constants';
|
||||
|
||||
export default function scoreCapLinks(linkData) {
|
||||
// Cap links are links like "last", etc.
|
||||
if (CAP_LINK_TEXT_RE.test(linkData)) {
|
||||
// If we found a link like "last", but we've already seen that
|
||||
// this link is also "next", it's fine. If it's not been
|
||||
// previously marked as "next", then it's probably bad.
|
||||
// Penalize.
|
||||
if (NEXT_LINK_TEXT_RE.test(linkData)) {
|
||||
return -65;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scoreCapLinks from './score-cap-links';
|
||||
|
||||
describe('scoreCapLinks(linkData)', () => {
|
||||
it('returns -65 if cap link with next link text', () => {
|
||||
const linkData = 'foo next Last page';
|
||||
|
||||
assert.equal(scoreCapLinks(linkData), -65);
|
||||
});
|
||||
|
||||
it('returns 0 if does not match a cap link', () => {
|
||||
const linkData = 'foo bar WOW GREAT';
|
||||
|
||||
assert.equal(scoreCapLinks(linkData), 0);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,10 @@
|
||||
import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
|
||||
|
||||
export default function scoreExtraneousLinks(href) {
|
||||
// If the URL itself contains extraneous values, give a penalty.
|
||||
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
|
||||
return -25;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scoreExtraneousLinks from './score-extraneous-links';
|
||||
|
||||
describe('scoreExtraneousLinks(href)', () => {
|
||||
it('returns -25 if link matches extraneous text', () => {
|
||||
const url = 'http://example.com/email-link';
|
||||
|
||||
assert.equal(scoreExtraneousLinks(url), -25);
|
||||
});
|
||||
|
||||
it('returns 0 if does not match extraneous text', () => {
|
||||
const url = 'http://example.com/asdf';
|
||||
|
||||
assert.equal(scoreExtraneousLinks(url), 0);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,30 @@
|
||||
import { IS_DIGIT_RE } from 'utils/text/constants';
|
||||
|
||||
export default function scoreLinkText(linkText, pageNum) {
|
||||
// If the link text can be parsed as a number, give it a minor
|
||||
// bonus, with a slight bias towards lower numbered pages. This is
|
||||
// so that pages that might not have 'next' in their text can still
|
||||
// get scored, and sorted properly by score.
|
||||
let score = 0;
|
||||
|
||||
if (IS_DIGIT_RE.test(linkText.trim())) {
|
||||
const linkTextAsNum = parseInt(linkText, 10);
|
||||
// If it's the first page, we already got it on the first call.
|
||||
// Give it a negative score. Otherwise, up to page 10, give a
|
||||
// small bonus.
|
||||
if (linkTextAsNum < 2) {
|
||||
score = -30;
|
||||
} else {
|
||||
score = Math.max(0, 10 - linkTextAsNum);
|
||||
}
|
||||
|
||||
// If it appears that the current page number is greater than
|
||||
// this links page number, it's a very bad sign. Give it a big
|
||||
// penalty.
|
||||
if (pageNum && pageNum >= linkTextAsNum) {
|
||||
score -= 50;
|
||||
}
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scoreLinkText from './score-link-text';
|
||||
|
||||
describe('scoreLinkText(linkText)', () => {
|
||||
it('returns 8 if link contains the num 2', () => {
|
||||
assert.equal(scoreLinkText('2', 0), 8);
|
||||
});
|
||||
|
||||
it('returns 5 if link contains the num 5', () => {
|
||||
assert.equal(scoreLinkText('5', 0), 5);
|
||||
});
|
||||
|
||||
it('returns -30 if link contains the number 1', () => {
|
||||
assert.equal(scoreLinkText('1', 0), -30);
|
||||
});
|
||||
|
||||
it('penalizes -50 if pageNum is >= link text as num', () => {
|
||||
assert.equal(scoreLinkText('4', 5), -44);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,10 @@
|
||||
import { NEXT_LINK_TEXT_RE } from '../constants';
|
||||
|
||||
export default function scoreNextLinkText(linkData) {
|
||||
// Things like "next", ">>", etc.
|
||||
if (NEXT_LINK_TEXT_RE.test(linkData)) {
|
||||
return 50;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scoreNextLinkText from './score-next-link-text';
|
||||
|
||||
describe('scoreNextLinkText(linkData)', () => {
|
||||
it('returns 50 if contains common next link text', () => {
|
||||
const linkData = 'foo bar Next page';
|
||||
|
||||
assert.equal(scoreNextLinkText(linkData), 50);
|
||||
});
|
||||
|
||||
it('returns 0 if does not contain common next link text', () => {
|
||||
const linkData = 'foo bar WOW GREAT';
|
||||
|
||||
assert.equal(scoreNextLinkText(linkData), 0);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,10 @@
|
||||
export default function scorePageInLink(pageNum, isWp) {
|
||||
// page in the link = bonus. Intentionally ignore wordpress because
|
||||
// their ?p=123 link style gets caught by this even though it means
|
||||
// separate documents entirely.
|
||||
if (pageNum && !isWp) {
|
||||
return 50;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scorePageInLink from './score-page-in-link';
|
||||
|
||||
describe('scorePageInLink(pageNum, isWp)', () => {
|
||||
it('returns 50 if link contains a page num', () => {
|
||||
assert.equal(scorePageInLink(1, false), 50);
|
||||
});
|
||||
|
||||
it('returns 0 if link contains no page num', () => {
|
||||
assert.equal(scorePageInLink(null, false), 0);
|
||||
});
|
||||
|
||||
it('returns 0 if page is wordpress', () => {
|
||||
assert.equal(scorePageInLink(10, true), 0);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,11 @@
|
||||
import { PREV_LINK_TEXT_RE } from '../constants';
|
||||
|
||||
export default function scorePrevLink(linkData) {
|
||||
// If the link has something like "previous", its definitely
|
||||
// an old link, skip it.
|
||||
if (PREV_LINK_TEXT_RE.test(linkData)) {
|
||||
return -200;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
import assert from 'assert';
|
||||
|
||||
import scorePrevLink from './score-prev-link';
|
||||
|
||||
describe('scorePrevLink(linkData)', () => {
|
||||
it('returns -200 if link matches previous text', () => {
|
||||
const linkData = 'foo next previous page';
|
||||
|
||||
assert.equal(scorePrevLink(linkData), -200);
|
||||
});
|
||||
|
||||
it('returns 0 if does not match a prev link', () => {
|
||||
const linkData = 'foo bar WOW GREAT';
|
||||
|
||||
assert.equal(scorePrevLink(linkData), 0);
|
||||
});
|
||||
});
|
||||
|
@ -0,0 +1,23 @@
|
||||
import difflib from 'difflib';
|
||||
|
||||
export default function scoreSimilarity(score, articleUrl, href) {
|
||||
// Do this last and only if we have a real candidate, because it's
|
||||
// potentially expensive computationally. Compare the link to this
|
||||
// URL using difflib to get the % similarity of these URLs. On a
|
||||
// sliding scale, subtract points from this link based on
|
||||
// similarity.
|
||||
if (score > 0) {
|
||||
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
|
||||
// Subtract .1 from diff_percent when calculating modifier,
|
||||
// which means that if it's less than 10% different, we give a
|
||||
// bonus instead. Ex:
|
||||
// 3% different = +17.5 points
|
||||
// 10% different = 0 points
|
||||
// 20% different = -25 points
|
||||
const diffPercent = 1.0 - similarity;
|
||||
const diffModifier = -(250 * (diffPercent - 0.2));
|
||||
return score + diffModifier;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue