feat: ported scoring methods with unit tests

pull/1/head
Adam Pash 8 years ago
parent 97087bd626
commit d4a19e6a27

@ -11,12 +11,12 @@ x `_paragraphize`
## Scoring
- `_get_score`
- `_set_score`
- `_add_score`
x `_get_score`
x `_set_score`
x `_add_score`
- `_score_content`
- `_score_node`
- `_score_paragraph`
x `_score_node`
x `_score_paragraph`
## Top Candidate

@ -956,3 +956,9 @@ const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|')
export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i')
export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i')
export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i')
export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i')
export const BAD_TAGS = new RegExp('^(address|form)$', 'i')

@ -0,0 +1,10 @@
import {
getScore,
setScore,
} from './index'
export default function addScore(node, $, amount) {
const score = getScore(node, $) + amount
setScore(node, $, score)
return node
}

@ -0,0 +1,28 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
addScore,
getScore,
} from './index'
describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => {
it(`adds the specified amount to a node's score`, () => {
const $ = cheerio.load('<p score="25">Foo</p>')
let node = $('p').first()
node = addScore(node, $, 25)
assert.equal(getScore(node, $), 50)
})
it(`adds score if score not yet set (assumes score is 0)`, () => {
const $ = cheerio.load('<p>Foo</p>')
let node = $('p').first()
node = addScore(node, $, 25)
assert.equal(getScore(node, $), 25)
})
})
})

@ -0,0 +1,11 @@
import { addScore } from './index'
// Adds 1/4 of a child's score to its parent
export default function addToParent(node, $, score) {
const parent = node.parent()
if (parent) {
addScore(parent, $, score * .25)
}
return node
}

@ -0,0 +1,24 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
addToParent,
getScore,
} from './index'
describe('Scoring utils', () => {
describe('addToParent(node, $, amount)', () => {
it(`adds 1/4 of a node's score it its parent`, () => {
const html = '<div score="25"><p score="40">Foo</p></div>'
const $ = cheerio.load(html)
let node = $('p').first()
node = addToParent(node, $, 40)
assert.equal(getScore(node.parent(), $), 35)
assert.equal(getScore(node, $), 40)
})
})
})

@ -0,0 +1,33 @@
const HTML = {
score1: `
<p>Lorem ipsum dolor sit amet</p>
`,
score3: `
<p>Lorem ipsum, dolor sit, amet</p>
`,
score19: `
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`,
divScore5: `
<div>Lorem ipsum, dolor sit, amet</div>
`,
blockquoteScore3: `
<blockquote>Lorem ipsum, dolor sit, amet</blockquote>
`,
formScoreNeg3: `
<form><label>Lorem ipsum, dolor sit, amet</label></form>
`,
thScoreNeg5: `
<th>Lorem ipsum, dolor sit, amet</th>
`,
score44: `
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
`,
score44Parent: `
<div>
<p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
</div>
`,
}
export default HTML

@ -0,0 +1,29 @@
import {
getScore,
scoreNode,
getWeight,
addToParent,
} from './index'
// gets and returns the score if it exists
// if not, initializes a score based on
// the node's tag type
export default function getOrInitScore(node, $, weightNodes=true) {
let score = getScore(node, $)
if (score) {
return score
} else {
score = scoreNode(node)
if (weightNodes) {
score = score + getWeight(node)
}
addToParent(node, $, score)
}
return score
}

@ -0,0 +1,61 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import {
getOrInitScore,
getScore,
} from './index'
describe('getOrInitScore(node, $)', () => {
describe('when score set', () => {
it(`returns score if node's score already set`, () => {
const html = '<p score="40">Foo</p>'
const $ = cheerio.load(html)
const node = $('p').first()
const score = getOrInitScore(node, $)
assert.equal(score, 40)
})
})
describe('when no score set', () => {
it(`returns 0 if no class/id and text < 25 chars`, () => {
const html = '<p>Foo</p>'
const $ = cheerio.load(html)
const node = $('p').first()
const score = getOrInitScore(node, $)
assert.equal(score, 0)
})
it(`returns score if no class/id and has commas/length`, () => {
const $ = cheerio.load(HTML.score19)
const node = $('p').first()
const score = getOrInitScore(node, $)
assert.equal(score, 19)
})
it(`returns greater score if weighted class/id is set`, () => {
const $ = cheerio.load(HTML.score44)
const node = $('p').first()
const score = getOrInitScore(node, $)
assert.equal(score, 44)
})
it(`gives 1/4 of its score to its parent`, () => {
const $ = cheerio.load(HTML.score44Parent)
const node = $('p').first()
const score = getOrInitScore(node, $)
assert.equal(getScore(node.parent(), $), score/4)
})
})
})

@ -0,0 +1,6 @@
// returns the score of a node based on
// the node's score attribute
// returns null if no score set
export default function getScore(node, $) {
return parseInt($(node).attr('score')) || null
}

@ -0,0 +1,25 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { getScore } from './index'
describe('Scoring utils', () => {
describe('getScore(node, $)', () => {
it("returns null if the node has no score set", () => {
const $ = cheerio.load('<p>Foo</p>')
const node = $('p').first()
assert.equal(getScore(node, $), null)
})
it("returns 25 if the node has a score attr of 25", () => {
const $ = cheerio.load('<p score="25">Foo</p>')
const node = $('p').first()
assert.equal(typeof getScore(node, $), 'number')
assert.equal(getScore(node, $), 25)
})
})
})

@ -1,2 +1,11 @@
// Scoring
export { default as getWeight } from './get-weight'
export { default as getScore } from './get-score'
export { default as scoreCommas } from './score-commas'
export { default as scoreLength } from './score-length'
export { default as scoreParagraph } from './score-paragraph'
export { default as setScore } from './set-score'
export { default as addScore } from './add-score'
export { default as addToParent } from './add-to-parent'
export { default as getOrInitScore } from './get-or-init-score'
export { default as scoreNode } from './score-node'

@ -0,0 +1,5 @@
// return 1 for every comma in text
export default function scoreCommas(text) {
return (text.match(/,/g) || []).length
}

@ -0,0 +1,20 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { scoreCommas } from './index'
describe('Scoring utils', () => {
describe('scoreCommas(text)', () => {
it(`returns 0 if text has no commas`, () => {
assert.equal(scoreCommas("Foo bar"), 0)
})
it(`returns a point for every comma in the text`, () => {
assert.equal(scoreCommas('Foo, bar'), 1)
assert.equal(scoreCommas('Foo, bar, baz'), 2)
assert.equal(scoreCommas('Foo, bar, baz, bat'), 3)
})
})
})

@ -0,0 +1,26 @@
const idkRe = new RegExp('^(p|pre)$', 'i')
export default function scoreLength(textLength, tagName='p') {
let score
const chunks = textLength / 50
if (chunks > 0) {
let lengthBonus
// No idea why p or pre are being tamped down here
// but just following the source for now
// Not even sure why tagName is included here,
// since this is only being called from the context
// of scoreParagraph
if (idkRe.test(tagName)) {
lengthBonus = chunks - 2
} else {
lengthBonus = chunks - 1.25
}
return Math.min(Math.max(lengthBonus, 0), 3)
} else {
return 0
}
}

@ -0,0 +1,22 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { scoreLength } from './index'
describe('Scoring utils', () => {
describe('scoreLength(textLength, tagName)', () => {
it(`returns 0 if length < 50 chars`, () => {
assert.equal(scoreLength(30), 0)
})
it(`returns varying scores but maxes out at 3`, () => {
assert.equal(scoreLength(150), 1)
assert.equal(scoreLength(199), 1.98)
assert.equal(scoreLength(200), 2)
assert.equal(scoreLength(250), 3)
assert.equal(scoreLength(500), 3)
assert.equal(scoreLength(1500), 3)
})
})
})

@ -0,0 +1,26 @@
import { scoreParagraph } from './index'
import {
PARAGRAPH_SCORE_TAGS,
CHILD_CONTENT_TAGS,
BAD_TAGS,
} from '../constants'
// Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag.
export default function scoreNode(node) {
const { tagName } = node.get(0)
if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
return scoreParagraph(node)
} else if (tagName === 'div') {
return 5
} else if (CHILD_CONTENT_TAGS.test(tagName)) {
return 3
} else if (BAD_TAGS.test(tagName)) {
return -3
} else if (tagName === 'th') {
return -5
}
return 0
}

@ -0,0 +1,95 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import {
scoreNode,
scoreParagraph,
} from './index'
describe('scoreNode(node)', () => {
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const html = '<p><em>Foo</em> bar</p>'
const $ = cheerio.load(html)
let node = $('p').first()
const score = scoreNode(node)
const pScore = scoreParagraph(node)
assert.equal(score, pScore)
assert.equal(score, 0)
})
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const $ = cheerio.load(HTML.score1)
let node = $('p').first()
const score = scoreNode(node)
const pScore = scoreParagraph(node)
assert.equal(score, pScore)
assert.equal(score, 1)
})
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const $ = cheerio.load(HTML.score3)
let node = $('p').first()
const score = scoreNode(node)
const pScore = scoreParagraph(node)
assert.equal(score, pScore)
assert.equal(score, 3)
})
it(`scores P, LI, SPAN, and PRE using scoreParagraph`, () => {
const $ = cheerio.load(HTML.score19)
let node = $('p').first()
const score = scoreNode(node)
const pScore = scoreParagraph(node)
assert.equal(score, pScore)
assert.equal(score, 19)
})
it(`scores divs with 5`, () => {
const $ = cheerio.load(HTML.divScore5)
let node = $('div').first()
const score = scoreNode(node)
assert.equal(score, 5)
})
it(`scores the blockquote family with 3`, () => {
const $ = cheerio.load(HTML.blockquoteScore3)
let node = $('blockquote').first()
const score = scoreNode(node)
assert.equal(score, 3)
})
it(`scores a form with negative 3`, () => {
const $ = cheerio.load(HTML.formScoreNeg3)
let node = $('form').first()
const score = scoreNode(node)
assert.equal(score, -3)
})
it(`scores a TH element with negative 5`, () => {
const $ = cheerio.load(HTML.thScoreNeg5)
let node = $('th').first()
const score = scoreNode(node)
assert.equal(score, -5)
})
})

@ -0,0 +1,35 @@
import {
scoreCommas,
scoreLength,
} from './index'
// Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better.
export default function scoreParagraph(node) {
let score = 1
const text = node.text()
const textLength = text.length
// If this paragraph is less than 25 characters, don't count it.
if (textLength < 25) {
return 0
}
// Add points for any commas within this paragraph
score = score + scoreCommas(text)
// For every 50 characters in this paragraph, add another point. Up
// to 3 points.
score = score + scoreLength(textLength)
// Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold.
if (text.slice(-1) === ':') {
score = score - 1
}
return score
}

@ -0,0 +1,48 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import {
scoreParagraph,
} from './index'
describe('Scoring utils', () => {
describe('scoreParagraph(node)', () => {
it(`returns 0 if text is less than 25 chars`, () => {
const html = '<p><em>Foo</em> bar</p>'
const $ = cheerio.load(html)
let node = $('p').first()
const score = scoreParagraph(node)
assert.equal(score, 0)
})
it(`returns 1 if text is > 25 chars and has 0 commas`, () => {
const $ = cheerio.load(HTML.score1)
let node = $('p').first()
const score = scoreParagraph(node)
assert.equal(score, 1)
})
it(`returns 3 if text is > 25 chars and has 2 commas`, () => {
const $ = cheerio.load(HTML.score3)
let node = $('p').first()
const score = scoreParagraph(node)
assert.equal(score, 3)
})
it(`returns 19 if text has 15 commas, ~600 chars`, () => {
const $ = cheerio.load(HTML.score19)
let node = $('p').first()
const score = scoreParagraph(node)
assert.equal(score, 19)
})
})
})

@ -0,0 +1,7 @@
export default function setScore(node, $, score) {
$(node).attr('score', score)
return node
}

@ -0,0 +1,23 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
setScore,
getScore
} from './index'
describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>')
let node = $('p').first()
const newScore = 25
node = setScore(node, $, newScore)
const score = getScore(node, $)
assert(score, newScore)
})
})
})

@ -1,5 +1,7 @@
// Given a node, determine if it's article-like enough to return
// param: node (a cheerio node)
// return: boolean
export default function nodeIsSufficient(node) {
return node.text().trim().length >= 100
}

Loading…
Cancel
Save