feat: find top candidate function

pull/1/head
Adam Pash 8 years ago
parent e2600231ac
commit 9da7a6f2a9

@ -1,5 +1,4 @@
Next: Work on score-content, making sure it's working as intended (seems to be)
Get better sense of when cheerio returns a raw node and when a cheerio object
- `extract` (this kicks it all off)
x `node_is_sufficient`
@ -21,9 +20,14 @@ x `_score_paragraph`
## Top Candidate
- `_find_top_candidate`
x `_find_top_candidate`
- `extract_clean_node`
- `_clean_conditionally`
Make sure weightNodes flag is being passed properly
Get better sense of when cheerio returns a raw node and when a cheerio object
Remove $ from function calls to getScore
Remove $ whenever possible
Test if .is method is faster than regex methods
Separate constants into activity-specific folders (dom, scoring)

@ -629,6 +629,8 @@ export const NON_TOP_CANDIDATE_TAGS = [
'meta',
]
export const NON_TOP_CANDIDATE_TAGS_RE =
new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i')
// A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.

@ -2,4 +2,5 @@
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
export { default as brsToPs } from './brs-to-ps'
export { default as paragraphize } from './paragraphize'
export { textLength, linkDensity } from './link-density'
export { convertToParagraphs, convertNodeTo } from './convert-to-paragraphs'

@ -1,7 +1,7 @@
// Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float
export default function linkDensity(node) {
export function linkDensity(node) {
const totalTextLength = textLength(node.text())
const linkText = node.find('a').text()
@ -16,7 +16,7 @@ export default function linkDensity(node) {
}
}
function textLength(text) {
export function textLength(text) {
return text.trim()
.replace(/\s+/g, ' ')
.length

@ -0,0 +1,113 @@
import { NON_TOP_CANDIDATE_TAGS_RE } from '../constants'
import { getScore } from './index'
import {
linkDensity,
textLength
} from '../dom/index'
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
export default function findTopCandidate($) {
let candidate, topScore = 0
$('*[score]').each((index, node) => {
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
return
}
const score = getScore($(node))
if (score > topScore) {
topScore = score
candidate = node
}
})
// If we don't have a candidate, return the body
// or whatever the first element is
if (!candidate) {
return $('body') || $('*').first()
}
candidate = mergeSiblings(candidate, topScore, $)
return $(candidate)
}
// Now that we have a top_candidate, look through the siblings of
// it to see if any of them are decently scored. If they are, they
// may be split parts of the content (Like two divs, a preamble and
// a body.) Example:
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
export function mergeSiblings(candidate, topScore, $) {
if (!$(candidate).parent().length) {
return candidate
}
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
let wrappingDiv = $('<div></div>')
$(candidate).parent().children().each((index, child) => {
// Ignore tags like BR, HR, etc
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
return
}
const childScore = getScore($(child))
if (childScore) {
if (child === candidate) {
wrappingDiv.append(child)
} else {
let contentBonus = 0
// extract to scoreLinkDensity() TODO
const density = linkDensity($(child))
// If sibling has a very low link density,
// give it a small bonus
if (density < .05) {
contentBonus = contentBonus + 20
}
// If sibling has a high link density,
// give it a penalty
if (density >= 0.5) {
contentBonus = contentBonus - 20
}
// If sibling node has the same class as
// candidate, give it a bonus
if ($(child).attr('class') === $(candidate).attr('class')) {
contentBonus = contentBonus + topScore * .2
}
const newScore = getScore($(child)) + contentBonus
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append(child)
} else if (node.tagName === 'p') {
childContentLength = textLength(child.text())
if (childContentLength > 80 && density < .25) {
return wrappingDiv.append(child)
} else if (childContentLength <= 80 && density === 0 &&
hasSentenceEnd(childContent)) {
return wrappingDiv.append(child)
}
}
}
}
})
return wrappingDiv
}
// TODO Extract into util - AP
// Given a string, return True if it appears to have an ending sentence
// within it, false otherwise.
const SENTENCE_END_RE = new RegExp('\.( |$)')
function hasSentenceEnd(text) {
return SENTENCE_END_RE.test(text)
}

@ -0,0 +1,60 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import HTML from './fixtures/html'
import {
getScore,
findTopCandidate,
scoreContent
} from './index'
describe('findTopCandidate($)', () => {
it("finds the top candidate from simple case", () => {
const $ = cheerio.load(HTML.findDom1)
const topCandidate = findTopCandidate($)
assert.equal(getScore(topCandidate), 100)
})
it("finds the top candidate from a nested case", () => {
const $ = cheerio.load(HTML.findDom2)
const topCandidate = findTopCandidate($)
// this is wrapped in a div so checking
// the score of the first child
assert.equal(getScore(topCandidate.children().first()), 50)
})
it("ignores tags like BR", () => {
const $ = cheerio.load(HTML.findDom3)
const topCandidate = findTopCandidate($)
assert.equal(getScore(topCandidate), 50)
})
it("returns BODY if no candidates found", () => {
const $ = cheerio.load(HTML.topBody)
const topCandidate = findTopCandidate($)
assert.equal(topCandidate.get(0).tagName, 'body')
})
it("appends a sibling with a good enough score", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
.replace(/<!--[\s\S]*?-->/g, '')
let $ = cheerio.load(html)
$ = scoreContent($)
const topCandidate = findTopCandidate($)
assert.equal($(topCandidate).text().length, 3652)
})
})

@ -54,6 +54,34 @@ const HTML = {
</div>
`,
},
// findTopCandidate
findDom1: `
<div score="100">
<p score="1">Lorem ipsum etc</p>
</div>
`,
findDom2: `
<div score="10">
<article score="50">
<p score="1">Lorem ipsum etc</p>
</article>
</div>
`,
findDom3: `
<article score="50">
<p score="1">Lorem ipsum br</p>
<br score="1000" />
</article>
`,
topBody: `
<body>
<article>
<p>Lorem ipsum etc</p>
<br />
</article>
<body>
`,
}
export default HTML

@ -2,5 +2,5 @@
// the node's score attribute
// returns null if no score set
export default function getScore(node, $) {
return parseFloat($(node).attr('score')) || null
return parseFloat(node.attr('score')) || null
}

@ -10,3 +10,4 @@ export { default as addToParent } from './add-to-parent'
export { default as getOrInitScore } from './get-or-init-score'
export { default as scoreNode } from './score-node'
export { default as scoreContent } from './score-content'
export { default as findTopCandidate } from './find-top-candidate'

Loading…
Cancel
Save