debugging: cheerio isn't always consistent in setting scores

pull/1/head
Adam Pash 8 years ago
parent 47ac7e9803
commit 74694ba8e2

File diff suppressed because one or more lines are too long

@ -10,7 +10,7 @@ describe('GenericContentExtractor', function() {
this.timeout(1000000)
describe('extract($, html, opts)', () => {
it("extracts html and returns the article", () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8')
// Array.from(range(1, 100)).map((i) => {
// console.log(i)
@ -19,7 +19,7 @@ describe('GenericContentExtractor', function() {
// ))
// })
const result = clean(GenericContentExtractor.extract(
{ $: null, html, url: 'http://example.com' }
{ $: null, html, url: 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html' }
))
// console.log(result)
})

@ -3,13 +3,13 @@ import {
setScore,
} from './index'
export default function addScore(node, $, amount) {
export default function addScore($node, $, amount) {
try {
const score = getOrInitScore(node, $) + amount
setScore(node, $, score)
const score = getOrInitScore($node, $) + amount
setScore($node, $, score)
} catch(e) {
console.debug(e)
} finally {
return node
return $node
}
}

@ -21,11 +21,27 @@ export default function scoreContent($, weightNodes=true) {
})
})
$('p, pre').each((index, node) => {
// TODO Why is this not scoring every p
// the first time through?
// Somehow it succeeds if I run it twice.
// See Vulture example in score-content.test.js
// It appears to have something to do with adding
// scores to parent nodes (comment that out and all
// children are scored).
scorePs($, weightNodes)
scorePs($, weightNodes)
// scorePs($, weightNodes)
return $
}
function scorePs($, weightNodes) {
$('p, pre').not('[score]').each((index, node) => {
// The raw score for this paragraph, before we add any parent/child
// scores.
let $node = $(node)
const rawScore = scoreNode($node)
$node = setScore($node, $, getOrInitScore($node, $, weightNodes))
// Add the individual content score to the parent node
@ -37,8 +53,6 @@ export default function scoreContent($, weightNodes=true) {
addScoreTo($parent.parent(), $, rawScore/2, weightNodes)
}
})
return $
}
function convertSpans($node, $) {

@ -7,7 +7,7 @@ import HTML from './fixtures/html'
import {
scoreContent,
getScore
getScore,
} from './index'
// TODO: Walk through these and sanity check my scores
@ -36,4 +36,20 @@ describe('scoreContent($, weightNodes)', () => {
assert.equal(getScore($('article').first()), 65.5)
})
// This is a strange case. On the first pass, scoreContent
// doesn't score every paragraph node for some reason.
it("scores this Vulture article the same", () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8')
let $ = cheerio.load(html)
$ = scoreContent($)
// console.log("NUMBER OF SCORED Ps", $('p[score]').length)
// fs.writeFile('./vult.html', $.html())
// fs.writeFile('./vulttop.html', $.html(top))
// $('p').each((index, node) => {
// console.log(node.attribs.score)
// })
// assert.equal(getScore($('[score]').first()), 65.5)
})
})

@ -7,7 +7,7 @@ import {
// commas, etc. Higher is better.
export default function scoreParagraph(node) {
let score = 1
const text = node.text()
const text = node.text().trim()
const textLength = text.length
// If this paragraph is less than 25 characters, don't count it.

@ -1,7 +1,12 @@
import { REMOVE_EMPTY_SELECTORS } from './constants'
export default function removeEmpty(article, $) {
$(REMOVE_EMPTY_SELECTORS, article).remove()
export default function removeEmpty($article, $) {
// $(REMOVE_EMPTY_SELECTORS, $article).remove()
$article.find('p').each((index, p) => {
const $p = $(p)
if ($p.text().trim() === '') $p.remove()
})
return $
}

@ -14,6 +14,14 @@ describe('removeEmpty($)', () => {
assertClean(result.html(), HTML.removeEmptyP.after)
})
it("removes P tags with only space", () => {
const html = `<div><p> </p></div>`
let $ = cheerio.load(html)
let result = removeEmpty($('*').first(), $)
assertClean(result.html(), `<div></div>`)
})
it("does not remove empty DIV tags", () => {
let $ = cheerio.load(HTML.removeEmptyP.before)

Loading…
Cancel
Save