diff --git a/src/extractors/generic/content/utils/scoring/score-content.js b/src/extractors/generic/content/utils/scoring/score-content.js index e10e720d..cbffc9bf 100644 --- a/src/extractors/generic/content/utils/scoring/score-content.js +++ b/src/extractors/generic/content/utils/scoring/score-content.js @@ -21,30 +21,27 @@ export default function scoreContent($, weightNodes=true) { }) }) - // TODO Why is this not scoring every p - // the first time through? - // Somehow it succeeds if I run it twice. - // See Vulture example in score-content.test.js - // It appears to have something to do with adding - // scores to parent nodes (comment that out and all - // children are scored). scorePs($, weightNodes) - scorePs($, weightNodes) - // scorePs($, weightNodes) return $ } function scorePs($, weightNodes) { - $('p, pre').not('[score]').each((index, node) => { + $('p, pre').toArray().map((node) => { // The raw score for this paragraph, before we add any parent/child // scores. let $node = $(node) - const rawScore = scoreNode($node) - $node = setScore($node, $, getOrInitScore($node, $, weightNodes)) + return $node + }).forEach(($node) => { + // The parent scoring has to be done in a separate loop + // because otherwise scoring the parent overwrites + // the score added to the child + // Add the individual content score to the parent node + const rawScore = scoreNode($node) + const $parent = $node.parent() addScoreTo($parent, $, rawScore, weightNodes) if ($parent) { @@ -52,6 +49,7 @@ function scorePs($, weightNodes) { // grandparent addScoreTo($parent.parent(), $, rawScore/2, weightNodes) } + }) } diff --git a/src/extractors/generic/content/utils/scoring/score-content.test.js b/src/extractors/generic/content/utils/scoring/score-content.test.js index d6b7a7de..05b547ec 100644 --- a/src/extractors/generic/content/utils/scoring/score-content.test.js +++ b/src/extractors/generic/content/utils/scoring/score-content.test.js @@ -36,20 +36,12 @@ describe('scoreContent($, weightNodes)', () => { assert.equal(getScore($('article').first()), 65.5) }) - // This is a strange case. On the first pass, scoreContent - // doesn't score every paragraph node for some reason. - it("scores this Vulture article the same", () => { + it("scores this Vulture article", () => { const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8') let $ = cheerio.load(html) $ = scoreContent($) - // console.log("NUMBER OF SCORED Ps", $('p[score]').length) - // fs.writeFile('./vult.html', $.html()) - // fs.writeFile('./vulttop.html', $.html(top)) - // $('p').each((index, node) => { - // console.log(node.attribs.score) - // }) - - // assert.equal(getScore($('[score]').first()), 65.5) + + assert.equal($('p[score]').length, 62) }) })