feat: find top candidate function

8 years ago · 9da7a6f2a9
parent e2600231ac
commit 9da7a6f2a9
9 changed files with 214 additions and 5 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,5 +1,4 @@
 Next: Work on score-content, making sure it's working as intended (seems to be)
-Get better sense of when cheerio returns a raw node and when a cheerio object

 - `extract` (this kicks it all off)
 x `node_is_sufficient`
@ -21,9 +20,14 @@ x `_score_paragraph`

 ## Top Candidate

- `_find_top_candidate`
+x `_find_top_candidate`
 - `extract_clean_node`
 - `_clean_conditionally`


 Make sure weightNodes flag is being passed properly
+Get better sense of when cheerio returns a raw node and when a cheerio object
+  Remove $ from function calls to getScore
+  Remove $ whenever possible
+Test if .is method is faster than regex methods
+Separate constants into activity-specific folders (dom, scoring)
--- a/src/extractor/generic/utils/constants.js
+++ b/src/extractor/generic/utils/constants.js
@ -629,6 +629,8 @@ export const NON_TOP_CANDIDATE_TAGS = [
    'meta',
 ]

+export const NON_TOP_CANDIDATE_TAGS_RE =
+  new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i')

 // A list of selectors that specify, very clearly, either hNews or other
 // very content-specific style content, like Blogger templates.
--- a/src/extractor/generic/utils/dom/index.js
+++ b/src/extractor/generic/utils/dom/index.js
@ -2,4 +2,5 @@
 export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
 export { default as brsToPs } from './brs-to-ps'
 export { default as paragraphize } from './paragraphize'
+export { textLength, linkDensity } from './link-density'
 export { convertToParagraphs, convertNodeTo } from './convert-to-paragraphs'
--- a/src/extractor/generic/utils/dom/link-density.js
+++ b/src/extractor/generic/utils/dom/link-density.js
@ -1,7 +1,7 @@
 // Determines what percentage of the text
 // in a node is link text
 // Takes a node, returns a float
-export default function linkDensity(node) {
+export function linkDensity(node) {
  const totalTextLength = textLength(node.text())

  const linkText = node.find('a').text()
@ -16,7 +16,7 @@ export default function linkDensity(node) {
  }
 }

-function textLength(text) {
+export function textLength(text) {
  return text.trim()
             .replace(/\s+/g, ' ')
             .length
--- a/src/extractor/generic/utils/scoring/find-top-candidate.js
+++ b/src/extractor/generic/utils/scoring/find-top-candidate.js
@ -0,0 +1,113 @@
+import { NON_TOP_CANDIDATE_TAGS_RE } from '../constants'
+import { getScore } from './index'
+import {
+  linkDensity,
+  textLength
+} from '../dom/index'
+
+// After we've calculated scores, loop through all of the possible
+// candidate nodes we found and find the one with the highest score.
+export default function findTopCandidate($) {
+  let candidate, topScore = 0
+
+  $('*[score]').each((index, node) => {
+    // Ignore tags like BR, HR, etc
+    if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
+      return
+    }
+
+    const score = getScore($(node))
+
+    if (score > topScore) {
+      topScore = score
+      candidate = node
+    }
+  })
+
+  // If we don't have a candidate, return the body
+  // or whatever the first element is
+  if (!candidate) {
+    return $('body') || $('*').first()
+  }
+
+  candidate = mergeSiblings(candidate, topScore, $)
+
+  return $(candidate)
+}
+
+// Now that we have a top_candidate, look through the siblings of
+// it to see if any of them are decently scored. If they are, they
+// may be split parts of the content (Like two divs, a preamble and
+// a body.) Example:
+// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
+export function mergeSiblings(candidate, topScore, $) {
+  if (!$(candidate).parent().length) {
+    return candidate
+  }
+
+  const siblingScoreThreshold = Math.max(10, topScore * 0.2)
+  let wrappingDiv = $('<div></div>')
+
+  $(candidate).parent().children().each((index, child) => {
+    // Ignore tags like BR, HR, etc
+    if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
+      return
+    }
+
+    const childScore = getScore($(child))
+    if (childScore) {
+      if (child === candidate) {
+        wrappingDiv.append(child)
+      } else {
+        let contentBonus = 0
+        // extract to scoreLinkDensity() TODO
+        const density = linkDensity($(child))
+
+        // If sibling has a very low link density,
+        // give it a small bonus
+        if (density < .05) {
+          contentBonus = contentBonus + 20
+        }
+
+        // If sibling has a high link density,
+        // give it a penalty
+        if (density >= 0.5) {
+          contentBonus = contentBonus - 20
+        }
+
+        // If sibling node has the same class as
+        // candidate, give it a bonus
+        if ($(child).attr('class') === $(candidate).attr('class')) {
+          contentBonus = contentBonus + topScore * .2
+        }
+
+        const newScore = getScore($(child)) + contentBonus
+
+        if (newScore >= siblingScoreThreshold) {
+          return wrappingDiv.append(child)
+        } else if (node.tagName === 'p') {
+          childContentLength = textLength(child.text())
+
+          if (childContentLength > 80 && density < .25) {
+            return wrappingDiv.append(child)
+          } else if (childContentLength <= 80 && density === 0 &&
+                    hasSentenceEnd(childContent)) {
+
+            return wrappingDiv.append(child)
+          }
+        }
+      }
+    }
+
+  })
+
+  return wrappingDiv
+}
+
+// TODO Extract into util - AP
+// Given a string, return True if it appears to have an ending sentence
+// within it, false otherwise.
+const SENTENCE_END_RE = new RegExp('\.( |$)')
+function hasSentenceEnd(text) {
+  return SENTENCE_END_RE.test(text)
+}
--- a/src/extractor/generic/utils/scoring/find-top-candidate.test.js
+++ b/src/extractor/generic/utils/scoring/find-top-candidate.test.js
@ -0,0 +1,60 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+import fs from 'fs'
+
+import HTML from './fixtures/html'
+
+import {
+  getScore,
+  findTopCandidate,
+  scoreContent
+} from './index'
+
+describe('findTopCandidate($)', () => {
+  it("finds the top candidate from simple case", () => {
+    const $ = cheerio.load(HTML.findDom1)
+
+    const topCandidate = findTopCandidate($)
+
+    assert.equal(getScore(topCandidate), 100)
+  })
+
+  it("finds the top candidate from a nested case", () => {
+    const $ = cheerio.load(HTML.findDom2)
+
+    const topCandidate = findTopCandidate($)
+
+    // this is wrapped in a div so checking
+    // the score of the first child
+    assert.equal(getScore(topCandidate.children().first()), 50)
+  })
+
+  it("ignores tags like BR", () => {
+    const $ = cheerio.load(HTML.findDom3)
+
+    const topCandidate = findTopCandidate($)
+
+    assert.equal(getScore(topCandidate), 50)
+  })
+
+  it("returns BODY if no candidates found", () => {
+    const $ = cheerio.load(HTML.topBody)
+
+    const topCandidate = findTopCandidate($)
+
+    assert.equal(topCandidate.get(0).tagName, 'body')
+  })
+
+  it("appends a sibling with a good enough score", () => {
+    const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
+                 .replace(/<!--[\s\S]*?-->/g, '')
+
+    let $ = cheerio.load(html)
+    $ = scoreContent($)
+
+    const topCandidate = findTopCandidate($)
+
+    assert.equal($(topCandidate).text().length, 3652)
+  })
+})
+
--- a/src/extractor/generic/utils/scoring/fixtures/html.js
+++ b/src/extractor/generic/utils/scoring/fixtures/html.js
@ -54,6 +54,34 @@ const HTML = {
      </div>
    `,
  },
+
+  // findTopCandidate
+  findDom1: `
+    <div score="100">
+      <p score="1">Lorem ipsum etc</p>
+    </div>
+  `,
+  findDom2: `
+    <div score="10">
+      <article score="50">
+        <p score="1">Lorem ipsum etc</p>
+      </article>
+    </div>
+  `,
+  findDom3: `
+    <article score="50">
+      <p score="1">Lorem ipsum br</p>
+      <br score="1000" />
+    </article>
+  `,
+  topBody: `
+    <body>
+      <article>
+        <p>Lorem ipsum etc</p>
+        <br />
+      </article>
+    <body>
+  `,
 }

 export default HTML
--- a/src/extractor/generic/utils/scoring/get-score.js
+++ b/src/extractor/generic/utils/scoring/get-score.js
@ -2,5 +2,5 @@
 // the node's score attribute
 // returns null if no score set
 export default function getScore(node, $) {
-  return parseFloat($(node).attr('score')) || null
+  return parseFloat(node.attr('score')) || null
 }
--- a/src/extractor/generic/utils/scoring/index.js
+++ b/src/extractor/generic/utils/scoring/index.js
@ -10,3 +10,4 @@ export { default as addToParent } from './add-to-parent'
 export { default as getOrInitScore } from './get-or-init-score'
 export { default as scoreNode } from './score-node'
 export { default as scoreContent } from './score-content'
+export { default as findTopCandidate } from './find-top-candidate'