Stripping unlikely candidates from DOM

8 years ago · 777e11c25c
parent 89a2cfbb82
commit 777e11c25c
8 changed files with 144 additions and 1 deletions
--- a/src/extractor/generic/constants.js
+++ b/src/extractor/generic/constants.js
@ -942,3 +942,16 @@ export const BLOCK_LEVEL_TAGS = [
    'ul',
    'video',
 ]
+
+
+// The removal is implemented as a blacklist and whitelist, this test finds
+// blacklisted elements that aren't whitelisted. We do this all in one
+// expression-both because it's only one pass, and because this skips the
+// serialization for whitelisted nodes.
+const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|')
+export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i')
+
+const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|')
+export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i')
+
+export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i')
--- a/src/extractor/generic/utils/fixtures/html.js
+++ b/src/extractor/generic/utils/fixtures/html.js
@ -1,4 +1,5 @@
 const HTML = {
+  // getWeight fixtures
  positiveId: `
    <div id="entry">
      <p>Ooo good one</p>
@ -44,6 +45,55 @@ const HTML = {
      <p>Ooo good one</p>
    </div>
  `,
+
+  // stripUnlikelyCandidates
+  noMatches: `
+    <div id="foo">
+      <p>Ooo good one</p>
+    </div>
+  `,
+  whitelistMatch: {
+    before: `
+      <div class="header">Stuff</div>
+      <div class="article">
+        <p>Ooo good one</p>
+      </div>
+    `,
+    after: `
+      <div class="article">
+        <p>Ooo good one</p>
+      </div>
+    `,
+  },
+  whiteAndBlack: {
+    before: `
+      <div class="article adbox">
+        <p>Ooo good one</p>
+      </div>
+    `,
+    after: `
+      <div class="article adbox">
+        <p>Ooo good one</p>
+      </div>
+    `,
+  },
+  whiteInsideBlack: {
+    before: `
+      <div>
+        <div class="adbox">
+          <div class="article">
+            <p>Ooo good one</p>
+          </div>
+        </div>
+        <div>Something unrelated</div>
+      </div>
+    `,
+    after: `
+      <div>
+        <div>Something unrelated</div>
+      </div>
+    `,
+  },
 }

 export default HTML
--- a/src/extractor/generic/utils/get-weight.js
+++ b/src/extractor/generic/utils/get-weight.js
--- a/src/extractor/generic/utils/get-weight.test.js
+++ b/src/extractor/generic/utils/get-weight.test.js
--- a/src/extractor/generic/utils/index.js
+++ b/src/extractor/generic/utils/index.js
@ -1 +1,2 @@
-export { default as getWeight } from './getWeight'
+export { default as getWeight } from './get-weight'
+export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
--- a/src/extractor/generic/utils/strip-unlikely-candidates.js
+++ b/src/extractor/generic/utils/strip-unlikely-candidates.js
@ -0,0 +1,37 @@
+import {
+  CANDIDATES_WHITELIST,
+  CANDIDATES_BLACKLIST,
+  UNLIKELY_RE,
+} from '../constants'
+
+// ## NOTES:
+// This is a working first pass, but if/when we start optimizing
+// this is a good candidate.
+
+export default function stripUnlikelyCandidates($) {
+  //  Loop through the provided document and remove any non-link nodes
+  //  that are unlikely candidates for article content.
+  //
+  //  Links are ignored because there are very often links to content
+  //  that are identified as non-body-content, but may be inside
+  //  article-like content.
+  //
+  //  :param $: a cheerio object to strip nodes from
+  //  :return $: the cleaned cheerio object
+  $('*').not('a').each(function(index, element) {
+    const classes = $(element).attr('class')
+    const id = $(element).attr('id')
+    if (!id && !classes) {
+      return
+    } else {
+      const classAndId = `${classes || ''} ${id || ''}`
+      if (CANDIDATES_WHITELIST.test(classAndId)) {
+        return
+      } else if (CANDIDATES_BLACKLIST.test(classAndId)) {
+        return $(element).remove()
+      }
+    }
+  })
+
+  return $
+}
--- a/src/extractor/generic/utils/strip-unlikely-candidates.test.js
+++ b/src/extractor/generic/utils/strip-unlikely-candidates.test.js
@ -0,0 +1,39 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import { clean } from './test-helpers'
+import HTML from './fixtures/html'
+import {
+  stripUnlikelyCandidates
+} from './index'
+
+describe('Generic Extractor Utils', () => {
+  describe('stripUnlikelyCandidates(node)', () => {
+    it("returns original doc if no matches found", () => {
+      const $ = cheerio.load(HTML.noMatches)
+      const stripped = stripUnlikelyCandidates($)
+      assert.equal(stripped.html(), HTML.noMatches)
+    })
+
+    it("strips unlikely matches from the doc", () => {
+      const $ = cheerio.load(HTML.whitelistMatch.before)
+      const stripped = clean(stripUnlikelyCandidates($).html())
+      assert.equal(stripped, clean(HTML.whitelistMatch.after))
+    })
+
+    it("keeps likely matches even when they also match the blacklist", () => {
+      const $ = cheerio.load(HTML.whiteAndBlack.before)
+      const stripped = clean(stripUnlikelyCandidates($).html())
+      assert.equal(stripped, clean(HTML.whiteAndBlack.after))
+    })
+
+    it("removed likely matches when inside blacklist node", () => {
+      const $ = cheerio.load(HTML.whiteInsideBlack.before)
+      const stripped = clean(stripUnlikelyCandidates($).html())
+      assert.equal(stripped, clean(HTML.whiteInsideBlack.after))
+    })
+
+
+  })
+})
+
--- a/src/extractor/generic/utils/test-helpers.js
+++ b/src/extractor/generic/utils/test-helpers.js
@ -0,0 +1,3 @@
+export function clean(string) {
+  return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, '')
+}