Lots of progress on score-content

8 years ago · bd7ed77f23
parent cc734c7e7d
commit bd7ed77f23
10 changed files with 220 additions and 126 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,4 +1,4 @@
-Next: Continue working on paragraphize; move p tags outside other p tags (do this when not converting br)
+Next: Work on score-content, making sure it's working as intended (seems to be)

 - `extract` (this kicks it all off)
 x `node_is_sufficient`
@ -23,6 +23,3 @@ x `_score_paragraph`
 - `_find_top_candidate`
 - `extract_clean_node`
 - `_clean_conditionally`
-
-
-Add helper methods to clean up tests
--- a/src/extractor/generic/utils/constants.js
+++ b/src/extractor/generic/utils/constants.js
@ -634,43 +634,51 @@ export const NON_TOP_CANDIDATE_TAGS = [
 // very content-specific style content, like Blogger templates.
 // More examples here: http://microformats.org/wiki/blog-post-formats
 export const HNEWS_CONTENT_SELECTORS = [
-    {
-        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "entry-content")]'),
-        must_exist: {
-            classes: ['hentry', 'entry-content'],
-        }
-    },
-    {
-        //selector: XPath('//*[contains(@class, "entry")]//*[contains(@class, "entry-content")]'),
-        must_exist: {
-            classes: ['entry', 'entry-content'],
-        }
-    },
-    {
-        //selector: XPath('//*[contains(@class, "entry")]//*[contains(@class, "entry_content")]'),
-        must_exist: {
-            classes: ['entry', 'entry_content'],
-        }
-    },
-    {
-        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "post-body")]'),
-        must_exist: {
-            classes: ['post', 'post-body'],
-        }
-    },
-    {
-        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "post_body")]'),
-        must_exist: {
-            classes: ['post', 'post_body'],
-        }
-    },
-    {
-        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "postbody")]'),
-        must_exist: {
-            classes: ['post', 'postbody'],
-        }
-    },
+  ['.hentry', '.entry-content'],
+  ['entry', '.entry-content'],
+  ['.entry', '.entry_content'],
+  ['.post', '.postbody'],
+  ['.post', '.post_body'],
+  ['.post', '.post-body'],
 ]
+// export const HNEWS_CONTENT_SELECTORS = [
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'),
+//         must_exist: {
+//             classes: ['hentry', 'entry-content'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'),
+//         must_exist: {
+//             classes: ['entry', 'entry-content'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'),
+//         must_exist: {
+//             classes: ['entry', 'entry_content'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'),
+//         must_exist: {
+//             classes: ['post', 'post-body'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'),
+//         must_exist: {
+//             classes: ['post', 'post_body'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'),
+//         must_exist: {
+//             classes: ['post', 'postbody'],
+//         }
+//     },
+// ]

 export const PHOTO_HINTS = [
    'figure',
--- a/src/extractor/generic/utils/dom/convert-to-paragraphs.js
+++ b/src/extractor/generic/utils/dom/convert-to-paragraphs.js
@ -12,7 +12,7 @@ import { DIV_TO_P_BLOCK_TAGS } from '../constants'
 //   :return cheerio object with new p elements
 //   (By-reference mutation, though. Returned just for convenience.)

-export default function convertToParagraphs($) {
+export function convertToParagraphs($) {
  $ = brsToPs($)
  $ = convertDivs($)
  $ = convertSpans($)
@ -25,7 +25,7 @@ function convertDivs($) {
    const convertable = $(div).children()
      .not(DIV_TO_P_BLOCK_TAGS).length == 0
    if (convertable) {
-      convertNodeToP(div, $)
+      convertNodeTo(div, $)
    }
  })

@ -36,83 +36,14 @@ function convertSpans($) {
  $('span').each((index, span) => {
    const convertable = $(span).parents('p, div').length == 0
    if (convertable) {
-      convertNodeToP(span, $)
+      convertNodeTo(span, $)
    }
  })

  return $
 }

-export function convertNodeToP(node, $) {
-  $(node).replaceWith(`<p>${$(node).contents()}</p>`)
+export function convertNodeTo(node, $, tag='p') {
+  $(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
  return $
 }
-
-    // def _convert_to_paragraphs(self, doc):
-    //
-    //     # Convert every doubled-<br /> to a paragraph tag.
-    //     self._brs_to_paragraphs(doc)
-    //
-    //     # Convert every shallow <div /> to a paragraph tag. Ignore divs that
-    //     # contain other block level elements.
-    //     inner_block_tags = './/' + ' or .//'.join(constants.DIV_TO_P_BLOCK_TAGS)
-    //     shallow_divs = doc.xpath('.//div[not(%s)]' % inner_block_tags)
-    //
-    //     for div in shallow_divs:
-    //         div.tag = 'p'
-    //
-    //     # Convert every span tag who has no ancestor p or div tag within their
-    //     # family tree to a P as well.
-    //     p_like_spans = doc.xpath('.//span[not(ancestor::p or ancestor::div)]')
-    //     for span in p_like_spans:
-    //         span.tag = 'p'
-    //
-    //     # If, after all of this, we have no P tags at all, we are probably
-    //     # dealing with some very ugly content that is separated by single BR
-    //     # tags. Convert them individually to P tags.
-    //     if int(doc.xpath('count(//p)')) == 0:
-    //         self._brs_to_paragraphs(doc, min_consecutive=1)
-    //
-    //     # Remove font and center tags, which are ugly and annoying
-    //     for fonttag in doc.xpath('.//font | .//center'):
-    //         fonttag.drop_tag()
-    //
-    //
-    //     ### DO WE EVEN NEED THIS?? -Chris ###
-    //
-    //     # # Due to the way the paras are inserted, the first paragraph does not
-    //     # # get captured. Since this first para can contain all sorts of random
-    //     # # junk (links, drop caps, images) it's not easy to regex our way to
-    //     # # victory so we do it via dom. - Karl G
-    //     # try:
-    //     #     first = node.xpath('.//p[@class = "rdb_br"][position() = 1]')[0]
-    //     # except IndexError:
-    //     #     pass
-    //     # else:
-    //     #     parent  = first.getparent()
-    //     #     breaker = None
-    //     #     if parent is None:
-    //     #         parent = node
-    //     #     para = E.P({'class':'rdb_br firstp'})
-    //     #     has_predecessors = False
-    //     #     for sibling in first.itersiblings(preceding = True):
-    //     #         has_predecessors = True
-    //     #         if sibling.tag in ['p', 'div']:
-    //     #             breaker = sibling
-    //     #             break
-    //     #         para.insert(0,sibling)
-    //     #
-    //     #     if (not has_predecessors and parent.text is not None and
-    //     #         parent.text.strip() != ""):
-    //     #         para.text = parent.text
-    //     #         parent.text = ''
-    //     #     else:
-    //     #         para.text = (para.text or '') + (parent.tail or '')
-    //     #
-    //     #     parent.tail = ''
-    //     #     if breaker is None:
-    //     #         parent.insert(0,para)
-    //     #     else:
-    //     #         parent.insert(parent.index(breaker)+1,para)
-    //
-    //     return doc
--- a/src/extractor/generic/utils/dom/convert-to-paragraphs.test.js
+++ b/src/extractor/generic/utils/dom/convert-to-paragraphs.test.js
@ -11,7 +11,7 @@ import {
  convertToParagraphs
 } from './index'

-import { convertNodeToP } from './convert-to-paragraphs'
+import { convertNodeTo } from './convert-to-paragraphs'

 describe('Generic Extractor Utils', () => {
  describe('convertToParagraphs($)', () => {
@ -22,14 +22,14 @@ describe('Generic Extractor Utils', () => {

  })

-  describe('convertNodeToP(node, $)', () => {
+  describe('convertNodeTo(node, $)', () => {
    it('takes a node with any tag and turns it into a P tag', () => {
-      const $ = cheerio.load(HTML.convertNodeToP.before)
+      const $ = cheerio.load(HTML.convertNodeTo.before)
      const node = $('div').first()

-      const result = convertNodeToP(node, $).html()
+      const result = convertNodeTo(node, $).html()

-      assertClean(result, HTML.convertNodeToP.after)
+      assertClean(result, HTML.convertNodeTo.after)
    })

  })
--- a/src/extractor/generic/utils/dom/index.js
+++ b/src/extractor/generic/utils/dom/index.js
@ -2,8 +2,4 @@
 export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
 export { default as brsToPs } from './brs-to-ps'
 export { default as paragraphize } from './paragraphize'
-export { default as convertToParagraphs } from './convert-to-paragraphs'
-
-
-// Scoring
-// export { default as getWeight } from './get-weight'
+export { convertToParagraphs, convertNodeTo } from './convert-to-paragraphs'
--- a/src/extractor/generic/utils/fixtures/html.js
+++ b/src/extractor/generic/utils/fixtures/html.js
@ -219,8 +219,8 @@ const HTML = {
    `,
  },

-  // convertNodeToP
-  convertNodeToP: {
+  // convertNodeTo
+  convertNodeTo: {
    before: '<div>Should become a p</div>',
    after: '<p>Should become a p</p>',
  }
--- a/src/extractor/generic/utils/scoring/fixtures/html.js
+++ b/src/extractor/generic/utils/scoring/fixtures/html.js
@ -28,6 +28,32 @@ const HTML = {
      <p class="entry">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
    </div>
  `,
+  hNews: {
+    before: `
+      <div class="hentry">
+        <p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
+      </div>
+    `,
+    after: `
+      <div class="hentry" score="99">
+        <p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
+      </div>
+    `,
+  },
+  nonHNews: {
+    before: `
+      <div class="">
+        <p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
+        <p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
+      </div>
+    `,
+    after: `
+      <div class="" score="38">
+        <p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
+        <p class="entry-content">Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
+      </div>
+    `,
+  },
 }

 export default HTML
--- a/src/extractor/generic/utils/scoring/index.js
+++ b/src/extractor/generic/utils/scoring/index.js
@ -9,3 +9,4 @@ export { default as addScore } from './add-score'
 export { default as addToParent } from './add-to-parent'
 export { default as getOrInitScore } from './get-or-init-score'
 export { default as scoreNode } from './score-node'
+export { default as scoreContent } from './score-content'
--- a/src/extractor/generic/utils/scoring/score-content.js
+++ b/src/extractor/generic/utils/scoring/score-content.js
@ -0,0 +1,106 @@
+import { HNEWS_CONTENT_SELECTORS } from '../constants'
+
+import {
+  scoreNode,
+  setScore,
+  getScore,
+  getOrInitScore,
+  addScore,
+} from './index'
+import { convertNodeTo } from '../dom'
+
+// score content. Parents get the full value of their children's
+// content score, grandparents half
+export default function scoreContent($, weightNodes=true) {
+
+  // First, look for special hNews based selectors and give them a big
+  // boost, if they exist
+  HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => {
+    $(parentSelector).find(childSelector).each((index, node) => {
+      addScore($(node).parent(parentSelector), $, 80)
+    })
+  })
+
+  $('p, pre').each((index, node) => {
+    // The raw score for this paragraph, before we add any parent/child
+    // scores.
+    const rawScore = scoreNode($(node))
+    node = setScore(node, $, getOrInitScore($(node), $, weightNodes))
+
+    // Add the individual content score to the parent node
+    const parent = $(node).parent()
+    addScoreTo(parent, $, rawScore, weightNodes)
+    if (parent) {
+      // Add half of the individual content score to the
+      // grandparent
+      addScoreTo(parent.parent(), $, rawScore/2, weightNodes)
+    }
+  })
+
+  return $
+}
+
+function convertSpans(node, $) {
+  if (node.get(0)) {
+    const { tagName } = node.get(0)
+
+    if (tagName === 'span') {
+      // convert spans to divs
+      convertNodeTo(node, $, 'div')
+    }
+  }
+}
+
+function addScoreTo(node, $, score, weightNodes) {
+  if (node) {
+    convertSpans(node, $)
+    addScore(node, $, score)
+  }
+}
+
+
+    // def _score_content(self, doc, weight_nodes=True):
+    //     for selector in constants.HNEWS_CONTENT_SELECTORS:
+    //         # Not self.resource.extract_by_selector because our doc is a copy
+    //         # of the resource doc.
+    //         nodes = extract_by_selector(doc, selector,
+    //                                         AttribMap(doc))
+    //         for node in nodes:
+    //             self._add_score(node, 80)
+    //
+    //     paras = doc.xpath('.//p | .//pre')
+    //
+    //     # If we don't have any paragraphs at all, we can't score based on
+    //     # paragraphs, so return without modifying anything else.
+    //     if len(paras) == 0:
+    //         return doc
+    //
+    //     for para in paras:
+    //         # Don't score invalid tags
+    //         if not isinstance(para.tag, basestring):
+    //             continue
+    //
+    //         # The raw score for this paragraph, before we add any parent/child
+    //         # scores.
+    //         raw_score = self._score_node(para)
+    //         self._set_score(para, self._get_score(para, weight_nodes))
+    //
+    //         parent = para.getparent()
+    //         if parent is not None:
+    //             if parent.tag == 'span':
+    //                 parent.tag = 'div'
+    //
+    //             # Add the individual content score to the parent node
+    //             self._add_score(parent, raw_score, weight_nodes=weight_nodes)
+    //
+    //             grandparent = parent.getparent()
+    //             if grandparent is not None:
+    //                 if grandparent.tag == 'span':
+    //                     grandparent.tag = 'div'
+    //
+    //                 # Add half of the individual content score to the
+    //                 # grandparent
+    //                 gp_score = raw_score / 2.0
+    //                 self._add_score(grandparent, gp_score, weight_nodes=weight_nodes)
+    //
+    //     return doc
--- a/src/extractor/generic/utils/scoring/score-content.test.js
+++ b/src/extractor/generic/utils/scoring/score-content.test.js
@ -0,0 +1,29 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import {
+  clean
+} from '../dom/test-helpers'
+import HTML from './fixtures/html'
+
+import {
+  scoreContent,
+  getScore
+} from './index'
+
+describe('scoreContent($, weightNodes)', () => {
+  it("loves hNews content", () => {
+    const $ = cheerio.load(HTML.hNews.before)
+    const result = scoreContent($).html()
+
+    assert.equal(getScore($('div').first(), $), 99)
+  })
+
+  it("is so-so about non-hNews content", () => {
+    const $ = cheerio.load(HTML.nonHNews.before)
+    const result = scoreContent($).html()
+
+    assert.equal(getScore($('div').first(), $), 38)
+  })
+
+})