feat: convertToParagraphs function working

8 years ago · 67e212ffac
parent c237245e89
commit 67e212ffac
6 changed files with 188 additions and 2 deletions
--- a/TODO.md
+++ b/TODO.md
@ -5,7 +5,7 @@ x `node_is_sufficient`
 - `_extract_best_node`
 x `get_weight`
 x `_strip_unlikely_candidates`
- `_convert_to_paragraphs`
+x `_convert_to_paragraphs`
 x `_brs_to_paragraphs`
 x `_paragraphize`

--- a/src/extractor/generic/constants.js
+++ b/src/extractor/generic/constants.js
@ -610,7 +610,7 @@ export const DIV_TO_P_BLOCK_TAGS = [
    'p',
    'pre',
    'table',
-]
+].join(',')

 // A list of tags that should be ignored when trying to find the top candidate
 // for a document.
--- a/src/extractor/generic/utils/convert-to-paragraphs.js
+++ b/src/extractor/generic/utils/convert-to-paragraphs.js
@ -0,0 +1,118 @@
+import { brsToPs } from './index'
+import { DIV_TO_P_BLOCK_TAGS } from '../constants'
+// Loop through the provided doc, and convert any p-like elements to
+// actual paragraph tags.
+//
+//   Things fitting this criteria:
+//   * Multiple consecutive <br /> tags.
+//   * <div /> tags without block level elements inside of them
+//   * <span /> tags who are not children of <p /> or <div /> tags.
+//
+//   :param $: A cheerio object to search
+//   :return cheerio object with new p elements
+//   (By-reference mutation, though. Returned just for convenience.)
+
+export default function convertToParagraphs($) {
+  $ = brsToPs($)
+  $ = convertDivs($)
+  $ = convertSpans($)
+
+  return $
+}
+
+function convertDivs($) {
+  $('div').each((index, div) => {
+    const convertable = $(div).children()
+      .not(DIV_TO_P_BLOCK_TAGS).length == 0
+    if (convertable) {
+      convertNodeToP(div, $)
+    }
+  })
+
+  return $
+}
+
+function convertSpans($) {
+  $('span').each((index, span) => {
+    const convertable = $(span).parents('p, div').length == 0
+    if (convertable) {
+      convertNodeToP(span, $)
+    }
+  })
+
+  return $
+}
+
+export function convertNodeToP(node, $) {
+  $(node).replaceWith(`<p>${$(node).contents()}</p>`)
+  return $
+}
+
+    // def _convert_to_paragraphs(self, doc):
+    //
+    //     # Convert every doubled-<br /> to a paragraph tag.
+    //     self._brs_to_paragraphs(doc)
+    //
+    //     # Convert every shallow <div /> to a paragraph tag. Ignore divs that
+    //     # contain other block level elements.
+    //     inner_block_tags = './/' + ' or .//'.join(constants.DIV_TO_P_BLOCK_TAGS)
+    //     shallow_divs = doc.xpath('.//div[not(%s)]' % inner_block_tags)
+    //
+    //     for div in shallow_divs:
+    //         div.tag = 'p'
+    //
+    //     # Convert every span tag who has no ancestor p or div tag within their
+    //     # family tree to a P as well.
+    //     p_like_spans = doc.xpath('.//span[not(ancestor::p or ancestor::div)]')
+    //     for span in p_like_spans:
+    //         span.tag = 'p'
+    //
+    //     # If, after all of this, we have no P tags at all, we are probably
+    //     # dealing with some very ugly content that is separated by single BR
+    //     # tags. Convert them individually to P tags.
+    //     if int(doc.xpath('count(//p)')) == 0:
+    //         self._brs_to_paragraphs(doc, min_consecutive=1)
+    //
+    //     # Remove font and center tags, which are ugly and annoying
+    //     for fonttag in doc.xpath('.//font | .//center'):
+    //         fonttag.drop_tag()
+    //
+    //
+    //     ### DO WE EVEN NEED THIS?? -Chris ###
+    //
+    //     # # Due to the way the paras are inserted, the first paragraph does not
+    //     # # get captured. Since this first para can contain all sorts of random
+    //     # # junk (links, drop caps, images) it's not easy to regex our way to
+    //     # # victory so we do it via dom. - Karl G
+    //     # try:
+    //     #     first = node.xpath('.//p[@class = "rdb_br"][position() = 1]')[0]
+    //     # except IndexError:
+    //     #     pass
+    //     # else:
+    //     #     parent  = first.getparent()
+    //     #     breaker = None
+    //     #     if parent is None:
+    //     #         parent = node
+    //     #     para = E.P({'class':'rdb_br firstp'})
+    //     #     has_predecessors = False
+    //     #     for sibling in first.itersiblings(preceding = True):
+    //     #         has_predecessors = True
+    //     #         if sibling.tag in ['p', 'div']:
+    //     #             breaker = sibling
+    //     #             break
+    //     #         para.insert(0,sibling)
+    //     #
+    //     #     if (not has_predecessors and parent.text is not None and
+    //     #         parent.text.strip() != ""):
+    //     #         para.text = parent.text
+    //     #         parent.text = ''
+    //     #     else:
+    //     #         para.text = (para.text or '') + (parent.tail or '')
+    //     #
+    //     #     parent.tail = ''
+    //     #     if breaker is None:
+    //     #         parent.insert(0,para)
+    //     #     else:
+    //     #         parent.insert(parent.index(breaker)+1,para)
+    //
+    //     return doc
--- a/src/extractor/generic/utils/convert-to-paragraphs.test.js
+++ b/src/extractor/generic/utils/convert-to-paragraphs.test.js
@ -0,0 +1,37 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import { clean } from './test-helpers'
+import HTML from './fixtures/html'
+import {
+  convertToParagraphs
+} from './index'
+import { convertNodeToP } from './convert-to-paragraphs'
+
+describe('Generic Extractor Utils', () => {
+  describe('convertToParagraphs($)', () => {
+
+    it("performs all conversions", () => {
+      const $ = cheerio.load(HTML.convertToParagraphs.before)
+      // Note: Result is not valid html
+      // Cheerio's parser will fix this elsewhere
+      const result = convertToParagraphs($).html()
+      assert.equal(clean(result), clean(HTML.convertToParagraphs.after))
+    })
+
+  })
+
+  describe('convertNodeToP(node, $)', () => {
+    it('takes a node with any tag and turns it into a P tag', () => {
+      const $ = cheerio.load(HTML.convertNodeToP.before)
+      const node = $('div').first()
+      const result = convertNodeToP(node, $).html()
+      assert.equal(clean(result), clean(HTML.convertNodeToP.after))
+
+    })
+
+  })
+
+})
+
+
--- a/src/extractor/generic/utils/fixtures/html.js
+++ b/src/extractor/generic/utils/fixtures/html.js
@ -194,6 +194,36 @@ const HTML = {
      </p>
    `,
  },
+
+  // convertToParagraphs
+  convertToParagraphs: {
+    before: `
+      <p>
+        Here is some text
+        <span>This should remain in a p</span>
+        <br />
+        <br />
+        This should be wrapped in a p
+        <div>This should become a p</div>
+      </p>
+      <span>This should become a p</span>
+    `,
+    after: `
+      <p>
+        Here is some text
+        <span>This should remain in a p</span>
+      <p>
+        This should be wrapped in a p
+      </p><p>This should become a p</p>
+      </p> <p>This should become a p</p>
+    `,
+  },
+
+  // convertNodeToP
+  convertNodeToP: {
+    before: '<div>Should become a p</div>',
+    after: '<p>Should become a p</p>',
+  }
 }

 export default HTML
--- a/src/extractor/generic/utils/index.js
+++ b/src/extractor/generic/utils/index.js
@ -2,3 +2,4 @@ export { default as getWeight } from './get-weight'
 export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
 export { default as brsToPs } from './brs-to-ps'
 export { default as paragraphize } from './paragraphize'
+export { default as convertToParagraphs } from './convert-to-paragraphs'