Converting multiple line breaks to p

8 years ago · c237245e89
parent 95d02dadd1
commit c237245e89
8 changed files with 124 additions and 72 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,4 +1,4 @@
-
+Next: Continue working on paragraphize; move p tags outside other p tags (do this when not converting br)

 - `extract` (this kicks it all off)
 x `node_is_sufficient`
@ -6,8 +6,8 @@ x `node_is_sufficient`
 x `get_weight`
 x `_strip_unlikely_candidates`
 - `_convert_to_paragraphs`
- `_brs_to_paragraphs`
- `_paragraphize`
+x `_brs_to_paragraphs`
+x `_paragraphize`

 ## Scoring

--- a/src/extractor/generic/constants.js
+++ b/src/extractor/generic/constants.js
@ -942,6 +942,7 @@ export const BLOCK_LEVEL_TAGS = [
    'ul',
    'video',
 ]
+export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i')


 // The removal is implemented as a blacklist and whitelist, this test finds
--- a/src/extractor/generic/utils/brs-to-ps.js
+++ b/src/extractor/generic/utils/brs-to-ps.js
@ -1,3 +1,5 @@
+import paragraphize from './paragraphize'
+
 // ## NOTES:
 // Another good candidate for refactoring/optimizing.
 // Very imperative code, I don't love it. - AP
@ -7,11 +9,8 @@
 //  <p /> tags instead.
 //
 //  :param $: A cheerio object
-//  :param min_consecutive: Integer, the minimum number of consecutive
-//       <br /> tags that must exist for them to be converted to <p />
-//       tags. Must be at least 1.
-//
-export default function brsToPs($, minConsecutive=2) {
+
+export default function brsToPs($) {
  let collapsing = false
  $('br').each((index, element) => {
    let nextElement = $(element).next().get(0)
@ -21,67 +20,10 @@ export default function brsToPs($, minConsecutive=2) {
      $(element).remove()
    } else if (collapsing) {
      collapsing = false
-      $(element).replaceWith('<p />')
+      // $(element).replaceWith('<p />')
+      paragraphize(element, $, true)
    }
  })

  return $
 }
-    // def _brs_to_paragraphs(self, doc, min_consecutive=2):
-    //     print "_brs_to_paragraphs: convert consecutive brs to p tags"
-    //     brs = doc.xpath('.//br')
-    //
-    //     # Loop through all of our break tags, looking for consecutive
-    //     # <br />s with no content in between them. If found, replace them
-    //     # with a single P tag.
-    //     for br in brs:
-    //         # Generate a list of all the breaks in a row, with no text in
-    //         # between them.
-    //         joined_brs = []
-    //         cur_br = br
-    //         while True:
-    //             joined_brs.append(cur_br)
-    //
-    //             if cur_br.tail:
-    //                 break
-    //
-    //             next = cur_br.getnext()
-    //             next_is_br = next is not None and next.tag.lower() == 'br'
-    //
-    //             if next_is_br:
-    //                 cur_br = next
-    //             else:
-    //                 break
-    //
-    //         if len(joined_brs) < min_consecutive:
-    //             continue
-    //
-    //         last_br = joined_brs[-1]
-    //
-    //         # Now loop through following siblings, until we hit a block
-    //         # tag or the end, and append them to this P if they are not a
-    //         # block tag that is not a BR.
-    //         self._paragraphize(last_br)
-    //
-    //         # Drop every break that we no longer need because of the P.
-    //         # The first BR has been turned into a P tag.
-    //         for joined_br in joined_brs:
-    //             if joined_br is not last_br:
-    //                 joined_br.drop_tag()
-    //
-    //     # If we had any new p tags that are already inside a P tag, resolve
-    //     # those by paragraphizing them, which will append their block level
-    //     # contents.
-    //     for fix_count in xrange(1000):
-    //         # Find the first p that contains another p, and paragraphize it.
-    //         # We do this in a loop because we're modifying the dom as we go.
-    //         try:
-    //             parent_p = doc.xpath('//p[./p][1]')[0]
-    //             self._paragraphize(parent_p)
-    //         except IndexError:
-    //             break
-    //     else:
-    //         # We exhausted our loop, which means we've looped too many times
-    //         # such that it's unreasonable. Log a warning.
-    //         logger.warning("Bailing on p parent fix due to crazy "
-    //                         "looping for url %s" % self.resource.url)
--- a/src/extractor/generic/utils/brs-to-ps.test.js
+++ b/src/extractor/generic/utils/brs-to-ps.test.js
@ -34,8 +34,11 @@ describe('Generic Extractor Utils', () => {

    it("converts BR tags in a P tag into a P containing inline children", () => {
      const $ = cheerio.load(HTML.brsInP.before)
+
+      // Note: result is malformed HTML
+      // Will be handled elsewhere
      const result = brsToPs($).html()
-      // assert.equal(clean(result), clean(HTML.brsInP.after))
+      assert.equal(clean(result), clean(HTML.brsInP.after))
    })

  })
--- a/src/extractor/generic/utils/fixtures/html.js
+++ b/src/extractor/generic/utils/fixtures/html.js
@ -120,8 +120,7 @@ const HTML = {
    `,
    after: `
      <div class="article adbox">
-        <p></p>
-        <p>Ooo good one</p>
+        <p> </p><p>Ooo good one</p>
      </div>
    `,
  },
@ -138,8 +137,7 @@ const HTML = {
    `,
    after: `
      <div class="article adbox">
-        <p></p>
-        <p>Ooo good one</p>
+        <p> </p><p>Ooo good one</p>
      </div>
    `,
  },
@ -155,9 +153,44 @@ const HTML = {
    after: `
      <p>
        Here is some text
+      <p>
+        Here is more text
+     </p></p>
+    `,
+  },
+  paragraphize: {
+    before: `
+      <p>
+        Here is some text
+        <br />
+        Here is more text
+        <span>And also this</span>
      </p>
+    `,
+    after: `
+      <p>
+        Here is some text
+      <p>
+        Here is more text
+        <span>And also this</span>
+      </p></p>
+    `,
+  },
+  paragraphizeBlock: {
+    before: `
+      <p>
+        Here is some text
+        <br />
+        Here is more text
+        <div>And also this</div>
+      </p>
+    `,
+    after: `
+      <p>
+        Here is some text
      <p>
        Here is more text
+      </p><div>And also this</div>
      </p>
    `,
  },
--- a/src/extractor/generic/utils/index.js
+++ b/src/extractor/generic/utils/index.js
@ -1,3 +1,4 @@
 export { default as getWeight } from './get-weight'
 export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
 export { default as brsToPs } from './brs-to-ps'
+export { default as paragraphize } from './paragraphize'
--- a/src/extractor/generic/utils/paragraphize.js
+++ b/src/extractor/generic/utils/paragraphize.js
@ -0,0 +1,40 @@
+import { BLOCK_LEVEL_TAGS_RE } from '../constants'
+
+// Given a node, turn it into a P if it is not already a P, and
+// make sure it conforms to the constraints of a P tag (I.E. does
+// not contain any other block tags.)
+//
+// If the node is a <br />, it treats the following inline siblings
+// as if they were its children.
+//
+// :param node: The node to paragraphize
+// :param $: The cheerio object to handle dom manipulation
+// :param br: Whether or not the passed node is a br
+
+export default function paragraphize(node, $, br=false) {
+  if (br) {
+    let sibling = node.nextSibling
+    let p = $('<p></p>')
+
+    // while the next node is text or not a block level element
+    // append it to a new p node
+    while (true) {
+      if (!sibling || (sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {
+        break
+      }
+
+      let nextSibling = sibling.nextSibling
+      $(sibling).appendTo(p)
+      sibling = nextSibling
+    }
+
+    $(node).replaceWith(p)
+    $(node).remove()
+    return $
+  } else {
+    // Not currently implemented. May not need to; can leverage
+    // cheerio's loader/htmlparser2 to format invalid html
+    // (e.g., nested p tags)
+    return $
+  }
+}
--- a/src/extractor/generic/utils/paragraphize.test.js
+++ b/src/extractor/generic/utils/paragraphize.test.js
@ -0,0 +1,32 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import { clean } from './test-helpers'
+import HTML from './fixtures/html'
+import {
+  paragraphize
+} from './index'
+
+describe('Generic Extractor Utils', () => {
+  describe('paragraphize(node)', () => {
+
+    it("conversts a BR into P and moves inline contents to P tag after current parent", () => {
+      const $ = cheerio.load(HTML.paragraphize.before)
+      let node = $('br').get(0)
+      // note: result here is not valid html; will handle elsewhere
+      let result = paragraphize(node, $, true).html()
+      assert.equal(clean(result), clean(HTML.paragraphize.after))
+    })
+
+    it("conversts a BR into P and stops when block element hit", () => {
+      const $ = cheerio.load(HTML.paragraphizeBlock.before)
+      let node = $('br').get(0)
+      // note: result here is not valid html; will handle elsewhere
+      let result = paragraphize(node, $, true).html()
+      assert.equal(clean(result), clean(HTML.paragraphizeBlock.after))
+    })
+
+  })
+})
+
+