Converting multiple line breaks to p

pull/1/head
Adam Pash 8 years ago
parent 95d02dadd1
commit c237245e89

@ -1,4 +1,4 @@
Next: Continue working on paragraphize; move p tags outside other p tags (do this when not converting br)
- `extract` (this kicks it all off)
x `node_is_sufficient`
@ -6,8 +6,8 @@ x `node_is_sufficient`
x `get_weight`
x `_strip_unlikely_candidates`
- `_convert_to_paragraphs`
- `_brs_to_paragraphs`
- `_paragraphize`
x `_brs_to_paragraphs`
x `_paragraphize`
## Scoring

@ -942,6 +942,7 @@ export const BLOCK_LEVEL_TAGS = [
'ul',
'video',
]
export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i')
// The removal is implemented as a blacklist and whitelist, this test finds

@ -1,3 +1,5 @@
import paragraphize from './paragraphize'
// ## NOTES:
// Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
@ -7,11 +9,8 @@
// <p /> tags instead.
//
// :param $: A cheerio object
// :param min_consecutive: Integer, the minimum number of consecutive
// <br /> tags that must exist for them to be converted to <p />
// tags. Must be at least 1.
//
export default function brsToPs($, minConsecutive=2) {
export default function brsToPs($) {
let collapsing = false
$('br').each((index, element) => {
let nextElement = $(element).next().get(0)
@ -21,67 +20,10 @@ export default function brsToPs($, minConsecutive=2) {
$(element).remove()
} else if (collapsing) {
collapsing = false
$(element).replaceWith('<p />')
// $(element).replaceWith('<p />')
paragraphize(element, $, true)
}
})
return $
}
// def _brs_to_paragraphs(self, doc, min_consecutive=2):
// print "_brs_to_paragraphs: convert consecutive brs to p tags"
// brs = doc.xpath('.//br')
//
// # Loop through all of our break tags, looking for consecutive
// # <br />s with no content in between them. If found, replace them
// # with a single P tag.
// for br in brs:
// # Generate a list of all the breaks in a row, with no text in
// # between them.
// joined_brs = []
// cur_br = br
// while True:
// joined_brs.append(cur_br)
//
// if cur_br.tail:
// break
//
// next = cur_br.getnext()
// next_is_br = next is not None and next.tag.lower() == 'br'
//
// if next_is_br:
// cur_br = next
// else:
// break
//
// if len(joined_brs) < min_consecutive:
// continue
//
// last_br = joined_brs[-1]
//
// # Now loop through following siblings, until we hit a block
// # tag or the end, and append them to this P if they are not a
// # block tag that is not a BR.
// self._paragraphize(last_br)
//
// # Drop every break that we no longer need because of the P.
// # The first BR has been turned into a P tag.
// for joined_br in joined_brs:
// if joined_br is not last_br:
// joined_br.drop_tag()
//
// # If we had any new p tags that are already inside a P tag, resolve
// # those by paragraphizing them, which will append their block level
// # contents.
// for fix_count in xrange(1000):
// # Find the first p that contains another p, and paragraphize it.
// # We do this in a loop because we're modifying the dom as we go.
// try:
// parent_p = doc.xpath('//p[./p][1]')[0]
// self._paragraphize(parent_p)
// except IndexError:
// break
// else:
// # We exhausted our loop, which means we've looped too many times
// # such that it's unreasonable. Log a warning.
// logger.warning("Bailing on p parent fix due to crazy "
// "looping for url %s" % self.resource.url)

@ -34,8 +34,11 @@ describe('Generic Extractor Utils', () => {
it("converts BR tags in a P tag into a P containing inline children", () => {
const $ = cheerio.load(HTML.brsInP.before)
// Note: result is malformed HTML
// Will be handled elsewhere
const result = brsToPs($).html()
// assert.equal(clean(result), clean(HTML.brsInP.after))
assert.equal(clean(result), clean(HTML.brsInP.after))
})
})

@ -120,8 +120,7 @@ const HTML = {
`,
after: `
<div class="article adbox">
<p></p>
<p>Ooo good one</p>
<p> </p><p>Ooo good one</p>
</div>
`,
},
@ -138,8 +137,7 @@ const HTML = {
`,
after: `
<div class="article adbox">
<p></p>
<p>Ooo good one</p>
<p> </p><p>Ooo good one</p>
</div>
`,
},
@ -155,9 +153,44 @@ const HTML = {
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},

@ -1,3 +1,4 @@
export { default as getWeight } from './get-weight'
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
export { default as brsToPs } from './brs-to-ps'
export { default as paragraphize } from './paragraphize'

@ -0,0 +1,40 @@
import { BLOCK_LEVEL_TAGS_RE } from '../constants'
// Given a node, turn it into a P if it is not already a P, and
// make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
// If the node is a <br />, it treats the following inline siblings
// as if they were its children.
//
// :param node: The node to paragraphize
// :param $: The cheerio object to handle dom manipulation
// :param br: Whether or not the passed node is a br
export default function paragraphize(node, $, br=false) {
if (br) {
let sibling = node.nextSibling
let p = $('<p></p>')
// while the next node is text or not a block level element
// append it to a new p node
while (true) {
if (!sibling || (sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {
break
}
let nextSibling = sibling.nextSibling
$(sibling).appendTo(p)
sibling = nextSibling
}
$(node).replaceWith(p)
$(node).remove()
return $
} else {
// Not currently implemented. May not need to; can leverage
// cheerio's loader/htmlparser2 to format invalid html
// (e.g., nested p tags)
return $
}
}

@ -0,0 +1,32 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { clean } from './test-helpers'
import HTML from './fixtures/html'
import {
paragraphize
} from './index'
describe('Generic Extractor Utils', () => {
describe('paragraphize(node)', () => {
it("conversts a BR into P and moves inline contents to P tag after current parent", () => {
const $ = cheerio.load(HTML.paragraphize.before)
let node = $('br').get(0)
// note: result here is not valid html; will handle elsewhere
let result = paragraphize(node, $, true).html()
assert.equal(clean(result), clean(HTML.paragraphize.after))
})
it("conversts a BR into P and stops when block element hit", () => {
const $ = cheerio.load(HTML.paragraphizeBlock.before)
let node = $('br').get(0)
// note: result here is not valid html; will handle elsewhere
let result = paragraphize(node, $, true).html()
assert.equal(clean(result), clean(HTML.paragraphizeBlock.after))
})
})
})
Loading…
Cancel
Save