Converting multiple line breaks to p
parent
95d02dadd1
commit
c237245e89
@ -1,3 +1,4 @@
|
||||
export { default as getWeight } from './get-weight'
|
||||
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
|
||||
export { default as brsToPs } from './brs-to-ps'
|
||||
export { default as paragraphize } from './paragraphize'
|
||||
|
@ -0,0 +1,40 @@
|
||||
import { BLOCK_LEVEL_TAGS_RE } from '../constants'
|
||||
|
||||
// Given a node, turn it into a P if it is not already a P, and
|
||||
// make sure it conforms to the constraints of a P tag (I.E. does
|
||||
// not contain any other block tags.)
|
||||
//
|
||||
// If the node is a <br />, it treats the following inline siblings
|
||||
// as if they were its children.
|
||||
//
|
||||
// :param node: The node to paragraphize
|
||||
// :param $: The cheerio object to handle dom manipulation
|
||||
// :param br: Whether or not the passed node is a br
|
||||
|
||||
export default function paragraphize(node, $, br=false) {
|
||||
if (br) {
|
||||
let sibling = node.nextSibling
|
||||
let p = $('<p></p>')
|
||||
|
||||
// while the next node is text or not a block level element
|
||||
// append it to a new p node
|
||||
while (true) {
|
||||
if (!sibling || (sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))) {
|
||||
break
|
||||
}
|
||||
|
||||
let nextSibling = sibling.nextSibling
|
||||
$(sibling).appendTo(p)
|
||||
sibling = nextSibling
|
||||
}
|
||||
|
||||
$(node).replaceWith(p)
|
||||
$(node).remove()
|
||||
return $
|
||||
} else {
|
||||
// Not currently implemented. May not need to; can leverage
|
||||
// cheerio's loader/htmlparser2 to format invalid html
|
||||
// (e.g., nested p tags)
|
||||
return $
|
||||
}
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import { clean } from './test-helpers'
|
||||
import HTML from './fixtures/html'
|
||||
import {
|
||||
paragraphize
|
||||
} from './index'
|
||||
|
||||
describe('Generic Extractor Utils', () => {
|
||||
describe('paragraphize(node)', () => {
|
||||
|
||||
it("conversts a BR into P and moves inline contents to P tag after current parent", () => {
|
||||
const $ = cheerio.load(HTML.paragraphize.before)
|
||||
let node = $('br').get(0)
|
||||
// note: result here is not valid html; will handle elsewhere
|
||||
let result = paragraphize(node, $, true).html()
|
||||
assert.equal(clean(result), clean(HTML.paragraphize.after))
|
||||
})
|
||||
|
||||
it("conversts a BR into P and stops when block element hit", () => {
|
||||
const $ = cheerio.load(HTML.paragraphizeBlock.before)
|
||||
let node = $('br').get(0)
|
||||
// note: result here is not valid html; will handle elsewhere
|
||||
let result = paragraphize(node, $, true).html()
|
||||
assert.equal(clean(result), clean(HTML.paragraphizeBlock.after))
|
||||
})
|
||||
|
||||
})
|
||||
})
|
||||
|
||||
|
Loading…
Reference in New Issue