feat: convertToParagraphs function working

pull/1/head
Adam Pash 8 years ago
parent c237245e89
commit 67e212ffac

@ -5,7 +5,7 @@ x `node_is_sufficient`
- `_extract_best_node`
x `get_weight`
x `_strip_unlikely_candidates`
- `_convert_to_paragraphs`
x `_convert_to_paragraphs`
x `_brs_to_paragraphs`
x `_paragraphize`

@ -610,7 +610,7 @@ export const DIV_TO_P_BLOCK_TAGS = [
'p',
'pre',
'table',
]
].join(',')
// A list of tags that should be ignored when trying to find the top candidate
// for a document.

@ -0,0 +1,118 @@
import { brsToPs } from './index'
import { DIV_TO_P_BLOCK_TAGS } from '../constants'
// Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
// Things fitting this criteria:
// * Multiple consecutive <br /> tags.
// * <div /> tags without block level elements inside of them
// * <span /> tags who are not children of <p /> or <div /> tags.
//
// :param $: A cheerio object to search
// :return cheerio object with new p elements
// (By-reference mutation, though. Returned just for convenience.)
export default function convertToParagraphs($) {
$ = brsToPs($)
$ = convertDivs($)
$ = convertSpans($)
return $
}
function convertDivs($) {
$('div').each((index, div) => {
const convertable = $(div).children()
.not(DIV_TO_P_BLOCK_TAGS).length == 0
if (convertable) {
convertNodeToP(div, $)
}
})
return $
}
function convertSpans($) {
$('span').each((index, span) => {
const convertable = $(span).parents('p, div').length == 0
if (convertable) {
convertNodeToP(span, $)
}
})
return $
}
export function convertNodeToP(node, $) {
$(node).replaceWith(`<p>${$(node).contents()}</p>`)
return $
}
// def _convert_to_paragraphs(self, doc):
//
// # Convert every doubled-<br /> to a paragraph tag.
// self._brs_to_paragraphs(doc)
//
// # Convert every shallow <div /> to a paragraph tag. Ignore divs that
// # contain other block level elements.
// inner_block_tags = './/' + ' or .//'.join(constants.DIV_TO_P_BLOCK_TAGS)
// shallow_divs = doc.xpath('.//div[not(%s)]' % inner_block_tags)
//
// for div in shallow_divs:
// div.tag = 'p'
//
// # Convert every span tag who has no ancestor p or div tag within their
// # family tree to a P as well.
// p_like_spans = doc.xpath('.//span[not(ancestor::p or ancestor::div)]')
// for span in p_like_spans:
// span.tag = 'p'
//
// # If, after all of this, we have no P tags at all, we are probably
// # dealing with some very ugly content that is separated by single BR
// # tags. Convert them individually to P tags.
// if int(doc.xpath('count(//p)')) == 0:
// self._brs_to_paragraphs(doc, min_consecutive=1)
//
// # Remove font and center tags, which are ugly and annoying
// for fonttag in doc.xpath('.//font | .//center'):
// fonttag.drop_tag()
//
//
// ### DO WE EVEN NEED THIS?? -Chris ###
//
// # # Due to the way the paras are inserted, the first paragraph does not
// # # get captured. Since this first para can contain all sorts of random
// # # junk (links, drop caps, images) it's not easy to regex our way to
// # # victory so we do it via dom. - Karl G
// # try:
// # first = node.xpath('.//p[@class = "rdb_br"][position() = 1]')[0]
// # except IndexError:
// # pass
// # else:
// # parent = first.getparent()
// # breaker = None
// # if parent is None:
// # parent = node
// # para = E.P({'class':'rdb_br firstp'})
// # has_predecessors = False
// # for sibling in first.itersiblings(preceding = True):
// # has_predecessors = True
// # if sibling.tag in ['p', 'div']:
// # breaker = sibling
// # break
// # para.insert(0,sibling)
// #
// # if (not has_predecessors and parent.text is not None and
// # parent.text.strip() != ""):
// # para.text = parent.text
// # parent.text = ''
// # else:
// # para.text = (para.text or '') + (parent.tail or '')
// #
// # parent.tail = ''
// # if breaker is None:
// # parent.insert(0,para)
// # else:
// # parent.insert(parent.index(breaker)+1,para)
//
// return doc

@ -0,0 +1,37 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { clean } from './test-helpers'
import HTML from './fixtures/html'
import {
convertToParagraphs
} from './index'
import { convertNodeToP } from './convert-to-paragraphs'
describe('Generic Extractor Utils', () => {
describe('convertToParagraphs($)', () => {
it("performs all conversions", () => {
const $ = cheerio.load(HTML.convertToParagraphs.before)
// Note: Result is not valid html
// Cheerio's parser will fix this elsewhere
const result = convertToParagraphs($).html()
assert.equal(clean(result), clean(HTML.convertToParagraphs.after))
})
})
describe('convertNodeToP(node, $)', () => {
it('takes a node with any tag and turns it into a P tag', () => {
const $ = cheerio.load(HTML.convertNodeToP.before)
const node = $('div').first()
const result = convertNodeToP(node, $).html()
assert.equal(clean(result), clean(HTML.convertNodeToP.after))
})
})
})

@ -194,6 +194,36 @@ const HTML = {
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// convertNodeToP
convertNodeToP: {
before: '<div>Should become a p</div>',
after: '<p>Should become a p</p>',
}
}
export default HTML

@ -2,3 +2,4 @@ export { default as getWeight } from './get-weight'
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
export { default as brsToPs } from './brs-to-ps'
export { default as paragraphize } from './paragraphize'
export { default as convertToParagraphs } from './convert-to-paragraphs'

Loading…
Cancel
Save