Better names and positions for variables

pull/21/head
Mišo Belica 11 years ago
parent 930b6ced12
commit 1a5970b238

@ -134,23 +134,6 @@ def build_error_document(html, fragment=True):
return output
def transform_misused_divs_into_paragraphs(document):
"""
Turn all <div> elements that don't have children block level
elements into <p> elements.
Since we can't change the tree as we iterate over it, we must do this
before we process our document.
"""
for element in document.iter(tag="div"):
child_tags = tuple(n.tag for n in element.getchildren())
if "div" not in child_tags:
logger.debug("Changing leaf <div> into <p>")
element.tag = "p"
return document
def check_siblings(candidate_node, candidate_list):
"""Look through siblings for content that might also be related.
@ -406,20 +389,20 @@ class Article(object):
return tounicode(self._readable())
@cached_property
def doc(self):
"""The doc is the parsed xml tree of the given html."""
def dom(self):
"""Parsed lxml tree (Document Object Model) of the given html."""
try:
document = self.orig.html
# cleaning doesn't return, just wipes in place
html_cleaner(document)
return transform_misused_divs_into_paragraphs(document)
return leaf_div_elements_into_paragraphs(document)
except ValueError:
return None
@cached_property
def candidates(self):
"""Generate the list of candidates from the doc."""
doc = self.doc
doc = self.dom
if doc is not None and len(doc):
candidates, should_drop = find_candidates(doc)
self._should_drop = should_drop
@ -471,14 +454,31 @@ class Article(object):
def _handle_no_candidates(self):
"""If we fail to find a good candidate we need to find something else."""
# since we've not found a good candidate we're should help this
if self.doc is not None and len(self.doc):
if self.dom is not None and len(self.dom):
# cleanup by removing the should_drop we spotted.
drop_nodes_with_parents(self._should_drop)
doc = prep_article(self.doc)
doc = prep_article(self.dom)
doc = build_base_document(doc, self.fragment)
else:
logger.warning('No document to use.')
doc = build_error_document(self.fragment)
return doc
def leaf_div_elements_into_paragraphs(document):
"""
Turn all <div> elements that don't have children block level
elements into <p> elements.
Since we can't change the tree as we iterate over it, we must do this
before we process our document.
"""
for element in document.iter(tag="div"):
child_tags = tuple(n.tag for n in element.getchildren())
if "div" not in child_tags:
logger.debug("Changing leaf <div> into <p>")
element.tag = "p"
return document

@ -16,7 +16,7 @@ from breadability.readable import get_class_weight
from breadability.readable import get_link_density
from breadability.readable import is_bad_link
from breadability.readable import score_candidates
from breadability.readable import transform_misused_divs_into_paragraphs
from breadability.readable import leaf_div_elements_into_paragraphs
from breadability.scoring import ScoredNode
from .utils import load_snippet, load_article
@ -122,7 +122,7 @@ class TestCleaning(unittest.TestCase):
test_doc = document_fromstring(test_html)
self.assertEqual(
tounicode(
transform_misused_divs_into_paragraphs(test_doc)),
leaf_div_elements_into_paragraphs(test_doc)),
to_unicode("<html><body><p>simple</p></body></html>")
)
@ -131,7 +131,7 @@ class TestCleaning(unittest.TestCase):
test_doc2 = document_fromstring(test_html2)
self.assertEqual(
tounicode(
transform_misused_divs_into_paragraphs(test_doc2)),
leaf_div_elements_into_paragraphs(test_doc2)),
to_unicode('<html><body><p>simple<a href="">link</a></p></body></html>')
)
@ -141,7 +141,7 @@ class TestCleaning(unittest.TestCase):
"<html><body><div>text<div>child</div>aftertext</div></body></html>")
self.assertEqual(
tounicode(transform_misused_divs_into_paragraphs(dom)),
tounicode(leaf_div_elements_into_paragraphs(dom)),
to_unicode("<html><body><div>text<p>child</p>aftertext</div></body></html>")
)

Loading…
Cancel
Save