Better names and positions for variables

11 years ago · 1a5970b238
parent 930b6ced12
commit 1a5970b238
2 changed files with 27 additions and 27 deletions
--- a/breadability/readable.py
+++ b/breadability/readable.py
@ -134,23 +134,6 @@ def build_error_document(html, fragment=True):
    return output


-def transform_misused_divs_into_paragraphs(document):
-    """
-    Turn all <div> elements that don't have children block level
-    elements into <p> elements.
-
-    Since we can't change the tree as we iterate over it, we must do this
-    before we process our document.
-    """
-    for element in document.iter(tag="div"):
-        child_tags = tuple(n.tag for n in element.getchildren())
-        if "div" not in child_tags:
-            logger.debug("Changing leaf <div> into <p>")
-            element.tag = "p"
-
-    return document
-
-
 def check_siblings(candidate_node, candidate_list):
    """Look through siblings for content that might also be related.

@ -406,20 +389,20 @@ class Article(object):
        return tounicode(self._readable())

    @cached_property
-    def doc(self):
-        """The doc is the parsed xml tree of the given html."""
+    def dom(self):
+        """Parsed lxml tree (Document Object Model) of the given html."""
        try:
            document = self.orig.html
            # cleaning doesn't return, just wipes in place
            html_cleaner(document)
-            return transform_misused_divs_into_paragraphs(document)
+            return leaf_div_elements_into_paragraphs(document)
        except ValueError:
            return None

    @cached_property
    def candidates(self):
        """Generate the list of candidates from the doc."""
-        doc = self.doc
+        doc = self.dom
        if doc is not None and len(doc):
            candidates, should_drop = find_candidates(doc)
            self._should_drop = should_drop
@ -471,14 +454,31 @@ class Article(object):
    def _handle_no_candidates(self):
        """If we fail to find a good candidate we need to find something else."""
        # since we've not found a good candidate we're should help this
-        if self.doc is not None and len(self.doc):
+        if self.dom is not None and len(self.dom):
            # cleanup by removing the should_drop we spotted.
            drop_nodes_with_parents(self._should_drop)

-            doc = prep_article(self.doc)
+            doc = prep_article(self.dom)
            doc = build_base_document(doc, self.fragment)
        else:
            logger.warning('No document to use.')
            doc = build_error_document(self.fragment)

        return doc
+
+
+def leaf_div_elements_into_paragraphs(document):
+    """
+    Turn all <div> elements that don't have children block level
+    elements into <p> elements.
+
+    Since we can't change the tree as we iterate over it, we must do this
+    before we process our document.
+    """
+    for element in document.iter(tag="div"):
+        child_tags = tuple(n.tag for n in element.getchildren())
+        if "div" not in child_tags:
+            logger.debug("Changing leaf <div> into <p>")
+            element.tag = "p"
+
+    return document
--- a/tests/test_readable.py
+++ b/tests/test_readable.py
@ -16,7 +16,7 @@ from breadability.readable import get_class_weight
 from breadability.readable import get_link_density
 from breadability.readable import is_bad_link
 from breadability.readable import score_candidates
-from breadability.readable import transform_misused_divs_into_paragraphs
+from breadability.readable import leaf_div_elements_into_paragraphs
 from breadability.scoring import ScoredNode
 from .utils import load_snippet, load_article

@ -122,7 +122,7 @@ class TestCleaning(unittest.TestCase):
        test_doc = document_fromstring(test_html)
        self.assertEqual(
            tounicode(
-                transform_misused_divs_into_paragraphs(test_doc)),
+                leaf_div_elements_into_paragraphs(test_doc)),
            to_unicode("<html><body><p>simple</p></body></html>")
        )

@ -131,7 +131,7 @@ class TestCleaning(unittest.TestCase):
        test_doc2 = document_fromstring(test_html2)
        self.assertEqual(
            tounicode(
-                transform_misused_divs_into_paragraphs(test_doc2)),
+                leaf_div_elements_into_paragraphs(test_doc2)),
                to_unicode('<html><body><p>simple<a href="">link</a></p></body></html>')
        )

@ -141,7 +141,7 @@ class TestCleaning(unittest.TestCase):
            "<html><body><div>text<div>child</div>aftertext</div></body></html>")

        self.assertEqual(
-            tounicode(transform_misused_divs_into_paragraphs(dom)),
+            tounicode(leaf_div_elements_into_paragraphs(dom)),
            to_unicode("<html><body><div>text<p>child</p>aftertext</div></body></html>")
        )