Fixed transformation of leaf <div> into <p>

11 years ago · 930b6ced12
parent 314c999730
commit 930b6ced12
2 changed files with 24 additions and 25 deletions
--- a/breadability/readable.py
+++ b/breadability/readable.py
@ -134,31 +134,21 @@ def build_error_document(html, fragment=True):
    return output


-def transform_misused_divs_into_paragraphs(doc):
-    """Turn all divs that don't have children block level elements into p's
+def transform_misused_divs_into_paragraphs(document):
+    """
+    Turn all <div> elements that don't have children block level
+    elements into <p> elements.

    Since we can't change the tree as we iterate over it, we must do this
    before we process our document.
-
-    The idea is that we process all divs and if the div does not contain
-    another list of divs, then we replace it with a p tag instead appending
-    it's contents/children to it.
    """
-    for elem in doc.iter(tag='div'):
-        child_tags = tuple(n.tag for n in elem.getchildren())
-        if 'div' not in child_tags:
-            # if there is no div inside of this div...then it's a leaf
-            # node in a sense.
-            # We need to create a <p> and put all it's contents in there
-            # We'll just stringify it, then regex replace the first/last
-            # div bits to turn them into <p> vs <div>.
-            logger.debug('Turning leaf <div> into <p>')
-            orig = tounicode(elem).strip()
-            started = re.sub(r'^<\s*div', '<p', orig)
-            ended = re.sub(r'div>$', 'p>', started)
-            elem.getparent().replace(elem, fromstring(ended))
-
-    return doc
+    for element in document.iter(tag="div"):
+        child_tags = tuple(n.tag for n in element.getchildren())
+        if "div" not in child_tags:
+            logger.debug("Changing leaf <div> into <p>")
+            element.tag = "p"
+
+    return document


 def check_siblings(candidate_node, candidate_list):
@ -419,11 +409,10 @@ class Article(object):
    def doc(self):
        """The doc is the parsed xml tree of the given html."""
        try:
-            doc = self.orig.html
+            document = self.orig.html
            # cleaning doesn't return, just wipes in place
-            html_cleaner(doc)
-            doc = transform_misused_divs_into_paragraphs(doc)
-            return doc
+            html_cleaner(document)
+            return transform_misused_divs_into_paragraphs(document)
        except ValueError:
            return None

--- a/tests/test_readable.py
+++ b/tests/test_readable.py
@ -135,6 +135,16 @@ class TestCleaning(unittest.TestCase):
                to_unicode('<html><body><p>simple<a href="">link</a></p></body></html>')
        )

+    def test_dont_transform_div_with_div(self):
+        """Verify that only child <div> element is replaced by <p>."""
+        dom = document_fromstring(
+            "<html><body><div>text<div>child</div>aftertext</div></body></html>")
+
+        self.assertEqual(
+            tounicode(transform_misused_divs_into_paragraphs(dom)),
+            to_unicode("<html><body><div>text<p>child</p>aftertext</div></body></html>")
+        )
+
    def test_bad_links(self):
        """Some links should just not belong."""
        bad_links = [