Don't remove h1/h2 elements from readable article

11 years ago · 5c20673d45
parent c9e087d077
commit 5c20673d45
2 changed files with 2 additions and 8 deletions
--- a/readability/readable.py
+++ b/readability/readable.py
@ -165,15 +165,9 @@ def clean_document(node):
        return

    logger.debug("Cleaning document.")
-    clean_list = ["object", "h1"]
+    clean_list = ["object"]
    to_drop = []

-    # If there is only one h2, they are probably using it as a header and
-    # not a subheader, so remove it since we already have a header.
-    if len(node.findall(".//h2")) == 1:
-        logger.debug("Adding H2 to list of nodes to clean.")
-        clean_list.append("h2")
-
    for n in node.iter():
        logger.debug("Cleaning iter node: %s %r", n.tag, n.attrib)
        # clean out any in-line style properties
--- a/tests/test_scoring.py
+++ b/tests/test_scoring.py
@ -100,7 +100,7 @@ class TestLinkDensity(unittest.TestCase):
    def test_several_links(self):
        """This doc has a 3 links with the majority of content."""
        doc = Article(load_snippet('document_absolute_url.html'))
-        self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/24)
+        self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/37)


 class TestClassWeight(unittest.TestCase):