Don't remove h1/h2 elements from readable article

pull/21/head
Mišo Belica 11 years ago
parent c9e087d077
commit 5c20673d45

@ -165,15 +165,9 @@ def clean_document(node):
return
logger.debug("Cleaning document.")
clean_list = ["object", "h1"]
clean_list = ["object"]
to_drop = []
# If there is only one h2, they are probably using it as a header and
# not a subheader, so remove it since we already have a header.
if len(node.findall(".//h2")) == 1:
logger.debug("Adding H2 to list of nodes to clean.")
clean_list.append("h2")
for n in node.iter():
logger.debug("Cleaning iter node: %s %r", n.tag, n.attrib)
# clean out any in-line style properties

@ -100,7 +100,7 @@ class TestLinkDensity(unittest.TestCase):
def test_several_links(self):
"""This doc has a 3 links with the majority of content."""
doc = Article(load_snippet('document_absolute_url.html'))
self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/24)
self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/37)
class TestClassWeight(unittest.TestCase):

Loading…
Cancel
Save