From 3ae64f165eae7500106324c9521a5fd6ad52fa0e Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Fri, 15 Jun 2012 20:15:37 -0400 Subject: [PATCH] Update and merge --- src/breadability/__init__.py | 1 - src/breadability/readable.py | 12 ++++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/breadability/__init__.py b/src/breadability/__init__.py index 5754926..856ce1d 100644 --- a/src/breadability/__init__.py +++ b/src/breadability/__init__.py @@ -1,2 +1 @@ VERSION = '0.1.2' -import client diff --git a/src/breadability/readable.py b/src/breadability/readable.py index 648cfa9..80e7208 100644 --- a/src/breadability/readable.py +++ b/src/breadability/readable.py @@ -1,5 +1,4 @@ import re -from lxml.etree import Element from lxml.etree import tounicode from lxml.etree import tostring from lxml.html.clean import Cleaner @@ -8,6 +7,7 @@ from lxml.html import fromstring from operator import attrgetter from pprint import PrettyPrinter +from breadability.document import build_doc from breadability.document import OriginalDocument from breadability.logconfig import LOG from breadability.logconfig import LNODE @@ -202,13 +202,17 @@ def prep_article(doc): LOG.debug('Cleaning document') clean_list = ['object', 'h1'] + # To start out, take our node and reload it so that our iterator is + # reset and we can process it completely. + re_node = build_doc(tounicode(node)) + # If there is only one h2, they are probably using it as a header and # not a subheader, so remove it since we already have a header. - if len(node.findall('.//h2')) == 1: + if len(re_node.findall('.//h2')) == 1: LOG.debug('Adding H2 to list of nodes to clean.') clean_list.append('h2') - for n in node.iter(tag=Element): + for n in re_node.iter(): LNODE.log(n, 2, "Cleaning iter node") # clean out any incline style properties if 'style' in n.attrib: @@ -262,7 +266,7 @@ def prep_article(doc): if n.getparent() is not None: n.drop_tree() - return node + return re_node def clean_conditionally(node): """Remove the clean_el if it looks like bad content based on rules."""