Update and merge

This commit is contained in:
Richard Harding 2012-06-15 20:15:37 -04:00
parent edca1c74ba
commit 3ae64f165e
2 changed files with 8 additions and 5 deletions

View File

@ -1,2 +1 @@
VERSION = '0.1.2'
import client

View File

@ -1,5 +1,4 @@
import re
from lxml.etree import Element
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html.clean import Cleaner
@ -8,6 +7,7 @@ from lxml.html import fromstring
from operator import attrgetter
from pprint import PrettyPrinter
from breadability.document import build_doc
from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
@ -202,13 +202,17 @@ def prep_article(doc):
LOG.debug('Cleaning document')
clean_list = ['object', 'h1']
# To start out, take our node and reload it so that our iterator is
# reset and we can process it completely.
re_node = build_doc(tounicode(node))
# If there is only one h2, they are probably using it as a header and
# not a subheader, so remove it since we already have a header.
if len(node.findall('.//h2')) == 1:
if len(re_node.findall('.//h2')) == 1:
LOG.debug('Adding H2 to list of nodes to clean.')
clean_list.append('h2')
for n in node.iter(tag=Element):
for n in re_node.iter():
LNODE.log(n, 2, "Cleaning iter node")
# clean out any incline style properties
if 'style' in n.attrib:
@ -262,7 +266,7 @@ def prep_article(doc):
if n.getparent() is not None:
n.drop_tree()
return node
return re_node
def clean_conditionally(node):
"""Remove the clean_el if it looks like bad content based on rules."""