mirror of
https://github.com/bookieio/breadability
synced 2024-11-04 12:00:19 +00:00
Update and merge
This commit is contained in:
parent
edca1c74ba
commit
3ae64f165e
@ -1,2 +1 @@
|
||||
VERSION = '0.1.2'
|
||||
import client
|
||||
|
@ -1,5 +1,4 @@
|
||||
import re
|
||||
from lxml.etree import Element
|
||||
from lxml.etree import tounicode
|
||||
from lxml.etree import tostring
|
||||
from lxml.html.clean import Cleaner
|
||||
@ -8,6 +7,7 @@ from lxml.html import fromstring
|
||||
from operator import attrgetter
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
from breadability.document import build_doc
|
||||
from breadability.document import OriginalDocument
|
||||
from breadability.logconfig import LOG
|
||||
from breadability.logconfig import LNODE
|
||||
@ -202,13 +202,17 @@ def prep_article(doc):
|
||||
LOG.debug('Cleaning document')
|
||||
clean_list = ['object', 'h1']
|
||||
|
||||
# To start out, take our node and reload it so that our iterator is
|
||||
# reset and we can process it completely.
|
||||
re_node = build_doc(tounicode(node))
|
||||
|
||||
# If there is only one h2, they are probably using it as a header and
|
||||
# not a subheader, so remove it since we already have a header.
|
||||
if len(node.findall('.//h2')) == 1:
|
||||
if len(re_node.findall('.//h2')) == 1:
|
||||
LOG.debug('Adding H2 to list of nodes to clean.')
|
||||
clean_list.append('h2')
|
||||
|
||||
for n in node.iter(tag=Element):
|
||||
for n in re_node.iter():
|
||||
LNODE.log(n, 2, "Cleaning iter node")
|
||||
# clean out any incline style properties
|
||||
if 'style' in n.attrib:
|
||||
@ -262,7 +266,7 @@ def prep_article(doc):
|
||||
if n.getparent() is not None:
|
||||
n.drop_tree()
|
||||
|
||||
return node
|
||||
return re_node
|
||||
|
||||
def clean_conditionally(node):
|
||||
"""Remove the clean_el if it looks like bad content based on rules."""
|
||||
|
Loading…
Reference in New Issue
Block a user