fix-bug: missing-pick-sentences

pull/33/head
郭江伟 6 years ago
parent d91236681e
commit a383c56c34

1
.gitignore vendored

@ -16,3 +16,4 @@ lib/
local/
man/
share/
.idea/

@ -21,7 +21,6 @@ from .scoring import (
)
from .utils import cached_property, shrink_text
html_cleaner = Cleaner(
scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
@ -30,7 +29,6 @@ html_cleaner = Cleaner(
annoying_tags=False, remove_tags=None, kill_tags=("noscript", "iframe"),
remove_unknown_tags=False, safe_attrs_only=False)
SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
ANNOTATION_TAGS = (
"a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite",
@ -193,6 +191,7 @@ def clean_document(node):
logger.debug("Dropping <%s>, it's insignificant", n.tag)
to_drop.append(n)
""" modified by guojw
# drop block element without content and children
if n.tag in ("div", "p"):
text_content = shrink_text(n.text_content())
@ -204,6 +203,7 @@ def clean_document(node):
# finally try out the conditional cleaning of the target node
if clean_conditionally(n):
to_drop.append(n)
"""
drop_nodes_with_parents(to_drop)

@ -13,7 +13,6 @@ from lxml.etree import tostring
from ._compat import to_bytes
from .utils import normalize_whitespace
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = re.compile(
@ -140,7 +139,9 @@ def is_unlikely_node(node):
def score_candidates(nodes):
"""Given a list of potential nodes, find some initial scores to start"""
MIN_HIT_LENTH = 25
# guojw
# MIN_HIT_LENTH = 25
MIN_HIT_LENTH = 1
candidates = {}
for node in nodes:

Loading…
Cancel
Save