Trim many repeated spaces to make clean() faster
When Readability encounters many repeated whitespace, the cleanup regexes in clean() take forever to run, so trim the amount of whitespace to 255 characters. Additionally, test the extracting performance with "timeout_decorator".
This commit is contained in:
parent
8235f0794c
commit
747c46abce
@ -54,6 +54,9 @@ def to_int(x):
|
||||
|
||||
|
||||
def clean(text):
|
||||
# Many spaces make the following regexes run forever
|
||||
text = re.sub(r'\s{255,}', ' ' * 255, text)
|
||||
|
||||
text = re.sub('\s*\n\s*', '\n', text)
|
||||
text = re.sub('\t|[ \t]{2,}', ' ', text)
|
||||
return text.strip()
|
||||
|
4
setup.py
4
setup.py
@ -28,6 +28,10 @@ setup(
|
||||
lxml_requirement,
|
||||
"cssselect"
|
||||
],
|
||||
tests_require=[
|
||||
# Test timeouts
|
||||
"timeout_decorator",
|
||||
],
|
||||
classifiers=[
|
||||
"Environment :: Web Environment",
|
||||
"Intended Audience :: Developers",
|
||||
|
@ -2,6 +2,7 @@ import os
|
||||
import unittest
|
||||
|
||||
from readability import Document
|
||||
import timeout_decorator
|
||||
|
||||
|
||||
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
@ -92,3 +93,14 @@ class TestArticleOnly(unittest.TestCase):
|
||||
assert('punctuation' in s)
|
||||
assert(not 'comment' in s)
|
||||
assert(not 'aside' in s)
|
||||
|
||||
# Many spaces make some regexes run forever
|
||||
@timeout_decorator.timeout(seconds=3, use_signals=False)
|
||||
def test_many_repeated_spaces(self):
|
||||
long_space = ' ' * 1000000
|
||||
sample = '<html><body><p>foo' + long_space + '</p></body></html>'
|
||||
|
||||
doc = Document(sample)
|
||||
s = doc.summary()
|
||||
|
||||
assert 'foo' in s
|
||||
|
Loading…
Reference in New Issue
Block a user