Trim many repeated spaces to make clean() faster

When Readability encounters many repeated whitespace, the cleanup
regexes in clean() take forever to run, so trim the amount of whitespace
to 255 characters.

Additionally, test the extracting performance with "timeout_decorator".
This commit is contained in:
Linas Valiukas 2018-09-26 08:26:08 +03:00
parent 8235f0794c
commit 747c46abce
3 changed files with 19 additions and 0 deletions

View File

@ -54,6 +54,9 @@ def to_int(x):
def clean(text):
# Many spaces make the following regexes run forever
text = re.sub(r'\s{255,}', ' ' * 255, text)
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('\t|[ \t]{2,}', ' ', text)
return text.strip()

View File

@ -28,6 +28,10 @@ setup(
lxml_requirement,
"cssselect"
],
tests_require=[
# Test timeouts
"timeout_decorator",
],
classifiers=[
"Environment :: Web Environment",
"Intended Audience :: Developers",

View File

@ -2,6 +2,7 @@ import os
import unittest
from readability import Document
import timeout_decorator
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
@ -92,3 +93,14 @@ class TestArticleOnly(unittest.TestCase):
assert('punctuation' in s)
assert(not 'comment' in s)
assert(not 'aside' in s)
# Many spaces make some regexes run forever
@timeout_decorator.timeout(seconds=3, use_signals=False)
def test_many_repeated_spaces(self):
long_space = ' ' * 1000000
sample = '<html><body><p>foo' + long_space + '</p></body></html>'
doc = Document(sample)
s = doc.summary()
assert 'foo' in s