Trim many repeated spaces to make clean() faster

When Readability encounters many repeated whitespace, the cleanup regexes in clean() take forever to run, so trim the amount of whitespace to 255 characters. Additionally, test the extracting performance with "timeout_decorator".
2018-09-26 08:26:08 +03:00 · 2018-09-26 08:26:08 +03:00 · 747c46abce
commit 747c46abce
parent 8235f0794c
3 changed files with 19 additions and 0 deletions
--- a/readability/readability.py
+++ b/readability/readability.py
@ -54,6 +54,9 @@ def to_int(x):


 def clean(text):
+    # Many spaces make the following regexes run forever
+    text = re.sub(r'\s{255,}', ' ' * 255, text)
+
    text = re.sub('\s*\n\s*', '\n', text)
    text = re.sub('\t|[ \t]{2,}', ' ', text)
    return text.strip()
--- a/setup.py
+++ b/setup.py
@ -28,6 +28,10 @@ setup(
        lxml_requirement,
        "cssselect"
        ],
+    tests_require=[
+        # Test timeouts
+        "timeout_decorator",
+    ],
    classifiers=[
        "Environment :: Web Environment",
        "Intended Audience :: Developers",
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@ -2,6 +2,7 @@ import os
 import unittest

 from readability import Document
+import timeout_decorator


 SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
@ -92,3 +93,14 @@ class TestArticleOnly(unittest.TestCase):
        assert('punctuation' in s)
        assert(not 'comment' in s)
        assert(not 'aside' in s)
+
+    # Many spaces make some regexes run forever
+    @timeout_decorator.timeout(seconds=3, use_signals=False)
+    def test_many_repeated_spaces(self):
+        long_space = ' ' * 1000000
+        sample = '<html><body><p>foo' + long_space + '</p></body></html>'
+
+        doc = Document(sample)
+        s = doc.summary()
+
+        assert 'foo' in s