Added string utils for handling whitespace

11 years ago · e198b94ffb
parent 3449a33d87
commit e198b94ffb
1 changed files with 32 additions and 0 deletions
--- a/readability/utils.py
+++ b/readability/utils.py
@ -1,5 +1,37 @@
 # -*- coding: utf8 -*-

+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import re
+
+
+def is_blank(text):
+    """
+    Returns ``True`` if string contains only whitespace characters
+    or is empty. Otherwise ``False`` is returned.
+    """
+    return not text or text.isspace()
+
+
+MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
+def normalize_whitespace(text):
+    """
+    Translates multiple whitespace into single space character.
+    If there is at least one new line character chunk is replaced
+    by single LF (Unix new line) character.
+    """
+    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)
+
+
+def _replace_whitespace(match):
+    text = match.group()
+
+    if "\n" in text or "\r" in text:
+        return "\n"
+    else:
+        return " "
+

 def cached_property(getter):
    """