pull/23/head
Richard Harding 11 years ago
parent 17270db5f0
commit 347f3ea0b5

@ -49,6 +49,13 @@ clean_all: clean_venv
fi fi
bin/flake8: venv
bin/pip install flake8
lint: bin/flake8
flake8 breadability
# ########### # ###########
# Deploy # Deploy
# ########### # ###########

@ -19,9 +19,13 @@ string_types = (bytes, unicode,)
try: try:
# Assert to hush pyflakes about the unused import. This is a _compat
# module and we expect this to aid in other code importing urllib.
import urllib2 as urllib import urllib2 as urllib
assert urllib
except ImportError: except ImportError:
import urllib.request as urllib import urllib.request as urllib
assert urllib
def unicode_compatible(cls): def unicode_compatible(cls):

@ -30,6 +30,9 @@ logger = logging.getLogger("breadability")
TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*")) TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
UTF8_PARSER = HTMLParser(encoding="utf8")
def determine_encoding(page): def determine_encoding(page):
encoding = "utf8" encoding = "utf8"
text = TAG_MARK_PATTERN.sub(to_bytes(" "), page) text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)
@ -54,7 +57,12 @@ def determine_encoding(page):
return encoding return encoding
BREAK_TAGS_PATTERN = re.compile(to_unicode(r"(?:<\s*[bh]r[^>]*>\s*)+"), re.IGNORECASE) BREAK_TAGS_PATTERN = re.compile(
to_unicode(r"(?:<\s*[bh]r[^>]*>\s*)+"),
re.IGNORECASE
)
def convert_breaks_to_paragraphs(html): def convert_breaks_to_paragraphs(html):
""" """
Converts <hr> tag and multiple <br> tags into paragraph. Converts <hr> tag and multiple <br> tags into paragraph.
@ -75,7 +83,6 @@ def _replace_break_tags(match):
return tags return tags
UTF8_PARSER = HTMLParser(encoding="utf8")
def build_document(html_content, base_href=None): def build_document(html_content, base_href=None):
"""Requires that the `html_content` not be None""" """Requires that the `html_content` not be None"""
assert html_content is not None assert html_content is not None

Loading…
Cancel
Save