You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
breadability/breadability/document.py

152 lines
4.0 KiB
Python

# -*- coding: utf8 -*-
"""Generate a clean nice starting html document to process for an article."""
from __future__ import absolute_import
import logging
import re
import chardet
from lxml.etree import ParserError, XMLSyntaxError, tounicode
from lxml.html import HTMLParser, document_fromstring
from ._compat import to_bytes, to_unicode, unicode, unicode_compatible
from .utils import cached_property, ignored
logger = logging.getLogger("breadability")
TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
UTF8_PARSER = HTMLParser(encoding="utf8")
CHARSET_META_TAG_PATTERN = re.compile(
br"""<meta[^>]+charset=["']?([^'"/>\s]+)""",
re.IGNORECASE
)
def decode_html(html):
"""
Converts bytes stream containing an HTML page into Unicode.
Tries to guess character encoding from meta tag of by "chardet" library.
"""
if isinstance(html, unicode):
return html
match = CHARSET_META_TAG_PATTERN.search(html)
if match:
declared_encoding = match.group(1).decode("ASCII")
# proceed unknown encoding as if it wasn't found at all
with ignored(LookupError):
return html.decode(declared_encoding, "ignore")
# try to enforce UTF-8 firstly
with ignored(UnicodeDecodeError):
return html.decode("utf8")
text = TAG_MARK_PATTERN.sub(to_bytes(" "), html)
diff = text.decode("utf8", "ignore").encode("utf8")
sizes = len(diff), len(text)
# 99% of text is UTF-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return html.decode("utf8", "ignore")
# try detect encoding
encoding = "utf8"
encoding_detector = chardet.detect(text)
if encoding_detector["encoding"]:
encoding = encoding_detector["encoding"]
return html.decode(encoding, "ignore")
BREAK_TAGS_PATTERN = re.compile(
to_unicode(r"(?:<\s*[bh]r[^>]*>\s*)+"),
re.IGNORECASE
)
def convert_breaks_to_paragraphs(html):
"""
Converts <hr> tag and multiple <br> tags into paragraph.
"""
logger.debug("Converting multiple <br> & <hr> tags into <p>.")
return BREAK_TAGS_PATTERN.sub(_replace_break_tags, html)
def _replace_break_tags(match):
tags = match.group()
if to_unicode("<hr") in tags:
return to_unicode("</p><p>")
elif tags.count(to_unicode("<br")) > 1:
return to_unicode("</p><p>")
else:
return tags
def build_document(html_content, base_href=None):
"""Requires that the `html_content` not be None"""
assert html_content is not None
if isinstance(html_content, unicode):
html_content = html_content.encode("utf8", "xmlcharrefreplace")
try:
document = document_fromstring(html_content, parser=UTF8_PARSER)
except (ParserError, XMLSyntaxError):
raise ValueError("Failed to parse document contents.")
if base_href:
document.make_links_absolute(base_href, resolve_base_href=True)
else:
document.resolve_base_href()
return document
@unicode_compatible
class OriginalDocument(object):
"""The original document to process."""
def __init__(self, html, url=None):
self._html = html
self._url = url
@property
def url(self):
"""Source URL of HTML document."""
return self._url
def __unicode__(self):
"""Renders the document as a string."""
return tounicode(self.dom)
@cached_property
def dom(self):
"""Parsed HTML document from the input."""
html = self._html
if not isinstance(html, unicode):
html = decode_html(html)
html = convert_breaks_to_paragraphs(html)
document = build_document(html, self._url)
return document
@cached_property
def links(self):
"""Links within the document."""
return self.dom.findall(".//a")
@cached_property
def title(self):
"""Title attribute of the parsed document."""
title_element = self.dom.find(".//title")
if title_element is None or title_element.text is None:
return ""
else:
return title_element.text.strip()