# -*- coding: utf8 -*- """Generate a clean nice starting html document to process for an article.""" from __future__ import absolute_import import logging import re import chardet from lxml.etree import ParserError, XMLSyntaxError, tounicode from lxml.html import HTMLParser, document_fromstring from ._compat import to_bytes, to_unicode, unicode, unicode_compatible from .utils import cached_property, ignored logger = logging.getLogger("breadability") TAG_MARK_PATTERN = re.compile(to_bytes(r"]*>\s*")) UTF8_PARSER = HTMLParser(encoding="utf8") CHARSET_META_TAG_PATTERN = re.compile( br"""]+charset=["']?([^'"/>\s]+)""", re.IGNORECASE ) def decode_html(html): """ Converts bytes stream containing an HTML page into Unicode. Tries to guess character encoding from meta tag of by "chardet" library. """ if isinstance(html, unicode): return html match = CHARSET_META_TAG_PATTERN.search(html) if match: declared_encoding = match.group(1).decode("ASCII") # proceed unknown encoding as if it wasn't found at all with ignored(LookupError): return html.decode(declared_encoding, "ignore") # try to enforce UTF-8 firstly with ignored(UnicodeDecodeError): return html.decode("utf8") text = TAG_MARK_PATTERN.sub(to_bytes(" "), html) diff = text.decode("utf8", "ignore").encode("utf8") sizes = len(diff), len(text) # 99% of text is UTF-8 if abs(len(text) - len(diff)) < max(sizes) * 0.01: return html.decode("utf8", "ignore") # try detect encoding encoding = "utf8" encoding_detector = chardet.detect(text) if encoding_detector["encoding"]: encoding = encoding_detector["encoding"] return html.decode(encoding, "ignore") BREAK_TAGS_PATTERN = re.compile( to_unicode(r"(?:<\s*[bh]r[^>]*>\s*)+"), re.IGNORECASE ) def convert_breaks_to_paragraphs(html): """ Converts
tag and multiple
tags into paragraph. """ logger.debug("Converting multiple
&
tags into

.") return BREAK_TAGS_PATTERN.sub(_replace_break_tags, html) def _replace_break_tags(match): tags = match.group() if to_unicode("

") elif tags.count(to_unicode(" 1: return to_unicode("

") else: return tags def build_document(html_content, base_href=None): """Requires that the `html_content` not be None""" assert html_content is not None if isinstance(html_content, unicode): html_content = html_content.encode("utf8", "xmlcharrefreplace") try: document = document_fromstring(html_content, parser=UTF8_PARSER) except (ParserError, XMLSyntaxError): raise ValueError("Failed to parse document contents.") if base_href: document.make_links_absolute(base_href, resolve_base_href=True) else: document.resolve_base_href() return document @unicode_compatible class OriginalDocument(object): """The original document to process.""" def __init__(self, html, url=None): self._html = html self._url = url @property def url(self): """Source URL of HTML document.""" return self._url def __unicode__(self): """Renders the document as a string.""" return tounicode(self.dom) @cached_property def dom(self): """Parsed HTML document from the input.""" html = self._html if not isinstance(html, unicode): html = decode_html(html) html = convert_breaks_to_paragraphs(html) document = build_document(html, self._url) return document @cached_property def links(self): """Links within the document.""" return self.dom.findall(".//a") @cached_property def title(self): """Title attribute of the parsed document.""" title_element = self.dom.find(".//title") if title_element is None or title_element.text is None: return "" else: return title_element.text.strip()