diff --git a/readability/htmls.py b/readability/htmls.py
index 55d7516..2d30abb 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -1,96 +1,112 @@
-from cleaners import normalize_spaces, clean_attributes
-from encodings import get_encoding
-from lxml.html import tostring
-import logging
-import lxml.html
-import re
-
-logging.getLogger().setLevel(logging.DEBUG)
-
-utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
-
-def build_doc(page):
- enc = get_encoding(page)
- page_enc = page.decode(enc, 'replace').encode('utf-8')
- doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
- return doc
-
-def js_re(src, pattern, flags, repl):
- return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
-
-
-def normalize_entities(cur_title):
- entities = {
- u'\u2014':'-',
- u'\u2013':'-',
- u'—': '-',
- u'–': '-',
- u'\u00A0': ' ',
- u'\u00AB': '"',
- u'\u00BB': '"',
- u'"': '"',
- }
- for c, r in entities.iteritems():
- if c in cur_title:
- cur_title = cur_title.replace(c, r)
-
- return cur_title
-
-def norm_title(title):
- return normalize_entities(normalize_spaces(title))
-
-def get_title(doc):
- title = doc.find('.//title').text
- if not title:
- return '[no-title]'
-
- return norm_title(title)
-
-def shortify_title(doc):
- title = doc.find('.//title').text
- if not title:
- return '[no-title]'
-
- title = orig = norm_title(title)
-
- for delimiter in [' | ', ' - ', ' :: ', ' / ']:
- if delimiter in title:
- parts = orig.split(delimiter)
- if len(parts[0].split()) >= 4:
- title = parts[0]
- break
- elif len(parts[-1].split()) >= 4:
- title = parts[-1]
- break
- else:
- if ': ' in title:
- parts = orig.split(': ')
- if len(parts[-1].split()) >= 4:
- title = parts[-1]
- else:
- title = orig.split(': ', 1)[1]
-
- if len(title.split()) <= 4:
- h1 = list(doc.iterfind('.//h1'))
- if len(h1) == 1:
- title = norm_title(h1[0].text)
- elif len(h1) == 0:
- h2 = list(doc.iterfind('.//h2'))
- if len(h1) == 1:
- title = norm_title(h2[1].text)
-
- if not 15 < len(title) < 150:
- return orig
-
- return title
-
-def get_body(doc):
- [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
- raw_html = unicode(tostring(doc.body or doc))
- cleaned = clean_attributes(raw_html)
- try:
- #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
- return cleaned
- except Exception: #FIXME find the equivalent lxml error
- logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
- return raw_html
+from cleaners import normalize_spaces, clean_attributes
+from encodings import get_encoding
+from lxml.html import tostring
+import logging
+import lxml.html
+import re
+
+logging.getLogger().setLevel(logging.DEBUG)
+
+utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+
+def build_doc(page):
+ enc = get_encoding(page)
+ page_enc = page.decode(enc, 'replace').encode('utf-8')
+ doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
+ return doc
+
+def js_re(src, pattern, flags, repl):
+ return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
+
+
+def normalize_entities(cur_title):
+ entities = {
+ u'\u2014':'-',
+ u'\u2013':'-',
+ u'—': '-',
+ u'–': '-',
+ u'\u00A0': ' ',
+ u'\u00AB': '"',
+ u'\u00BB': '"',
+ u'"': '"',
+ }
+ for c, r in entities.iteritems():
+ if c in cur_title:
+ cur_title = cur_title.replace(c, r)
+
+ return cur_title
+
+def norm_title(title):
+ return normalize_entities(normalize_spaces(title))
+
+def get_title(doc):
+ title = doc.find('.//title').text
+ if not title:
+ return '[no-title]'
+
+ return norm_title(title)
+
+def add_match(collection, text, orig):
+ text = norm_title(text)
+ if len(text.split()) >= 2 and len(text) >= 15:
+ if text.replace('"', '') in orig.replace('"', ''):
+ collection.add(text)
+
+def shorten_title(doc):
+ title = doc.find('.//title').text
+ if not title:
+ return ''
+
+ title = orig = norm_title(title)
+
+ candidates = set()
+
+ for item in ['.//h1', './/h2', './/h3']:
+ for e in list(doc.iterfind(item)):
+ if e.text:
+ add_match(candidates, e.text, orig)
+ if e.text_content():
+ add_match(candidates, e.text_content(), orig)
+
+ for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+ for e in doc.cssselect(item):
+ if e.text:
+ add_match(candidates, e.text, orig)
+ if e.text_content():
+ add_match(candidates, e.text_content(), orig)
+
+ if candidates:
+ title = sorted(candidates, key=len)[-1]
+ else:
+ for delimiter in [' | ', ' - ', ' :: ', ' / ']:
+ if delimiter in title:
+ parts = orig.split(delimiter)
+ if len(parts[0].split()) >= 4:
+ title = parts[0]
+ break
+ elif len(parts[-1].split()) >= 4:
+ title = parts[-1]
+ break
+ else:
+ if ': ' in title:
+ parts = orig.split(': ')
+ if len(parts[-1].split()) >= 4:
+ title = parts[-1]
+ else:
+ title = orig.split(': ', 1)[1]
+
+ if not 15 < len(title) < 150:
+ return orig
+
+ return title
+
+def get_body(doc):
+ [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+ raw_html = unicode(tostring(doc.body or doc))
+ cleaned = clean_attributes(raw_html)
+ try:
+ #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
+ return cleaned
+ except Exception: #FIXME find the equivalent lxml error
+ logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+ return raw_html
diff --git a/readability/readability.py b/readability/readability.py
index 11f8da0..0802c6b 100644
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
from collections import defaultdict
from cleaners import html_cleaner, clean_attributes
-from htmls import build_doc, get_body, get_title
+from htmls import build_doc, get_body, get_title, shorten_title
from lxml.etree import tostring, tounicode
import logging
import re
@@ -15,12 +15,12 @@ REGEXES = {
'positiveRe': re.compile('caption|article|body|content|entry|hentry|page|pagination|post|text',re.I),
'negativeRe': re.compile('adwrapper|ad_wrapper|share|bookmark|nav|combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
- 'replaceBrsRe': re.compile('(
]*>[ \n\r\t]*){2,}',re.I),
- 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
- 'trimRe': re.compile('^\s+|\s+$/'),
- 'normalizeRe': re.compile('\s{2,}/'),
- 'killBreaksRe': re.compile('(
(\s| ?)*){1,}/'),
- 'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
+ #'replaceBrsRe': re.compile('(
]*>[ \n\r\t]*){2,}',re.I),
+ #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
+ #'trimRe': re.compile('^\s+|\s+$/'),
+ #'normalizeRe': re.compile('\s{2,}/'),
+ #'killBreaksRe': re.compile('(
(\s| ?)*){1,}/'),
+ #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
}
def describe(node):
@@ -37,6 +37,15 @@ def log_candidates(candidates, print_format=""):
#def _text(node):
# return " ".join(node.findall(text=True))
+def to_int(x):
+ if not x: return None
+ x = x.strip()
+ if x.endswith('px'):
+ return int(x[:-2])
+ if x.endswith('em'):
+ return int(x[:-2]) * 12
+ return int(x)
+
class Unparseable(ValueError):
pass
@@ -72,6 +81,9 @@ class Document:
def title(self):
return get_title(self._html(True))
+ def short_title(self):
+ return shorten_title(self._html(True))
+
def summary(self):
try:
ruthless = True
@@ -263,9 +275,10 @@ class Document:
def sanitize(self, node, candidates):
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
- if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree()
+ if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
+ header.drop_tree()
- for elem in self.tags(node, "form", "iframe"):
+ for elem in self.tags(node, "form", "iframe", "textarea"):
elem.drop_tree()
allowed = {}
# Conditionally clean