use a more leniant parser

15 years ago · c0ca60ee26
parent ad3d52ade4
commit c0ca60ee26
4 changed files with 2201 additions and 4 deletions
--- a/readability/BeautifulSoup.py
+++ b/readability/BeautifulSoup.py
--- a/readability/page_parser.py
+++ b/readability/page_parser.py
@ -0,0 +1,145 @@
 import re
 from url_helpers import absolute_url
 from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit
 from logging import error
 __all__ = [
 	'Unparseable',
 	'parse',
 	'get_title',
 	'get_body',
 	'ascii']
 def debug(s): pass
 class Unparseable(ValueError):
 	pass
 def parse(raw_content, base_href=None, notify=lambda x: None):
 	for parse_method in _parse_methods():
 		try:
 			return parse_method(raw_content, base_href)
 		except HTMLParseError, e:
 			notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
 			continue
 	raise Unparseable()
 def get_title(soup):
 	title = unicode(getattr(soup.title, 'string', ''))
 	if not title:
 		return None
 	return normalize_spaces(title)
 def get_body(soup):
 	[ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
 	raw_html = unicode(soup.body or soup)
 	cleaned = clean_attributes(raw_html)
 	try:
 		BeautifulSoup(cleaned)
 		return cleaned
 	except HTMLParseError:
 		error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned))
 		return raw_html
 def ascii(s):
 	return s.decode('ascii', 'ignore')
 class Replacement(object):
 	def __init__(self, desc, regex, replacement):
 		self.desc = desc
 		self.regex = regex
 		self.replacement = replacement
 	def apply(self, content):
 #		# useful for debugging:
 #		try:
 #			print self. desc + ':' + str(self.regex.findall(content))
 #		except RuntimeError: pass
 		return self.regex.sub(self.replacement, content)
 def beautiful_soup(content, base_href):
 	soup = BeautifulSoup(content)
 	if base_href:
 		_fix_references(soup, base_href)
 	return soup
 def _make_absolute_links(soup, base_href):
 	for link in soup.findAll('a', attrs={'href':True}):
 		link['href'] = absolute_url(link['href'], base_href)
 def _make_absolute_images(soup, base_href):
 	for img in soup.findAll('img', attrs={'src':True}):
 		img['src'] = absolute_url(img['src'], base_href)
 def _fix_references(soup, base_href):
 	_make_absolute_links(soup, base_href)
 	_make_absolute_images(soup, base_href)
 # a bunch of regexes to hack around lousy html
 dodgy_regexes = (
 	Replacement('javascript',
 		regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
 		replacement=''),
 	Replacement('double double-quoted attributes',
 		regex=re.compile('(="[^"]+")"+'),
 		replacement='\\1'),
 	Replacement('unclosed tags',
 		regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
 		replacement='\\1>\\2'),
 	Replacement('unclosed (numerical) attribute values',
 		regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
 		replacement='\\1"\\2'),
 	)
 # helpers for parsing
 def normalize_spaces(s):
 	"""replace any sequence of whitespace
 	characters with a single space"""
 	return ' '.join(s.split())
 def _remove_crufty_html(content):
 	for replacement in dodgy_regexes:
 		content = replacement.apply(content)
 	return content
 def _parse_methods():
 	def unicode_cleansed(content, base_href):
 		content = UnicodeDammit(content, isHTML=True).markup
 		cleaned = _remove_crufty_html(content)
 		debug("Cleaned content: %s" % (cleaned,))
 		return beautiful_soup(cleaned, base_href)
 	def ascii_cleansed(content, base_href):
 		content = ascii(content)
 		cleaned = _remove_crufty_html(content)
 		debug("Cleaned content: %s" % (cleaned,))
 		return beautiful_soup(cleaned, base_href)
 	return (
 		beautiful_soup,
 		unicode_cleansed,
 		ascii_cleansed)
 # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
 bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
 non_space = '[^ "\'>]+'
 htmlstrip = re.compile("<" # open
 	"([^>]+) " # prefix
 	"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
 	'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
 	"([^>]*)"  # postfix
 	">"        # end
 , re.I)
 def clean_attributes(html):
 	while htmlstrip.search(html):
 		html = htmlstrip.sub('<\\1\\2>', html)
 	return html
--- a/readability/readability.py
+++ b/readability/readability.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python
-from BeautifulSoup import BeautifulSoup, NavigableString
+from BeautifulSoup import NavigableString
 from page_parser import parse
 import re
 REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
@ -34,8 +35,7 @@ class Document:
 		self.make_html()
 	def make_html(self):
-		self.html = BeautifulSoup(self.input)
+		self.html = parse(self.input, self.options['url'])
 	def content(self, remove_unlikely_candidates = True):
 		def remove(tag): [i.extract() for i in self.html.findAll(tag)]
@ -60,7 +60,7 @@ class Document:
 		# Things like preambles, content split by ads that we removed, etc.
 		sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
-		output = BeautifulSoup("<div/>")
+		output = parse("<div/>")
 		for sibling in best_candidate['elem'].parent.contents:
 			if isinstance(sibling, NavigableString): continue
 			append = False
--- a/readability/url_helpers.py
+++ b/readability/url_helpers.py
@ -0,0 +1,52 @@
 import logging
 from urlparse import urlparse
 def host_for_url(url):
 	"""
 	>>> host_for_url('http://base/whatever/fdsh')
 	'base'
 	>>> host_for_url('invalid')
 	"""
 	host = urlparse(url)[1]
 	if not host:
 		logging.error("could not extract host from URL: %r" % (url,))
 		return None
 	return host
 def absolute_url(url, base_href):
 	"""
 	>>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
 	'http://base/whatever/ooo/foo'
 	>>> absolute_url('foo/bar/', 'http://base')
 	'http://base/foo/bar/'
 	>>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
 	'http://base/foo/bar'
 	>>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
 	'http://base/foo/bar'
 	>>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
 	'http://localhost/foo'
 	"""
 	url = url.strip()
 	proto = urlparse(url)[0]
 	if proto:
 		return url
 	base_url_parts = urlparse(base_href)
 	base_server = '://'.join(base_url_parts[:2])
 	if url.startswith('/'):
 		return base_server + url
 	else:
 		path = base_url_parts[2]
 		if '/' in path:
 			path = path.rsplit('/', 1)[0] + '/'
 		else:
 			path = '/'
 		return base_server + path + url
 if __name__ == '__main__':
 	import doctest
 	doctest.testmod()