python-readability/readability/url_helpers.py

52 lines
1.2 KiB
Python
Raw Normal View History

2010-04-23 10:51:56 +00:00
import logging
from urlparse import urlparse
def host_for_url(url):
"""
>>> host_for_url('http://base/whatever/fdsh')
'base'
>>> host_for_url('invalid')
"""
host = urlparse(url)[1]
if not host:
logging.error("could not extract host from URL: %r" % (url,))
return None
return host
def absolute_url(url, base_href):
"""
>>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
'http://base/whatever/ooo/foo'
>>> absolute_url('foo/bar/', 'http://base')
'http://base/foo/bar/'
>>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
'http://base/foo/bar'
>>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
'http://base/foo/bar'
>>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
'http://localhost/foo'
"""
url = url.strip()
proto = urlparse(url)[0]
if proto:
return url
base_url_parts = urlparse(base_href)
base_server = '://'.join(base_url_parts[:2])
if url.startswith('/'):
return base_server + url
else:
path = base_url_parts[2]
if '/' in path:
path = path.rsplit('/', 1)[0] + '/'
else:
path = '/'
return base_server + path + url
if __name__ == '__main__':
import doctest
doctest.testmod()