summaryrefslogtreecommitdiff
path: root/plugin/readability/url_helpers.py
blob: 8234c8dd5805f3a7cbef8100ff057d5622d8989b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import logging
from urlparse import urlparse

def host_for_url(url):
	"""
	>>> host_for_url('http://base/whatever/fdsh')
	'base'
	>>> host_for_url('invalid')
	"""
	host = urlparse(url)[1]
	if not host:
		logging.error("could not extract host from URL: %r" % (url,))
		return None
	return host

def absolute_url(url, base_href):
	"""
	>>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
	'http://base/whatever/ooo/foo'

	>>> absolute_url('foo/bar/', 'http://base')
	'http://base/foo/bar/'

	>>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
	'http://base/foo/bar'

	>>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
	'http://base/foo/bar'

	>>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
	'http://localhost/foo'
	"""
	url = url.strip()
	proto = urlparse(url)[0]
	if proto:
		return url

	base_url_parts = urlparse(base_href)
	base_server = '://'.join(base_url_parts[:2])
	if url.startswith('/'):
		return base_server + url
	else:
		path = base_url_parts[2]
		if '/' in path:
			path = path.rsplit('/', 1)[0] + '/'
		else:
			path = '/'
		return base_server + path + url

if __name__ == '__main__':
	import doctest
	doctest.testmod()