From 7151ac23fd2f2e04c69d1fb642081201c8a16ce6 Mon Sep 17 00:00:00 2001 From: ryanss Date: Mon, 12 Jan 2015 23:59:00 -0500 Subject: Add readability parsing for reading linked pages --- plugin/readability/url_helpers.py | 52 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100755 plugin/readability/url_helpers.py (limited to 'plugin/readability/url_helpers.py') diff --git a/plugin/readability/url_helpers.py b/plugin/readability/url_helpers.py new file mode 100755 index 0000000..8234c8d --- /dev/null +++ b/plugin/readability/url_helpers.py @@ -0,0 +1,52 @@ +import logging +from urlparse import urlparse + +def host_for_url(url): + """ + >>> host_for_url('http://base/whatever/fdsh') + 'base' + >>> host_for_url('invalid') + """ + host = urlparse(url)[1] + if not host: + logging.error("could not extract host from URL: %r" % (url,)) + return None + return host + +def absolute_url(url, base_href): + """ + >>> absolute_url('foo', 'http://base/whatever/ooo/fdsh') + 'http://base/whatever/ooo/foo' + + >>> absolute_url('foo/bar/', 'http://base') + 'http://base/foo/bar/' + + >>> absolute_url('/foo/bar', 'http://base/whatever/fdskf') + 'http://base/foo/bar' + + >>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf') + 'http://base/foo/bar' + + >>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf') + 'http://localhost/foo' + """ + url = url.strip() + proto = urlparse(url)[0] + if proto: + return url + + base_url_parts = urlparse(base_href) + base_server = '://'.join(base_url_parts[:2]) + if url.startswith('/'): + return base_server + url + else: + path = base_url_parts[2] + if '/' in path: + path = path.rsplit('/', 1)[0] + '/' + else: + path = '/' + return base_server + path + url + +if __name__ == '__main__': + import doctest + doctest.testmod() \ No newline at end of file -- cgit v1.2.3