summaryrefslogtreecommitdiff
path: root/plugin/readability/page_parser.py
diff options
context:
space:
mode:
authorryanss2015-01-12 23:59:00 -0500
committerryanss2015-01-12 23:59:00 -0500
commit7151ac23fd2f2e04c69d1fb642081201c8a16ce6 (patch)
tree244632b642feb987263b5afe9e0d5e0ef265dbd4 /plugin/readability/page_parser.py
parent0e72d030addb30976089a98bbfaffdc0f10eacd7 (diff)
downloadvim-hn-7151ac23fd2f2e04c69d1fb642081201c8a16ce6.tar.gz
Add readability parsing for reading linked pages
Diffstat (limited to 'plugin/readability/page_parser.py')
-rwxr-xr-xplugin/readability/page_parser.py145
1 files changed, 145 insertions, 0 deletions
diff --git a/plugin/readability/page_parser.py b/plugin/readability/page_parser.py
new file mode 100755
index 0000000..1c80ca7
--- /dev/null
+++ b/plugin/readability/page_parser.py
@@ -0,0 +1,145 @@
+import re
+from url_helpers import absolute_url
+from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit
+from logging import error
+
+__all__ = [
+ 'Unparseable',
+ 'parse',
+ 'get_title',
+ 'get_body',
+ 'ascii']
+
+def debug(s): pass
+
+class Unparseable(ValueError):
+ pass
+
+def parse(raw_content, base_href=None, notify=lambda x: None):
+ for parse_method in _parse_methods():
+ try:
+ return parse_method(raw_content, base_href)
+ except HTMLParseError, e:
+ notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
+ continue
+ raise Unparseable()
+
+def get_title(soup):
+ title = unicode(getattr(soup.title, 'string', ''))
+ if not title:
+ return None
+ return normalize_spaces(title)
+
+
+def get_body(soup):
+ [ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
+ raw_html = unicode(soup.body or soup)
+ cleaned = clean_attributes(raw_html)
+ try:
+ BeautifulSoup(cleaned)
+ return cleaned
+ except HTMLParseError:
+ error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned))
+ return raw_html
+
+def ascii(s):
+ return s.decode('ascii', 'ignore')
+
+class Replacement(object):
+ def __init__(self, desc, regex, replacement):
+ self.desc = desc
+ self.regex = regex
+ self.replacement = replacement
+
+ def apply(self, content):
+# # useful for debugging:
+# try:
+# print self. desc + ':' + str(self.regex.findall(content))
+# except RuntimeError: pass
+ return self.regex.sub(self.replacement, content)
+
+def beautiful_soup(content, base_href):
+ soup = BeautifulSoup(content)
+ if base_href:
+ _fix_references(soup, base_href)
+ return soup
+
+
+def _make_absolute_links(soup, base_href):
+ for link in soup.findAll('a', attrs={'href':True}):
+ link['href'] = absolute_url(link['href'], base_href)
+
+def _make_absolute_images(soup, base_href):
+ for img in soup.findAll('img', attrs={'src':True}):
+ img['src'] = absolute_url(img['src'], base_href)
+
+def _fix_references(soup, base_href):
+ _make_absolute_links(soup, base_href)
+ _make_absolute_images(soup, base_href)
+
+# a bunch of regexes to hack around lousy html
+dodgy_regexes = (
+ Replacement('javascript',
+ regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
+ replacement=''),
+
+ Replacement('double double-quoted attributes',
+ regex=re.compile('(="[^"]+")"+'),
+ replacement='\\1'),
+
+ Replacement('unclosed tags',
+ regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
+ replacement='\\1>\\2'),
+
+ Replacement('unclosed (numerical) attribute values',
+ regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
+ replacement='\\1"\\2'),
+ )
+
+
+# helpers for parsing
+def normalize_spaces(s):
+ """replace any sequence of whitespace
+ characters with a single space"""
+ return ' '.join(s.split())
+
+def _remove_crufty_html(content):
+ for replacement in dodgy_regexes:
+ content = replacement.apply(content)
+ return content
+
+def _parse_methods():
+ def unicode_cleansed(content, base_href):
+ content = UnicodeDammit(content, isHTML=True).markup
+ cleaned = _remove_crufty_html(content)
+ debug("Cleaned content: %s" % (cleaned,))
+ return beautiful_soup(cleaned, base_href)
+
+ def ascii_cleansed(content, base_href):
+ content = ascii(content)
+ cleaned = _remove_crufty_html(content)
+ debug("Cleaned content: %s" % (cleaned,))
+ return beautiful_soup(cleaned, base_href)
+
+ return (
+ beautiful_soup,
+ unicode_cleansed,
+ ascii_cleansed)
+
+# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
+single_quoted = "'[^']+'"
+double_quoted = '"[^"]+"'
+non_space = '[^ "\'>]+'
+htmlstrip = re.compile("<" # open
+ "([^>]+) " # prefix
+ "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
+ '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+ "([^>]*)" # postfix
+ ">" # end
+, re.I)
+def clean_attributes(html):
+ while htmlstrip.search(html):
+ html = htmlstrip.sub('<\\1\\2>', html)
+ return html
+