diff options
Diffstat (limited to 'plugin/readability/page_parser.py')
| -rwxr-xr-x | plugin/readability/page_parser.py | 145 |
1 files changed, 0 insertions, 145 deletions
diff --git a/plugin/readability/page_parser.py b/plugin/readability/page_parser.py deleted file mode 100755 index 1c80ca7..0000000 --- a/plugin/readability/page_parser.py +++ /dev/null @@ -1,145 +0,0 @@ -import re -from url_helpers import absolute_url -from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit -from logging import error - -__all__ = [ - 'Unparseable', - 'parse', - 'get_title', - 'get_body', - 'ascii'] - -def debug(s): pass - -class Unparseable(ValueError): - pass - -def parse(raw_content, base_href=None, notify=lambda x: None): - for parse_method in _parse_methods(): - try: - return parse_method(raw_content, base_href) - except HTMLParseError, e: - notify("parsing (%s) failed: %s" % (parse_method.__name__, e)) - continue - raise Unparseable() - -def get_title(soup): - title = unicode(getattr(soup.title, 'string', '')) - if not title: - return None - return normalize_spaces(title) - - -def get_body(soup): - [ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ] - raw_html = unicode(soup.body or soup) - cleaned = clean_attributes(raw_html) - try: - BeautifulSoup(cleaned) - return cleaned - except HTMLParseError: - error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned)) - return raw_html - -def ascii(s): - return s.decode('ascii', 'ignore') - -class Replacement(object): - def __init__(self, desc, regex, replacement): - self.desc = desc - self.regex = regex - self.replacement = replacement - - def apply(self, content): -# # useful for debugging: -# try: -# print self. desc + ':' + str(self.regex.findall(content)) -# except RuntimeError: pass - return self.regex.sub(self.replacement, content) - -def beautiful_soup(content, base_href): - soup = BeautifulSoup(content) - if base_href: - _fix_references(soup, base_href) - return soup - - -def _make_absolute_links(soup, base_href): - for link in soup.findAll('a', attrs={'href':True}): - link['href'] = absolute_url(link['href'], base_href) - -def _make_absolute_images(soup, base_href): - for img in soup.findAll('img', attrs={'src':True}): - img['src'] = absolute_url(img['src'], base_href) - -def _fix_references(soup, base_href): - _make_absolute_links(soup, base_href) - _make_absolute_images(soup, base_href) - -# a bunch of regexes to hack around lousy html -dodgy_regexes = ( - Replacement('javascript', - regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE), - replacement=''), - - Replacement('double double-quoted attributes', - regex=re.compile('(="[^"]+")"+'), - replacement='\\1'), - - Replacement('unclosed tags', - regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'), - replacement='\\1>\\2'), - - Replacement('unclosed (numerical) attribute values', - regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'), - replacement='\\1"\\2'), - ) - - -# helpers for parsing -def normalize_spaces(s): - """replace any sequence of whitespace - characters with a single space""" - return ' '.join(s.split()) - -def _remove_crufty_html(content): - for replacement in dodgy_regexes: - content = replacement.apply(content) - return content - -def _parse_methods(): - def unicode_cleansed(content, base_href): - content = UnicodeDammit(content, isHTML=True).markup - cleaned = _remove_crufty_html(content) - debug("Cleaned content: %s" % (cleaned,)) - return beautiful_soup(cleaned, base_href) - - def ascii_cleansed(content, base_href): - content = ascii(content) - cleaned = _remove_crufty_html(content) - debug("Cleaned content: %s" % (cleaned,)) - return beautiful_soup(cleaned, base_href) - - return ( - beautiful_soup, - unicode_cleansed, - ascii_cleansed) - -# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds -bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*'] -single_quoted = "'[^']+'" -double_quoted = '"[^"]+"' -non_space = '[^ "\'>]+' -htmlstrip = re.compile("<" # open - "([^>]+) " # prefix - "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes - '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value - "([^>]*)" # postfix - ">" # end -, re.I) -def clean_attributes(html): - while htmlstrip.search(html): - html = htmlstrip.sub('<\\1\\2>', html) - return html - |
