plugin/readability/page_parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

import re
from url_helpers import absolute_url
from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit
from logging import error

__all__ = [
	'Unparseable',
	'parse',
	'get_title',
	'get_body',
	'ascii']

def debug(s): pass

class Unparseable(ValueError):
	pass

def parse(raw_content, base_href=None, notify=lambda x: None):
	for parse_method in _parse_methods():
		try:
			return parse_method(raw_content, base_href)
		except HTMLParseError, e:
			notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
			continue
	raise Unparseable()

def get_title(soup):
	title = unicode(getattr(soup.title, 'string', ''))
	if not title:
		return None
	return normalize_spaces(title)


def get_body(soup):
	[ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
	raw_html = unicode(soup.body or soup)
	cleaned = clean_attributes(raw_html)
	try:
		BeautifulSoup(cleaned)
		return cleaned
	except HTMLParseError:
		error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned))
		return raw_html

def ascii(s):
	return s.decode('ascii', 'ignore')

class Replacement(object):
	def __init__(self, desc, regex, replacement):
		self.desc = desc
		self.regex = regex
		self.replacement = replacement
	
	def apply(self, content):
#		# useful for debugging:
#		try:
#			print self. desc + ':' + str(self.regex.findall(content))
#		except RuntimeError: pass
		return self.regex.sub(self.replacement, content)

def beautiful_soup(content, base_href):
	soup = BeautifulSoup(content)
	if base_href:
		_fix_references(soup, base_href)
	return soup


def _make_absolute_links(soup, base_href):
	for link in soup.findAll('a', attrs={'href':True}):
		link['href'] = absolute_url(link['href'], base_href)

def _make_absolute_images(soup, base_href):
	for img in soup.findAll('img', attrs={'src':True}):
		img['src'] = absolute_url(img['src'], base_href)

def _fix_references(soup, base_href):
	_make_absolute_links(soup, base_href)
	_make_absolute_images(soup, base_href)

# a bunch of regexes to hack around lousy html
dodgy_regexes = (
	Replacement('javascript',
		regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
		replacement=''),

	Replacement('double double-quoted attributes',
		regex=re.compile('(="[^"]+")"+'),
		replacement='\\1'),

	Replacement('unclosed tags',
		regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
		replacement='\\1>\\2'),

	Replacement('unclosed (numerical) attribute values',
		regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
		replacement='\\1"\\2'),
	)
	

# helpers for parsing
def normalize_spaces(s):
	"""replace any sequence of whitespace
	characters with a single space"""
	return ' '.join(s.split())

def _remove_crufty_html(content):
	for replacement in dodgy_regexes:
		content = replacement.apply(content)
	return content

def _parse_methods():
	def unicode_cleansed(content, base_href):
		content = UnicodeDammit(content, isHTML=True).markup
		cleaned = _remove_crufty_html(content)
		debug("Cleaned content: %s" % (cleaned,))
		return beautiful_soup(cleaned, base_href)

	def ascii_cleansed(content, base_href):
		content = ascii(content)
		cleaned = _remove_crufty_html(content)
		debug("Cleaned content: %s" % (cleaned,))
		return beautiful_soup(cleaned, base_href)

	return (
		beautiful_soup,
		unicode_cleansed,
		ascii_cleansed)

# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
	"([^>]+) " # prefix
	"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
	'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
	"([^>]*)"  # postfix
	">"        # end
, re.I)
def clean_attributes(html):
	while htmlstrip.search(html):
		html = htmlstrip.sub('<\\1\\2>', html)
	return html