1
2
3
4 """
5 ::
6
7 # from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942
8 # Title: Cross-site scripting (XSS) defense
9 # Submitter: Josh Goldfoot (other recipes)
10 # Last Updated: 2006/08/05
11 # Version no: 1.0
12
13 """
14
15
16 from htmllib import HTMLParser
17 from cgi import escape
18 from urlparse import urlparse
19 from formatter import AbstractFormatter
20 from htmlentitydefs import entitydefs
21 from xml.sax.saxutils import quoteattr
22
23 __all__ = ['sanitize']
24
25
27 """Gets rid of < and > and & and, for good measure, :"""
28
29 return escape(text, quote=True).replace(':', ':')
30
31
33
34 - def __init__(
35 self,
36 permitted_tags=[
37 'a',
38 'b',
39 'blockquote',
40 'br/',
41 'i',
42 'li',
43 'ol',
44 'ul',
45 'p',
46 'cite',
47 'code',
48 'pre',
49 'img/',
50 ],
51 allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt'
52 ], 'blockquote': ['type']},
53 fmt=AbstractFormatter,
54 strip_disallowed=False
55 ):
56
57 HTMLParser.__init__(self, fmt)
58 self.result = ''
59 self.open_tags = []
60 self.permitted_tags = [i for i in permitted_tags if i[-1] != '/']
61 self.requires_no_close = [i[:-1] for i in permitted_tags
62 if i[-1] == '/']
63 self.permitted_tags += self.requires_no_close
64 self.allowed_attributes = allowed_attributes
65
66
67
68
69 self.allowed_schemes = ['http', 'https', 'ftp', 'mailto']
70
71
72 self.strip_disallowed = strip_disallowed
73 self.in_disallowed = False
74
76 if data and not self.in_disallowed:
77 self.result += xssescape(data)
78
80 if self.in_disallowed:
81 return
82 elif len(ref) < 7 and ref.isdigit():
83 self.result += '&#%s;' % ref
84 else:
85 self.result += xssescape('&#%s' % ref)
86
88 if self.in_disallowed:
89 return
90 elif ref in entitydefs:
91 self.result += '&%s;' % ref
92 else:
93 self.result += xssescape('&%s' % ref)
94
100
107 if tag not in self.permitted_tags:
108 if self.strip_disallowed:
109 self.in_disallowed = True
110 else:
111 self.result += xssescape('<%s>' % tag)
112 else:
113 bt = '<' + tag
114 if tag in self.allowed_attributes:
115 attrs = dict(attrs)
116 self.allowed_attributes_here = [x for x in
117 self.allowed_attributes[tag] if x in attrs
118 and len(attrs[x]) > 0]
119 for attribute in self.allowed_attributes_here:
120 if attribute in ['href', 'src', 'background']:
121 if self.url_is_acceptable(attrs[attribute]):
122 bt += ' %s="%s"' % (attribute,
123 attrs[attribute])
124 else:
125 bt += ' %s=%s' % (xssescape(attribute),
126 quoteattr(attrs[attribute]))
127 if bt == '<a' or bt == '<img':
128 return
129 if tag in self.requires_no_close:
130 bt += ' /'
131 bt += '>'
132 self.result += bt
133 self.open_tags.insert(0, tag)
134
136 bracketed = '</%s>' % tag
137 if tag not in self.permitted_tags:
138 if self.strip_disallowed:
139 self.in_disallowed = False
140 else:
141 self.result += xssescape(bracketed)
142 elif tag in self.open_tags:
143 self.result += bracketed
144 self.open_tags.remove(tag)
145
148
151
153 """
154 Accepts relative, absolute, and mailto urls
155 """
156
157 parsed = urlparse(url)
158 return (parsed[0] in self.allowed_schemes and '.' in parsed[1]) \
159 or (parsed[0] in self.allowed_schemes and '@' in parsed[2]) \
160 or (parsed[0] == '' and parsed[2].startswith('/'))
161
162 - def strip(self, rawstring, escape=True):
163 """
164 Returns the argument stripped of potentially harmful
165 HTML or Javascript code
166
167 @type escape: boolean
168 @param escape: If True (default) it escapes the potentially harmful
169 content, otherwise remove it
170 """
171
172 if not isinstance(rawstring, str):
173 return str(rawstring)
174 for tag in self.requires_no_close:
175 rawstring = rawstring.replace("<%s/>" % tag, "<%s />" % tag)
176 if not escape:
177 self.strip_disallowed = True
178 self.result = ''
179 self.feed(rawstring)
180 for endtag in self.open_tags:
181 if endtag not in self.requires_no_close:
182 self.result += '</%s>' % endtag
183 return self.result
184
198
199
200 -def sanitize(text, permitted_tags=[
201 'a',
202 'b',
203 'blockquote',
204 'br/',
205 'i',
206 'li',
207 'ol',
208 'ul',
209 'p',
210 'cite',
211 'code',
212 'pre',
213 'img/',
214 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
215 'table', 'tbody', 'thead', 'tfoot', 'tr', 'td', 'div',
216 'strong', 'span',
217 ],
218 allowed_attributes={
219 'a': ['href', 'title'],
220 'img': ['src', 'alt'],
221 'blockquote': ['type'],
222 'td': ['colspan'],
223 },
224 escape=True):
225 if not isinstance(text, basestring):
226 return str(text)
227 return XssCleaner(permitted_tags=permitted_tags,
228 allowed_attributes=allowed_attributes).strip(text, escape)
229