Package gluon :: Module sanitizer
[hide private]
[frames] | no frames]

Source Code for Module gluon.sanitizer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  :: 
  6   
  7      # from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942 
  8      # Title: Cross-site scripting (XSS) defense 
  9      # Submitter: Josh Goldfoot (other recipes) 
 10      # Last Updated: 2006/08/05 
 11      # Version no: 1.0 
 12   
 13  """ 
 14   
 15   
 16  from htmllib import HTMLParser 
 17  from cgi import escape 
 18  from urlparse import urlparse 
 19  from formatter import AbstractFormatter 
 20  from htmlentitydefs import entitydefs 
 21  from xml.sax.saxutils import quoteattr 
 22   
 23  __all__ = ['sanitize'] 
 24   
 25   
26 -def xssescape(text):
27 """Gets rid of < and > and & and, for good measure, :""" 28 29 return escape(text, quote=True).replace(':', '&#58;')
30 31
32 -class XssCleaner(HTMLParser):
33
34 - def __init__( 35 self, 36 permitted_tags=[ 37 'a', 38 'b', 39 'blockquote', 40 'br/', 41 'i', 42 'li', 43 'ol', 44 'ul', 45 'p', 46 'cite', 47 'code', 48 'pre', 49 'img/', 50 ], 51 allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt' 52 ], 'blockquote': ['type']}, 53 fmt=AbstractFormatter, 54 strip_disallowed=False 55 ):
56 57 HTMLParser.__init__(self, fmt) 58 self.result = '' 59 self.open_tags = [] 60 self.permitted_tags = [i for i in permitted_tags if i[-1] != '/'] 61 self.requires_no_close = [i[:-1] for i in permitted_tags 62 if i[-1] == '/'] 63 self.permitted_tags += self.requires_no_close 64 self.allowed_attributes = allowed_attributes 65 66 # The only schemes allowed in URLs (for href and src attributes). 67 # Adding "javascript" or "vbscript" to this list would not be smart. 68 69 self.allowed_schemes = ['http', 'https', 'ftp', 'mailto'] 70 71 #to strip or escape disallowed tags? 72 self.strip_disallowed = strip_disallowed 73 self.in_disallowed = False
74
75 - def handle_data(self, data):
76 if data and not self.in_disallowed: 77 self.result += xssescape(data)
78
79 - def handle_charref(self, ref):
80 if self.in_disallowed: 81 return 82 elif len(ref) < 7 and ref.isdigit(): 83 self.result += '&#%s;' % ref 84 else: 85 self.result += xssescape('&#%s' % ref)
86
87 - def handle_entityref(self, ref):
88 if self.in_disallowed: 89 return 90 elif ref in entitydefs: 91 self.result += '&%s;' % ref 92 else: 93 self.result += xssescape('&%s' % ref)
94
95 - def handle_comment(self, comment):
96 if self.in_disallowed: 97 return 98 elif comment: 99 self.result += xssescape('<!--%s-->' % comment)
100
101 - def handle_starttag( 102 self, 103 tag, 104 method, 105 attrs, 106 ):
107 if tag not in self.permitted_tags: 108 if self.strip_disallowed: 109 self.in_disallowed = True 110 else: 111 self.result += xssescape('<%s>' % tag) 112 else: 113 bt = '<' + tag 114 if tag in self.allowed_attributes: 115 attrs = dict(attrs) 116 self.allowed_attributes_here = [x for x in 117 self.allowed_attributes[tag] if x in attrs 118 and len(attrs[x]) > 0] 119 for attribute in self.allowed_attributes_here: 120 if attribute in ['href', 'src', 'background']: 121 if self.url_is_acceptable(attrs[attribute]): 122 bt += ' %s="%s"' % (attribute, 123 attrs[attribute]) 124 else: 125 bt += ' %s=%s' % (xssescape(attribute), 126 quoteattr(attrs[attribute])) 127 if bt == '<a' or bt == '<img': 128 return 129 if tag in self.requires_no_close: 130 bt += ' /' 131 bt += '>' 132 self.result += bt 133 self.open_tags.insert(0, tag)
134
135 - def handle_endtag(self, tag, attrs):
136 bracketed = '</%s>' % tag 137 if tag not in self.permitted_tags: 138 if self.strip_disallowed: 139 self.in_disallowed = False 140 else: 141 self.result += xssescape(bracketed) 142 elif tag in self.open_tags: 143 self.result += bracketed 144 self.open_tags.remove(tag)
145
146 - def unknown_starttag(self, tag, attributes):
147 self.handle_starttag(tag, None, attributes)
148
149 - def unknown_endtag(self, tag):
150 self.handle_endtag(tag, None)
151
152 - def url_is_acceptable(self, url):
153 """ 154 Accepts relative, absolute, and mailto urls 155 """ 156 157 parsed = urlparse(url) 158 return (parsed[0] in self.allowed_schemes and '.' in parsed[1]) \ 159 or (parsed[0] in self.allowed_schemes and '@' in parsed[2]) \ 160 or (parsed[0] == '' and parsed[2].startswith('/'))
161
162 - def strip(self, rawstring, escape=True):
163 """ 164 Returns the argument stripped of potentially harmful 165 HTML or Javascript code 166 167 @type escape: boolean 168 @param escape: If True (default) it escapes the potentially harmful 169 content, otherwise remove it 170 """ 171 172 if not isinstance(rawstring, str): 173 return str(rawstring) 174 for tag in self.requires_no_close: 175 rawstring = rawstring.replace("<%s/>" % tag, "<%s />" % tag) 176 if not escape: 177 self.strip_disallowed = True 178 self.result = '' 179 self.feed(rawstring) 180 for endtag in self.open_tags: 181 if endtag not in self.requires_no_close: 182 self.result += '</%s>' % endtag 183 return self.result
184
185 - def xtags(self):
186 """ 187 Returns a printable string informing the user which tags are allowed 188 """ 189 190 tg = '' 191 for x in sorted(self.permitted_tags): 192 tg += '<' + x 193 if x in self.allowed_attributes: 194 for y in self.allowed_attributes[x]: 195 tg += ' %s=""' % y 196 tg += '> ' 197 return xssescape(tg.strip())
198 199
200 -def sanitize(text, permitted_tags=[ 201 'a', 202 'b', 203 'blockquote', 204 'br/', 205 'i', 206 'li', 207 'ol', 208 'ul', 209 'p', 210 'cite', 211 'code', 212 'pre', 213 'img/', 214 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 215 'table', 'tbody', 'thead', 'tfoot', 'tr', 'td', 'div', 216 'strong', 'span', 217 ], 218 allowed_attributes={ 219 'a': ['href', 'title'], 220 'img': ['src', 'alt'], 221 'blockquote': ['type'], 222 'td': ['colspan'], 223 }, 224 escape=True):
225 if not isinstance(text, basestring): 226 return str(text) 227 return XssCleaner(permitted_tags=permitted_tags, 228 allowed_attributes=allowed_attributes).strip(text, escape)
229