Peter Surda
f27ca0d3d6
HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178
101 lines
4.1 KiB
Python
101 lines
4.1 KiB
Python
from HTMLParser import HTMLParser
|
|
import inspect
|
|
from urllib import quote, quote_plus
|
|
from urlparse import urlparse
|
|
|
|
class SafeHTMLParser(HTMLParser):
|
|
# from html5lib.sanitiser
|
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
|
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
|
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
|
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
|
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
|
'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
|
|
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
|
|
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
|
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
|
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
|
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
|
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
|
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
|
replaces = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"], ["\n", "<br/>"], ["\t", " "], [" ", " "], [" ", " "], ["<br/> ", "<br/> "]]
|
|
src_schemes = [ "data" ]
|
|
|
|
@staticmethod
|
|
def multi_replace(text):
|
|
for a in SafeHTMLParser.replaces:
|
|
text = text.replace(a[0], a[1])
|
|
if len(text) > 1 and text[0] == " ":
|
|
text = " " + text[1:]
|
|
return text
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
HTMLParser.__init__(self, *args, **kwargs)
|
|
self.reset_safe()
|
|
|
|
def reset_safe(self):
|
|
self.elements = set()
|
|
self.raw = u""
|
|
self.sanitised = u""
|
|
self.has_html = False
|
|
self.allow_picture = False
|
|
self.allow_external_src = False
|
|
|
|
def add_if_acceptable(self, tag, attrs = None):
|
|
if not tag in SafeHTMLParser.acceptable_elements:
|
|
return
|
|
self.sanitised += "<"
|
|
if inspect.stack()[1][3] == "handle_endtag":
|
|
self.sanitised += "/"
|
|
self.sanitised += tag
|
|
if not attrs is None:
|
|
for attr, val in attrs:
|
|
if tag == "img" and attr == "src" and not self.allow_picture:
|
|
val = ""
|
|
elif attr == "src" and not self.allow_external_src:
|
|
url = urlparse(val)
|
|
if url.scheme not in SafeHTMLParser.src_schemes:
|
|
val == ""
|
|
self.sanitised += " " + quote_plus(attr)
|
|
if not (val is None):
|
|
self.sanitised += "=\"" + val + "\""
|
|
if inspect.stack()[1][3] == "handle_startendtag":
|
|
self.sanitised += "/"
|
|
self.sanitised += ">"
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in SafeHTMLParser.acceptable_elements:
|
|
self.has_html = True
|
|
self.add_if_acceptable(tag, attrs)
|
|
|
|
def handle_endtag(self, tag):
|
|
self.add_if_acceptable(tag)
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
if tag in SafeHTMLParser.acceptable_elements:
|
|
self.has_html = True
|
|
self.add_if_acceptable(tag, attrs)
|
|
|
|
def handle_data(self, data):
|
|
self.sanitised += unicode(data, 'utf-8', 'replace')
|
|
|
|
def handle_charref(self, name):
|
|
self.sanitised += "&#" + name + ";"
|
|
|
|
def handle_entityref(self, name):
|
|
self.sanitised += "&" + name + ";"
|
|
|
|
def feed(self, data):
|
|
HTMLParser.feed(self, data)
|
|
tmp = SafeHTMLParser.multi_replace(data)
|
|
self.raw += unicode(tmp, 'utf-8', 'replace')
|
|
|
|
def is_html(self, text = None, allow_picture = False):
|
|
if text:
|
|
self.reset()
|
|
self.reset_safe()
|
|
self.allow_picture = allow_picture
|
|
self.feed(text)
|
|
self.close()
|
|
return self.has_html
|