HTML parser updates

HTML parser wasn't correctly handling img tags.
Now it also by defaults disabled external schemas to prevent
deanonymisation (even though the renderer actually doesn't support
external schemas at the moment)

Addresses #178
This commit is contained in:
Peter Šurda 2016-02-25 17:13:39 +08:00
parent 2f9501fa1a
commit 2f7a386aaf
Signed by untrusted user: PeterSurda
GPG Key ID: 0C5F50C0B5F37D87
2 changed files with 21 additions and 13 deletions

View File

@ -89,9 +89,9 @@ class MessageView(QtGui.QTextBrowser):
def setContent(self, data): def setContent(self, data):
self.html = SafeHTMLParser() self.html = SafeHTMLParser()
self.html.allow_picture = True
self.html.reset() self.html.reset()
self.html.reset_safe() self.html.reset_safe()
self.html.allow_picture = True
self.html.feed(data) self.html.feed(data)
self.html.close() self.html.close()
self.showPlain() self.showPlain()

View File

@ -1,6 +1,7 @@
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
import inspect import inspect
from urllib import quote, quote_plus from urllib import quote, quote_plus
from urlparse import urlparse
class SafeHTMLParser(HTMLParser): class SafeHTMLParser(HTMLParser):
# from html5lib.sanitiser # from html5lib.sanitiser
@ -18,6 +19,7 @@ class SafeHTMLParser(HTMLParser):
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'] 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
replaces = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"], ["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], [" ", "&nbsp; "], [" ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]] replaces = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"], ["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], [" ", "&nbsp; "], [" ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
src_schemes = [ "data" ]
@staticmethod @staticmethod
def multi_replace(text): def multi_replace(text):
@ -36,27 +38,33 @@ class SafeHTMLParser(HTMLParser):
self.raw = u"" self.raw = u""
self.sanitised = u"" self.sanitised = u""
self.has_html = False self.has_html = False
self.allow_picture = False
self.allow_external_src = False
def add_if_acceptable(self, tag, attrs = None): def add_if_acceptable(self, tag, attrs = None):
if not tag in self.acceptable_elements: if not tag in SafeHTMLParser.acceptable_elements:
return return
self.sanitised += "<" self.sanitised += "<"
if inspect.stack()[1][3] == "handle_endtag": if inspect.stack()[1][3] == "handle_endtag":
self.sanitised += "/" self.sanitised += "/"
self.sanitised += tag self.sanitised += tag
if not attrs is None: if not attrs is None:
for attr in attrs: for attr, val in attrs:
if tag == "img" and attr[0] == "src" and not self.allow_picture: if tag == "img" and attr == "src" and not self.allow_picture:
attr[1] = "" val = ""
self.sanitised += " " + quote_plus(attr[0]) elif attr == "src" and not self.allow_external_src:
if not (attr[1] is None): url = urlparse(val)
self.sanitised += "=\"" + attr[1] + "\"" if url.scheme not in SafeHTMLParser.src_schemes:
val == ""
self.sanitised += " " + quote_plus(attr)
if not (val is None):
self.sanitised += "=\"" + val + "\""
if inspect.stack()[1][3] == "handle_startendtag": if inspect.stack()[1][3] == "handle_startendtag":
self.sanitised += "/" self.sanitised += "/"
self.sanitised += ">" self.sanitised += ">"
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag in self.acceptable_elements: if tag in SafeHTMLParser.acceptable_elements:
self.has_html = True self.has_html = True
self.add_if_acceptable(tag, attrs) self.add_if_acceptable(tag, attrs)
@ -64,7 +72,7 @@ class SafeHTMLParser(HTMLParser):
self.add_if_acceptable(tag) self.add_if_acceptable(tag)
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):
if tag in self.acceptable_elements: if tag in SafeHTMLParser.acceptable_elements:
self.has_html = True self.has_html = True
self.add_if_acceptable(tag, attrs) self.add_if_acceptable(tag, attrs)
@ -86,7 +94,7 @@ class SafeHTMLParser(HTMLParser):
if text: if text:
self.reset() self.reset()
self.reset_safe() self.reset_safe()
self.allow_picture = allow_picture
self.feed(text) self.feed(text)
self.close() self.close()
self.allow_picture = allow_picture
return self.has_html return self.has_html