From f27ca0d3d678813456a95615e5fe62518d75baa3 Mon Sep 17 00:00:00 2001 From: Peter Surda Date: Thu, 25 Feb 2016 17:13:39 +0800 Subject: [PATCH] HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 --- src/bitmessageqt/messageview.py | 4 ++-- src/bitmessageqt/safehtmlparser.py | 30 +++++++++++++++++++----------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/bitmessageqt/messageview.py b/src/bitmessageqt/messageview.py index faa21cd2..a579dd13 100644 --- a/src/bitmessageqt/messageview.py +++ b/src/bitmessageqt/messageview.py @@ -89,9 +89,9 @@ class MessageView(QtGui.QTextBrowser): def setContent(self, data): self.html = SafeHTMLParser() - self.html.allow_picture = True self.html.reset() self.html.reset_safe() + self.html.allow_picture = True self.html.feed(data) self.html.close() - self.showPlain() \ No newline at end of file + self.showPlain() diff --git a/src/bitmessageqt/safehtmlparser.py b/src/bitmessageqt/safehtmlparser.py index c357662d..79ad0f73 100644 --- a/src/bitmessageqt/safehtmlparser.py +++ b/src/bitmessageqt/safehtmlparser.py @@ -1,6 +1,7 @@ from HTMLParser import HTMLParser import inspect from urllib import quote, quote_plus +from urlparse import urlparse class SafeHTMLParser(HTMLParser): # from html5lib.sanitiser @@ -18,6 +19,7 @@ class SafeHTMLParser(HTMLParser): 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'] replaces = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"], ["\n", "
"], ["\t", "    "], [" ", "  "], [" ", "  "], ["
", "
 "]] + src_schemes = [ "data" ] @staticmethod def multi_replace(text): @@ -36,27 +38,33 @@ class SafeHTMLParser(HTMLParser): self.raw = u"" self.sanitised = u"" self.has_html = False + self.allow_picture = False + self.allow_external_src = False def add_if_acceptable(self, tag, attrs = None): - if not tag in self.acceptable_elements: + if not tag in SafeHTMLParser.acceptable_elements: return self.sanitised += "<" if inspect.stack()[1][3] == "handle_endtag": self.sanitised += "/" self.sanitised += tag if not attrs is None: - for attr in attrs: - if tag == "img" and attr[0] == "src" and not self.allow_picture: - attr[1] = "" - self.sanitised += " " + quote_plus(attr[0]) - if not (attr[1] is None): - self.sanitised += "=\"" + attr[1] + "\"" + for attr, val in attrs: + if tag == "img" and attr == "src" and not self.allow_picture: + val = "" + elif attr == "src" and not self.allow_external_src: + url = urlparse(val) + if url.scheme not in SafeHTMLParser.src_schemes: + val == "" + self.sanitised += " " + quote_plus(attr) + if not (val is None): + self.sanitised += "=\"" + val + "\"" if inspect.stack()[1][3] == "handle_startendtag": self.sanitised += "/" self.sanitised += ">" def handle_starttag(self, tag, attrs): - if tag in self.acceptable_elements: + if tag in SafeHTMLParser.acceptable_elements: self.has_html = True self.add_if_acceptable(tag, attrs) @@ -64,7 +72,7 @@ class SafeHTMLParser(HTMLParser): self.add_if_acceptable(tag) def handle_startendtag(self, tag, attrs): - if tag in self.acceptable_elements: + if tag in SafeHTMLParser.acceptable_elements: self.has_html = True self.add_if_acceptable(tag, attrs) @@ -86,7 +94,7 @@ class SafeHTMLParser(HTMLParser): if text: self.reset() self.reset_safe() + self.allow_picture = allow_picture self.feed(text) self.close() - self.allow_picture = allow_picture - return self.has_html \ No newline at end of file + return self.has_html