From e8bd427b9f1507f33e4797e3c1b07a81817605de Mon Sep 17 00:00:00 2001 From: Dmitri Bogomolov <4glitch@gmail.com> Date: Fri, 16 Nov 2018 16:58:12 +0200 Subject: [PATCH] flake8 for bitmessageqt.safehtmlparser (with docstrings from #1368) --- src/bitmessageqt/safehtmlparser.py | 102 ++++++++++++++++++----------- 1 file changed, 62 insertions(+), 40 deletions(-) diff --git a/src/bitmessageqt/safehtmlparser.py b/src/bitmessageqt/safehtmlparser.py index d1d7910c..edacd4bb 100644 --- a/src/bitmessageqt/safehtmlparser.py +++ b/src/bitmessageqt/safehtmlparser.py @@ -1,51 +1,73 @@ -from HTMLParser import HTMLParser +"""Subclass of HTMLParser.HTMLParser for MessageView widget""" + import inspect import re -from urllib import quote, quote_plus +from HTMLParser import HTMLParser + +from urllib import quote_plus from urlparse import urlparse + class SafeHTMLParser(HTMLParser): + """HTML parser with sanitisation""" # from html5lib.sanitiser - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', - 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', - 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', - 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', - 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', - 'figcaption', 'figure', 'footer', 'font', 'header', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', - 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', - 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', - 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', - 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', - 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', - 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'] - replaces_pre = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"]] - replaces_post = [["\n", "<br/>"], ["\t", " "], [" ", " "], [" ", " "], ["<br/> ", "<br/> "]] - src_schemes = [ "data" ] - #uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))') - uriregex1 = re.compile(r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)') + acceptable_elements = ( + 'a', 'abbr', 'acronym', 'address', 'area', + 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', + 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', + 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', + 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', + 'figcaption', 'figure', 'footer', 'font', 'header', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', + 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', + 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', + 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', + 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', + 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', + 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video' + ) + replaces_pre = ( + ("&", "&"), ("\"", """), ("<", "<"), (">", ">")) + replaces_post = ( + ("\n", "<br/>"), ("\t", " "), + (" ", " "), (" ", " "), ("<br/> ", "<br/> ")) + src_schemes = ["data"] + # uriregex1 = re.compile( + # r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])' + # r'|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)' + # r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))' + # r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))') + uriregex1 = re.compile( + r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])' + r'(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)' + ) uriregex2 = re.compile(r'<a href="([^"]+)&') - emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b') + emailregex = re.compile( + r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b') @staticmethod def replace_pre(text): + """Perform substring replacement before regex replacements""" for a in SafeHTMLParser.replaces_pre: - text = text.replace(a[0], a[1]) + text = text.replace(*a) return text @staticmethod def replace_post(text): + """Perform substring replacement after regex replacements""" for a in SafeHTMLParser.replaces_post: - text = text.replace(a[0], a[1]) + text = text.replace(*a) if len(text) > 1 and text[0] == " ": text = " " + text[1:] return text def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) + self.reset() self.reset_safe() - + def reset_safe(self): + """Reset runtime variables specific to this class""" self.elements = set() self.raw = u"" self.sanitised = u"" @@ -53,8 +75,9 @@ class SafeHTMLParser(HTMLParser): self.allow_picture = False self.allow_external_src = False - def add_if_acceptable(self, tag, attrs = None): - if tag not in SafeHTMLParser.acceptable_elements: + def add_if_acceptable(self, tag, attrs=None): + """Add tag if it passes sanitisation""" + if tag not in self.acceptable_elements: return self.sanitised += "<" if inspect.stack()[1][3] == "handle_endtag": @@ -66,7 +89,7 @@ class SafeHTMLParser(HTMLParser): val = "" elif attr == "src" and not self.allow_external_src: url = urlparse(val) - if url.scheme not in SafeHTMLParser.src_schemes: + if url.scheme not in self.src_schemes: val = "" self.sanitised += " " + quote_plus(attr) if not (val is None): @@ -74,26 +97,26 @@ class SafeHTMLParser(HTMLParser): if inspect.stack()[1][3] == "handle_startendtag": self.sanitised += "/" self.sanitised += ">" - + def handle_starttag(self, tag, attrs): - if tag in SafeHTMLParser.acceptable_elements: + if tag in self.acceptable_elements: self.has_html = True self.add_if_acceptable(tag, attrs) def handle_endtag(self, tag): self.add_if_acceptable(tag) - + def handle_startendtag(self, tag, attrs): - if tag in SafeHTMLParser.acceptable_elements: + if tag in self.acceptable_elements: self.has_html = True self.add_if_acceptable(tag, attrs) - + def handle_data(self, data): self.sanitised += data - + def handle_charref(self, name): self.sanitised += "&#" + name + ";" - + def handle_entityref(self, name): self.sanitised += "&" + name + ";" @@ -104,15 +127,14 @@ class SafeHTMLParser(HTMLParser): data = unicode(data, 'utf-8', errors='replace') HTMLParser.feed(self, data) tmp = SafeHTMLParser.replace_pre(data) - tmp = SafeHTMLParser.uriregex1.sub( - r'<a href="\1">\1</a>', - tmp) - tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp) - tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp) + tmp = self.uriregex1.sub(r'<a href="\1">\1</a>', tmp) + tmp = self.uriregex2.sub(r'<a href="\1&', tmp) + tmp = self.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp) tmp = SafeHTMLParser.replace_post(tmp) self.raw += tmp - def is_html(self, text = None, allow_picture = False): + def is_html(self, text=None, allow_picture=False): + """Detect if string contains HTML tags""" if text: self.reset() self.reset_safe()