flake8 for bitmessageqt.safehtmlparser (with docstrings from #1368)

2018-11-16 16:58:12 +02:00 · 2018-11-16 16:58:12 +02:00 · e8bd427b9f
commit e8bd427b9f
parent 97366ede73
1 changed files with 62 additions and 40 deletions
--- a/src/bitmessageqt/safehtmlparser.py
+++ b/src/bitmessageqt/safehtmlparser.py
@ -1,51 +1,73 @@
-from HTMLParser import HTMLParser
+"""Subclass of HTMLParser.HTMLParser for MessageView widget"""
+
 import inspect
 import re
-from urllib import quote, quote_plus
+from HTMLParser import HTMLParser
+
+from urllib import quote_plus
 from urlparse import urlparse

+
 class SafeHTMLParser(HTMLParser):
+    """HTML parser with sanitisation"""
    # from html5lib.sanitiser
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
-                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
-                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
-                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
-                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
-                           'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
-                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
-                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
-                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
-                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
-                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
-                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
-                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
-    replaces_pre = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"]]
-    replaces_post = [["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
-    src_schemes = [ "data" ]
-    #uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
-    uriregex1 = re.compile(r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
+    acceptable_elements = (
+        'a', 'abbr', 'acronym', 'address', 'area',
+        'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+        'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+        'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+        'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+        'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
+        'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
+        'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+        'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+        'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+        'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+        'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+        'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'
+    )
+    replaces_pre = (
+        ("&", "&amp;"), ("\"", "&quot;"), ("<", "&lt;"), (">", "&gt;"))
+    replaces_post = (
+        ("\n", "<br/>"), ("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"),
+        ("  ", "&nbsp; "), ("  ", "&nbsp; "), ("<br/> ", "<br/>&nbsp;"))
+    src_schemes = ["data"]
+    # uriregex1 = re.compile(
+    #     r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])'
+    #     r'|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)'
+    #     r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))'
+    #     r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
+    uriregex1 = re.compile(
+        r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])'
+        r'(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
+    )
    uriregex2 = re.compile(r'<a href="([^"]+)&amp;')
-    emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')
+    emailregex = re.compile(
+        r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')

    @staticmethod
    def replace_pre(text):
+        """Perform substring replacement before regex replacements"""
        for a in SafeHTMLParser.replaces_pre:
-            text = text.replace(a[0], a[1])
+            text = text.replace(*a)
        return text

    @staticmethod
    def replace_post(text):
+        """Perform substring replacement after regex replacements"""
        for a in SafeHTMLParser.replaces_post:
-            text = text.replace(a[0], a[1])
+            text = text.replace(*a)
        if len(text) > 1 and text[0] == " ":
            text = "&nbsp;" + text[1:]
        return text

    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
+        self.reset()
        self.reset_safe()

    def reset_safe(self):
+        """Reset runtime variables specific to this class"""
        self.elements = set()
        self.raw = u""
        self.sanitised = u""
@ -53,8 +75,9 @@ class SafeHTMLParser(HTMLParser):
        self.allow_picture = False
        self.allow_external_src = False

-    def add_if_acceptable(self, tag, attrs = None):
-        if tag not in SafeHTMLParser.acceptable_elements:
+    def add_if_acceptable(self, tag, attrs=None):
+        """Add tag if it passes sanitisation"""
+        if tag not in self.acceptable_elements:
            return
        self.sanitised += "<"
        if inspect.stack()[1][3] == "handle_endtag":
@ -66,7 +89,7 @@ class SafeHTMLParser(HTMLParser):
                    val = ""
                elif attr == "src" and not self.allow_external_src:
                    url = urlparse(val)
-                    if url.scheme not in SafeHTMLParser.src_schemes:
+                    if url.scheme not in self.src_schemes:
                        val = ""
                self.sanitised += " " + quote_plus(attr)
                if not (val is None):
@ -76,7 +99,7 @@ class SafeHTMLParser(HTMLParser):
        self.sanitised += ">"

    def handle_starttag(self, tag, attrs):
-        if tag in SafeHTMLParser.acceptable_elements:
+        if tag in self.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)

@ -84,7 +107,7 @@ class SafeHTMLParser(HTMLParser):
        self.add_if_acceptable(tag)

    def handle_startendtag(self, tag, attrs):
-        if tag in SafeHTMLParser.acceptable_elements:
+        if tag in self.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)

@ -104,15 +127,14 @@ class SafeHTMLParser(HTMLParser):
            data = unicode(data, 'utf-8', errors='replace')
        HTMLParser.feed(self, data)
        tmp = SafeHTMLParser.replace_pre(data)
-        tmp = SafeHTMLParser.uriregex1.sub(
-            r'<a href="\1">\1</a>',
-            tmp)
-        tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
-        tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
+        tmp = self.uriregex1.sub(r'<a href="\1">\1</a>', tmp)
+        tmp = self.uriregex2.sub(r'<a href="\1&', tmp)
+        tmp = self.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
        tmp = SafeHTMLParser.replace_post(tmp)
        self.raw += tmp

-    def is_html(self, text = None, allow_picture = False):
+    def is_html(self, text=None, allow_picture=False):
+        """Detect if string contains HTML tags"""
        if text:
            self.reset()
            self.reset_safe()