From e8bd427b9f1507f33e4797e3c1b07a81817605de Mon Sep 17 00:00:00 2001
From: Dmitri Bogomolov <4glitch@gmail.com>
Date: Fri, 16 Nov 2018 16:58:12 +0200
Subject: [PATCH] flake8 for bitmessageqt.safehtmlparser (with docstrings from
#1368)
---
src/bitmessageqt/safehtmlparser.py | 102 ++++++++++++++++++-----------
1 file changed, 62 insertions(+), 40 deletions(-)
diff --git a/src/bitmessageqt/safehtmlparser.py b/src/bitmessageqt/safehtmlparser.py
index d1d7910c..edacd4bb 100644
--- a/src/bitmessageqt/safehtmlparser.py
+++ b/src/bitmessageqt/safehtmlparser.py
@@ -1,51 +1,73 @@
-from HTMLParser import HTMLParser
+"""Subclass of HTMLParser.HTMLParser for MessageView widget"""
+
import inspect
import re
-from urllib import quote, quote_plus
+from HTMLParser import HTMLParser
+
+from urllib import quote_plus
from urlparse import urlparse
+
class SafeHTMLParser(HTMLParser):
+ """HTML parser with sanitisation"""
# from html5lib.sanitiser
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
- 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
- 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
- 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
- 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
- 'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
- 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
- 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
- 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
- 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
- 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
- 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
- replaces_pre = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"]]
- replaces_post = [["\n", "
"], ["\t", " "], [" ", " "], [" ", " "], ["
", "
"]]
- src_schemes = [ "data" ]
- #uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
- uriregex1 = re.compile(r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
+ acceptable_elements = (
+ 'a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
+ 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+ 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+ 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+ 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'
+ )
+ replaces_pre = (
+ ("&", "&"), ("\"", """), ("<", "<"), (">", ">"))
+ replaces_post = (
+ ("\n", "
"), ("\t", " "),
+ (" ", " "), (" ", " "), ("
", "
"))
+ src_schemes = ["data"]
+ # uriregex1 = re.compile(
+ # r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])'
+ # r'|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)'
+ # r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))'
+ # r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
+ uriregex1 = re.compile(
+ r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])'
+ r'(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
+ )
uriregex2 = re.compile(r' 1 and text[0] == " ":
text = " " + text[1:]
return text
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
+ self.reset()
self.reset_safe()
-
+
def reset_safe(self):
+ """Reset runtime variables specific to this class"""
self.elements = set()
self.raw = u""
self.sanitised = u""
@@ -53,8 +75,9 @@ class SafeHTMLParser(HTMLParser):
self.allow_picture = False
self.allow_external_src = False
- def add_if_acceptable(self, tag, attrs = None):
- if tag not in SafeHTMLParser.acceptable_elements:
+ def add_if_acceptable(self, tag, attrs=None):
+ """Add tag if it passes sanitisation"""
+ if tag not in self.acceptable_elements:
return
self.sanitised += "<"
if inspect.stack()[1][3] == "handle_endtag":
@@ -66,7 +89,7 @@ class SafeHTMLParser(HTMLParser):
val = ""
elif attr == "src" and not self.allow_external_src:
url = urlparse(val)
- if url.scheme not in SafeHTMLParser.src_schemes:
+ if url.scheme not in self.src_schemes:
val = ""
self.sanitised += " " + quote_plus(attr)
if not (val is None):
@@ -74,26 +97,26 @@ class SafeHTMLParser(HTMLParser):
if inspect.stack()[1][3] == "handle_startendtag":
self.sanitised += "/"
self.sanitised += ">"
-
+
def handle_starttag(self, tag, attrs):
- if tag in SafeHTMLParser.acceptable_elements:
+ if tag in self.acceptable_elements:
self.has_html = True
self.add_if_acceptable(tag, attrs)
def handle_endtag(self, tag):
self.add_if_acceptable(tag)
-
+
def handle_startendtag(self, tag, attrs):
- if tag in SafeHTMLParser.acceptable_elements:
+ if tag in self.acceptable_elements:
self.has_html = True
self.add_if_acceptable(tag, attrs)
-
+
def handle_data(self, data):
self.sanitised += data
-
+
def handle_charref(self, name):
self.sanitised += "" + name + ";"
-
+
def handle_entityref(self, name):
self.sanitised += "&" + name + ";"
@@ -104,15 +127,14 @@ class SafeHTMLParser(HTMLParser):
data = unicode(data, 'utf-8', errors='replace')
HTMLParser.feed(self, data)
tmp = SafeHTMLParser.replace_pre(data)
- tmp = SafeHTMLParser.uriregex1.sub(
- r'\1',
- tmp)
- tmp = SafeHTMLParser.uriregex2.sub(r'\1', tmp)
+ tmp = self.uriregex1.sub(r'\1', tmp)
+ tmp = self.uriregex2.sub(r'\1', tmp)
tmp = SafeHTMLParser.replace_post(tmp)
self.raw += tmp
- def is_html(self, text = None, allow_picture = False):
+ def is_html(self, text=None, allow_picture=False):
+ """Detect if string contains HTML tags"""
if text:
self.reset()
self.reset_safe()