flake8 for bitmessageqt.safehtmlparser (with docstrings from #1368)

This commit is contained in:
Dmitri Bogomolov 2018-11-16 16:58:12 +02:00
parent 97366ede73
commit e8bd427b9f
Signed by untrusted user: g1itch
GPG Key ID: 720A756F18DEED13
1 changed files with 62 additions and 40 deletions

View File

@ -1,51 +1,73 @@
from HTMLParser import HTMLParser """Subclass of HTMLParser.HTMLParser for MessageView widget"""
import inspect import inspect
import re import re
from urllib import quote, quote_plus from HTMLParser import HTMLParser
from urllib import quote_plus
from urlparse import urlparse from urlparse import urlparse
class SafeHTMLParser(HTMLParser): class SafeHTMLParser(HTMLParser):
"""HTML parser with sanitisation"""
# from html5lib.sanitiser # from html5lib.sanitiser
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', acceptable_elements = (
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'a', 'abbr', 'acronym', 'address', 'area',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'figcaption', 'figure', 'footer', 'font', 'header', 'h1', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'] 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
replaces_pre = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"]] 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'
replaces_post = [["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], [" ", "&nbsp; "], [" ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]] )
src_schemes = [ "data" ] replaces_pre = (
#uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))') ("&", "&amp;"), ("\"", "&quot;"), ("<", "&lt;"), (">", "&gt;"))
uriregex1 = re.compile(r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)') replaces_post = (
("\n", "<br/>"), ("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"),
(" ", "&nbsp; "), (" ", "&nbsp; "), ("<br/> ", "<br/>&nbsp;"))
src_schemes = ["data"]
# uriregex1 = re.compile(
# r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])'
# r'|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)'
# r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))'
# r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
uriregex1 = re.compile(
r'((https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])'
r'(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
)
uriregex2 = re.compile(r'<a href="([^"]+)&amp;') uriregex2 = re.compile(r'<a href="([^"]+)&amp;')
emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b') emailregex = re.compile(
r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')
@staticmethod @staticmethod
def replace_pre(text): def replace_pre(text):
"""Perform substring replacement before regex replacements"""
for a in SafeHTMLParser.replaces_pre: for a in SafeHTMLParser.replaces_pre:
text = text.replace(a[0], a[1]) text = text.replace(*a)
return text return text
@staticmethod @staticmethod
def replace_post(text): def replace_post(text):
"""Perform substring replacement after regex replacements"""
for a in SafeHTMLParser.replaces_post: for a in SafeHTMLParser.replaces_post:
text = text.replace(a[0], a[1]) text = text.replace(*a)
if len(text) > 1 and text[0] == " ": if len(text) > 1 and text[0] == " ":
text = "&nbsp;" + text[1:] text = "&nbsp;" + text[1:]
return text return text
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs) HTMLParser.__init__(self, *args, **kwargs)
self.reset()
self.reset_safe() self.reset_safe()
def reset_safe(self): def reset_safe(self):
"""Reset runtime variables specific to this class"""
self.elements = set() self.elements = set()
self.raw = u"" self.raw = u""
self.sanitised = u"" self.sanitised = u""
@ -53,8 +75,9 @@ class SafeHTMLParser(HTMLParser):
self.allow_picture = False self.allow_picture = False
self.allow_external_src = False self.allow_external_src = False
def add_if_acceptable(self, tag, attrs = None): def add_if_acceptable(self, tag, attrs=None):
if tag not in SafeHTMLParser.acceptable_elements: """Add tag if it passes sanitisation"""
if tag not in self.acceptable_elements:
return return
self.sanitised += "<" self.sanitised += "<"
if inspect.stack()[1][3] == "handle_endtag": if inspect.stack()[1][3] == "handle_endtag":
@ -66,7 +89,7 @@ class SafeHTMLParser(HTMLParser):
val = "" val = ""
elif attr == "src" and not self.allow_external_src: elif attr == "src" and not self.allow_external_src:
url = urlparse(val) url = urlparse(val)
if url.scheme not in SafeHTMLParser.src_schemes: if url.scheme not in self.src_schemes:
val = "" val = ""
self.sanitised += " " + quote_plus(attr) self.sanitised += " " + quote_plus(attr)
if not (val is None): if not (val is None):
@ -74,26 +97,26 @@ class SafeHTMLParser(HTMLParser):
if inspect.stack()[1][3] == "handle_startendtag": if inspect.stack()[1][3] == "handle_startendtag":
self.sanitised += "/" self.sanitised += "/"
self.sanitised += ">" self.sanitised += ">"
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag in SafeHTMLParser.acceptable_elements: if tag in self.acceptable_elements:
self.has_html = True self.has_html = True
self.add_if_acceptable(tag, attrs) self.add_if_acceptable(tag, attrs)
def handle_endtag(self, tag): def handle_endtag(self, tag):
self.add_if_acceptable(tag) self.add_if_acceptable(tag)
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):
if tag in SafeHTMLParser.acceptable_elements: if tag in self.acceptable_elements:
self.has_html = True self.has_html = True
self.add_if_acceptable(tag, attrs) self.add_if_acceptable(tag, attrs)
def handle_data(self, data): def handle_data(self, data):
self.sanitised += data self.sanitised += data
def handle_charref(self, name): def handle_charref(self, name):
self.sanitised += "&#" + name + ";" self.sanitised += "&#" + name + ";"
def handle_entityref(self, name): def handle_entityref(self, name):
self.sanitised += "&" + name + ";" self.sanitised += "&" + name + ";"
@ -104,15 +127,14 @@ class SafeHTMLParser(HTMLParser):
data = unicode(data, 'utf-8', errors='replace') data = unicode(data, 'utf-8', errors='replace')
HTMLParser.feed(self, data) HTMLParser.feed(self, data)
tmp = SafeHTMLParser.replace_pre(data) tmp = SafeHTMLParser.replace_pre(data)
tmp = SafeHTMLParser.uriregex1.sub( tmp = self.uriregex1.sub(r'<a href="\1">\1</a>', tmp)
r'<a href="\1">\1</a>', tmp = self.uriregex2.sub(r'<a href="\1&', tmp)
tmp) tmp = self.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
tmp = SafeHTMLParser.replace_post(tmp) tmp = SafeHTMLParser.replace_post(tmp)
self.raw += tmp self.raw += tmp
def is_html(self, text = None, allow_picture = False): def is_html(self, text=None, allow_picture=False):
"""Detect if string contains HTML tags"""
if text: if text:
self.reset() self.reset()
self.reset_safe() self.reset_safe()