PyBitmessage/src/bitmessageqt/safehtmlparser.py

from HTMLParser import HTMLParser
import inspect
import re
from urllib import quote, quote_plus
from urlparse import urlparse

class SafeHTMLParser(HTMLParser):
    # from html5lib.sanitiser
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
                           'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
    replaces_pre = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"]]
    replaces_post = [["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
    src_schemes = [ "data" ]
    uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
    uriregex2 = re.compile(r'<a href="([^"]+)&amp;')
    emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')

    @staticmethod
    def replace_pre(text):
        for a in SafeHTMLParser.replaces_pre:
            text = text.replace(a[0], a[1])
        return text

    @staticmethod
    def replace_post(text):
        for a in SafeHTMLParser.replaces_post:
            text = text.replace(a[0], a[1])
        if len(text) > 1 and text[0] == " ":
            text = "&nbsp;" + text[1:]
        return text

    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
        self.reset_safe()
        
    def reset_safe(self):
        self.elements = set()
        self.raw = u""
        self.sanitised = u""
        self.has_html = False
        self.allow_picture = False
        self.allow_external_src = False

    def add_if_acceptable(self, tag, attrs = None):
        if tag not in SafeHTMLParser.acceptable_elements:
            return
        self.sanitised += "<"
        if inspect.stack()[1][3] == "handle_endtag":
            self.sanitised += "/"
        self.sanitised += tag
        if attrs is not None:
            for attr, val in attrs:
                if tag == "img" and attr == "src" and not self.allow_picture:
                    val = ""
                elif attr == "src" and not self.allow_external_src:
                    url = urlparse(val)
                    if url.scheme not in SafeHTMLParser.src_schemes:
                        val = ""
                self.sanitised += " " + quote_plus(attr)
                if not (val is None):
                    self.sanitised += "=\"" + val + "\""
        if inspect.stack()[1][3] == "handle_startendtag":
            self.sanitised += "/"
        self.sanitised += ">"
    
    def handle_starttag(self, tag, attrs):
        if tag in SafeHTMLParser.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)

    def handle_endtag(self, tag):
        self.add_if_acceptable(tag)
        
    def handle_startendtag(self, tag, attrs):
        if tag in SafeHTMLParser.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)
    
    def handle_data(self, data):
        self.sanitised += data
        
    def handle_charref(self, name):
        self.sanitised += "&#" + name + ";"
    
    def handle_entityref(self, name):
        self.sanitised += "&" + name + ";"

    def feed(self, data):
        try:
            data = unicode(data, 'utf-8')
        except UnicodeDecodeError:
            data = unicode(data, 'utf-8', errors='replace')
        HTMLParser.feed(self, data)
        tmp = SafeHTMLParser.replace_pre(data)
        tmp = SafeHTMLParser.uriregex1.sub(
            r'<a href="\1">\1</a>',
            tmp)
        tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
        tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
        tmp = SafeHTMLParser.replace_post(tmp)
        self.raw += tmp

    def is_html(self, text = None, allow_picture = False):
        if text:
            self.reset()
            self.reset_safe()
            self.allow_picture = allow_picture
            self.feed(text)
            self.close()
        return self.has_html
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from HTMLParser import HTMLParser`
			`import inspect`
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 00:47:07 +01:00			`import re`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from urllib import quote, quote_plus`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`from urlparse import urlparse`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`class SafeHTMLParser(HTMLParser):`
			`# from html5lib.sanitiser`
			`acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',`
			`'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',`
			`'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',`
			`'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',`
			`'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',`
			`'figcaption', 'figure', 'footer', 'font', 'header', 'h1',`
			`'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',`
			`'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',`
			`'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',`
			`'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',`
			`'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',`
			`'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',`
			`'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']`
HTML parser fix - URLs followed with space were broken 2017-02-28 22:47:56 +01:00			`replaces_pre = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"]]`
			`replaces_post = [["\n", "<br/>"], ["\t", "    "], [" ", "  "], [" ", "  "], ["<br/> ", "<br/> "]]`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`src_schemes = [ "data" ]`
Add bitcoin URI handler for message viewer 2016-03-18 14:09:28 +01:00			uriregex1 = re.compile(r'(?i)\b((?:(https?\|ftp\|bitcoin):(?:/{1,3}\|[a-z0-9%])\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'".,<>?]))')
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 00:47:07 +01:00			`uriregex2 = re.compile(r'<a href="([^"]+)&')`
			`emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`@staticmethod`
HTML parser fix - URLs followed with space were broken 2017-02-28 22:47:56 +01:00			`def replace_pre(text):`
			`for a in SafeHTMLParser.replaces_pre:`
			`text = text.replace(a[0], a[1])`
			`return text`

			`@staticmethod`
			`def replace_post(text):`
			`for a in SafeHTMLParser.replaces_post:`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`text = text.replace(a[0], a[1])`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`if len(text) > 1 and text[0] == " ":`
			`text = " " + text[1:]`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`return text`

HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def __init__(self, args, *kwargs):`
			`HTMLParser.__init__(self, args, *kwargs)`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`self.reset_safe()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def reset_safe(self):`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.elements = set()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.raw = u""`
			`self.sanitised = u""`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.has_html = False`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`self.allow_picture = False`
			`self.allow_external_src = False`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def add_if_acceptable(self, tag, attrs = None):`
Code quality improvements 2017-06-24 12:13:35 +02:00			`if tag not in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`return`
			`self.sanitised += "<"`
			`if inspect.stack()[1][3] == "handle_endtag":`
			`self.sanitised += "/"`
			`self.sanitised += tag`
Code quality improvements 2017-06-24 12:13:35 +02:00			`if attrs is not None:`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`for attr, val in attrs:`
			`if tag == "img" and attr == "src" and not self.allow_picture:`
			`val = ""`
			`elif attr == "src" and not self.allow_external_src:`
			`url = urlparse(val)`
			`if url.scheme not in SafeHTMLParser.src_schemes:`
Code quality improvements 2017-06-24 12:13:35 +02:00			`val = ""`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`self.sanitised += " " + quote_plus(attr)`
			`if not (val is None):`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`self.sanitised += "=\"" + val + "\""`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`if inspect.stack()[1][3] == "handle_startendtag":`
			`self.sanitised += "/"`
			`self.sanitised += ">"`

			`def handle_starttag(self, tag, attrs):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`if tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_endtag(self, tag):`
			`self.add_if_acceptable(tag)`

			`def handle_startendtag(self, tag, attrs):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`if tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_data(self, data):`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`self.sanitised += data`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def handle_charref(self, name):`
			`self.sanitised += "&#" + name + ";"`

			`def handle_entityref(self, name):`
			`self.sanitised += "&" + name + ";"`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`def feed(self, data):`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`try:`
			`data = unicode(data, 'utf-8')`
			`except UnicodeDecodeError:`
			`data = unicode(data, 'utf-8', errors='replace')`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`HTMLParser.feed(self, data)`
HTML parser fix - URLs followed with space were broken 2017-02-28 22:47:56 +01:00			`tmp = SafeHTMLParser.replace_pre(data)`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`tmp = SafeHTMLParser.uriregex1.sub(`
			`r'<a href="\1">\1</a>',`
			`tmp)`
			`tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)`
			`tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)`
HTML parser fix - URLs followed with space were broken 2017-02-28 22:47:56 +01:00			`tmp = SafeHTMLParser.replace_post(tmp)`
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 00:47:07 +01:00			`self.raw += tmp`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def is_html(self, text = None, allow_picture = False):`
			`if text:`
			`self.reset()`
			`self.reset_safe()`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`self.allow_picture = allow_picture`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.feed(text)`
			`self.close()`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`return self.has_html`