PyBitmessage-2024-12-12/src/bitmessageqt/safehtmlparser.py

from HTMLParser import HTMLParser
import inspect
import re
from urllib import quote, quote_plus
from urlparse import urlparse
from debug import logger

class SafeHTMLParser(HTMLParser):
    # from html5lib.sanitiser
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
                           'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
    replaces = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"], ["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
    src_schemes = [ "data" ]
    uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
    uriregex2 = re.compile(r'<a href="([^"]+)&amp;')
    emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')

    @staticmethod
    def multi_replace(text):
        for a in SafeHTMLParser.replaces:
            text = text.replace(a[0], a[1])
        if len(text) > 1 and text[0] == " ":
            text = "&nbsp;" + text[1:]
        return text

    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
        self.reset_safe()
        
    def reset_safe(self):
        self.elements = set()
        self.raw = u""
        self.sanitised = u""
        self.has_html = False
        self.allow_picture = False
        self.allow_external_src = False

    def add_if_acceptable(self, tag, attrs = None):
        if not tag in SafeHTMLParser.acceptable_elements:
            return
        self.sanitised += "<"
        if inspect.stack()[1][3] == "handle_endtag":
            self.sanitised += "/"
        self.sanitised += tag
        if not attrs is None:
            for attr, val in attrs:
                if tag == "img" and attr == "src" and not self.allow_picture:
                    val = ""
                elif attr == "src" and not self.allow_external_src:
                    url = urlparse(val)
                    if url.scheme not in SafeHTMLParser.src_schemes:
                        val == ""
                self.sanitised += " " + quote_plus(attr)
                if not (val is None):
                    self.sanitised += "=\"" + val + "\""
        if inspect.stack()[1][3] == "handle_startendtag":
            self.sanitised += "/"
        self.sanitised += ">"
    
    def handle_starttag(self, tag, attrs):
        if tag in SafeHTMLParser.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)

    def handle_endtag(self, tag):
        self.add_if_acceptable(tag)
        
    def handle_startendtag(self, tag, attrs):
        if tag in SafeHTMLParser.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)
    
    def handle_data(self, data):
        self.sanitised += data
        
    def handle_charref(self, name):
        self.sanitised += "&#" + name + ";"
    
    def handle_entityref(self, name):
        self.sanitised += "&" + name + ";"

    def feed(self, data):
        try:
            data = unicode(data, 'utf-8')
        except UnicodeDecodeError:
            data = unicode(data, 'utf-8', errors='replace')
        HTMLParser.feed(self, data)
        tmp = SafeHTMLParser.multi_replace(data)
        tmp = SafeHTMLParser.uriregex1.sub(
            r'<a href="\1">\1</a>',
            tmp)
        tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
        tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
        self.raw += tmp

    def is_html(self, text = None, allow_picture = False):
        if text:
            self.reset()
            self.reset_safe()
            self.allow_picture = allow_picture
            self.feed(text)
            self.close()
        return self.has_html
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from HTMLParser import HTMLParser`
			`import inspect`
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 00:47:07 +01:00			`import re`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from urllib import quote, quote_plus`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`from urlparse import urlparse`
Long message parsing fix - while 448ceaa74ca6929f626fa36962c4f3c0ca0f0659 fixed slow rendering on windows, there was still a bug where overly long messages caused freezeing of the hyperlink regexp parser, which appears to happen on all platforms. Maybe it's a freeze, maybe it just takes too long. This patch aborts the regexp parser after 1 second and simply displays the message without hyperlinks being clickable. This doesn't affect HTML mode because there the links are kept as they are - Fixes #900 2016-10-21 15:54:02 +02:00			`from debug import logger`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`class SafeHTMLParser(HTMLParser):`
			`# from html5lib.sanitiser`
			`acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',`
			`'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',`
			`'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',`
			`'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',`
			`'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',`
			`'figcaption', 'figure', 'footer', 'font', 'header', 'h1',`
			`'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',`
			`'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',`
			`'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',`
			`'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',`
			`'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',`
			`'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',`
			`'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`replaces = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"], ["\n", "<br/>"], ["\t", "    "], [" ", "  "], [" ", "  "], ["<br/> ", "<br/> "]]`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`src_schemes = [ "data" ]`
Add bitcoin URI handler for message viewer 2016-03-18 14:09:28 +01:00			uriregex1 = re.compile(r'(?i)\b((?:(https?\|ftp\|bitcoin):(?:/{1,3}\|[a-z0-9%])\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'".,<>?]))')
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 00:47:07 +01:00			`uriregex2 = re.compile(r'<a href="([^"]+)&')`
			`emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`@staticmethod`
			`def multi_replace(text):`
			`for a in SafeHTMLParser.replaces:`
			`text = text.replace(a[0], a[1])`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`if len(text) > 1 and text[0] == " ":`
			`text = " " + text[1:]`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`return text`

HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def __init__(self, args, *kwargs):`
			`HTMLParser.__init__(self, args, *kwargs)`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`self.reset_safe()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def reset_safe(self):`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.elements = set()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.raw = u""`
			`self.sanitised = u""`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.has_html = False`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`self.allow_picture = False`
			`self.allow_external_src = False`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def add_if_acceptable(self, tag, attrs = None):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`if not tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`return`
			`self.sanitised += "<"`
			`if inspect.stack()[1][3] == "handle_endtag":`
			`self.sanitised += "/"`
			`self.sanitised += tag`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`if not attrs is None:`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`for attr, val in attrs:`
			`if tag == "img" and attr == "src" and not self.allow_picture:`
			`val = ""`
			`elif attr == "src" and not self.allow_external_src:`
			`url = urlparse(val)`
			`if url.scheme not in SafeHTMLParser.src_schemes:`
			`val == ""`
			`self.sanitised += " " + quote_plus(attr)`
			`if not (val is None):`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`self.sanitised += "=\"" + val + "\""`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`if inspect.stack()[1][3] == "handle_startendtag":`
			`self.sanitised += "/"`
			`self.sanitised += ">"`

			`def handle_starttag(self, tag, attrs):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`if tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_endtag(self, tag):`
			`self.add_if_acceptable(tag)`

			`def handle_startendtag(self, tag, attrs):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`if tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_data(self, data):`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`self.sanitised += data`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def handle_charref(self, name):`
			`self.sanitised += "&#" + name + ";"`

			`def handle_entityref(self, name):`
			`self.sanitised += "&" + name + ";"`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`def feed(self, data):`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`try:`
			`data = unicode(data, 'utf-8')`
			`except UnicodeDecodeError:`
			`data = unicode(data, 'utf-8', errors='replace')`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`HTMLParser.feed(self, data)`
			`tmp = SafeHTMLParser.multi_replace(data)`
SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem 2017-02-22 09:05:05 +01:00			`tmp = SafeHTMLParser.uriregex1.sub(`
			`r'<a href="\1">\1</a>',`
			`tmp)`
			`tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)`
			`tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)`
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 00:47:07 +01:00			`self.raw += tmp`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def is_html(self, text = None, allow_picture = False):`
			`if text:`
			`self.reset()`
			`self.reset_safe()`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`self.allow_picture = allow_picture`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.feed(text)`
			`self.close()`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 10:13:39 +01:00			`return self.has_html`