PyBitmessage-2025-03-02/src/bitmessageqt/safehtmlparser.py

from HTMLParser import HTMLParser
import inspect
import multiprocessing
import re
import Queue
from urllib import quote, quote_plus
from urlparse import urlparse
from debug import logger
from shared import parserInputQueue, parserOutputQueue, parserProcess, parserLock

def regexpSubprocess(parserInputQueue, parserOutputQueue):
    for data in iter(parserInputQueue.get, None):
        if data is None:
            break;
        try:
            result = SafeHTMLParser.uriregex1.sub(
                r'<a href="\1">\1</a>',
                data)
            result = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', result)
            result = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', result)
            parserOutputQueue.put(result)
        except SystemExit:
            break;
        except:
            break;

class SafeHTMLParser(HTMLParser):
    # from html5lib.sanitiser
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
                           'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
    replaces = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"], ["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
    src_schemes = [ "data" ]
    uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
    uriregex2 = re.compile(r'<a href="([^"]+)&amp;')
    emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')

    @staticmethod
    def multi_replace(text):
        for a in SafeHTMLParser.replaces:
            text = text.replace(a[0], a[1])
        if len(text) > 1 and text[0] == " ":
            text = "&nbsp;" + text[1:]
        return text

    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
        self.reset_safe()
        
    def reset_safe(self):
        self.elements = set()
        self.raw = u""
        self.sanitised = u""
        self.has_html = False
        self.allow_picture = False
        self.allow_external_src = False

    def add_if_acceptable(self, tag, attrs = None):
        if not tag in SafeHTMLParser.acceptable_elements:
            return
        self.sanitised += "<"
        if inspect.stack()[1][3] == "handle_endtag":
            self.sanitised += "/"
        self.sanitised += tag
        if not attrs is None:
            for attr, val in attrs:
                if tag == "img" and attr == "src" and not self.allow_picture:
                    val = ""
                elif attr == "src" and not self.allow_external_src:
                    url = urlparse(val)
                    if url.scheme not in SafeHTMLParser.src_schemes:
                        val == ""
                self.sanitised += " " + quote_plus(attr)
                if not (val is None):
                    self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\""
        if inspect.stack()[1][3] == "handle_startendtag":
            self.sanitised += "/"
        self.sanitised += ">"
    
    def handle_starttag(self, tag, attrs):
        if tag in SafeHTMLParser.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)

    def handle_endtag(self, tag):
        self.add_if_acceptable(tag)
        
    def handle_startendtag(self, tag, attrs):
        if tag in SafeHTMLParser.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)
    
    def handle_data(self, data):
        self.sanitised += unicode(data, 'utf-8', 'replace')
        
    def handle_charref(self, name):
        self.sanitised += "&#" + name + ";"
    
    def handle_entityref(self, name):
        self.sanitised += "&" + name + ";"

    def feed(self, data):
        global parserLock, parserProcess, parserInputQueue, parserOutputQueue
        HTMLParser.feed(self, data)
        tmp = SafeHTMLParser.multi_replace(data)
        tmp = unicode(tmp, 'utf-8', 'replace')
        
        parserLock.acquire()
        if parserProcess is None:
            parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
            parserProcess.start()
        parserLock.release()
        parserInputQueue.put(tmp)
        try:
            tmp = parserOutputQueue.get(True, 1)
        except Queue.Empty:
            logger.error("Regular expression parsing timed out, not displaying links")
            parserLock.acquire()
            parserProcess.terminate()
            parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
            parserProcess.start()
            parserLock.release()
        else:
            pass
        self.raw += tmp

    def is_html(self, text = None, allow_picture = False):
        if text:
            self.reset()
            self.reset_safe()
            self.allow_picture = allow_picture
            self.feed(text)
            self.close()
        return self.has_html
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from HTMLParser import HTMLParser`
			`import inspect`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`import multiprocessing`
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 07:47:07 +08:00			`import re`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`import Queue`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from urllib import quote, quote_plus`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`from urlparse import urlparse`
Long message parsing fix - while 448ceaa74ca6929f626fa36962c4f3c0ca0f0659 fixed slow rendering on windows, there was still a bug where overly long messages caused freezeing of the hyperlink regexp parser, which appears to happen on all platforms. Maybe it's a freeze, maybe it just takes too long. This patch aborts the regexp parser after 1 second and simply displays the message without hyperlinks being clickable. This doesn't affect HTML mode because there the links are kept as they are - Fixes #900 2016-10-21 15:54:02 +02:00			`from debug import logger`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`from shared import parserInputQueue, parserOutputQueue, parserProcess, parserLock`
Long message parsing fix - while 448ceaa74ca6929f626fa36962c4f3c0ca0f0659 fixed slow rendering on windows, there was still a bug where overly long messages caused freezeing of the hyperlink regexp parser, which appears to happen on all platforms. Maybe it's a freeze, maybe it just takes too long. This patch aborts the regexp parser after 1 second and simply displays the message without hyperlinks being clickable. This doesn't affect HTML mode because there the links are kept as they are - Fixes #900 2016-10-21 15:54:02 +02:00
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`def regexpSubprocess(parserInputQueue, parserOutputQueue):`
			`for data in iter(parserInputQueue.get, None):`
			`if data is None:`
			`break;`
			`try:`
			`result = SafeHTMLParser.uriregex1.sub(`
			`r'<a href="\1">\1</a>',`
			`data)`
			`result = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', result)`
			`result = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', result)`
			`parserOutputQueue.put(result)`
			`except SystemExit:`
			`break;`
			`except:`
			`break;`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`class SafeHTMLParser(HTMLParser):`
			`# from html5lib.sanitiser`
			`acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',`
			`'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',`
			`'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',`
			`'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',`
			`'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',`
			`'figcaption', 'figure', 'footer', 'font', 'header', 'h1',`
			`'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',`
			`'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',`
			`'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',`
			`'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',`
			`'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',`
			`'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',`
			`'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`replaces = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"], ["\n", "<br/>"], ["\t", "    "], [" ", "  "], [" ", "  "], ["<br/> ", "<br/> "]]`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`src_schemes = [ "data" ]`
Add bitcoin URI handler for message viewer 2016-03-18 14:09:28 +01:00			uriregex1 = re.compile(r'(?i)\b((?:(https?\|ftp\|bitcoin):(?:/{1,3}\|[a-z0-9%])\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'".,<>?]))')
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 07:47:07 +08:00			`uriregex2 = re.compile(r'<a href="([^"]+)&')`
			`emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`@staticmethod`
			`def multi_replace(text):`
			`for a in SafeHTMLParser.replaces:`
			`text = text.replace(a[0], a[1])`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`if len(text) > 1 and text[0] == " ":`
			`text = " " + text[1:]`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`return text`

HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def __init__(self, args, *kwargs):`
			`HTMLParser.__init__(self, args, *kwargs)`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`self.reset_safe()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def reset_safe(self):`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.elements = set()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.raw = u""`
			`self.sanitised = u""`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.has_html = False`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`self.allow_picture = False`
			`self.allow_external_src = False`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def add_if_acceptable(self, tag, attrs = None):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`if not tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`return`
			`self.sanitised += "<"`
			`if inspect.stack()[1][3] == "handle_endtag":`
			`self.sanitised += "/"`
			`self.sanitised += tag`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`if not attrs is None:`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`for attr, val in attrs:`
			`if tag == "img" and attr == "src" and not self.allow_picture:`
			`val = ""`
			`elif attr == "src" and not self.allow_external_src:`
			`url = urlparse(val)`
			`if url.scheme not in SafeHTMLParser.src_schemes:`
			`val == ""`
			`self.sanitised += " " + quote_plus(attr)`
			`if not (val is None):`
HTML parser fix - the UTF-8-ifying of tags had missing brackets 2016-10-23 18:35:20 +02:00			`self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\""`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`if inspect.stack()[1][3] == "handle_startendtag":`
			`self.sanitised += "/"`
			`self.sanitised += ">"`

			`def handle_starttag(self, tag, attrs):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`if tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_endtag(self, tag):`
			`self.add_if_acceptable(tag)`

			`def handle_startendtag(self, tag, attrs):`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`if tag in SafeHTMLParser.acceptable_elements:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_data(self, data):`
			`self.sanitised += unicode(data, 'utf-8', 'replace')`

			`def handle_charref(self, name):`
			`self.sanitised += "&#" + name + ";"`

			`def handle_entityref(self, name):`
			`self.sanitised += "&" + name + ";"`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`def feed(self, data):`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`global parserLock, parserProcess, parserInputQueue, parserOutputQueue`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`HTMLParser.feed(self, data)`
			`tmp = SafeHTMLParser.multi_replace(data)`
Long message parsing fix - while 448ceaa74ca6929f626fa36962c4f3c0ca0f0659 fixed slow rendering on windows, there was still a bug where overly long messages caused freezeing of the hyperlink regexp parser, which appears to happen on all platforms. Maybe it's a freeze, maybe it just takes too long. This patch aborts the regexp parser after 1 second and simply displays the message without hyperlinks being clickable. This doesn't affect HTML mode because there the links are kept as they are - Fixes #900 2016-10-21 15:54:02 +02:00			`tmp = unicode(tmp, 'utf-8', 'replace')`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00
			`parserLock.acquire()`
			`if parserProcess is None:`
			`parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))`
			`parserProcess.start()`
			`parserLock.release()`
			`parserInputQueue.put(tmp)`
Long message parsing fix - while 448ceaa74ca6929f626fa36962c4f3c0ca0f0659 fixed slow rendering on windows, there was still a bug where overly long messages caused freezeing of the hyperlink regexp parser, which appears to happen on all platforms. Maybe it's a freeze, maybe it just takes too long. This patch aborts the regexp parser after 1 second and simply displays the message without hyperlinks being clickable. This doesn't affect HTML mode because there the links are kept as they are - Fixes #900 2016-10-21 15:54:02 +02:00			`try:`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`tmp = parserOutputQueue.get(True, 1)`
			`except Queue.Empty:`
Long message parsing fix - while 448ceaa74ca6929f626fa36962c4f3c0ca0f0659 fixed slow rendering on windows, there was still a bug where overly long messages caused freezeing of the hyperlink regexp parser, which appears to happen on all platforms. Maybe it's a freeze, maybe it just takes too long. This patch aborts the regexp parser after 1 second and simply displays the message without hyperlinks being clickable. This doesn't affect HTML mode because there the links are kept as they are - Fixes #900 2016-10-21 15:54:02 +02:00			`logger.error("Regular expression parsing timed out, not displaying links")`
Freezing message parser fix #2 - this has been tested on Windows as well, and has been cleaned up. There is now a permanent parser thread, and it restarts when the parsing takes more than 1 second - Fixes #900 2016-10-22 01:45:32 +02:00			`parserLock.acquire()`
			`parserProcess.terminate()`
			`parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))`
			`parserProcess.start()`
			`parserLock.release()`
			`else:`
			`pass`
Clickable email and http links in plain text Email addresses and URIs are now clickable when viewing a message in plain text mode. Clicking an email address moves to the Send tab, while clicking an URI has the same result as clicking an URI in html mode, it will ask for confirmation before opening it in external handler. 2016-02-29 07:47:07 +08:00			`self.raw += tmp`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def is_html(self, text = None, allow_picture = False):`
			`if text:`
			`self.reset()`
			`self.reset_safe()`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`self.allow_picture = allow_picture`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.feed(text)`
			`self.close()`
HTML parser updates HTML parser wasn't correctly handling img tags. Now it also by defaults disabled external schemas to prevent deanonymisation (even though the renderer actually doesn't support external schemas at the moment) Addresses #178 2016-02-25 17:13:39 +08:00			`return self.has_html`