PyBitmessage-2024-12-01/src/bitmessageqt/safehtmlparser.py

from HTMLParser import HTMLParser
import inspect
from urllib import quote, quote_plus

class SafeHTMLParser(HTMLParser):
    # from html5lib.sanitiser
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
                           'figcaption', 'figure', 'footer', 'font', 'header', 'h1',
                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
    replaces = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"], ["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]

    @staticmethod
    def multi_replace(text):
        for a in SafeHTMLParser.replaces:
            text = text.replace(a[0], a[1])
        if len(text) > 1 and text[0] == " ":
            text = "&nbsp;" + text[1:]
        return text

    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
        self.reset_safe()
        
    def reset_safe(self):
        self.elements = set()
        self.raw = u""
        self.sanitised = u""
        self.has_html = False

    def add_if_acceptable(self, tag, attrs = None):
        if not tag in self.acceptable_elements:
            return
        self.sanitised += "<"
        if inspect.stack()[1][3] == "handle_endtag":
            self.sanitised += "/"
        self.sanitised += tag
        if not attrs is None:
            for attr in attrs:
                if tag == "img" and attr[0] == "src" and not self.allow_picture:
                    attr[1] = ""
                self.sanitised += " " + quote_plus(attr[0])
                if not (attr[1] is None):
                    self.sanitised += "=\"" + attr[1] + "\""
        if inspect.stack()[1][3] == "handle_startendtag":
            self.sanitised += "/"
        self.sanitised += ">"
    
    def handle_starttag(self, tag, attrs):
        if tag in self.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)

    def handle_endtag(self, tag):
        self.add_if_acceptable(tag)
        
    def handle_startendtag(self, tag, attrs):
        if tag in self.acceptable_elements:
            self.has_html = True
        self.add_if_acceptable(tag, attrs)
    
    def handle_data(self, data):
        self.sanitised += unicode(data, 'utf-8', 'replace')
        
    def handle_charref(self, name):
        self.sanitised += "&#" + name + ";"
    
    def handle_entityref(self, name):
        self.sanitised += "&" + name + ";"

    def feed(self, data):
        HTMLParser.feed(self, data)
        tmp = SafeHTMLParser.multi_replace(data)
        self.raw += unicode(tmp, 'utf-8', 'replace')

    def is_html(self, text = None, allow_picture = False):
        if text:
            self.reset()
            self.reset_safe()
            self.feed(text)
            self.close()
            self.allow_picture = allow_picture
        return self.has_html
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`from HTMLParser import HTMLParser`
			`import inspect`
			`from urllib import quote, quote_plus`

			`class SafeHTMLParser(HTMLParser):`
			`# from html5lib.sanitiser`
			`acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',`
			`'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',`
			`'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',`
			`'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',`
			`'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',`
			`'figcaption', 'figure', 'footer', 'font', 'header', 'h1',`
			`'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',`
			`'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',`
			`'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',`
			`'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',`
			`'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',`
			`'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',`
			`'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`replaces = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"], ["\n", "<br/>"], ["\t", "    "], [" ", "  "], [" ", "  "], ["<br/> ", "<br/> "]]`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`@staticmethod`
			`def multi_replace(text):`
			`for a in SafeHTMLParser.replaces:`
			`text = text.replace(a[0], a[1])`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`if len(text) > 1 and text[0] == " ":`
			`text = " " + text[1:]`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00			`return text`

HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def __init__(self, args, *kwargs):`
			`HTMLParser.__init__(self, args, *kwargs)`
Message body display handling of spaces After the changes in the message body renderer, spaces were not correctly handled. Fixes #168 2016-01-23 09:53:14 +01:00			`self.reset_safe()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def reset_safe(self):`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.elements = set()`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`self.raw = u""`
			`self.sanitised = u""`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`self.has_html = False`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00
			`def add_if_acceptable(self, tag, attrs = None):`
			`if not tag in self.acceptable_elements:`
			`return`
			`self.sanitised += "<"`
			`if inspect.stack()[1][3] == "handle_endtag":`
			`self.sanitised += "/"`
			`self.sanitised += tag`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`if not attrs is None:`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`for attr in attrs:`
			`if tag == "img" and attr[0] == "src" and not self.allow_picture:`
			`attr[1] = ""`
Message has safe link opening Links in message body (if in HTML mode) now open, but it asks for a confirmation in a dialog box. Fixes #27 2015-12-15 01:24:10 +01:00			`self.sanitised += " " + quote_plus(attr[0])`
Lazy rendering of message contents Message will render as user is scrolling down. This prevents interface freezes on long messages (such as inline img in text mode). Fixes Bitmessage##366 Also a minor fix in text mode rendering. 2015-12-15 03:51:06 +01:00			`if not (attr[1] is None):`
Message has safe link opening Links in message body (if in HTML mode) now open, but it asks for a confirmation in a dialog box. Fixes #27 2015-12-15 01:24:10 +01:00			`self.sanitised += "=\"" + attr[1] + "\""`
HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`if inspect.stack()[1][3] == "handle_startendtag":`
			`self.sanitised += "/"`
			`self.sanitised += ">"`

			`def handle_starttag(self, tag, attrs):`
			`if tag in self.acceptable_elements:`
			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_endtag(self, tag):`
			`self.add_if_acceptable(tag)`

			`def handle_startendtag(self, tag, attrs):`
			`if tag in self.acceptable_elements:`
			`self.has_html = True`
			`self.add_if_acceptable(tag, attrs)`

			`def handle_data(self, data):`
			`self.sanitised += unicode(data, 'utf-8', 'replace')`

			`def handle_charref(self, name):`
			`self.sanitised += "&#" + name + ";"`

			`def handle_entityref(self, name):`
			`self.sanitised += "&" + name + ";"`
Html parser fixes Raw mode improved, avoid HTML parser entirely and just replaces some strings. 2015-12-16 14:20:51 +01:00
			`def feed(self, data):`
			`HTMLParser.feed(self, data)`
			`tmp = SafeHTMLParser.multi_replace(data)`
			`self.raw += unicode(tmp, 'utf-8', 'replace')`

HTML detector and switcher HTML messages are detected and if present, the top of the message textedit displays a clickable area that switches HTML rendering on and off. Fixes #13 2015-12-14 19:43:39 +01:00			`def is_html(self, text = None, allow_picture = False):`
			`if text:`
			`self.reset()`
			`self.reset_safe()`
			`self.feed(text)`
			`self.close()`
			`self.allow_picture = allow_picture`
			`return self.has_html`