HTML parser fix

- URLs followed with space were broken
2017-02-28 22:47:56 +01:00 · 2017-02-28 22:47:56 +01:00 · 171bc83ec0
commit 171bc83ec0
parent b7e75b9bc5
1 changed files with 12 additions and 4 deletions
--- a/src/bitmessageqt/safehtmlparser.py
+++ b/src/bitmessageqt/safehtmlparser.py
@ -19,15 +19,22 @@ class SafeHTMLParser(HTMLParser):
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
-    replaces = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"], ["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
+    replaces_pre = [["&", "&amp;"], ["\"", "&quot;"], ["<", "&lt;"], [">", "&gt;"]]
+    replaces_post = [["\n", "<br/>"], ["\t", "&nbsp;&nbsp;&nbsp;&nbsp;"], ["  ", "&nbsp; "], ["  ", "&nbsp; "], ["<br/> ", "<br/>&nbsp;"]]
    src_schemes = [ "data" ]
    uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
    uriregex2 = re.compile(r'<a href="([^"]+)&amp;')
    emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')

    @staticmethod
-    def multi_replace(text):
-        for a in SafeHTMLParser.replaces:
+    def replace_pre(text):
+        for a in SafeHTMLParser.replaces_pre:
+            text = text.replace(a[0], a[1])
+        return text
+
+    @staticmethod
+    def replace_post(text):
+        for a in SafeHTMLParser.replaces_post:
            text = text.replace(a[0], a[1])
        if len(text) > 1 and text[0] == " ":
            text = "&nbsp;" + text[1:]
@ -95,12 +102,13 @@ class SafeHTMLParser(HTMLParser):
        except UnicodeDecodeError:
            data = unicode(data, 'utf-8', errors='replace')
        HTMLParser.feed(self, data)
-        tmp = SafeHTMLParser.multi_replace(data)
+        tmp = SafeHTMLParser.replace_pre(data)
        tmp = SafeHTMLParser.uriregex1.sub(
            r'<a href="\1">\1</a>',
            tmp)
        tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
        tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
+        tmp = SafeHTMLParser.replace_post(tmp)
        self.raw += tmp

    def is_html(self, text = None, allow_picture = False):