SafeHTMLParser unicode / subprocess

- don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem
2017-02-22 09:05:05 +01:00 · 2017-02-22 09:05:05 +01:00 · 266d8eba1f
commit 266d8eba1f
parent fd95f8f519
3 changed files with 12 additions and 57 deletions
--- a/src/bitmessageqt/safehtmlparser.py
+++ b/src/bitmessageqt/safehtmlparser.py
@ -1,28 +1,9 @@
 from HTMLParser import HTMLParser
 import inspect
-import multiprocessing
 import re
-import Queue
 from urllib import quote, quote_plus
 from urlparse import urlparse
 from debug import logger
-from queues import parserInputQueue, parserOutputQueue, parserProcess, parserLock
-
-def regexpSubprocess(parserInputQueue, parserOutputQueue):
-    for data in iter(parserInputQueue.get, None):
-        if data is None:
-            break;
-        try:
-            result = SafeHTMLParser.uriregex1.sub(
-                r'<a href="\1">\1</a>',
-                data)
-            result = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', result)
-            result = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', result)
-            parserOutputQueue.put(result)
-        except SystemExit:
-            break;
-        except:
-            break;

 class SafeHTMLParser(HTMLParser):
    # from html5lib.sanitiser
@ -82,7 +63,7 @@ class SafeHTMLParser(HTMLParser):
                        val == ""
                self.sanitised += " " + quote_plus(attr)
                if not (val is None):
-                    self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\""
+                    self.sanitised += "=\"" + val + "\""
        if inspect.stack()[1][3] == "handle_startendtag":
            self.sanitised += "/"
        self.sanitised += ">"
@ -101,7 +82,7 @@ class SafeHTMLParser(HTMLParser):
        self.add_if_acceptable(tag, attrs)
    
    def handle_data(self, data):
-        self.sanitised += unicode(data, 'utf-8', 'replace')
+        self.sanitised += data
        
    def handle_charref(self, name):
        self.sanitised += "&#" + name + ";"
@ -110,34 +91,17 @@ class SafeHTMLParser(HTMLParser):
        self.sanitised += "&" + name + ";"

    def feed(self, data):
-        global parserProcess
+        try:
+            data = unicode(data, 'utf-8')
+        except UnicodeDecodeError:
+            data = unicode(data, 'utf-8', errors='replace')
        HTMLParser.feed(self, data)
        tmp = SafeHTMLParser.multi_replace(data)
-        tmp = unicode(tmp, 'utf-8', 'replace')
-        
-        parserLock.acquire()
-        if parserProcess is None:
-            parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
-            parserProcess.start()
-        parserLock.release()
-        # flush queue
-        try:
-            while True:
-                tmp = parserOutputQueue.get(False)
-        except Queue.Empty:
-            logger.debug("Parser queue flushed")
-            pass
-        parserInputQueue.put(tmp)
-        try:
-            tmp = parserOutputQueue.get(True, 1)
-        except Queue.Empty:
-            logger.error("Regular expression parsing timed out, not displaying links")
-            parserLock.acquire()
-            parserProcess.terminate()
-            parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
-            parserProcess.start()
-            parserLock.release()
-
+        tmp = SafeHTMLParser.uriregex1.sub(
+            r'<a href="\1">\1</a>',
+            tmp)
+        tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
+        tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
        self.raw += tmp

    def is_html(self, text = None, allow_picture = False):
--- a/src/queues.py
+++ b/src/queues.py
@ -9,8 +9,3 @@ addressGeneratorQueue = Queue.Queue()
 objectProcessorQueue = ObjectProcessorQueue()
 apiAddressGeneratorReturnQueue = Queue.Queue(
    )  # The address generator thread uses this queue to get information back to the API thread.
-
-parserProcess = None
-parserLock = mpLock()
-parserInputQueue = mpQueue()
-parserOutputQueue = mpQueue()
--- a/src/shutdown.py
+++ b/src/shutdown.py
@ -12,16 +12,12 @@ from helper_threading import StoppableThread
 from knownnodes import saveKnownNodes
 from inventory import Inventory
 import protocol
-from queues import addressGeneratorQueue, objectProcessorQueue, parserInputQueue, UISignalQueue, workerQueue
+from queues import addressGeneratorQueue, objectProcessorQueue, UISignalQueue, workerQueue
 import shared
 import state

 def doCleanShutdown():
    state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit.
-    try:
-        parserInputQueue.put(None, False)
-    except Queue.Full:
-        pass
    protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data'))   
    objectProcessorQueue.put(('checkShutdownVariable', 'no data'))
    for thread in threading.enumerate():