From 266d8eba1f9e2d666332e2ab9cd76bda55b20c87 Mon Sep 17 00:00:00 2001 From: Peter Surda Date: Wed, 22 Feb 2017 09:05:05 +0100 Subject: [PATCH] SafeHTMLParser unicode / subprocess - don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem --- src/bitmessageqt/safehtmlparser.py | 58 ++++++------------------------ src/queues.py | 5 --- src/shutdown.py | 6 +--- 3 files changed, 12 insertions(+), 57 deletions(-) diff --git a/src/bitmessageqt/safehtmlparser.py b/src/bitmessageqt/safehtmlparser.py index 5fe5265c..1ff12c8e 100644 --- a/src/bitmessageqt/safehtmlparser.py +++ b/src/bitmessageqt/safehtmlparser.py @@ -1,28 +1,9 @@ from HTMLParser import HTMLParser import inspect -import multiprocessing import re -import Queue from urllib import quote, quote_plus from urlparse import urlparse from debug import logger -from queues import parserInputQueue, parserOutputQueue, parserProcess, parserLock - -def regexpSubprocess(parserInputQueue, parserOutputQueue): - for data in iter(parserInputQueue.get, None): - if data is None: - break; - try: - result = SafeHTMLParser.uriregex1.sub( - r'\1', - data) - result = SafeHTMLParser.uriregex2.sub(r'\1', result) - parserOutputQueue.put(result) - except SystemExit: - break; - except: - break; class SafeHTMLParser(HTMLParser): # from html5lib.sanitiser @@ -82,7 +63,7 @@ class SafeHTMLParser(HTMLParser): val == "" self.sanitised += " " + quote_plus(attr) if not (val is None): - self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\"" + self.sanitised += "=\"" + val + "\"" if inspect.stack()[1][3] == "handle_startendtag": self.sanitised += "/" self.sanitised += ">" @@ -101,7 +82,7 @@ class SafeHTMLParser(HTMLParser): self.add_if_acceptable(tag, attrs) def handle_data(self, data): - self.sanitised += unicode(data, 'utf-8', 'replace') + self.sanitised += data def handle_charref(self, name): self.sanitised += "&#" + name + ";" @@ -110,34 +91,17 @@ class SafeHTMLParser(HTMLParser): self.sanitised += "&" + name + ";" def feed(self, data): - global parserProcess + try: + data = unicode(data, 'utf-8') + except UnicodeDecodeError: + data = unicode(data, 'utf-8', errors='replace') HTMLParser.feed(self, data) tmp = SafeHTMLParser.multi_replace(data) - tmp = unicode(tmp, 'utf-8', 'replace') - - parserLock.acquire() - if parserProcess is None: - parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue)) - parserProcess.start() - parserLock.release() - # flush queue - try: - while True: - tmp = parserOutputQueue.get(False) - except Queue.Empty: - logger.debug("Parser queue flushed") - pass - parserInputQueue.put(tmp) - try: - tmp = parserOutputQueue.get(True, 1) - except Queue.Empty: - logger.error("Regular expression parsing timed out, not displaying links") - parserLock.acquire() - parserProcess.terminate() - parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue)) - parserProcess.start() - parserLock.release() - + tmp = SafeHTMLParser.uriregex1.sub( + r'\1', + tmp) + tmp = SafeHTMLParser.uriregex2.sub(r'\1', tmp) self.raw += tmp def is_html(self, text = None, allow_picture = False): diff --git a/src/queues.py b/src/queues.py index f50a0f1c..c7dab16f 100644 --- a/src/queues.py +++ b/src/queues.py @@ -9,8 +9,3 @@ addressGeneratorQueue = Queue.Queue() objectProcessorQueue = ObjectProcessorQueue() apiAddressGeneratorReturnQueue = Queue.Queue( ) # The address generator thread uses this queue to get information back to the API thread. - -parserProcess = None -parserLock = mpLock() -parserInputQueue = mpQueue() -parserOutputQueue = mpQueue() diff --git a/src/shutdown.py b/src/shutdown.py index 8a219237..bf1a4d12 100644 --- a/src/shutdown.py +++ b/src/shutdown.py @@ -12,16 +12,12 @@ from helper_threading import StoppableThread from knownnodes import saveKnownNodes from inventory import Inventory import protocol -from queues import addressGeneratorQueue, objectProcessorQueue, parserInputQueue, UISignalQueue, workerQueue +from queues import addressGeneratorQueue, objectProcessorQueue, UISignalQueue, workerQueue import shared import state def doCleanShutdown(): state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit. - try: - parserInputQueue.put(None, False) - except Queue.Full: - pass protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data')) objectProcessorQueue.put(('checkShutdownVariable', 'no data')) for thread in threading.enumerate():