SafeHTMLParser unicode / subprocess

- don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode
  and an attempt to fix it would take too much refactoring and I'm not
  even sure it would work
- instead, make it handle broken unicode correctly
- I think the previous reports of freezes were caused by trying to
  interpret data as unicode, causing a crash
- it does about 1MB/s on my machine, so a timeout is not a big problem
This commit is contained in:
Peter Šurda 2017-02-22 09:05:05 +01:00
parent fd95f8f519
commit 266d8eba1f
Signed by untrusted user: PeterSurda
GPG Key ID: 0C5F50C0B5F37D87
3 changed files with 12 additions and 57 deletions

View File

@ -1,28 +1,9 @@
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
import inspect import inspect
import multiprocessing
import re import re
import Queue
from urllib import quote, quote_plus from urllib import quote, quote_plus
from urlparse import urlparse from urlparse import urlparse
from debug import logger from debug import logger
from queues import parserInputQueue, parserOutputQueue, parserProcess, parserLock
def regexpSubprocess(parserInputQueue, parserOutputQueue):
for data in iter(parserInputQueue.get, None):
if data is None:
break;
try:
result = SafeHTMLParser.uriregex1.sub(
r'<a href="\1">\1</a>',
data)
result = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', result)
result = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', result)
parserOutputQueue.put(result)
except SystemExit:
break;
except:
break;
class SafeHTMLParser(HTMLParser): class SafeHTMLParser(HTMLParser):
# from html5lib.sanitiser # from html5lib.sanitiser
@ -82,7 +63,7 @@ class SafeHTMLParser(HTMLParser):
val == "" val == ""
self.sanitised += " " + quote_plus(attr) self.sanitised += " " + quote_plus(attr)
if not (val is None): if not (val is None):
self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\"" self.sanitised += "=\"" + val + "\""
if inspect.stack()[1][3] == "handle_startendtag": if inspect.stack()[1][3] == "handle_startendtag":
self.sanitised += "/" self.sanitised += "/"
self.sanitised += ">" self.sanitised += ">"
@ -101,7 +82,7 @@ class SafeHTMLParser(HTMLParser):
self.add_if_acceptable(tag, attrs) self.add_if_acceptable(tag, attrs)
def handle_data(self, data): def handle_data(self, data):
self.sanitised += unicode(data, 'utf-8', 'replace') self.sanitised += data
def handle_charref(self, name): def handle_charref(self, name):
self.sanitised += "&#" + name + ";" self.sanitised += "&#" + name + ";"
@ -110,34 +91,17 @@ class SafeHTMLParser(HTMLParser):
self.sanitised += "&" + name + ";" self.sanitised += "&" + name + ";"
def feed(self, data): def feed(self, data):
global parserProcess try:
data = unicode(data, 'utf-8')
except UnicodeDecodeError:
data = unicode(data, 'utf-8', errors='replace')
HTMLParser.feed(self, data) HTMLParser.feed(self, data)
tmp = SafeHTMLParser.multi_replace(data) tmp = SafeHTMLParser.multi_replace(data)
tmp = unicode(tmp, 'utf-8', 'replace') tmp = SafeHTMLParser.uriregex1.sub(
r'<a href="\1">\1</a>',
parserLock.acquire() tmp)
if parserProcess is None: tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue)) tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
parserProcess.start()
parserLock.release()
# flush queue
try:
while True:
tmp = parserOutputQueue.get(False)
except Queue.Empty:
logger.debug("Parser queue flushed")
pass
parserInputQueue.put(tmp)
try:
tmp = parserOutputQueue.get(True, 1)
except Queue.Empty:
logger.error("Regular expression parsing timed out, not displaying links")
parserLock.acquire()
parserProcess.terminate()
parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
parserProcess.start()
parserLock.release()
self.raw += tmp self.raw += tmp
def is_html(self, text = None, allow_picture = False): def is_html(self, text = None, allow_picture = False):

View File

@ -9,8 +9,3 @@ addressGeneratorQueue = Queue.Queue()
objectProcessorQueue = ObjectProcessorQueue() objectProcessorQueue = ObjectProcessorQueue()
apiAddressGeneratorReturnQueue = Queue.Queue( apiAddressGeneratorReturnQueue = Queue.Queue(
) # The address generator thread uses this queue to get information back to the API thread. ) # The address generator thread uses this queue to get information back to the API thread.
parserProcess = None
parserLock = mpLock()
parserInputQueue = mpQueue()
parserOutputQueue = mpQueue()

View File

@ -12,16 +12,12 @@ from helper_threading import StoppableThread
from knownnodes import saveKnownNodes from knownnodes import saveKnownNodes
from inventory import Inventory from inventory import Inventory
import protocol import protocol
from queues import addressGeneratorQueue, objectProcessorQueue, parserInputQueue, UISignalQueue, workerQueue from queues import addressGeneratorQueue, objectProcessorQueue, UISignalQueue, workerQueue
import shared import shared
import state import state
def doCleanShutdown(): def doCleanShutdown():
state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit. state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit.
try:
parserInputQueue.put(None, False)
except Queue.Full:
pass
protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data')) protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data'))
objectProcessorQueue.put(('checkShutdownVariable', 'no data')) objectProcessorQueue.put(('checkShutdownVariable', 'no data'))
for thread in threading.enumerate(): for thread in threading.enumerate():