SafeHTMLParser unicode / subprocess

- don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode
  and an attempt to fix it would take too much refactoring and I'm not
  even sure it would work
- instead, make it handle broken unicode correctly
- I think the previous reports of freezes were caused by trying to
  interpret data as unicode, causing a crash
- it does about 1MB/s on my machine, so a timeout is not a big problem
This commit is contained in:
Peter Šurda 2017-02-22 09:05:05 +01:00
parent fd95f8f519
commit 266d8eba1f
Signed by: PeterSurda
GPG Key ID: 0C5F50C0B5F37D87
3 changed files with 12 additions and 57 deletions

View File

@ -1,28 +1,9 @@
from HTMLParser import HTMLParser
import inspect
import multiprocessing
import re
import Queue
from urllib import quote, quote_plus
from urlparse import urlparse
from debug import logger
from queues import parserInputQueue, parserOutputQueue, parserProcess, parserLock
def regexpSubprocess(parserInputQueue, parserOutputQueue):
for data in iter(parserInputQueue.get, None):
if data is None:
break;
try:
result = SafeHTMLParser.uriregex1.sub(
r'<a href="\1">\1</a>',
data)
result = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', result)
result = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', result)
parserOutputQueue.put(result)
except SystemExit:
break;
except:
break;
class SafeHTMLParser(HTMLParser):
# from html5lib.sanitiser
@ -82,7 +63,7 @@ class SafeHTMLParser(HTMLParser):
val == ""
self.sanitised += " " + quote_plus(attr)
if not (val is None):
self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\""
self.sanitised += "=\"" + val + "\""
if inspect.stack()[1][3] == "handle_startendtag":
self.sanitised += "/"
self.sanitised += ">"
@ -101,7 +82,7 @@ class SafeHTMLParser(HTMLParser):
self.add_if_acceptable(tag, attrs)
def handle_data(self, data):
self.sanitised += unicode(data, 'utf-8', 'replace')
self.sanitised += data
def handle_charref(self, name):
self.sanitised += "&#" + name + ";"
@ -110,34 +91,17 @@ class SafeHTMLParser(HTMLParser):
self.sanitised += "&" + name + ";"
def feed(self, data):
global parserProcess
try:
data = unicode(data, 'utf-8')
except UnicodeDecodeError:
data = unicode(data, 'utf-8', errors='replace')
HTMLParser.feed(self, data)
tmp = SafeHTMLParser.multi_replace(data)
tmp = unicode(tmp, 'utf-8', 'replace')
parserLock.acquire()
if parserProcess is None:
parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
parserProcess.start()
parserLock.release()
# flush queue
try:
while True:
tmp = parserOutputQueue.get(False)
except Queue.Empty:
logger.debug("Parser queue flushed")
pass
parserInputQueue.put(tmp)
try:
tmp = parserOutputQueue.get(True, 1)
except Queue.Empty:
logger.error("Regular expression parsing timed out, not displaying links")
parserLock.acquire()
parserProcess.terminate()
parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
parserProcess.start()
parserLock.release()
tmp = SafeHTMLParser.uriregex1.sub(
r'<a href="\1">\1</a>',
tmp)
tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
self.raw += tmp
def is_html(self, text = None, allow_picture = False):

View File

@ -9,8 +9,3 @@ addressGeneratorQueue = Queue.Queue()
objectProcessorQueue = ObjectProcessorQueue()
apiAddressGeneratorReturnQueue = Queue.Queue(
) # The address generator thread uses this queue to get information back to the API thread.
parserProcess = None
parserLock = mpLock()
parserInputQueue = mpQueue()
parserOutputQueue = mpQueue()

View File

@ -12,16 +12,12 @@ from helper_threading import StoppableThread
from knownnodes import saveKnownNodes
from inventory import Inventory
import protocol
from queues import addressGeneratorQueue, objectProcessorQueue, parserInputQueue, UISignalQueue, workerQueue
from queues import addressGeneratorQueue, objectProcessorQueue, UISignalQueue, workerQueue
import shared
import state
def doCleanShutdown():
state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit.
try:
parserInputQueue.put(None, False)
except Queue.Full:
pass
protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data'))
objectProcessorQueue.put(('checkShutdownVariable', 'no data'))
for thread in threading.enumerate():