SafeHTMLParser unicode / subprocess
- don't do subprocess in SafeHTMLParser, it doesn't work in frozen mode and an attempt to fix it would take too much refactoring and I'm not even sure it would work - instead, make it handle broken unicode correctly - I think the previous reports of freezes were caused by trying to interpret data as unicode, causing a crash - it does about 1MB/s on my machine, so a timeout is not a big problem
This commit is contained in:
parent
fd95f8f519
commit
266d8eba1f
|
@ -1,28 +1,9 @@
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
import inspect
|
import inspect
|
||||||
import multiprocessing
|
|
||||||
import re
|
import re
|
||||||
import Queue
|
|
||||||
from urllib import quote, quote_plus
|
from urllib import quote, quote_plus
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from debug import logger
|
from debug import logger
|
||||||
from queues import parserInputQueue, parserOutputQueue, parserProcess, parserLock
|
|
||||||
|
|
||||||
def regexpSubprocess(parserInputQueue, parserOutputQueue):
|
|
||||||
for data in iter(parserInputQueue.get, None):
|
|
||||||
if data is None:
|
|
||||||
break;
|
|
||||||
try:
|
|
||||||
result = SafeHTMLParser.uriregex1.sub(
|
|
||||||
r'<a href="\1">\1</a>',
|
|
||||||
data)
|
|
||||||
result = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', result)
|
|
||||||
result = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', result)
|
|
||||||
parserOutputQueue.put(result)
|
|
||||||
except SystemExit:
|
|
||||||
break;
|
|
||||||
except:
|
|
||||||
break;
|
|
||||||
|
|
||||||
class SafeHTMLParser(HTMLParser):
|
class SafeHTMLParser(HTMLParser):
|
||||||
# from html5lib.sanitiser
|
# from html5lib.sanitiser
|
||||||
|
@ -82,7 +63,7 @@ class SafeHTMLParser(HTMLParser):
|
||||||
val == ""
|
val == ""
|
||||||
self.sanitised += " " + quote_plus(attr)
|
self.sanitised += " " + quote_plus(attr)
|
||||||
if not (val is None):
|
if not (val is None):
|
||||||
self.sanitised += "=\"" + (val if isinstance(val, unicode) else unicode(val, 'utf-8', 'replace')) + "\""
|
self.sanitised += "=\"" + val + "\""
|
||||||
if inspect.stack()[1][3] == "handle_startendtag":
|
if inspect.stack()[1][3] == "handle_startendtag":
|
||||||
self.sanitised += "/"
|
self.sanitised += "/"
|
||||||
self.sanitised += ">"
|
self.sanitised += ">"
|
||||||
|
@ -101,7 +82,7 @@ class SafeHTMLParser(HTMLParser):
|
||||||
self.add_if_acceptable(tag, attrs)
|
self.add_if_acceptable(tag, attrs)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
self.sanitised += unicode(data, 'utf-8', 'replace')
|
self.sanitised += data
|
||||||
|
|
||||||
def handle_charref(self, name):
|
def handle_charref(self, name):
|
||||||
self.sanitised += "&#" + name + ";"
|
self.sanitised += "&#" + name + ";"
|
||||||
|
@ -110,34 +91,17 @@ class SafeHTMLParser(HTMLParser):
|
||||||
self.sanitised += "&" + name + ";"
|
self.sanitised += "&" + name + ";"
|
||||||
|
|
||||||
def feed(self, data):
|
def feed(self, data):
|
||||||
global parserProcess
|
try:
|
||||||
|
data = unicode(data, 'utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
data = unicode(data, 'utf-8', errors='replace')
|
||||||
HTMLParser.feed(self, data)
|
HTMLParser.feed(self, data)
|
||||||
tmp = SafeHTMLParser.multi_replace(data)
|
tmp = SafeHTMLParser.multi_replace(data)
|
||||||
tmp = unicode(tmp, 'utf-8', 'replace')
|
tmp = SafeHTMLParser.uriregex1.sub(
|
||||||
|
r'<a href="\1">\1</a>',
|
||||||
parserLock.acquire()
|
tmp)
|
||||||
if parserProcess is None:
|
tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
|
||||||
parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
|
tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
|
||||||
parserProcess.start()
|
|
||||||
parserLock.release()
|
|
||||||
# flush queue
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
tmp = parserOutputQueue.get(False)
|
|
||||||
except Queue.Empty:
|
|
||||||
logger.debug("Parser queue flushed")
|
|
||||||
pass
|
|
||||||
parserInputQueue.put(tmp)
|
|
||||||
try:
|
|
||||||
tmp = parserOutputQueue.get(True, 1)
|
|
||||||
except Queue.Empty:
|
|
||||||
logger.error("Regular expression parsing timed out, not displaying links")
|
|
||||||
parserLock.acquire()
|
|
||||||
parserProcess.terminate()
|
|
||||||
parserProcess = multiprocessing.Process(target=regexpSubprocess, name="RegExParser", args=(parserInputQueue, parserOutputQueue))
|
|
||||||
parserProcess.start()
|
|
||||||
parserLock.release()
|
|
||||||
|
|
||||||
self.raw += tmp
|
self.raw += tmp
|
||||||
|
|
||||||
def is_html(self, text = None, allow_picture = False):
|
def is_html(self, text = None, allow_picture = False):
|
||||||
|
|
|
@ -9,8 +9,3 @@ addressGeneratorQueue = Queue.Queue()
|
||||||
objectProcessorQueue = ObjectProcessorQueue()
|
objectProcessorQueue = ObjectProcessorQueue()
|
||||||
apiAddressGeneratorReturnQueue = Queue.Queue(
|
apiAddressGeneratorReturnQueue = Queue.Queue(
|
||||||
) # The address generator thread uses this queue to get information back to the API thread.
|
) # The address generator thread uses this queue to get information back to the API thread.
|
||||||
|
|
||||||
parserProcess = None
|
|
||||||
parserLock = mpLock()
|
|
||||||
parserInputQueue = mpQueue()
|
|
||||||
parserOutputQueue = mpQueue()
|
|
||||||
|
|
|
@ -12,16 +12,12 @@ from helper_threading import StoppableThread
|
||||||
from knownnodes import saveKnownNodes
|
from knownnodes import saveKnownNodes
|
||||||
from inventory import Inventory
|
from inventory import Inventory
|
||||||
import protocol
|
import protocol
|
||||||
from queues import addressGeneratorQueue, objectProcessorQueue, parserInputQueue, UISignalQueue, workerQueue
|
from queues import addressGeneratorQueue, objectProcessorQueue, UISignalQueue, workerQueue
|
||||||
import shared
|
import shared
|
||||||
import state
|
import state
|
||||||
|
|
||||||
def doCleanShutdown():
|
def doCleanShutdown():
|
||||||
state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit.
|
state.shutdown = 1 #Used to tell proof of work worker threads and the objectProcessorThread to exit.
|
||||||
try:
|
|
||||||
parserInputQueue.put(None, False)
|
|
||||||
except Queue.Full:
|
|
||||||
pass
|
|
||||||
protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data'))
|
protocol.broadcastToSendDataQueues((0, 'shutdown', 'no data'))
|
||||||
objectProcessorQueue.put(('checkShutdownVariable', 'no data'))
|
objectProcessorQueue.put(('checkShutdownVariable', 'no data'))
|
||||||
for thread in threading.enumerate():
|
for thread in threading.enumerate():
|
||||||
|
|
Reference in New Issue
Block a user