HTML parser fix
- URLs followed with space were broken
This commit is contained in:
parent
b7e75b9bc5
commit
171bc83ec0
|
@ -19,15 +19,22 @@ class SafeHTMLParser(HTMLParser):
|
||||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
||||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
||||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
||||||
replaces = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"], ["\n", "<br/>"], ["\t", " "], [" ", " "], [" ", " "], ["<br/> ", "<br/> "]]
|
replaces_pre = [["&", "&"], ["\"", """], ["<", "<"], [">", ">"]]
|
||||||
|
replaces_post = [["\n", "<br/>"], ["\t", " "], [" ", " "], [" ", " "], ["<br/> ", "<br/> "]]
|
||||||
src_schemes = [ "data" ]
|
src_schemes = [ "data" ]
|
||||||
uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
|
uriregex1 = re.compile(r'(?i)\b((?:(https?|ftp|bitcoin):(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')
|
||||||
uriregex2 = re.compile(r'<a href="([^"]+)&')
|
uriregex2 = re.compile(r'<a href="([^"]+)&')
|
||||||
emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')
|
emailregex = re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})\b')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def multi_replace(text):
|
def replace_pre(text):
|
||||||
for a in SafeHTMLParser.replaces:
|
for a in SafeHTMLParser.replaces_pre:
|
||||||
|
text = text.replace(a[0], a[1])
|
||||||
|
return text
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def replace_post(text):
|
||||||
|
for a in SafeHTMLParser.replaces_post:
|
||||||
text = text.replace(a[0], a[1])
|
text = text.replace(a[0], a[1])
|
||||||
if len(text) > 1 and text[0] == " ":
|
if len(text) > 1 and text[0] == " ":
|
||||||
text = " " + text[1:]
|
text = " " + text[1:]
|
||||||
|
@ -95,12 +102,13 @@ class SafeHTMLParser(HTMLParser):
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
data = unicode(data, 'utf-8', errors='replace')
|
data = unicode(data, 'utf-8', errors='replace')
|
||||||
HTMLParser.feed(self, data)
|
HTMLParser.feed(self, data)
|
||||||
tmp = SafeHTMLParser.multi_replace(data)
|
tmp = SafeHTMLParser.replace_pre(data)
|
||||||
tmp = SafeHTMLParser.uriregex1.sub(
|
tmp = SafeHTMLParser.uriregex1.sub(
|
||||||
r'<a href="\1">\1</a>',
|
r'<a href="\1">\1</a>',
|
||||||
tmp)
|
tmp)
|
||||||
tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
|
tmp = SafeHTMLParser.uriregex2.sub(r'<a href="\1&', tmp)
|
||||||
tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
|
tmp = SafeHTMLParser.emailregex.sub(r'<a href="mailto:\1">\1</a>', tmp)
|
||||||
|
tmp = SafeHTMLParser.replace_post(tmp)
|
||||||
self.raw += tmp
|
self.raw += tmp
|
||||||
|
|
||||||
def is_html(self, text = None, allow_picture = False):
|
def is_html(self, text = None, allow_picture = False):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user