From 3464fd6c870095b7f75b894da90a6860f03e2523 Mon Sep 17 00:00:00 2001 From: Nils Dagsson Moskopp Date: Tue, 25 Oct 2011 06:08:42 +0200 Subject: [PATCH] - tear out regular expressions, use tornado's linkify capabilities --- logformat.py | 54 +++++++++++++++------------------------------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/logformat.py b/logformat.py index 11327b9..d5007e3 100755 --- a/logformat.py +++ b/logformat.py @@ -17,11 +17,13 @@ # MA 02110-1301, USA. from sys import argv -import re + import time import locale import os +from tornado import escape + try: from mod_python import apache except: @@ -33,13 +35,6 @@ def __init__(self, textlog, language, plain=False): Input a text/plain chatlog from zweipktfkt and get out HTML5 goodness. """ - # precompile regular expressions - hosts_re = re.compile(r'(^[0-9]{2}:[0-9]{2} [^<][^ ]*) \(.*@.*\) (has (joined|quit|left))') - chars_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]') - uri_patterns = [ r'''((?<=\()\b[A-Za-z][A-Za-z0-9\+\.\-]*:([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))''', r'''((?<=<)\b[A-Za-z][A-Za-z0-9\+\.\-]*:([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=>))''', r'''(?",">") - line = line.replace("'","'") - line = line.replace("\"",""") - # input is mixed utf-8 and latin-1 try: line = unicode(line,'utf-8','strict') except UnicodeDecodeError: line = unicode(line,'latin-1','strict') - if plain: - line = line.encode('utf-8') - else: - line = line.encode('ascii', 'xmlcharrefreplace') - line, count = chars_re.subn('',line) + + line = line.encode('utf-8') # remove erroneous spaces try: @@ -143,27 +124,22 @@ def __init__(self, textlog, language, plain=False): try: int(line[:2]) int(line[3:5]) - line = '' + line[:5] + '' + line[5:] - - if line[32:36] == "<": - line = '#' + line + '' + timestamp = line[:5] + + def linkify(text): + return escape.linkify(text).encode('utf-8') + + if line[6] == "<": + if line[7] == ' ': + line = line[:7] + line[8:] + line = '#' + '' + timestamp + '' + linkify(line[5:]) + '' else: - line = '#' + line + '' + line = '#' + linkify(line) + '' lastlineid = lineid except ValueError: pass - # markup links - uri_replacement = r'''\1''' - - for p in uri_res: - line, nsubs = p.subn(uri_replacement, line) - if nsubs > 0: break # only use first matching pattern - - # markup twitter names - line = twittername_re.sub(r'''\1''',line) - self.log += line + ("\n" if plain else "
\n") if (not plain) and line == textlog.split("\n")[-1:][0]: