使用正则表达式,对QQ群聊天记录进行解析,用于分析日期、成员等维度发言情况。
原始文本是
2014-03-28 15:04:25 №┽◎Eagle(369029696)
解析之后
yyyy=2014
mm = 03
dd = 28
hh = 15
mi =04
ss = 25
nick = №┽◎Eagle
qq = 369029696
代码如下
# -*- coding: utf-8 -*- """ zhangbo2012 http://www.cnblogs.com/zhangbo2012/ """ import re def resolving_by_user(filepath): with open(filepath,'r') as rf: filecontent = rf.read() resolving_result={} #2014-03-28 15:04:25 №┽◎Eagle(369029696) p = re.compile(r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) (.*)\((.*?)\)\n') for [yyyy,mm,dd,hh,mi,ss,nick,qq] in p.findall(filecontent): if qq in resolving_result.keys(): temps = resolving_result[qq] temps["qq"]=qq temps["nick"]=nick temps["worldcnt"]+=1 resolving_result[qq]=temps else: resolving_result[qq] ={"qq":qq,"nick":nick,"worldcnt":1} for value in resolving_result.values(): print str.rjust(repr(value['qq']),15)+str.rjust(repr(value['worldcnt']),10) if __name__=='__main__': resolving_by_user("2.txt")