1 #日志分析完整代码(新增几个小模块)
2 #Author: Baozi
3 #-*- codeing:utf-8 -*-
4 #日志分析项目
5 '''
6 1.新建一个python文件test.py7 2.从日志文件中复制一条日志信息用于测试。logline存储这个日志字符串8 '''
9 importthreading10 from queue importQueue11 importdatetime12 importre13 importrandom14 importtime15 from pathlib importPath16 from user_agents importparse17 from collections importdefaultdict18
19 #logline = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
20 #pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P\w+) (?P\S+) (?P[\w/\d.]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
21 pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P[^"]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
22
23 ops ={24 'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),25 'status':int,26 'length':int,27 'request':lambda request:dict(zip(('method','url','ptorocol'),request.split())),28 'useragent':lambdauseragent:parse(useragent)29 }30 regex =re.compile(pattern)31
32 defextract(line):33 matcher =regex.match(line)34 print(matcher.groupdict())35 #matcher.groupdict()函数返回一个包含所有match匹配的命名分组的字典
36 info = {k:ops.get(k,lambda x:x)(v) for k,v inmatcher.groupdict().items()}37 returninfo38
39 defopenfile(path:str):40 with open(path)as f:41 for line inf:42 d =extract(line)43 ifd:44 yieldd45 else:46 #TODO 不合格的数据
47 continue
48
49 def load(*path:str):50 #文件装载
51 for file inpath:52 p =Path(file)53 if notp.exists():54 continue
55 ifp.is_dir():56 for x inp.iterdir():57 ifx.if_file():58 yield fromopenfile(str(x))59 elifp.is_file():60 yield fromopenfile(str(p))61 ###################################滑动窗口实现##############################################
62 defwindows(src:Queue,handler,width:int,interval:int):63 start = datetime.datetime.strptime('1971/01/01 00:00:00 +0800','%Y/%m/%d %H:%M:%S %z')64 current = datetime.datetime.strptime('1971/01/01 00:00:01 +0800','%Y/%m/%d %H:%M:%S %z')65 buffer = [] #窗口中待计算的数据
66 delta = datetime.timedelta(seconds=width-interval)67
68 whileTrue:69 data =src.get()70 ifdata:71 buffer.append(data)72 current =data['datetime']73
74 if (current - start).total_seconds() >=interval:75 ret =handler(buffer)76 print(ret)77 start =current78 #buffer的处理
79 buffer = [x for x in buffer if x['datetime'] > current -delta]80
81 #处理函数
82 #状态码分析
83 defstatus_handler(iterable):84 #一批时间窗口内的数据
85 status ={}86 for item initerable:87 key = item['status']88 if key not instatus.keys():89 status[key] =090 status[key] = 1
91 total =sum(status.values())92 return {k:v/total*100 for k,v instatus.items()}93
94 #浏览器分析
95 ua_dict = defaultdict(lambda:0)96 defbrowser_handler(iterable:list):97 for item initerable:98 ua = item['useragent']99 key =(ua.browser.family,ua.browser.version_string)100 ua_dict[key] =1
101 returnua_dict102
103 defhandler(iterable):104 vals = [x['value'] for x initerable]105 return sum(vals) /len(vals)106
107 defdonothing_handler(iterable:list):108 print(iterable)109 returniterable110 ###########################数据分发器实现#####################################
111 #数据分发器:这里做一个简单的一对多副本发送,一个数据通过分发器,发送到n个消费者
112 defdispatcher(src):113 queues =[]114 threads =[]115
116 defreq(handler,width,interval):117 q =Queue()118 queues.append(q)119 t = threading.Thread(target=windows,args=(q,handler,width,interval))120 threads.append(t)121
122 defrun():123 for t inthreads:124 t.start()125
126 for x in src:#一条数据送到n个消费者各自的队列中
127 for q inqueues:128 q.put(x)129 returnreq,run130
131 req,run = dispatcher(load('test.log'))132 #req注册窗口
133 #req(donothing_handler,1,1)
134 #req(status_handler,2,2)
135 req(browser_handler,2,2)136
137 #启动
138 run()