日志分析
概述
分析的前提
半结构化数据
文本分析
提取数据(信息提取)
一、空格分隔
with open('xxx.log')as f:
for line in f:
for field in line.split():
print(field)
#注意这里拼接的一些技巧
logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu\
=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou\
.com/search/spider.html)"'''
fields = []
flag = False
tmp = ''
#注意拼接"GET /020/media.html?menu=3 HTTP/1.1"这种字符串需借助标记变量!
for field in logs.split():
if not flag and (field.startswith('[') or field.startswith('"')):
if field.endswith(']') or field.endswith('"'):#处理首尾均有[]的字符串
fields.append(field.strip('[]"'))
# 处理只有左中括号的字符串,但是该字符串应该与接下类的某一段含有右括号的字符拼接起来[19/Feb/2013:10:23:29
else:#
tmp += field[1:]
flag = True
continue
#处理[19/Feb/2013:10:23:29 +0800]中的+0800]
if flag:
if field.endswith(']') or field.endswith('"'):
tmp += " " + field[:-1]
fields.append(tmp)
tmp = ''
flag = False
else:
tmp +=" " + field
continue
fields.append(field)#直接加入不带有[]""的字符串
类型转换
import datetime
def convert_time(timestr):
return datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
#若上面的函数可简写成匿名函数形式
lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
请求信息的解析
def get_request(request:str):
return dict(zip(['method','url','protocol'],request.split()))
#上面的函数对应为如下匿名函数
lambda request:dict(zip(['method','url','protocol'],request.split()))
映射
1 import datetime
2 logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu\
3 =3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou\
4 .com/search/spider.html)"'''
5
6 def convert_time(timestr):
7 return datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
8
9 # lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
10
11 def get_request(request:str):
12 return dict(zip(['method','url','protocol'],request.split()))
13
14 # lambda request:dict(zip(['method','url','protocol'],request.split()))
15
16 names = ('remote','','','datetime','request','status','length','','useragent')
17 ops = (None,None,None,convert_time,get_request,int,int,None,None)
18
19 def extract(line):
20 fields = []
21 flag = False
22 tmp = ''
23
24 #"GET /020/media.html?menu=3 HTTP/1.1"
25 for field in logs.split():
26 if not flag and (field.startswith('[') or field.startswith('"')):
27 if field.endswith(']') or field.endswith('"'):#处理首尾均有[]的字符串
28 fields.append(field.strip('[]"'))
29 # 处理只有左中括号的字符串,但是该字符串应该与接下类的某一段含有右括号的字符拼接起来[19/Feb/2013:10:23:29
30 else:#
31 tmp += field[1:]
32 flag = True
33 continue
34 #处理[19/Feb/2013:10:23:29 +0800]中的+0800]
35 if flag:
36 if field.endswith(']') or field.endswith('"'):
37 tmp += " " + field[:-1]
38 fields.append(tmp)
39 tmp = ''
40 flag = False
41 else:
42 tmp +=" " + field
43 continue
44
45 fields.append(field)#直接加入不带有[]""的字符串
46
47 # print(fields)
48 info = {}
49 for i,field in enumerate(fields):
50 name = names[i]
51 op = ops[i]
52 if op:
53 info[name] = (op(field))
54 return info
55
56 print(extract(logs))
二、正则表达式提取
pattern = '''([\d.]{7,}) - - \[([/\w +:]+)\] "(\w+) (\S+) ([\w/\d.]+)" (\d+) (\d+) .+ "(.+)"'''
names = ('remote','datetime','request','method','url','ptorocol','status','length','useragent')
ops = (None,lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),None,None,None,int,int,None)
pattern = '''(?P[\d.]{7,}) - - \[(?P[/\w +:]+)\] \
"(?P\w+) (?P\S+) (?P[\w/\d.]+)"\
(?P\d+) (?P\d+) .+ "(?PM.+)"'''
ops = {
'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
'status':int,
'length':int
}
import datetime
import re
logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P\w+) (?P\S+) (?P[\w/\d.]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
ops = {
'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
'status':int,
'length':int
}
regex = re.compile(pattern)
def extract(line):
matcher = regex.match(line)
#matcher.groupdict()函数返回一个包含所有match匹配的命名分组的字典
info = {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
return info print(extract(logs))
异常处理
滑动窗口
数据载入
时间窗口分析
概念
当width>interval(数据求值时会有重叠)
当width=interval(数据求值时没有重叠)
当width
时序数据
数据分析基本程序结构
import random
import datetime
def source():
while True:
yield {'datetime':datetime.datetime.now(),'value':random.randint(1,10)}
#获取数据
src = source()
items = [next(src) for _ in range(3)]
# print(items)
#处理函数
def handler(iterable):
vals = [x['value'] for x in iterable]
return sum(vals)/len(vals)
print(handler(items))
#上述代码实模拟了一段时间内产生了数据,等了一段固定的时间取数据计算其平均值。
窗口函数实现
将上面的获取数据的程序扩展为windows函数,使用重叠的方案!
#代码实现:
import random
import datetime
import time
def source():
while True:
yield {'value':random.randint(1,100),'datetime':datetime.datetime.now()}
time.sleep(1)
def windows(src,handler,width:int,interval:int):
"""
:param src:数据源、生成器、用来拿数据
:param handler: 数据处理函数
:param width: 时间窗口宽度,秒
:param interval: 处理时间间隔,秒
:return:None
"""
start = datetime.datetime.strptime('19710101 00:00:00 +0800','%Y/%m/%d %H:%M:%S %z')
current = datetime.datetime.strptime('19710101 00:00:01 +0800','%Y/%m/%d %H:%M:%S %z')
buffer = [] #窗口中待计算的数据
delta = datetime.timedelta(seconds=width-interval)
for data in src:
if data:#存入临时缓存区
buffer.append(x)
current =data['datetime']
if (current - start).total_seconds() >= interval:
ret = handler(buffer)
print("{:.2f}".format(ret))
start = current
#更新buffer,current - delta表示需要重叠的数据
buffer = [x for x in buffer if x['datetime'] > current - delta]
#处理函数
def handler(iterable):
vals = [x['value'] for x in iterable]
return sum(vals) / len(vals)
windows(source(),handler,10,5)
分发
生产者消费模型
queue模块--队列
from queue import Queue
import random
q = Queue()
print(q.put(random.randint(1,100)))
print(q.put(random.randint(1,100)))
print(q.get())
print(q.get())
print(q.get(timeout=2))#阻塞两秒后抛出空值异常
分发器的实现
import threading
#定义线程
#target线程中运行的函数;args这个函数运行时需要的实参
t = threading.Thread(target=windows,args=(src,handler,width,interval))
#启动线程
t.start()
分发器代码实现
# Author: Baozi
#-*- codeing:utf-8 -*-
# Author: Baozi
#-*- codeing:utf-8 -*-
#日志分析项目
'''
1.新建一个python文件test.py
2.从日志文件中复制一条日志信息用于测试。logline存储这个日志字符串
'''
import threading
from queue import Queue
import datetime
import re
import random
import time
# logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P\w+) (?P\S+) (?P[\w/\d.]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
ops = {
'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
'status':int,
'length':int
}
regex = re.compile(pattern)
def extract(line):
matcher = regex.match(line)
print(matcher.groupdict())
#matcher.groupdict()函数返回一个包含所有match匹配的命名分组的字典
info = {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
return info
def load(path:str):
#单文件装载
with open(path)as f:
for line in f:
d = extract(line)
if d:
yield d
else:
#TODO 不合格的数据
continue
############################滑动窗口实现##################################################def windows(src:Queue,handler,width:int,interval:int):
"""
:param src:数据源、生成器、用来拿数据
:param handler: 数据处理函数
:param width: 时间窗口宽度,秒
:param interval: 处理时间间隔,秒
:return:
"""
start = datetime.datetime.strptime('1971/01/01 00:00:00 +0800','%Y/%m/%d %H:%M:%S %z')
current = datetime.datetime.strptime('1971/01/01 00:00:01 +0800','%Y/%m/%d %H:%M:%S %z')
buffer = [] #窗口中待计算的数据
delta = datetime.timedelta(seconds=width-interval)
while True:
data = src.get()
if data:
buffer.append(data)
current =data['datetime']
if (current - start).total_seconds() >= interval:
ret = handler(buffer)
print(ret)
start = current
#buffer的处理
buffer = [x for x in buffer if x['datetime'] > current - delta]
#处理函数
def handler(iterable):
vals = [x['value'] for x in iterable]
return sum(vals) / len(vals)
def donothing_handler(iterable:list):
print(iterable)
return iterable
######################分发器实现##########################################
#数据分发器:这里做一个简单的一对多副本发送,一个数据通过分发器,发送到n个消费者
def dispatcher(src):
queues = []
threads = []
def req(handler,width,interval):
q = Queue()
queues.append(q)
t = threading.Thread(target=windows,args=(q,handler,width,interval))
threads.append(t)
def run():
for t in threads:
t.start()
for x in src:#一条数据送到n个消费者各自的队列中
for q in queues:
q.put(x)
return req,run
req,run = dispatcher(load('test.log'))
#req注册窗口
req(donothing_handler,1,1)
#启动
run()
完成分析功能
状态码分析
def status_handler(iterable):
#一批时间窗口内的数据
status = {}
for item in iterable:
key = item['status']
if key not in status.keys():
status[key] = 0
status[key] = 1
total = sum(status.values())
return {k:v/total*100 for k,v in status.items()}
日志文件的加载
def openfile(path:str):
with open(path)as f:
for line in f:
d = extract(line)
if d:
yield d
else:
# TODO 不合格的数据
continue
def load(*path:str):
#装载日志文件
for file in path:
p = Path(file)
if not p.exists():
continue
if p.is_dir():
for x in p.iterdir():
if x.if_file():
yield from openfile(str(x))
elif p.is_file():
yield from openfile(str(p))
完整代码如下:
1 #日志分析项目
2 '''
3 1.新建一个python文件test.py
4 2.从日志文件中复制一条日志信息用于测试。logline存储这个日志字符串
5 '''
6 import threading
7 from queue import Queue
8 import datetime
9 import re
10 import random
11 import time
12 from pathlib import Path
13 # logline = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
14 pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P\w+) (?P\S+) (?P[\w/\d.]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
15
16 ops = {
17 'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
18 'status':int,
19 'length':int
20 }
21 regex = re.compile(pattern)
22
23 def extract(line):
24 matcher = regex.match(line)
25 print(matcher.groupdict())
26 #matcher.groupdict()函数返回一个包含所有match匹配的命名分组的字典
27 info = {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
28 return info
29
30 def openfile(path:str):
31 with open(path)as f:
32 for line in f:
33 d = extract(line)
34 if d:
35 yield d
36 else:
37 # TODO 不合格的数据
38 continue
39
40 def load(*path:str):
41 #文件装载
42 for file in path:
43 p = Path(file)
44 if not p.exists():
45 continue
46 if p.is_dir():
47 for x in p.iterdir():
48 if x.if_file():
49 yield from openfile(str(x))
50 elif p.is_file():
51 yield from openfile(str(p))
52 ##################################滑动窗口实现##################################################
53 def windows(src:Queue,handler,width:int,interval:int):
54 start = datetime.datetime.strptime('1971/01/01 00:00:00 +0800','%Y/%m/%d %H:%M:%S %z')
55 current = datetime.datetime.strptime('1971/01/01 00:00:01 +0800','%Y/%m/%d %H:%M:%S %z')
56 buffer = [] #窗口中待计算的数据
57 delta = datetime.timedelta(seconds=width-interval)
58
59 while True:
60 data = src.get()
61 if data:
62 buffer.append(data)
63 current =data['datetime']
64
65 if (current - start).total_seconds() >= interval:
66 ret = handler(buffer)
67 print(ret)
68 start = current
69 #buffer的处理
70 buffer = [x for x in buffer if x['datetime'] > current - delta]
71
72 #处理函数
73 def status_handler(iterable):
74 #一批时间窗口内的数据
75 status = {}
76 for item in iterable:
77 key = item['status']
78 if key not in status.keys():
79 status[key] = 0
80 status[key] = 1
81 total = sum(status.values())
82 return {k:v/total*100 for k,v in status.items()}
83
84 def handler(iterable):
85 vals = [x['value'] for x in iterable]
86 return sum(vals) / len(vals)
87
88 def donothing_handler(iterable:list):
89 print(iterable)
90 return iterable
91 ##########################数据分发器实现####################################
92 #数据分发器:这里做一个简单的一对多副本发送,一个数据通过分发器,发送到n个消费者
93 def dispatcher(src):
94 queues = []
95 threads = []
96
97 def req(handler,width,interval):
98 q = Queue()
99 queues.append(q)
100
101 t = threading.Thread(target=windows,args=(q,handler,width,interval))
102 threads.append(t)
103
104 def run():
105 for t in threads:
106 t.start()
107
108 for x in src:#一条数据送到n个消费者各自的队列中
109 for q in queues:
110 q.put(x)
111
112 return req,run
113
114 req,run = dispatcher(load('test.log'))
115 #req注册窗口
116 req(donothing_handler,1,1)
117 # req(status_handler,2,2)
118
119 #启动
120 run()
浏览器分析
useragent
信息提取
from user_agents import parse
useragent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
uaobj = parse(useragent)
print(uaobj.browser)
print(uaobj.browser.family,uaobj.browser.version)
#输出如下:
Browser(family='Chrome', version=(67, 0, 3396), version_string='67.0.3396')
Chrome (67, 0, 3396)
1 #日志分析完整代码(新增几个小模块)
2 # Author: Baozi
3 #-*- codeing:utf-8 -*-
4 #日志分析项目
5 '''
6 1.新建一个python文件test.py
7 2.从日志文件中复制一条日志信息用于测试。logline存储这个日志字符串
8 '''
9 import threading
10 from queue import Queue
11 import datetime
12 import re
13 import random
14 import time
15 from pathlib import Path
16 from user_agents import parse
17 from collections import defaultdict
18
19 # logline = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
20 # pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P\w+) (?P\S+) (?P[\w/\d.]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
21 pattern = '''(?P[\d.]{7,}) - - \[(?P[\w/ +:]+)\] "(?P[^"]+)" (?P\d+) (?P\d+) .+ "(?P.+)"'''
22
23 ops = {
24 'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
25 'status':int,
26 'length':int,
27 'request':lambda request:dict(zip(('method','url','ptorocol'),request.split())),
28 'useragent':lambda useragent:parse(useragent)
29 }
30 regex = re.compile(pattern)
31
32 def extract(line):
33 matcher = regex.match(line)
34 print(matcher.groupdict())
35 #matcher.groupdict()函数返回一个包含所有match匹配的命名分组的字典
36 info = {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
37 return info
38
39 def openfile(path:str):
40 with open(path)as f:
41 for line in f:
42 d = extract(line)
43 if d:
44 yield d
45 else:
46 # TODO 不合格的数据
47 continue
48
49 def load(*path:str):
50 #文件装载
51 for file in path:
52 p = Path(file)
53 if not p.exists():
54 continue
55 if p.is_dir():
56 for x in p.iterdir():
57 if x.if_file():
58 yield from openfile(str(x))
59 elif p.is_file():
60 yield from openfile(str(p))
61 ###################################滑动窗口实现##############################################
62 def windows(src:Queue,handler,width:int,interval:int):
63 start = datetime.datetime.strptime('1971/01/01 00:00:00 +0800','%Y/%m/%d %H:%M:%S %z')
64 current = datetime.datetime.strptime('1971/01/01 00:00:01 +0800','%Y/%m/%d %H:%M:%S %z')
65 buffer = [] #窗口中待计算的数据
66 delta = datetime.timedelta(seconds=width-interval)
67
68 while True:
69 data = src.get()
70 if data:
71 buffer.append(data)
72 current =data['datetime']
73
74 if (current - start).total_seconds() >= interval:
75 ret = handler(buffer)
76 print(ret)
77 start = current
78 #buffer的处理
79 buffer = [x for x in buffer if x['datetime'] > current - delta]
80
81 #处理函数
82 #状态码分析
83 def status_handler(iterable):
84 #一批时间窗口内的数据
85 status = {}
86 for item in iterable:
87 key = item['status']
88 if key not in status.keys():
89 status[key] = 0
90 status[key] = 1
91 total = sum(status.values())
92 return {k:v/total*100 for k,v in status.items()}
93
94 #浏览器分析
95 ua_dict = defaultdict(lambda :0)
96 def browser_handler(iterable:list):
97 for item in iterable:
98 ua = item['useragent']
99 key = (ua.browser.family,ua.browser.version_string)
100 ua_dict[key] =1
101 return ua_dict
102
103 def handler(iterable):
104 vals = [x['value'] for x in iterable]
105 return sum(vals) / len(vals)
106
107 def donothing_handler(iterable:list):
108 print(iterable)
109 return iterable
110 ###########################数据分发器实现#####################################
111 #数据分发器:这里做一个简单的一对多副本发送,一个数据通过分发器,发送到n个消费者
112 def dispatcher(src):
113 queues = []
114 threads = []
115
116 def req(handler,width,interval):
117 q = Queue()
118 queues.append(q)
119 t = threading.Thread(target=windows,args=(q,handler,width,interval))
120 threads.append(t)
121
122 def run():
123 for t in threads:
124 t.start()
125
126 for x in src:#一条数据送到n个消费者各自的队列中
127 for q in queues:
128 q.put(x)
129 return req,run
130
131 req,run = dispatcher(load('test.log'))
132 #req注册窗口
133 # req(donothing_handler,1,1)
134 # req(status_handler,2,2)
135 req(browser_handler,2,2)
136
137 #启动
138 run()