—————–日志分析—————-
代码:
import random
import datetime
import time
import threading
import re
import sys
from queue import Queue
from pathlib import Path
from user_agents import parse
# 数据匹配处理
PATTERN = '''(?P<remote>[\d\.]{7,})\s-\s-\s\[(?P<datetime>[^\[\]]+)\]\s"(?P<method>.*)\s(?P<url>.*)\s(?P<protocol>.*)"\s(?P<status>\d{3})\s(?P<size>\d+)\s"[^"]+"\s"(?P<useragent>[^"]+)"'''
regex = re.compile(PATTERN) # 编译
ops = {
'datetime': lambda datestr: datetime.datetime.strptime(datestr, '%d/%b/%Y:%H:%M:%S %z'),
# 'request': lambda request: dict(zip('method', 'url', 'protocol'), request.split())
'useragent': lambda useragent: parse(useragent)
'status': int,
'size': int
}
# 进一步处理每条日志消息(时间格式处理、消息返回值、消息大小)
# ops.get(name, lambda x: x)(data) 1,如果name有返回值就使用返回值,否则就使用lambda x:x函数
def extract(line:str) -> dict:
matcher = regex.match(line)
if matcher:
return {name:ops.get(name, lambda x: x)(data) for name, data in matcher.groupdict().items()}
# 打开文件,依次读取每一行
def openfile(files:str):
with open(files) as file:
for line in file:
fields = extract(line)
if fields:
yield fields
else:
continue
# 装在日志文件或目录
def load(*args):
for item in args:
Paths = Path(item)
if not Paths.exists():
continue
if Paths.is_dir():
for file in Paths.iterdir():
if file.is_file():
yield from openfile(str(file)) # file为Path所以使用str函数转换
elif Paths.is_file():
yield from openfile(str(Paths)) # file为Path所以使用str函数转换
# 临时测试生成数据
# def source():
# while True:
# yield {
# 'datetime': datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=8))),
# 'value':random.randint(1, 100)
# }
# time.sleep(1)
# 滑动窗口
def window(src, handler, width:int, interval:int):
'''
:param src: 数据源,缓存队列
:param handler: 数据处理函数
:param width: 时间窗口宽度,秒
:param interval: 处理时间间隔,秒
'''
start = datetime.datetime.strptime('20170101 000000 +0800', '%Y%m%d %H%M%S %z')
current = datetime.datetime.strptime('20170101 010000 +0800', '%Y%m%d %H%M%S %z')
buffer = []
delta = datetime.timedelta(seconds=width-interval)
while True:
data = src.get() # 从队列中拿出一条消息
if data:
buffer.append(data) # 存入queue缓冲等待计算
current = data['datetime']
if (current-start).total_seconds() >= interval: # 每隔interval计算buffer中的数据一次
ret = handler(buffer)
print('{}'.format(ret))
start = current
# 清理队列中过时的消息
buffer = [x for x in buffer if x['datetime'] > current]
# 消费者:统计平均值
def handler(iterable):
return sum(map(lambda x: x['value'], iterable)) / len(iterable)
# 消费者:统计状态码占比
def status_handler(iterable):
status = {}
for item in iterable:
key = item['status']
status[key] = status.get(key, 0) + 1
total = len(iterable)
return {key:status[key] / total for key, value in status.items()}
# 消费者:浏览器分析函数
def browser_handler(iterable):
browsers = {}
for item in iterable:
useragent = item['useragent']
key = (useragent.family, useragent.browser.version_string)
browsers[key] = browsers.get(key, 0) + 1
return browsers
# 数据分发器
def dispatcher(src):
handlers = [] # 创建一个线程池,存放
queues = [] # 创建一个一对多的队列
def reg(handler, width:int, interval:int):
queuobj = Queue()
queues.append(queuobj)
threads = threading.Thread(target=window, args=(queuobj, handler, width, interval))
handlers.append(threads)
def run():
for handler in handlers:
handler.start() # 启动线程处理数据
for item in src:
for queue in queues:
queue.put(item) # 将数据源获取的数据put到所有队列中
return reg, run
if __name__ == '__main__':
# dir_file = sys.argv[1]
# reg, run = dispatcher(load(dir_file))
reg, run = dispatcher(load('test.log'))
reg(status_handler, 10, 5) # 注册
run() # 运行