日志分析步骤
正则表达式
采用正则表达式将日志部分进行分组匹配
信息提取
根据正则表达式将日志提取为字典格式作为生产者,不停的产出每条日志信息
滑动窗口
滑动窗口是每次处理多少数据,时间宽度是多少?周期是多少?
数据分发
为避免生产者和消费者之间速率不匹配,采用消息队列,将生产者产生的数据存储在消息队列之中,让消费者通过消息队列进行消费数据,
文件加载
将要处理的文件进行加载,加载后用正则表达式匹配,然后产生数据,为生产者提供源材料
分析器
消费者从消息队列中获取数据,最后根据分析器进行消费数据
各个参数分析
对于各个参数分析时,需要多线程配合,将各个参数分析的分析器绑定在不同的线程之上,便于同步消费数据
核心代码
#-*- coding: UTF-8 -*-
import re
import datetime
from pathlib import Path
from queue import Queue
import threading
from user_agents import parse
from functools import reduce
import time
import sys
p = Path(r'C:\Users\keke\Desktop\python\p10c07\logs\access.log')
pattern = '''(?P<remote>[\d.]{7,}) - - \[(?P<datetime>[/\w +:]+)\] \
"(?P<method>\w+) (?P<url>\S+) (?P<protocol>[\w/\d.]+)" \
(?P<status>\d+) (?P<length>\d+) .+ "(?P<useragent>.+)"'''
regex = re.compile(pattern)
ops = {
'datetime':lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
'status':int,
'length':int,
}
def openfile(path):
"""
函数:读取文本,清洗数据
path:文本文件
"""
with open(path,'r+',encoding='UTF8') as f:
for line in f:
fields = regex.match(line)
if fields:
yield {k:ops.get(k,lambda x:x)(v) for k,v in fields.groupdict().items()}
else:
continue
def loadpath(*path):
"""
函数:方便用户同时传入多个参数
"""
for item in path:
p = Path(item)
if not p.exists():
continue
if p.is_dir():
for file in p.iterdir():
if file.is_file():
yield from openfile(file)
elif p.is_file():
yield from openfile(str(p))
def headler_remoteip(d):
"""
函数:显示远程主机IP地址
"""
for x in d:
print(x.get('remote'))
allbrowsers ={}
numbrow = 0
def headler_useragent(iterable):
"""
函数:统计浏览器及版本
"""
global numbrow
browers = {}
numbrow += len(iterable)
for x in iterable:
ua = parse(x['useragent'])
key = (ua.browser.family,ua.browser.version_string)
browers[key] = browers.get(key,0) + 1
allbrowsers[key] = allbrowsers.get(key,0) + 1
numb = reduce(lambda x,y:x+y,allbrowsers.values())
print(numbrow)
print(numb)
def window(src:Queue,headler,windth,interval):
"""
函数:滑动窗口
:param src:窗口队列
:param headler:处理函数
:param windth:处理时间为多长的数据
:param interval:多久处理一次
"""
buffer = []
start = datetime.datetime.strptime('06/Apr/2017:18:00:25 +0800', '%d/%b/%Y:%H:%M:%S %z')
while True:
data = src.get()
if data:
current = data.get('datetime')
buffer.append(data)
if (current-start).total_seconds() > interval:
#if (current-start).total_seconds() == interval:
headler(buffer)
start = current
buffer = [x for x in buffer if (current - x.get('datetime')).total_seconds() < windth-interval]
def dispatcher(src):
"""
函数:数据分发函数
src:生产者模型中的生产者
"""
handlers = []
queues = []
def reg(headler,windth,interval):
"""
:param headler:消费者函数
:param windth:具体给定处理多久的数据
:param interval:间隔时间周期,通过线程来实现
"""
q = Queue()
queues.append(q)
t = threading.Thread(target=window,args=(q,headler,windth,interval))
handlers.append(t)
def run():
"""
函数:给线程fifo提供数据源,并提供启动线程函数
"""
for t in handlers:
t.start()
for item in src:
for q in queues:
q.put(item)
return reg,run
if __name__ == "__main__":
path = str(p)
reg,run = dispatcher(loadpath(path))
#reg(headler_remoteip,10,8)
reg(headler_useragent,60,60)
run()