# -*- coding: utf-8 -*-
import time
import os
import json
def remove_logs(cachedir):
for root, dirs, files in os.walk(cachedir, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
def runlog(filelog,filename=''):
print('[running]','初始化数据')
spiderr = ['Googlebot','bingbot','Baiduspider','360Spider','Sogou','DotBot','Yahoo','Trident','YandexBot','SiteExplorer','MJ12bot','AhrefsBot','ExtLinksBot','SEOkicks-Robot','SiteExplorer','YisouSpider','python','SemrushBot']
dataname = []
dataval = []
timeline = ''
timesize = 0
sizelevel = [0 for i in range(11)]
sizeall = 0
print('[running]','开始遍历数据',filelog)
with open(filelog, 'r') as files:
for line in files:
# print(line)
r = line.strip().split('\t')
if len(r) != 8:continue
# 处理各网站蜘蛛抓取情况
r[2] = r[2].lower()
for spider in spiderr:
if spider.lower() in r[7].lower():
sp = spider.lower()
break
else:
sp ='none'
# r.insert(7,sp)
m = [r[2],sp]
if m not in dataname:
dataname.append(m)
dataval.append(1)
else:
key = dataname.index(m)
dataval[key]+=1
# 处理每秒压力情况
# print(r[5])
size = int(r[5])
if timeline != r[1]:
# print(timesize)
mb = int(timesize*8/1024/1024)
# print(mb,timesize/1024/1024,timesize)
if mb>10:mb=10
sizelevel[mb] += 1
timesize = size
timeline = r[1]
else:
timesize += size
sizeall += size
print('[running]','数据抓取完成')
# 整理抓取数据
webr = []
for i in range(len(dataname)):
webr.append({
'domain':dataname[i][0],
'spider':dataname[i][1],
'value':dataval[i],
})
# 汇总数据
data = {
'web-spider':webr,
'size-level':sizelevel,
'size-count':sizeall
}
print(data)
if filename:
print('[running]','执行保存文件',filename)
# 保存新数据
with open(filename, 'w', encoding='utf-8') as filejson:
filejson.write(json.dumps(data))
print('[running]','执行删除文件',filelog)
remove_logs(filelog)
if __name__ == '__main__':
date = time.strftime("%Y%m%d",time.localtime(int(time.time())))
filelog = 'Access.%s.log'%date
filename = 'log.%s.json'%date
runlog(filelog,filename)
# 脚本说明:
# python3.6版本
# 数组spiderr 蜘蛛名列表,不在数组内统一为none,none包括真实用户请求
# size-level 指每秒使用带宽数据,单位M [1秒内使用0~1M带宽次数,...,1秒内使用10M~maxM带宽次数]
# size-count 指web日志请求总大小,单位bit 换算MB size/1024/1024
# parama filelog string 日志文件名
# parama filename string 保存文件名
# 需配合CheckLog.cut.sh使用
# YESTERDAY=$(date -d "yesterday" +%Y%m%d)
# mv /home/wwwlogs/access.log /home/wwwlogs/Access.${YESTERDAY}.log
# kill -USR1 $(cat /usr/local/nginx/logs/nginx.pid)
# cd /home/wwwlogs/
# zip -q Access.${YESTERDAY}.log.zip Access.${YESTERDAY}.log
# #rm -f /home/wwwlogs/Access.${YESTERDAY}.log
# python3 CheckLog.py
# 设置变量YESTERDAY
# 重命名log文件
# 重启nginx进程,生成新log文件
# 切换到目录
# 打包压缩日志文件
# 删除当天日志文件 已注释,交给python完成
# 启动python执行日志处理脚本
# 需配合nginx.conf设置使用
# http 添加代码
# log_format main '$remote_addr [$time_local] $http_host $status "$request" $body_bytes_sent $http_referer $http_user_agent';
# service 添加代码
# access_log /home/wwwroot/xxxxx/access.log main;
# 特别说明
# 无需再单独设置定时执行CheckLog.py,直接在CheckLog.cut.sh内执行即可
# by 薛一
# 2018-07-12
python分析web日志文件
最新推荐文章于 2020-12-23 13:13:58 发布