#!-*- coding:utf-8 -*-
import re
import os
import json
import sys
import shutil
#2019-09-17T23:59:02+08:00 - info - 10.116.88.13 - - - GW - API-GATEWAY - - - - - 378df35127397fcf74b69c9f84585b3d - 10.200.64.13 - 149318 - 375172 - "POST /o2o-dds-iss/RLS/callRlsAndWantedByBno HTTP/1.1" - ISS-A - 10.117.177.183:8888 - 200 - 266 - 0.007 - 0.007 - 755 - leafnet - @DDS_API_GW
#2018-06-27T23:58:47+08:00 - info - 10.116.88.1 - - - GW - API-GATEWAY - - - - - 1530115125909be448ec230ac4a1f96d541236824080f - 10.200.3.7 - 22016187 - 40367406 - "POST /express/offlineCommonService/incrTaskVersionInfo/v3?t=1 HTTP/1.1" - ENV2_2 - 10.116.177.113:8931 - 200 - 36 - 0.013 - 0.012 - 398 - unite
#统计所有URL当天调用次数
#filtTime: 指定过滤日期
#logPath: 需要统计的日志路径,扫描整个文件夹
#返回汇总数据,数据量大,只调用一次
def getURLCounts(filtTime,logPath):
pattern = re.compile(r'(POST\s|GET\s)(.+)\sHTTP')
summary = {}
urls = {}
urlByMin = {}
urlBySec = {}
for root, dirs, files in os.walk(logPath):
for file in files:
ifp = root + '/' + file
with open(ifp, 'rb') as f:
for line in f:
if line.startswith(filtTime):
sec = "".join(line[11:19].split(":"))
min = "".join(line[11:16].split(":"))
match = re.search(pattern,line)
if match:
url = match.group().split(" ")[1].split("?")[0]
#统计所有URL
if url in urls:
urls[url] += 1
else:
urls[url] = 1
#按分钟汇总
if url not in urlByMin:
urlByMin[url] = {}
if min in urlByMin[url]:
urlByMin[url][min] += 1
else:
urlByMin[url][min] = 1
#按秒汇总
if url not in urlBySec:
urlBySec[url] = {}
if sec in urlBySec[url]:
urlBySec[url][sec] += 1
else:
urlBySec[url][sec] = 1
urls = sorted(urls.items(),key = lambda x:x[1], reverse = True)
summary["urls"] = urls
summary["urlByMin"] = urlByMin
summary["urlBySec"] = urlBySec
return json.dumps(summary)
#按分钟格式化数据
def formatDataByMin(dataDict):
dataList = []
for t in range(1440):
hour = str(t/60).rjust(2,'0')
min = str(t%60).rjust(2,'0')
timeStr = hour + min
d = []
d.append(timeStr)
if timeStr in dataDict:
d.append(dataDict[timeStr])
else:
d.append('0')
dataList.append(d)
return dataList
#按秒格式化数据
def formatDataBySec(dataDict):
dataList = []
for t in range(86400):
hour = str(t/3600).rjust(2,'0')
min = str((t%3600)/60).rjust(2,'0')
sec = str(t%3600%60).rjust(2,'0')
timeStr = hour + min + sec
d = []
d.append(timeStr)
if dataDict.has_key(timeStr):
d.append(dataDict[timeStr])
else:
d.append('0')
dataList.append(d)
return dataList
#记录数据到文件
def saveDataFile(dataList, filePath):
content = ""
for i in dataList:
content += str(i[0])+","+str(i[1]) + "\n"
with open(filePath, "w+") as f:
f.write(content)
#生成统计的记录
#urlByX 表示按X格式统计,X表示分钟或是秒
#formatDataByX 表示调用的格式化函数名,可以是按分钟或是按秒格式化
def genDataByX(urlByX, formatDataByX, filePath):
for url in urlByX:
urlRecord = urlByX[url]
fileName = url.replace("/","_")
dataList = getattr(sys.modules[__name__], formatDataByX)(urlRecord)
saveDataFile(dataList, filePath + fileName+".txt")
#按高峰时间段统计
#areatime 指定高峰时间区间
#readFileDir 扫描统计的记录目录
#统计结果存放路径
def getMaxCountByTime(readFileDir, areatime, filePath):
for root, dirs, files in os.walk(readFileDir):
for f in files:
for x in areatime:
start,end = x.split("~")
_max = 0
_maxtime = 0
with open(readFileDir+ "/" + f, "r") as ff:
lines = ff.readlines()
for line in lines:
_time, _count = line.split(",")
if _time>=start and _time<=end:
if int(_count) > int(_max):
_max = _count
_maxtime = _time
#print f, _maxtime, _max
with open(filePath + f, "a+") as wf:
if int(_max) == 0:
wf.write(str(_maxtime)+"\t"+str(_max)+"\n")
else:
wf.write(str(_maxtime)+"\t"+str(_max))
#统计所有URL按分钟累积和
#filePath: 扫描文件路径
def genURLSumByMin(filePath):
sumRecord = {}
for root, dirs, files in os.walk(filePath):
for file in files:
ifp = root + '/' + file
with open(ifp, "r") as f:
for line in f:
l = line.split(",")
t = l[0]
c = int(l[1])
#print t
if t in sumRecord:
sumRecord[t] = int(sumRecord[t]) + c
else:
sumRecord[t] = c
return formatDataByMin(sumRecord)
#计算单位时间内(一小时内),url调用量最大的区间
#该方法依赖genURLSumByMin得到的数据
def getURLMaxAreaInHour(filePath):
datalist = {}
start = 0
with open(filePath,'r') as f:
line = f.readlines()
start = 0
end = 24*60 - 60
while True:
if start >= end:
break
sum = 0
s = e = None
for i in range(60):
l = line[start + i].split(",")
t = l[0]
c = l[1]
if not s:
s = t
else:
e = t
sum += int(c)
datalist[s+"~"+e] = sum
start += 1
return sorted(datalist.items(),key = lambda x:x[1], reverse = True)
# 初步结果保存文件
resultFile = 'result.txt'
# 准备工作
def ready(resultPath):
shutil.rmtree(resultPath)
childPath = ['dataByMin', 'dataBySec', 'dataMaxBySec', 'dataMaxByMin']
isExists=os.path.exists(resultPath)
if not isExists:
os.makedirs(resultPath)
for p in childPath:
if not os.path.exists(resultPath + '/' + p):
os.makedirs((resultPath + '/' + p))
# 第一步:扫描生产日志
def step1(filtTime, logPath):
# 过滤日期
# filtTime #"2018-06-27T"
#生产日志所在目录
# logPath #"./log"
result = getURLCounts(filtTime, logPath)
#保存记录
with open(resultPath + "/" + resultFile,"w+") as f:
f.write(result)
#第二步:整理数据,依赖第一步得到的结果
def step2(resultPath):
with open(resultPath + "/" + resultFile,"rb") as f:
#载入记录数据
line = json.load(f)
###############按时间格式化数据#####
#按分钟格式化
urlByMin = line["urlByMin"]
genDataByX(urlByMin, "formatDataByMin" , resultPath + "/dataByMin/")
#按秒格式化数据
urlBySec = line["urlBySec"]
genDataByX(urlBySec, "formatDataBySec", resultPath + "/dataBySec/")
###############统计高峰时间数据,依赖格式化的数据#####
#高峰时间段按秒
areatimeBySec=[
"000000~235959",
]
#按秒高峰统计
getMaxCountByTime(resultPath + "/dataBySec/", areatimeBySec, resultPath + "/dataMaxBySec/")
#高峰时间段按分钟
areatimeByMin=[
"0000~2359",
]
#按分钟高峰统计
getMaxCountByTime(resultPath + "/dataByMin/", areatimeByMin, resultPath + "/dataMaxByMin/")
#按分钟统计所有URL调用次数,并保存到allurlbymin.txt中
saveDataFile(genURLSumByMin(resultPath + "/dataByMin"), resultPath + "/allurlbymin.txt")
#统计所有URL一天单独调用次数
urls = line["urls"]
saveDataFile(urls, resultPath + "/allurlsummary.txt")
#第三步:按excel格式生成数据
def step3(resultPath):
urlSummary = []
with open(resultPath + "/allurlsummary.txt","rb") as f:
for line in f:
urlRecord = []
urlName,count = line.split(",")
urlRecord.append(urlName)
urlRecord.append(count.strip())
fileName = urlName.replace("/","_")
#获取高分时间段信息(按分钟)
with open(resultPath + "/dataMaxByMin/" + fileName + ".txt","rb") as f1:
for l1 in f1:
t1,c1 = l1.split()
q1= format(float(c1)/60,'.2f')
urlRecord.append(t1.strip())
urlRecord.append(c1)
urlRecord.append(q1)
with open(resultPath + "/dataMaxBySec/" + fileName + ".txt","rb") as f2:
for l2 in f2:
t2,c2 = l2.split()
urlRecord.append(t2.strip())
urlRecord.append(c2)
urlSummary.append(urlRecord)
content = ""
for s in urlSummary:
flag = len(s)
for x in s:
if x == s[-1] and flag==1:
content += str(x)+"\n"
else:
content += str(x)+","
flag -= 1
with open(resultPath + "/summary.csv", "w+") as fx:
fx.write(content)
#第四步:获取单位时间内最大数据(一小时内)
def step4(resultPath):
getURLMaxAreaInHour(resultPath + "/allurlbymin.txt")[0:10]
if __name__ == '__main__':
# 需要扫描日志的路径
logPath = './'
#logPath = 'E:\仓管家\双十一压测\生产网关日志/access_20190918235901.log'
# 定义保存结果路径
resultPath = './filterLog'
#resultPath = 'E:\仓管家\双十一压测\生产网关日志/filterLog'
# 统计的日期,注意格式
dateFiler = "2020-06-17T" #"2019-03-22T"
# 脚本开始
ready(resultPath)
step1(dateFiler, logPath)
step2(resultPath)
step3(resultPath)