# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime
def getData():
data = pd.DataFrame()
for f in os.listdir(r'D:\\python\\python3\\sort\\logdata'):
file = os.path.join('D:\\python\\python3\\sort\\logdata\\',f)
#print(file)
data1 = pd.read_table(file, sep=' ', header='infer',error_bad_lines=False)
data1['ifhtml'] = [type(x) == str and (x.endswith('.gif') or x.endswith('.mp4') or x.endswith('.png') or x.endswith('.js') or x.endswith('.css') or x.endswith('.jpg') or x.endswith('.jpeg') or x.endswith('.rar') or x.endswith('.txt') or x.endswith('.ico') or x.endswith('.svg') or x.startswith('/wap') or x.startswith('/plus') or x.endswith('.gz') or x.endswith('.7z') or x.endswith('eval($_POST[c]))') or x.endswith('.jsp')) for x in data1['cs-uri-stem']]
data = pd.concat([data,data1],axis=0)
data = data[data['ifhtml'] == False]
return data
def countByDate(data,date):
#每日时间访问图
data75 = data[data['date'] == date]
data75['datetime'] = data75['date']+" "+data75['time']
data75.index = pd.to_datetime(data75.datetime)
print(data75.info())
print(data75.head())
#按日期绘图
data75.resample('1H', closed='left')['datetime'].count().plot(kind='barh')
def countByIp(data, date):
#当日ip地址访问统计
data75 = data[data['date'] == date]
data75count = data75.groupby('c-ip')['c-ip'].count().sort_values()
print(data75count[-10:])
data75count[-10:].plot(kind='barh')
data = getData()
print('0-'*50)
#print(data.head())
#data.groupby(['date'])['date'].count().plot(kind='barh')
def getTopPerson(data):
#获取ip 访问量最高的人
print(data.groupby(['date','c-ip'])['c-ip'].count().sort_values()[-20:])
data.groupby(['date','c-ip'])['c-ip'].count().sort_values()[-20:].plot(kind='barh')
def countStatus(data):
#总状态码分布
print(data.groupby('date')['sc-status'].value_counts())
#获取有效访问次数
#datastatus = set(data['sc-status'])
#print(datastatus)
data =data[data['sc-status'].apply(checkStatus)]
data.groupby(['sc-status'])['sc-status'].count().plot(kind='barh')
def checkStatus(x):
return x in [200, 301, 206, 302, 304]
def unStatus(data):
#获取无效访问图
data = data[data['sc-status'].apply(lambda x: x in [404, 500, 401])]
datasc = data.groupby('date')['sc-status'].value_counts().to_frame().unstack()
datasc.plot.bar()
def useStatus(data):
#每天有效访问次数统计
data =data[data['sc-status'].apply(checkStatus)]
data.groupby('date')['date'].count().sort_index().plot(kind='barh')
print('日均访问量:')
print(data.groupby('date')['date'].count().mean())
def urlcate(data):
#查看访问网站链接类型
datausri = set(data['cs-uri-stem'])
datausri = pd.DataFrame(datausri)
def countUrl(data):
#访问链接类型统计
data.groupby(['cs-uri-stem'])["cs-uri-stem"].count().sort_values()[-20:-1].plot(kind='barh')
def countMethd(data):
#请求方式类型统计
data =data[data['sc-status'].apply(checkStatus)]
data = data[data['cs-method']!='GET']
data.groupby(['date','cs-method'])['cs-method'].count().sort_values()[-10:-1].plot(kind='barh')
if __name__ == '__main__':
data = getData()
countMethd(data)