关注微信号:小程在线
关注CSDN博客:程志伟的博客
Python 日志解析、状态码分析、网站流量分析、IP地址分析、地理信息分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#!pip install apache-log-parser
import apache_log_parser
%matplotlib inline
#------------------apache log日志--------------------
'''
本案例参考 Nikolay Koldunov(koldunovn@gmail.com)文章完成
我们使用apache-log-parser进行apalce log分析。log解析前我们需要了解对应的网站的Apahce log的配置。 这里我们已经知道待分析网站的log格式为:
format = r'%V %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %T'
对应的各字段代表内容如下:(参考stackoverflow):
%V - 根据 UseCanonicalName 设置的服务器名字
%h - 远程主机(客户端 IP)
%l - identity of the user determined by identd (not usually used since not reliable)
%u - 由 HTTP authentication 决定的 user name
%t - 服务器完成处理这个请求的时间
%r - 来自客户端的请求行。 ("GET / HTTP/1.0")
%>s - 服务器端返回给客户端的状态码(200, 404 等等。)
%b - 响应给客户端的响应报文大小 (in bytes)
\"%{Referer}i\" - Referer is the page that linked to this URL.
\"%{User-Agent}i\" - the browser identification string
%T - Apache 请求时间
'''
#网站服务器的格式
fformat = '%V %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %T'
#创建解析器
p = apache_log_parser.make_parser(fformat)
#网站日志,并传给解析器
sample_string = 'koldunov.net 85.26.235.202 - - [16/Mar/2013:00:19:43 +0400] "GET /?p=364 HTTP/1.0" 200 65237 "http://koldunov.net/?p=364" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" 0'
data = p(sample_string)
data
Out[4]:
{'server_name2': 'koldunov.net',
'remote_host': '85.26.235.202',
'remote_logname': '-',
'remote_user': '-',
'time_received': '[16/Mar/2013:00:19:43 +0400]',
'time_received_datetimeobj': datetime.datetime(2013, 3, 16, 0, 19, 43),
'time_received_isoformat': '2013-03-16T00:19:43',
'time_received_tz_datetimeobj': datetime.datetime(2013, 3, 16, 0, 19, 43, tzinfo='0400'),
'time_received_tz_isoformat': '2013-03-16T00:19:43+04:00',
'time_received_utc_datetimeobj': datetime.datetime(2013, 3, 15, 20, 19, 43, tzinfo='0000'),
'time_received_utc_isoformat': '2013-03-15T20:19:43+00:00',
'request_first_line': 'GET /?p=364 HTTP/1.0',
'request_method': 'GET',
'request_url': '/?p=364',
'request_http_ver': '1.0',
'request_url_scheme': '',
'request_url_netloc': '',
'request_url_path': '/',
'request_url_query': 'p=364',
'request_url_fragment': '',
'request_url_username': None,
'request_url_password': None,
'request_url_hostname': None,
'request_url_port': None,
'request_url_query_dict': {'p': ['364']},
'request_url_query_list': [('p', '364')],
'request_url_query_simple_dict': {'p': '364'},
'status': '200',
'response_bytes_clf': '65237',
'request_header_referer': 'http://koldunov.net/?p=364',
'request_header_user_agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'request_header_user_agent__browser__family': 'Chrome',
'request_header_user_agent__browser__version_string': '23.0.1271',
'request_header_user_agent__os__family': 'Windows',
'request_header_user_agent__os__version_string': 'XP',
'request_header_user_agent__is_mobile': False,
'time_s': ''}
#读取日志文件
log = open('H:/0date/apache_access_log').readlines()
#解析每一行,并创建dict list
log_list = []
for line in log:
try:
data = p(line)
except:
sys.stderr.write("Unable to parse %s" % line)
data['time_received'] = data['time_received'][1:12]+' '+data['time_received'][13:21]+' '+data['time_received'][22: