26. Pandas处理分析网站原始访问日志

Pandas处理分析网站原始访问日志

目标:真实项目的实战,探索Pandas的数据处理与分析

实例:
数据来源:我自己的wordpress博客http://www.crazyant.net/ 的访问日志

实现步骤:
1、读取数据、清理、格式化
2、统计爬虫spider的访问比例,输出柱状图
3、统计http状态码的访问占比,输出饼图
4、统计按小时、按天的PV/UV流量趋势,输出折线图

1、读取数据并清理格式化

import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line
# 读取整个目录,将所有的文件合并到一个dataframe
data_dir = "./datas/crazyant/blog_access_log"

df_list = []

import os
for fname in os.listdir(f"{data_dir}"):
    df_list.append(pd.read_csv(f"{data_dir}/{fname}", sep=" ", header=None, error_bad_lines=False))

df = pd.concat(df_list)
b'Skipping line 2245: expected 10 fields, saw 16\nSkipping line 2889: expected 10 fields, saw 14\nSkipping line 2890: expected 10 fields, saw 14\nSkipping line 2891: expected 10 fields, saw 13\nSkipping line 2892: expected 10 fields, saw 13\nSkipping line 2900: expected 10 fields, saw 11\nSkipping line 2902: expected 10 fields, saw 11\nSkipping line 3790: expected 10 fields, saw 14\nSkipping line 3791: expected 10 fields, saw 14\nSkipping line 3792: expected 10 fields, saw 13\nSkipping line 3793: expected 10 fields, saw 13\nSkipping line 3833: expected 10 fields, saw 11\nSkipping line 3835: expected 10 fields, saw 11\nSkipping line 9936: expected 10 fields, saw 16\n'
b'Skipping line 11748: expected 10 fields, saw 11\nSkipping line 11750: expected 10 fields, saw 11\n'
df.head()
0123456789
0106.11.153.226--[02/Dec/2019:22:40:18+0800]GET /740.html?replytocom=1194 HTTP/1.020013446-YisouSpider
142.156.254.60--[02/Dec/2019:22:40:23+0800]POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.020155http://www.crazyant.net/740.html?replytocom=1194Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2106.11.159.254--[02/Dec/2019:22:40:27+0800]GET /576.html HTTP/1.020013461-YisouSpider
3106.11.157.254--[02/Dec/2019:22:40:28+0800]GET /?lwfcdw=t9n2d3&oqzohc=m5e7j1&oubyvq=iab6a3&oudmbg=6osqd3 HTTP/1.020010485-YisouSpider
442.156.137.109--[02/Dec/2019:22:40:30+0800]POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.020155http://www.crazyant.net/576.htmlMozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df = df[[0, 3, 6, 9]].copy()
df.head()
0369
0106.11.153.226[02/Dec/2019:22:40:18200YisouSpider
142.156.254.60[02/Dec/2019:22:40:23201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2106.11.159.254[02/Dec/2019:22:40:27200YisouSpider
3106.11.157.254[02/Dec/2019:22:40:28200YisouSpider
442.156.137.109[02/Dec/2019:22:40:30201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df.columns = ["ip", "stime", "status", "client"]
df.head()
ipstimestatusclient
0106.11.153.226[02/Dec/2019:22:40:18200YisouSpider
142.156.254.60[02/Dec/2019:22:40:23201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2106.11.159.254[02/Dec/2019:22:40:27200YisouSpider
3106.11.157.254[02/Dec/2019:22:40:28200YisouSpider
442.156.137.109[02/Dec/2019:22:40:30201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df.dtypes
ip        object
stime     object
status    int64 
client    object
dtype: object

2、统计spider的比例

df["is_spider"] = df["client"].str.lower().str.contains("spider")
df.head()
ipstimestatusclientis_spider
0106.11.153.226[02/Dec/2019:22:40:18200YisouSpiderTrue
142.156.254.60[02/Dec/2019:22:40:23201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
2106.11.159.254[02/Dec/2019:22:40:27200YisouSpiderTrue
3106.11.157.254[02/Dec/2019:22:40:28200YisouSpiderTrue
442.156.137.109[02/Dec/2019:22:40:30201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
df_spider = df["is_spider"].value_counts()
df_spider
False    46641
True     3637 
Name: is_spider, dtype: int64
bar = (
        Bar()
        .add_xaxis([str(x) for x in df_spider.index])
        .add_yaxis("是否Spider", df_spider.values.tolist())
        .set_global_opts(title_opts=opts.TitleOpts(title="爬虫访问量占比"))
)
bar.render_notebook()
<div id="c9bd2bcd2d004232b92db8db30eccba0" style="width:900px; height:500px;"></div>

3、访问状态码的数量对比

df_status = df.groupby("status").size()
df_status
status
200    41924
201    3432 
206    70   
301    2364 
302    23   
304    19   
400    20   
403    92   
404    1474 
405    12   
444    846  
500    1    
504    1    
dtype: int64
list(zip(df_status.index, df_status))
[(200, 41924),
 (201, 3432),
 (206, 70),
 (301, 2364),
 (302, 23),
 (304, 19),
 (400, 20),
 (403, 92),
 (404, 1474),
 (405, 12),
 (444, 846),
 (500, 1),
 (504, 1)]
pie = (
        Pie()
        .add("状态码比例", list(zip(df_status.index, df_status)))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
pie.render_notebook()
<div id="4412254edcf447b68e65faab50374114" style="width:900px; height:500px;"></div>

4、实现按小时、按天粒度的流量统计

df.head()
ipstimestatusclientis_spider
0106.11.153.226[02/Dec/2019:22:40:18200YisouSpiderTrue
142.156.254.60[02/Dec/2019:22:40:23201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
2106.11.159.254[02/Dec/2019:22:40:27200YisouSpiderTrue
3106.11.157.254[02/Dec/2019:22:40:28200YisouSpiderTrue
442.156.137.109[02/Dec/2019:22:40:30201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
df["stime"] = pd.to_datetime(df["stime"].str[1:], format="%d/%b/%Y:%H:%M:%S")
df.head()
ipstimestatusclientis_spider
0106.11.153.2262019-12-02 22:40:18200YisouSpiderTrue
142.156.254.602019-12-02 22:40:23201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
2106.11.159.2542019-12-02 22:40:27200YisouSpiderTrue
3106.11.157.2542019-12-02 22:40:28200YisouSpiderTrue
442.156.137.1092019-12-02 22:40:30201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
df.set_index("stime", inplace=True)
df.sort_index(inplace=True)
df.head()
ipstatusclientis_spider
stime
2019-12-02 22:40:18106.11.153.226200YisouSpiderTrue
2019-12-02 22:40:2342.156.254.60201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
2019-12-02 22:40:27106.11.159.254200YisouSpiderTrue
2019-12-02 22:40:28106.11.157.254200YisouSpiderTrue
2019-12-02 22:40:3042.156.137.109201Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36True
df.index
DatetimeIndex(['2019-12-02 22:40:18', '2019-12-02 22:40:23',
               '2019-12-02 22:40:27', '2019-12-02 22:40:28',
               '2019-12-02 22:40:30', '2019-12-02 22:40:46',
               '2019-12-02 22:41:52', '2019-12-02 22:41:52',
               '2019-12-02 22:41:55', '2019-12-02 22:42:16',
               ...
               '2019-12-07 21:30:16', '2019-12-07 21:30:17',
               '2019-12-07 21:30:19', '2019-12-07 21:30:20',
               '2019-12-07 21:30:21', '2019-12-07 21:30:22',
               '2019-12-07 21:30:23', '2019-12-07 21:30:56',
               '2019-12-07 21:30:58', '2019-12-07 21:31:02'],
              dtype='datetime64[ns]', name='stime', length=50278, freq=None)
# 按小时统计
#df_pvuv = df.resample("H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

# 按每6个小时统计
#df_pvuv = df.resample("6H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

# 按天统计
df_pvuv = df.resample("D")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

df_pvuv.head()
pvuv
stime
2019-12-0228870
2019-12-03102851180
2019-12-04136181197
2019-12-05104851152
2019-12-0694691261
line = (
        Line()
        .add_xaxis(df_pvuv.index.to_list())
        .add_yaxis("PV", df_pvuv["pv"].to_list())
        .add_yaxis("UV", df_pvuv["uv"].to_list())
        .set_global_opts(
            title_opts=opts.TitleOpts(title="PVUV数据对比"),
            tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross")
        )
    )
line.render_notebook()
<div id="3375e16f7e3c45e8a7ca57f0310d594a" style="width:900px; height:500px;"></div>


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值