import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth',-1)from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line
# 读取整个目录,将所有的文件合并到一个dataframe
data_dir ="./datas/crazyant/blog_access_log"
df_list =[]import os
for fname in os.listdir(f"{data_dir}"):
df_list.append(pd.read_csv(f"{data_dir}/{fname}", sep=" ", header=None, error_bad_lines=False))
df = pd.concat(df_list)
b'Skipping line 2245: expected 10 fields, saw 16\nSkipping line 2889: expected 10 fields, saw 14\nSkipping line 2890: expected 10 fields, saw 14\nSkipping line 2891: expected 10 fields, saw 13\nSkipping line 2892: expected 10 fields, saw 13\nSkipping line 2900: expected 10 fields, saw 11\nSkipping line 2902: expected 10 fields, saw 11\nSkipping line 3790: expected 10 fields, saw 14\nSkipping line 3791: expected 10 fields, saw 14\nSkipping line 3792: expected 10 fields, saw 13\nSkipping line 3793: expected 10 fields, saw 13\nSkipping line 3833: expected 10 fields, saw 11\nSkipping line 3835: expected 10 fields, saw 11\nSkipping line 9936: expected 10 fields, saw 16\n'
b'Skipping line 11748: expected 10 fields, saw 11\nSkipping line 11750: expected 10 fields, saw 11\n'
df.head()
0
1
2
3
4
5
6
7
8
9
0
106.11.153.226
-
-
[02/Dec/2019:22:40:18
+0800]
GET /740.html?replytocom=1194 HTTP/1.0
200
13446
-
YisouSpider
1
42.156.254.60
-
-
[02/Dec/2019:22:40:23
+0800]
POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0
201
55
http://www.crazyant.net/740.html?replytocom=1194
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2
106.11.159.254
-
-
[02/Dec/2019:22:40:27
+0800]
GET /576.html HTTP/1.0
200
13461
-
YisouSpider
3
106.11.157.254
-
-
[02/Dec/2019:22:40:28
+0800]
GET /?lwfcdw=t9n2d3&oqzohc=m5e7j1&oubyvq=iab6a3&oudmbg=6osqd3 HTTP/1.0
200
10485
-
YisouSpider
4
42.156.137.109
-
-
[02/Dec/2019:22:40:30
+0800]
POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0
201
55
http://www.crazyant.net/576.html
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df = df[[0,3,6,9]].copy()
df.head()
0
3
6
9
0
106.11.153.226
[02/Dec/2019:22:40:18
200
YisouSpider
1
42.156.254.60
[02/Dec/2019:22:40:23
201
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2
106.11.159.254
[02/Dec/2019:22:40:27
200
YisouSpider
3
106.11.157.254
[02/Dec/2019:22:40:28
200
YisouSpider
4
42.156.137.109
[02/Dec/2019:22:40:30
201
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
bar =(
Bar().add_xaxis([str(x)for x in df_spider.index]).add_yaxis("是否Spider", df_spider.values.tolist()).set_global_opts(title_opts=opts.TitleOpts(title="爬虫访问量占比")))
bar.render_notebook()
Pandas处理分析网站原始访问日志目标:真实项目的实战,探索Pandas的数据处理与分析实例:数据来源:我自己的wordpress博客http://www.crazyant.net/ 的访问日志实现步骤:1、读取数据、清理、格式化2、统计爬虫spider的访问比例,输出柱状图3、统计http状态码的访问占比,输出饼图4、统计按小时、按天的PV/UV流量趋势,输出折线图1、读取数据并清理格式化import pandas as pdimport numpy as nppd.set_o