python 分析大数据日志_【大数据】大数据实践项目 - nginx 日志分析

最新推荐文章于 2023-07-03 10:13:39 发布

weixin_39575565

最新推荐文章于 2023-07-03 10:13:39 发布

阅读量162

点赞数

文章标签： python 分析大数据日志

Github 项目源码

1.nginx 日志收集

# 查看日志配置，不知道配置路径的话，可以执行 nginx -t

less /etc/nginx/nginx.conf

# 查看日志

cd /var/log/nginx;

# 合并打包日志

cat access.log > nginx.log;

gunzip -c access.log*gz > nginx.log;

gzip nginx.log;

sz nginx.log.gz;

2.hive 建表加载数据

# 上传解压日志

rz;

gunzip nginx.log.gz;

-- 根据日志格式，使用正则序列化解析，一个()是一个字段，注意转义

drop table if exists spider.nginx_log;

create table spider.nginx_log(

remote_addr STRING,

remote_user STRING,

time_local STRING,

request STRING,

status STRING,

body_bytes_sent STRING,

http_referer STRING,

http_user_agent STRING

)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'

WITH SERDEPROPERTIES (

"input.regex" = '(.*?) - (.*?) \\[(.*?)\\] "(.*?)" (\\d+) (\\d+) "(.*?)" "(.*?)" .*',

"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s"

);

-- 加载数据

load data local inpath '/home/getway/tmp/way/nginx.log' into table spider.nginx_log;

3.数据清洗与分析

-- 数据清洗: 选择有用的字段，并通过 udf 解析 useragent，将数据写到新表 nginx_log_clean

set hive.exec.mode.local.auto=true;

set hive.exec.mode.local.auto.inputbytes.max=52428800;

set hive.exec.mode.local.auto.input.files.max=10;

add file /home/getway/tmp/way/ua_perse.py;

create table spider.nginx_log_clean as

select transform(remote_addr,

time_local,

request,

http_user_agent)

USING 'python3 ua_perse.py' AS (remote_addr,

time_local,

request,

http_user_agent,

device,

os,

browser)

from spider.nginx_log;

-- 查看数据示例

select * from spider.nginx_log_clean limit 10;

set hive.exec.mode.local.auto=true;

set hive.exec.mode.local.auto.inputbytes.max=52428800;

set hive.exec.mode.local.auto.input.files.max=10;

-- ip 统计

select remote_addr, count(1)

from spider.nginx_log_clean

group by remote_addr

order by 2 desc;

-- pv 最高的页面

select request, count(1)

from spider.nginx_log_clean

where request rlike 'comics'

group by request

order by 2 desc

limit 1;

-- 每天的访问数

select case when device='Spider' then 'Spider' else 'Normal' end, substring(time_local, 0, 11), count(1)

from spider.nginx_log_clean

group by case when device='Spider' then 'Spider' else 'Normal' end, substring(time_local, 0, 11)

order by 1, 2;

-- 每小时的访问数

select case when device='Spider' then 'Spider' else 'Normal' end, substring(time_local, 13, 2), count(1)

from spider.nginx_log_clean

group by case when device='Spider' then 'Spider' else 'Normal' end, substring(time_local, 13, 2)

order by 1,2;

-- 查看用户设备前 10 名

select device, count(1)

from spider.nginx_log_clean

where device not in ('Other', 'Spider') -- 过滤掉干扰数据

group by device

order by 2 desc

limit 10;

-- 爬虫访问情况

select browser, count(1)

from spider.nginx_log_clean

where device = 'Spider'

group by browser;

-- 通过 udf 获取 ip 所在省份，分析省份访问数

-- 数据量较少，使用本地模式比较快

set hive.exec.mode.local.auto=true;

set hive.exec.mode.local.auto.inputbytes.max=52428800;

set hive.exec.mode.local.auto.input.files.max=10;

add file /home/getway/baidu_api.py;

with cte as(

select remote_addr, count(1) cn

from spider.nginx_log_clean

group by remote_addr

)

select province, sum(cn)

from(

select transform(remote_addr, cn)

USING 'python3 baidu_api.py' AS (province, cn)

from cte

) as a

group by province;

4.数据可视化

将前面需要可视化的分析结果，直接复制粘贴到 excel，然后通过 pandas 读取，经 ironman 可视化展示。

基于 flask 和 echarts 的数据可视化工具 ironman

最终的效果图

24 小时访问趋势

每日访问情况

客户端设备占比

用户分布

爬虫词云

weixin_39575565

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫