5、访客分析
========================================================================================================================================================================================================================================================================
-- 独立访客
--需求:按照时间维度来统计独立访客及其产生的pv量
时间维度:时
drop table dw_user_dstc_ip_h;
create table dw_user_dstc_ip_h(
remote_addr string,
pvs bigint,
hour string);
select remote_addr,count(1) as pvs, concat(month,day,hour) as hour
from dw_weblog_detail
where datestr ='20181101'
group by concat(month,day,hour),remote_addr
order by hour,remote_addr limit 10;
+------------------+------+---------+--+
| remote_addr | pvs | hour |
+------------------+------+---------+--+
| 101.226.68.137 | 16 | 110106 |
| 124.42.13.230 | 44 | 110106 |
| 157.55.35.40 | 4 | 110106 |
| 163.177.71.12 | 16 | 110106 |
| 183.195.232.138 | 16 | 110106 |
| 194.237.142.21 | 2 | 110106 |
| 216.24.201.254 | 2 | 110106 |
| 221.130.41.168 | 2 | 110106 |
| 221.237.156.47 | 4 | 110106 |
| 222.66.59.174 | 4 | 110106 |
+------------------+------+---------+--+
insert into table dw_user_dstc_ip_h
select remote_addr,count(1) as pvs, concat(month,day,hour) as hour
from dw_weblog_detail
where datestr ='20181101'
group by concat(month,day,hour),remote_addr
order by hour,remote_addr;
select * from dw_user_dstc_ip_h limit 20;
+--------------------------------+------------------------+-------------------------+--+
| dw_user_dstc_ip_h.remote_addr | dw_user_dstc_ip_h.pvs | dw_user_dstc_ip_h.hour |
+--------------------------------+------------------------+-------------------------+--+
| 71.96.108.116 | 56 | 110106 |
| 124.42.13.230 | 44 | 110106 |
| 58.215.204.118 | 26 | 110106 |
| 183.195.232.138 | 16 | 110106 |
| 58.248.178.212 | 16 | 110106 |
| 101.226.68.137 | 16 | 110106 |
| 163.177.71.12 | 16 | 110106 |
| 66.249.66.84 | 4 | 110106 |
| 222.66.59.174 | 4 | 110106 |
| 157.55.35.40 | 4 | 110106 |
| 221.237.156.47 | 4 | 110106 |
| 60.208.6.156 | 2 | 110106 |
| 194.237.142.21 | 2 | 110106 |
| 50.116.27.194 | 2 | 110106 |
| 222.68.172.190 | 2 | 110106 |
| 221.130.41.168 | 2 | 110106 |
| 216.24.201.254 | 2 | 110106 |
| 61.135.216.105 | 2 | 110106 |
| 61.135.216.104 | 2 | 110106 |
| 222.247.54.20 | 86 | 110107 |
+--------------------------------+------------------------+-------------------------+--+
--在上述基础之上,可以继续分析,比如每小时独立访客总数
select count(1) as dstc_ip_cnts ,hour
from dw_user_dstc_ip_h
group by hour
limit 25;
+---------------+---------+--+
| dstc_ip_cnts | hour |
+---------------+---------+--+
| 38 | 110106 |
| 196 | 110107 |
| 258 | 110108 |
| 298 | 110109 |
| 214 | 110110 |
| 108 | 110111 |
| 104 | 110112 |
| 142 | 110113 |
| 124 | 110114 |
| 144 | 110115 |
| 186 | 110116 |
| 110 | 110117 |
| 84 | 110118 |
| 166 | 110119 |
| 70 | 110120 |
| 88 | 110121 |
| 104 | 110122 |
| 76 | 110123 |
| 78 | 110200 |
| 84 | 110201 |
| 108 | 110202 |
| 134 | 110203 |
| 110 | 110204 |
| 96 | 110205 |
| 60 | 110206 |
+---------------+---------+--+
select sum(pvs) as dstc_ip_cnts ,hour
from dw_user_dstc_ip_h
group by hour
limit 25; (chenyanlong:?)
+---------------+---------+--+
| dstc_ip_cnts | hour |
+---------------+---------+--+
| 444 | 110106 |
| 4040 | 110107 |
| 8208 | 110108 |
| 5496 | 110109 |
| 2272 | 110110 |
| 2284 | 110111 |
| 2484 | 110112 |
| 2124 | 110113 |
| 2056 | 110114 |
| 3036 | 110115 |
| 1900 | 110116 |
| 1528 | 110117 |
| 1048 | 110118 |
| 1560 | 110119 |
| 844 | 110120 |
| 852 | 110121 |
| 1404 | 110122 |
| 1528 | 110123 |
| 1248 | 110200 |
| 1296 | 110201 |
| 2184 | 110202 |
| 2208 | 110203 |
| 2276 | 110204 |
| 2160 | 110205 |
| 600 | 110206 |
+---------------+---------+--+
时间维度:日
select remote_addr,count(1) as counts,concat(month,day) as day
from dw_weblog_detail
Where datestr='20181101'
group by concat(month,day),remote_addr;
-----------------+---------+-------+--+
| remote_addr | counts | day |
+------------------+---------+-------+--+
| 218.240.60.248 | 10 | 1101 |
| 218.241.103.162 | 50 | 1101 |
| 218.30.118.102 | 6 | 1101 |
| 218.30.118.99 | 6 | 1101 |
| 218.56.173.244 | 2 | 1101 |
| 218.6.169.95 | 2 | 1101 |
| 218.66.124.44 | 2 | 1101 |
| 218.69.11.90 | 52 | 1101 |
| 218.72.20.38 | 54 | 1101 |
| 218.82.101.100 | 2 | 1101 |
| 218.88.6.121 | 2 | 1101 |
| 219.133.13.252 | 2 | 1101 |
| 219.133.15.170 | 18 | 1101 |
| 219.134.187.135 | 4 | 1101 |
| 219.134.64.81 | 4 | 1101 |
时间维度: 月
select remote_addr,count(1) as counts,month
from dw_weblog_detail
group by month,remote_addr;
| remote_addr | counts | month |
+------------------+---------+--------+--+
| 71.133.107.222 | 72 | 11 |
| 71.206.247.97 | 46 | 11 |
| 71.96.108.116 | 56 | 11 |
| 72.46.128.138 | 8 | 11 |
| 72.46.128.139 | 14 | 11 |
| 72.46.128.140 | 6 | 11 |
| 72.46.128.141 | 14 | 11 |
| 75.126.125.204 | 2 | 11 |
| 76.185.96.8 | 56 | 11 |
| 8.35.201.48 | 24 | 11 |
| 8.35.201.49 | 2 | 11 |
| 8.35.201.50 | 14 | 11 |
| 8.35.201.51 | 10 | 11 |
| 8.35.201.52 | 2 | 11 |
-- 每日新访客
-- 需求:将每天的新访客统计出来。
--历日去重访客累积表
drop table dw_user_dsct_history;
create table dw_user_dsct_history(
day string,
ip string
)
partitioned by(datestr string);
--每日新访客表
drop table dw_user_new_d;
create table dw_user_new_d (
day string,
ip string
)
partitioned by(datestr string);
--每日新用户插入新访客表
insert into table dw_user_new_d partition(datestr='20181101')
select tmp.day as day,tmp.today_addr as new_ip from
(
select today.day as day,today.remote_addr as today_addr,old.ip as old_addr
from
(select distinct remote_addr as remote_addr,"20181101" as day from dw_weblog_detail where datestr="20181101") today
left outer join
dw_user_dsct_history old
on today.remote_addr=old.ip
) tmp
where tmp.old_addr is null;
--每日新用户追加到累计表
insert into table dw_user_dsct_history partition(datestr='20181101')
select day,ip from dw_user_new_d where datestr='20181101';
验证:
select count(distinct remote_addr) from dw_weblog_detail;
+-------+--+
| _c0 |
+-------+--+
| 1027 |
+-------+--+
select count(1) from dw_user_dsct_history where datestr='20181101';
+-------+--+
| _c0 |
+-------+--+
| 1027 |
+-------+--+
select count(1) from dw_user_new_d where datestr='20181101';
+-------+--+
| _c0 |
+-------+--+
| 1027 |
+-------+--+