5、访客分析

5、访客分析
========================================================================================================================================================================================================================================================================
-- 独立访客
--需求:按照时间维度来统计独立访客及其产生的pv量

时间维度:时
drop table dw_user_dstc_ip_h;
create table dw_user_dstc_ip_h(
remote_addr string,
pvs      bigint,
hour     string);

select remote_addr,count(1) as pvs, concat(month,day,hour) as hour
from dw_weblog_detail
where datestr ='20181101'
group by  concat(month,day,hour),remote_addr
order by  hour,remote_addr limit 10;
+------------------+------+---------+--+
|   remote_addr    | pvs  |  hour   |
+------------------+------+---------+--+
| 101.226.68.137   | 16   | 110106  |
| 124.42.13.230    | 44   | 110106  |
| 157.55.35.40     | 4    | 110106  |
| 163.177.71.12    | 16   | 110106  |
| 183.195.232.138  | 16   | 110106  |
| 194.237.142.21   | 2    | 110106  |
| 216.24.201.254   | 2    | 110106  |
| 221.130.41.168   | 2    | 110106  |
| 221.237.156.47   | 4    | 110106  |
| 222.66.59.174    | 4    | 110106  |
+------------------+------+---------+--+

insert into table  dw_user_dstc_ip_h
select remote_addr,count(1) as pvs, concat(month,day,hour) as hour
from dw_weblog_detail
where datestr ='20181101'
group by  concat(month,day,hour),remote_addr
order by  hour,remote_addr; 

select * from  dw_user_dstc_ip_h limit 20;
+--------------------------------+------------------------+-------------------------+--+
| dw_user_dstc_ip_h.remote_addr  | dw_user_dstc_ip_h.pvs  | dw_user_dstc_ip_h.hour  |
+--------------------------------+------------------------+-------------------------+--+
| 71.96.108.116                  | 56                     | 110106                  |
| 124.42.13.230                  | 44                     | 110106                  |
| 58.215.204.118                 | 26                     | 110106                  |
| 183.195.232.138                | 16                     | 110106                  |
| 58.248.178.212                 | 16                     | 110106                  |
| 101.226.68.137                 | 16                     | 110106                  |
| 163.177.71.12                  | 16                     | 110106                  |
| 66.249.66.84                   | 4                      | 110106                  |
| 222.66.59.174                  | 4                      | 110106                  |
| 157.55.35.40                   | 4                      | 110106                  |
| 221.237.156.47                 | 4                      | 110106                  |
| 60.208.6.156                   | 2                      | 110106                  |
| 194.237.142.21                 | 2                      | 110106                  |
| 50.116.27.194                  | 2                      | 110106                  |
| 222.68.172.190                 | 2                      | 110106                  |
| 221.130.41.168                 | 2                      | 110106                  |
| 216.24.201.254                 | 2                      | 110106                  |
| 61.135.216.105                 | 2                      | 110106                  |
| 61.135.216.104                 | 2                      | 110106                  |
| 222.247.54.20                  | 86                     | 110107                  |
+--------------------------------+------------------------+-------------------------+--+

--在上述基础之上,可以继续分析,比如每小时独立访客总数
select count(1) as dstc_ip_cnts ,hour
from dw_user_dstc_ip_h
group by hour
limit 25;
+---------------+---------+--+
| dstc_ip_cnts  |  hour   |
+---------------+---------+--+
| 38            | 110106  |
| 196           | 110107  |
| 258           | 110108  |
| 298           | 110109  |
| 214           | 110110  |
| 108           | 110111  |
| 104           | 110112  |
| 142           | 110113  |
| 124           | 110114  |
| 144           | 110115  |
| 186           | 110116  |
| 110           | 110117  |
| 84            | 110118  |
| 166           | 110119  |
| 70            | 110120  |
| 88            | 110121  |
| 104           | 110122  |
| 76            | 110123  |
| 78            | 110200  |
| 84            | 110201  |
| 108           | 110202  |
| 134           | 110203  |
| 110           | 110204  |
| 96            | 110205  |
| 60            | 110206  |
+---------------+---------+--+

select sum(pvs) as dstc_ip_cnts ,hour
from dw_user_dstc_ip_h
group by hour
limit 25;  (chenyanlong:?)

+---------------+---------+--+
| dstc_ip_cnts  |  hour   |
+---------------+---------+--+
| 444           | 110106  |
| 4040          | 110107  |
| 8208          | 110108  |
| 5496          | 110109  |
| 2272          | 110110  |
| 2284          | 110111  |
| 2484          | 110112  |
| 2124          | 110113  |
| 2056          | 110114  |
| 3036          | 110115  |
| 1900          | 110116  |
| 1528          | 110117  |
| 1048          | 110118  |
| 1560          | 110119  |
| 844           | 110120  |
| 852           | 110121  |
| 1404          | 110122  |
| 1528          | 110123  |
| 1248          | 110200  |
| 1296          | 110201  |
| 2184          | 110202  |
| 2208          | 110203  |
| 2276          | 110204  |
| 2160          | 110205  |
| 600           | 110206  |
+---------------+---------+--+

时间维度:日
select remote_addr,count(1) as counts,concat(month,day) as day
from dw_weblog_detail
Where datestr='20181101'
group by concat(month,day),remote_addr;
-----------------+---------+-------+--+
|   remote_addr    | counts  |  day  |
+------------------+---------+-------+--+
| 218.240.60.248   | 10      | 1101  |
| 218.241.103.162  | 50      | 1101  |
| 218.30.118.102   | 6       | 1101  |
| 218.30.118.99    | 6       | 1101  |
| 218.56.173.244   | 2       | 1101  |
| 218.6.169.95     | 2       | 1101  |
| 218.66.124.44    | 2       | 1101  |
| 218.69.11.90     | 52      | 1101  |
| 218.72.20.38     | 54      | 1101  |
| 218.82.101.100   | 2       | 1101  |
| 218.88.6.121     | 2       | 1101  |
| 219.133.13.252   | 2       | 1101  |
| 219.133.15.170   | 18      | 1101  |
| 219.134.187.135  | 4       | 1101  |
| 219.134.64.81    | 4       | 1101  |

时间维度: 月
select remote_addr,count(1) as counts,month 
from dw_weblog_detail
group by month,remote_addr;
|   remote_addr    | counts  | month  |
+------------------+---------+--------+--+
| 71.133.107.222   | 72      | 11     |
| 71.206.247.97    | 46      | 11     |
| 71.96.108.116    | 56      | 11     |
| 72.46.128.138    | 8       | 11     |
| 72.46.128.139    | 14      | 11     |
| 72.46.128.140    | 6       | 11     |
| 72.46.128.141    | 14      | 11     |
| 75.126.125.204   | 2       | 11     |
| 76.185.96.8      | 56      | 11     |
| 8.35.201.48      | 24      | 11     |
| 8.35.201.49      | 2       | 11     |
| 8.35.201.50      | 14      | 11     |
| 8.35.201.51      | 10      | 11     |
| 8.35.201.52      | 2       | 11     |



-- 每日新访客
-- 需求:将每天的新访客统计出来。

--历日去重访客累积表
drop table dw_user_dsct_history;
create table dw_user_dsct_history(
day string,
ip string
) 
partitioned by(datestr string);

--每日新访客表
drop table dw_user_new_d;
create table dw_user_new_d (
day string,
ip string
) 
partitioned by(datestr string);

--每日新用户插入新访客表
insert into table dw_user_new_d partition(datestr='20181101')
select tmp.day as day,tmp.today_addr as new_ip from
(
select today.day as day,today.remote_addr as today_addr,old.ip as old_addr 
from 
(select distinct remote_addr as remote_addr,"20181101" as day from dw_weblog_detail where datestr="20181101") today
left outer join 
dw_user_dsct_history old
on today.remote_addr=old.ip
) tmp
where tmp.old_addr is null;
--每日新用户追加到累计表
insert into table dw_user_dsct_history partition(datestr='20181101')
select day,ip from dw_user_new_d where datestr='20181101';


验证:
select count(distinct remote_addr) from dw_weblog_detail;
+-------+--+
|  _c0  |
+-------+--+
| 1027  |
+-------+--+
select count(1) from dw_user_dsct_history where datestr='20181101';
+-------+--+
|  _c0  |
+-------+--+
| 1027  |
+-------+--+
select count(1) from dw_user_new_d where datestr='20181101';
+-------+--+
|  _c0  |
+-------+--+
| 1027  |
+-------+--+

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值