需求: 统计24小时 ,每个时段的 PV 和UV
分析过程:
1.
PV:统计每一条记录 (每个链接就是一个)
UV: 访问人数 (guid 唯一用户的ID) distenct 去重复
2.
字段过滤
id / url / guid / tracktime
3.
预期结果
日期 小时 PV UV
28 18 21212 61312
28 19 23111 41121
4.
select sql
5.
sqoop 导出
==========================================================================
一. 数据采集
创建数据库
create database track_log
创建源表
create table log_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as textfile;
加载数据
load data local inpath '/yyc/2015082818' into table log_source;
load data local inpath '/yyc/2015082819' into table log_source;
二.
数据清洗阶段 》》 创建分区表
create table log_qingxi(id string,
url string,
guid string,
date string,
hour string
)
row format delimited fields terminated by '\t';
-》导入数据
insert into table log_qingxi
select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from log_source;
-》检验查询
select id,date,hour from log_qingxi limit 10;
-》创建分区表
create table log_part1(
id string,
url string,
guid string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';
-》导入数据
insert into table log_part1 partition (date='20150828',hour='18')
select id,url,guid from log_qingxi where date='28' and hour='18';
insert into table log_part1 partition (date='20150828',hour='19')
select id,url,guid from log_qingxi where date='28' and hour='19';
select id,date,hour from log_part1 where date='20150828' and hour='18';
如果这里有一个月的数据,要导入30次或者31次
所以下面我们会讲怎么使用动态分区
动态分区
修改hive-site.xml 文件
支持动态分区设置下面参数
1.直接set
set hive.exec.dynamic.partition.mode=nonstrict;
2.或者 配置文件里配置
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>Whether or not to allow dynamic partitions in DML/DDL.</description>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>strict</value>
<description>In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions.</description>
</property>
-》实现动态分区
create table log_part2(
id string,
url string,
guid string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';
insert into table log_part2 partition (date,hour)
select * from log_qingxi;
或者
insert into table log_part2 partition (date,hour)
select id,url,guid,date,hour from log_qingxi;
数据清洗部分完成
==========================================================
三. 需求实现
PV 实现
0: jdbc:hive2://yun-01:10000> select date,hour,count(url) PV from log_part1 group by date,hour;
+-----------+-------+--------+--+
| date | hour | pv |
+-----------+-------+--------+--+
| 20150828 | 18 | 64972 |
| 20150828 | 19 | 61162 |
+-----------+-------+--------+--+
2 rows selected (6.008 seconds)
UV实现:
select date,hour,count(distinct guid) UV from log_part1 group by date,hour;
+-----------+-------+--------+--+
| date | hour | uv |
+-----------+-------+--------+--+
| 20150828 | 18 | 23938 |
| 20150828 | 19 | 22330 |
+-----------+-------+--------+--+
能不能在一条SQL 中实现
create table if not exists result asselect date,hour,count(url) PV,count(distinct guid) UV from log_part1 group by date,hour;
最终结果
select * from result;
+--------------+--------------+------------+------------+--+
| result.date | result.hour | result.pv | result.uv |
+--------------+--------------+------------+------------+--+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+--------------+--------------+------------+------------+--+
导出到mysql
mysql建表
create table if not exists save(
date varchar(30) not null,
hour varchar(30) not null,
pv varchar(30) not null,
uv varchar(30) not null,
primary key(date,hour)
);
执行导出结果
bin/sqoop export \
--connect jdbc:mysql://yun-01:3306/sqoop_test \
--username root \
--password 123456 \
--table save \
--export-dir /user/beifeng/cdhwarehouse/result \
--num-mappers 1 \
--input-fields-terminated-by '\001'
在MYSQL 下查询验证
mysql> select * from save;
+----------+------+-------+-------+
| date | hour | pv | uv |
+----------+------+-------+-------+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+----------+------+-------+-------+
2 rows in set (0.01 sec)