–创建数据库
create database if not exists db_yhd;
use db_yhd;
CREATE TABLE db_yhd.track_log(
id STRING,
url STRING,
referer STRING,
keyword STRING,
type STRING,
guid STRING,
pageId STRING,
moduleId STRING,
linkId STRING,
attachedInfo STRING,
sessionId STRING,
trackerU STRING,
trackerType STRING,
ip STRING,
trackerSrc STRING,
cookie STRING,
orderCode STRING,
trackTime STRING,
endUserId STRING,
firstLink STRING,
sessionViewNo STRING,
productId STRING,
curMerchantId STRING,
provinceId STRING,
cityId STRING,
fee STRING,
edmActivity STRING,
edmEmail STRING,
edmJobId STRING,
ieVersion STRING,
platform STRING,
internalKeyword STRING,
resultSum STRING,
currentPage STRING,
linkPosition STRING,
buttonPosition STRING
)
PARTITIONED BY (date_str STRING ,hour_str STRING )
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
--加载数据
LOAD DATA LOCAL INPATH '/opt/datas/2015082818' INTO TABLE db_yhd.track_log PARTITION (date_str="20150828", hour_str="18");
LOAD DATA LOCAL INPATH '/opt/datas/2015082819' INTO TABLE db_yhd.track_log PARTITION (date_str="20150828", hour_str="19");
--查看分区信息
show partitions db_yhd.track_log;
--抽样
select count(1) as count from db_yhd.track_log where date_str="20150828";
select date_str,url from db_yhd.track_log where date_str="20150828" limit 100;
/**
统计分析每日各时段的PV和UV
PV:pageview网页浏览器,URL不能为空
UV:uniqueVisitor 唯一访客数,多少人访问
*/
select
date_str,hour_str,count(url) as pv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url))>0
group by
date_str,hour_str
+-----------+-----------+--------+--+
| date_str | hour_str | pv |
+-----------+-----------+--------+--+
| 20150828 | 18 | 35880 |
| 20150828 | 19 | 33317 |
+-----------+-----------+--------+--+
select
date_str,count(url) as pv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url))>0
group by
date_str
+-----------+--------+--+
| date_str | pv |
+-----------+--------+--+
| 20150828 | 69197 |
+-----------+--------+--+
--发现有很多数据丢失,属于所谓"脏数据":56937
select
count(url) as c
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) ==0
--每日各时段UV统计
--guid是唯一ID,去重即可得到结果
select
date_str,hour_str,count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) ==0
group by
date_str,hour_str
+-----------+-----------+--------+--+
| date_str | hour_str | uv |
+-----------+-----------+--------+--+
| 20150828 | 18 | 23938 |
| 20150828 | 19 | 22330 |
+-----------+-----------+--------+--+
select
date_str,count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828"
group by
date_str
+-----------+--------+--+
| date_str | uv |
+-----------+--------+--+
| 20150828 | 39007 |
+-----------+--------+--+
/**
统计一条的UV
结果发现一天UV比各个时段加起来小
整个很正常,6:59开始到7点之后用户
*/
--合并结果
+-----------+-----------+--------+--------+
| date_str | hour_str | pv | uv |
+-----------+-----------+--------+--------+
| 20150828 | 18 | 35880 | 23938 |
| 20150828 | 19 | 33317 | 22330 |
+-----------+-----------+--------+--------+
select
date_str,hour_str,
count(url) as pv ,
count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) ==0
group by
date_str,hour_str
--保存结果
drop table if exists db_yhd.tb_pvuv_result;
create table db_yhd.tb_pvuv_result
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
AS
SELECT
date_str,hour_str,
count(url) as pv ,
count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) ==0
group by
date_str,hour_str
--将Hive结果集中数据,导出到MYSQL中
--创建MYSQL表
create table tb_result_pv(
id int not null AUTO_INCREMENT,
date_str varchar(255) not null,
hour_str varchar(255) not null,
pv int not null,
uv int not null ,
PRIMARY KEY(id)
)
--使用sqoop导出结果
bin/sqoop export \
--connect jdbc:mysql://bigdata-hpsk01.huadian.com/test \
--username root \
--password 123456 \
--table tb_result_pv \
--columns date_str,hour_str,pv,uv \
--export-dir /user/hive/warehouse/db_yhd.db/tb_pvuv_result \
--input-fields-terminated-by ','
--统计每日各省份PU和UV
--企业中往往实现一个功能之后,其他功能复制改吧改吧,就O了。
SELECT
date_str,provinceId,
count(url) as pv ,
count(distinct guid) as uv
from
db_yhd.track_log
where
date_str="20150828" and length(trim(url)) >0 and length(trim(provinceId)) >0
group by
date_str,provinceId