案例一
-》需求:统计24小时内的每个时段的pv和uv
-》pv统计总的浏览量
-》uv统计guid去重后的总量
-》获取时间字段,日期和小时 -》分区表
-》2015-08-28 18:14:59 -》28和18 substring方式获取
-》数据分析
-》hive :select sql
-》sqoop:导出mysql
日期 小时 pv uv
-》日期和小时:tracktime
-》pv:url
-》uv:guid
1.【数据收集】
登陆hive:
启动服务端:bin/hiveserver2 &
启动客户端:bin/beeline -u jdbc:hive2://node-1:10000/ -n ibeifeng -p 123456
创建源表:
create database track_log;
create table yhd_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t';
load data local inpath '/opt/datas/2015082818' into table yhd_source;
load data local inpath '/opt/datas/2015082819' into table yhd_source;
2. 【数据清洗】
时间是2015082812,需要截取日期28,时间12
创建清洗表
create table yhd_qingxi(
id string,
url string,
guid string,
date string,
hour string
)
row format delimited fields terminated by '\t';
insert into table yhd_qingxi select id,url,guid,substring(