一、将日志未清洗的数据放在Linux系统下/opt/log。
二、上传到HDFS里面,命令:hadoop fs -mkdir log-data-new
三、缺一个fastjson的jar包,命令:/usr/local/hadoop-2.6.4/share/hadoop/mapreduce/lib,然后xftp从项目里导进来jar包。
四、运行jar包,命令:hadoop jar /usr/local/logminer/LogMiner-jar com.etc.DataTest(IDEA主类的路径) hdfs://192.168.88.202:9000/log-data-new hdfs://192.168.88.202:9000/log-data-clean
hadoop jar /usr/local/logdata/LogMiner-1.0-SNAPSHOT.jar com.etc.DataTest hdfs://192.168.88.202:9000/log-data-new hdfs://192.168.88.202:9000/log-data-clean
复制代码
五、建外部表,建分区,导入数据。
create external table t_logdata(cid_sn string,os_ver string,resolution string,commit_time string,sdk_ver string,device_id_type string,device_model string,promotion_channel string,app_ver_name string,app_ver_code string,pid string,net_type string,device_id string,app_device_id string,release_channel string,country string,time_zone string,os_name string,manufacture string,commit_id string,app_token string,app_id string,language string,User_id string)
partitioned by(day string)
row format delimited
fields terminated by ','
location '/log-data-clean-data/';
复制代码
load data local(本地) inpath '/opt/log/data/*' into table t_logdata partition(day='2017-08-15');
load data inpath'/log-data-clean/*' into table t_logdata partition(day='2017-08-15');
select * from t_logdata where day='2017-08-15'; //查询处理过的数据有没有上传
复制代码
六、新增用户分析(了解分组且排序的高阶函数row_number())
CREATE TABLE etl_active_day (
cid_sn string
,os_ver string
,resolution string
,commit_time string
,sdk_ver string
,device_id_type string
,device_model string
,promotion_channel string
,app_ver_name string
,app_ver_code string
,pid string
,net_type string
,device_id string
,app_device_id string
,release_channel string
,country string
,time_zone string
,os_name string
,manufacture string
,commit_id string
,app_token string
,app_id string
,language string
,user_id string
) partitioned BY (day string) row format delimited fields terminated BY ',';
复制代码
七、日活跃用户(了解查询所有表字段建立临时表筛选字段后插入分区)
INSERT INTO TABLE etl_active_day PARTITION (day = '2017-08-15')
SELECT
cid_sn
,os_ver
,resolution
,commit_time
,sdk_ver
,device_id_type
,device_model
,promotion_channel
,app_ver_name
,app_ver_code
,pid
,net_type
,device_id
,app_device_id
,release_channel
,country
,time_zone
,os_name
,manufacture
,commit_id
,app_token
,app_id
,language
,user_id
FROM (
SELECT *
,row_number() OVER (
PARTITION BY user_id ORDER BY commit_time
) AS rn
FROM t_logdata
WHERE day = '2017-08-15'
) tmp
WHERE rn = 1;
复制代码
八、建维度统计结果
DROP TABLE dim_user_active_day;
CREATE TABLE dim_active_day (
os_name string
,city string
,release_channel string
,app_ver_name string
,cnts INT
) partitioned BY (
day string
,dim string
);
复制代码
九、利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中。
FROM etl_active_day
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0000'
)
SELECT 'all'
,'all'
,'all'
,'all'
,count(1)
WHERE day = '2017-08-15'
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1000'
)
SELECT os_name
,'all'
,'all'
,'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY (os_name)
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0100'
)
SELECT 'all'
,country
,'all'
,'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY (country)
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0010'
)
SELECT 'all'
,'all'
,release_channel
,'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY (release_channel)
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0001'
)
SELECT 'all'
,'all'
,'all'
,app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY (app_ver_name)
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0011'
)
SELECT 'all'
,'all'
,release_channel
,app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY release_channel , app_ver_name
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0101'
)
SELECT 'all'
,country
,'all'
,app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY country , app_ver_name
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0110'
)
SELECT 'all'
,country
,release_channel
,'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY country , release_channel
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '0111'
)
SELECT 'all'
,country
,release_channel
, app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY country , release_channel , app_ver_name
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1001'
)
SELECT os_name
,'all'
,'all'
, app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , app_ver_name
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1010'
)
SELECT os_name
,'all'
,release_channel
, 'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , release_channel
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1011'
)
SELECT os_name
,'all'
,release_channel
,app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , release_channel , app_ver_name
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1100'
)
SELECT os_name
,country
,'all'
, 'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1101'
)
SELECT os_name
,country
,'all'
,app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country , app_ver_name
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1110'
)
SELECT os_name
,country
,release_channel
,'all'
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country , release_channel
INSERT INTO TABLE dim_active_day PARTITION (
day = '2017-08-15'
,dim = '1111'
)
SELECT os_name
,country
,release_channel
,app_ver_name
,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country , release_channel , app_ver_name;
复制代码
十、日新用户统计
/*
日新:当日第一次出现的用户--当日的新增用户
思路: a、应该建立一个历史用户表(只存user_id)
b、将当日的活跃用户去 比对 历史用户表, 就知道哪些人是今天新出现的用户 --> 当日新增用户
c、将当日新增用户追加到历史用户表
*/
-- 数据建模 *******************************
-- 1 历史用户表
create table etl_user_history(user_id string);
-- 2 当日新增用户表:存所有字段(每个人时间最早的一条),带有一个分区字段:day string;
create table etl_user_new_day like etl_active_day;
-- 统计实现 *********************************
-- 1 当日活跃-历史用户表 --> 新增用户表的当日分区
Insert into etl_user_new_day partition(day='2017-08-15')
SELECT cid_sn
,os_ver
,resolution
,commit_time
,sdk_ver
,device_id_type
,device_model
,promotion_channel
,app_ver_name
,app_ver_code
,pid
,net_type
,device_id
,app_device_id
,release_channel
,country
,time_zone
,os_name
,manufacture
,commit_id
,app_token
,app_id
,language
,a.user_id
From etl_active_day a left join etl_user_history b on a.user_id = b.user_id
where a.day='2017-08-15' and b.user_id is null;
-- 2 将当日新增用户的user_id追加到历史表
insert into table etl_user_history
select user_id from etl_user_new_day where day='2017-08-15';
/*
日新:维度统计报表
思路: a、从日新etl表中,按照维度组合,统计出各种维度组合下的新用户数量
维度:
os_name country release_channel app_ver_name
维度组合统计
0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1
*/
-- 1 日新维度统计报表--数据建模
create table dim_user_new_day(os_name string,country string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);
-- 2 日新维度统计报表sql开发(利用多重插入语法)
From etl_user_new_day
insert into table dim_user_new_day partition(day='2017-08-15',dim='0000')
select 'all','all','all','all',count(1)
where day='2017-08-15'
insert into table dim_user_new_day partition(day='2017-08-15',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2017-08-15'
group by app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2017-08-15'
group by release_channel
insert into table dim_user_new_day partition(day='2017-08-15',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2017-08-15'
group by release_channel,app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='0100')
select 'all',country,'all','all',count(1)
where day='2017-08-15'
group by country
insert into table dim_user_new_day partition(day='2017-08-15',dim='0101')
select 'all',country,'all',app_ver_name,count(1)
where day='2017-08-15'
group by country,app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='0110')
select 'all',country,release_channel,'all',count(1)
where day='2017-08-15'
group by country,release_channel
insert into table dim_user_new_day partition(day='2017-08-15',dim='0111')
select 'all',country,release_channel,app_ver_name,count(1)
where day='2017-08-15'
group by country,release_channel,app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='1000')
select 'os_name','all','all','all',count(1)
where day='2017-08-15'
group by os_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='1001')
select 'os_name','all','all','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='1010')
select 'os_name','all','release_channel','all',count(1)
where day='2017-08-15'
group by os_name,release_channel
insert into table dim_user_new_day partition(day='2017-08-15',dim='1011')
select 'os_name','all','release_channel','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,release_channel,app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='1100')
select 'os_name','country','all','all',count(1)
where day='2017-08-15'
group by os_name,country
insert into table dim_user_new_day partition(day='2017-08-15',dim='1101')
select 'os_name','country','all','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,country,app_ver_name
insert into table dim_user_new_day partition(day='2017-08-15',dim='1110')
select 'os_name','country','release_channel','all',count(1)
where day='2017-08-15'
group by os_name,country,release_channel
insert into table dim_user_new_day partition(day='2017-08-15',dim='1111')
select 'os_name','country','release_channel','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,country,release_channel,app_ver_name;
/*
留存用户分析
概念:昨日新增,今日还活跃
逻辑思路:昨天在新用户表中,今天在活跃用户表中 --> 今日的“次日留存用户”
-- 昨天的新用户表中,存在于今天的活跃用户表中的人 --> 今日的“次日留存用户”
*/
-- 数据建模
--建次日留存etl信息表:记录跟活跃用户表相同的字段
create table etl_user_keepalive_nextday like etl_active_day;
-- etl开发
insert into table etl_user_keepalive_nextday partition(day='2017-08-16')
select
actuser.cid_sn
,actuser.os_ver
,actuser.resolution
,actuser.commit_time
,actuser.sdk_ver
,actuser.device_id_type
,actuser.device_model
,actuser.promotion_channel
,actuser.app_ver_name
,actuser.app_ver_code
,actuser.pid
,actuser.net_type
,actuser.device_id
,actuser.app_device_id
,actuser.release_channel
,actuser.country
,actuser.time_zone
,actuser.os_name
,actuser.manufacture
,actuser.commit_id
,actuser.app_token
,actuser.app_id
,actuser.language
,actuser.user_id
from etl_user_new_day newuser join etl_active_day actuser
on newuser.user_id = actuser.user_id
where newuser.day='2017-08-15' and actuser.day='2017-08-16';
/* ***************用左半连接效率略高***************************************************** */
insert into table etl_user_keepalive_nextday partition(day='2017-08-16')
select
cid_sn
,os_ver
,resolution
,commit_time
,sdk_ver
,device_id_type
,device_model
,promotion_channel
,app_ver_name
,app_ver_code
,pid
,net_type
,device_id
,app_device_id
,release_channel
,country
,time_zone
,os_name
,manufacture
,commit_id
,app_token
,app_id
,language
,a.user_id
from etl_user_new_day a left semi join etl_active_day b
on a.user_id = b.user_id and a.day='2017-08-15' and b.day='2017-08-16';
where a.day='2017-08-15' and b.day='2017-08-16'; // 注意:left semi join中,右表的引用不能出现在where条件中
/*
维度统计
*/
-- 1.次日留存维度统计报表--数据建模
create table dim_user_keepalive_nextday(os_name string,country string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);
-- 2.利用多重插入语法
思路: a、从次日留存etl表中,按照维度组合,统计出各种维度组合下的留存用户数量
维度:
os_name country release_channel app_ver_name
维度组合统计
0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1
*/
From etl_user_keepalive_nextday
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0000')
select 'all','all','all','all',count(1)
where day='2017-08-16'
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2017-08-16'
group by app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2017-08-16'
group by release_channel
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2017-08-16'
group by release_channel,app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0100')
select 'all',country,'all','all',count(1)
where day='2017-08-16'
group by country
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0101')
select 'all',country,'all',app_ver_name,count(1)
where day='2017-08-16'
group by country,app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0110')
select 'all',country,release_channel,'all',count(1)
where day='2017-08-16'
group by country,release_channel
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0111')
select 'all',country,release_channel,app_ver_name,count(1)
where day='2017-08-16'
group by country,release_channel,app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1000')
select 'os_name','all','all','all',count(1)
where day='2017-08-16'
group by os_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1001')
select 'os_name','all','all','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1010')
select 'os_name','all','release_channel','all',count(1)
where day='2017-08-16'
group by os_name,release_channel
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1011')
select 'os_name','all','release_channel','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,release_channel,app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1100')
select 'os_name','country','all','all',count(1)
where day='2017-08-16'
group by os_name,country
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1101')
select 'os_name','country','all','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,country,app_ver_name
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1110')
select 'os_name','country','release_channel','all',count(1)
where day='2017-08-16'
group by os_name,country,release_channel
insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1111')
select 'os_name','country','release_channel','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,country,release_channel,app_ver_name;
复制代码