Linux运行jar包(hive离线计算项目)

一、将日志未清洗的数据放在Linux系统下/opt/log。

二、上传到HDFS里面,命令:hadoop fs -mkdir log-data-new

三、缺一个fastjson的jar包,命令:/usr/local/hadoop-2.6.4/share/hadoop/mapreduce/lib,然后xftp从项目里导进来jar包。

四、运行jar包,命令:hadoop jar /usr/local/logminer/LogMiner-jar com.etc.DataTest(IDEA主类的路径) hdfs://192.168.88.202:9000/log-data-new hdfs://192.168.88.202:9000/log-data-clean

hadoop jar /usr/local/logdata/LogMiner-1.0-SNAPSHOT.jar com.etc.DataTest hdfs://192.168.88.202:9000/log-data-new hdfs://192.168.88.202:9000/log-data-clean
复制代码

五、建外部表,建分区,导入数据。

create external table t_logdata(cid_sn string,os_ver string,resolution string,commit_time string,sdk_ver string,device_id_type string,device_model string,promotion_channel string,app_ver_name string,app_ver_code string,pid string,net_type string,device_id string,app_device_id string,release_channel string,country string,time_zone string,os_name string,manufacture string,commit_id string,app_token string,app_id string,language string,User_id string)
partitioned by(day string)
row format delimited
fields terminated by ','
location '/log-data-clean-data/';
复制代码
load data local(本地) inpath '/opt/log/data/*' into table t_logdata partition(day='2017-08-15');
load data inpath'/log-data-clean/*' into table t_logdata partition(day='2017-08-15');
 select * from t_logdata where day='2017-08-15';  //查询处理过的数据有没有上传
复制代码

六、新增用户分析(了解分组且排序的高阶函数row_number())

CREATE TABLE etl_active_day (
 cid_sn string
 ,os_ver string
 ,resolution string
 ,commit_time string
 ,sdk_ver string
 ,device_id_type string
 ,device_model string
 ,promotion_channel string
 ,app_ver_name string
 ,app_ver_code string
 ,pid string
 ,net_type string
 ,device_id string
 ,app_device_id string
 ,release_channel string
 ,country string
 ,time_zone string
 ,os_name string
 ,manufacture string
 ,commit_id string
 ,app_token string
 ,app_id string
 ,language string
 ,user_id string
    ) partitioned BY (day string) row format delimited fields terminated BY ',';
复制代码

七、日活跃用户(了解查询所有表字段建立临时表筛选字段后插入分区)

INSERT INTO TABLE etl_active_day PARTITION (day = '2017-08-15')
SELECT
 cid_sn
 ,os_ver
 ,resolution
 ,commit_time
 ,sdk_ver
 ,device_id_type
 ,device_model
 ,promotion_channel
 ,app_ver_name
 ,app_ver_code
 ,pid
 ,net_type
 ,device_id
 ,app_device_id
 ,release_channel
 ,country
 ,time_zone
 ,os_name
 ,manufacture
 ,commit_id
 ,app_token
 ,app_id
 ,language
 ,user_id
FROM (
    SELECT *
        ,row_number() OVER (
            PARTITION BY user_id ORDER BY commit_time
            ) AS rn
    FROM t_logdata
    WHERE day = '2017-08-15'
    ) tmp
WHERE rn = 1;
复制代码

八、建维度统计结果

DROP TABLE dim_user_active_day;
CREATE TABLE dim_active_day (
    os_name string
    ,city string
    ,release_channel string
    ,app_ver_name string
    ,cnts INT
    ) partitioned BY (
    day string
    ,dim string
    );
复制代码

九、利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中。

FROM etl_active_day

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0000'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2017-08-15'

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1000'
    )
SELECT os_name
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY (os_name)

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0100'
    )
SELECT 'all'
    ,country
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY (country)

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0010'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY (release_channel)

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0001'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY (app_ver_name)

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0011'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,app_ver_name 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY release_channel , app_ver_name

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0101'
    )
SELECT 'all'
    ,country 
    ,'all'
    ,app_ver_name 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY country , app_ver_name

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0110'
    )
SELECT 'all'
    ,country 
    ,release_channel 
    ,'all' 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY country , release_channel

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '0111'
    )
SELECT 'all'
    ,country 
    ,release_channel 
    , app_ver_name 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY country , release_channel , app_ver_name 

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1001'
    )
SELECT os_name 
    ,'all'
    ,'all'
    , app_ver_name 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , app_ver_name

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1010'
    )
SELECT os_name 
    ,'all'
    ,release_channel 
    , 'all' 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , release_channel

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1011'
    )
SELECT os_name 
    ,'all'
    ,release_channel 
    ,app_ver_name
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , release_channel , app_ver_name

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1100'
    )
SELECT os_name 
    ,country 
    ,'all'
    , 'all' 
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1101'
    )
SELECT os_name 
    ,country 
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country , app_ver_name

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
    ,dim = '1110'
    )
SELECT os_name 
    ,country 
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country , release_channel

INSERT INTO TABLE dim_active_day PARTITION (
    day = '2017-08-15'
   ,dim = '1111' 
   )
SELECT os_name 
    ,country 
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2017-08-15'
GROUP BY os_name , country , release_channel , app_ver_name;
复制代码

十、日新用户统计

/*
日新:当日第一次出现的用户--当日的新增用户

思路: a、应该建立一个历史用户表(只存user_id)

       b、将当日的活跃用户去 比对  历史用户表, 就知道哪些人是今天新出现的用户 --> 当日新增用户
	   
	   c、将当日新增用户追加到历史用户表

*/

-- 数据建模  *******************************

-- 1 历史用户表
create table etl_user_history(user_id string);


-- 2 当日新增用户表:存所有字段(每个人时间最早的一条),带有一个分区字段:day string;
create table etl_user_new_day like etl_active_day;


-- 统计实现 *********************************

-- 1 当日活跃-历史用户表 --> 新增用户表的当日分区
Insert  into etl_user_new_day partition(day='2017-08-15')
SELECT cid_sn
 ,os_ver
 ,resolution
 ,commit_time
 ,sdk_ver
 ,device_id_type
 ,device_model
 ,promotion_channel
 ,app_ver_name
 ,app_ver_code
 ,pid
 ,net_type
 ,device_id
 ,app_device_id
 ,release_channel
 ,country
 ,time_zone
 ,os_name
 ,manufacture
 ,commit_id
 ,app_token
 ,app_id
 ,language
 ,a.user_id
From  etl_active_day a left join  etl_user_history b on a.user_id = b.user_id
where a.day='2017-08-15' and b.user_id is null;

-- 2 将当日新增用户的user_id追加到历史表
insert into table etl_user_history
select user_id from etl_user_new_day where day='2017-08-15';

/*
日新:维度统计报表

思路: a、从日新etl表中,按照维度组合,统计出各种维度组合下的新用户数量

维度:
os_name    country    release_channel   app_ver_name


维度组合统计 
0 0 0 0 
0 0 0 1 
0 0 1 0 
0 0 1 1 
0 1 0 0 
0 1 0 1 
0 1 1 0 
0 1 1 1 
1 0 0 0 
1 0 0 1 
1 0 1 0 
1 0 1 1 
1 1 0 0 
1 1 0 1 
1 1 1 0 
1 1 1 1

*/

-- 1 日新维度统计报表--数据建模
create table dim_user_new_day(os_name string,country string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);


-- 2 日新维度统计报表sql开发(利用多重插入语法)

From etl_user_new_day

insert into table dim_user_new_day partition(day='2017-08-15',dim='0000')
select 'all','all','all','all',count(1)
where day='2017-08-15'

insert into table dim_user_new_day partition(day='2017-08-15',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2017-08-15'
group by app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2017-08-15'
group by release_channel

insert into table dim_user_new_day partition(day='2017-08-15',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2017-08-15'
group by release_channel,app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='0100')
select 'all',country,'all','all',count(1)
where day='2017-08-15'
group by country

insert into table dim_user_new_day partition(day='2017-08-15',dim='0101')
select 'all',country,'all',app_ver_name,count(1)
where day='2017-08-15'
group by country,app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='0110')
select 'all',country,release_channel,'all',count(1)
where day='2017-08-15'
group by country,release_channel

insert into table dim_user_new_day partition(day='2017-08-15',dim='0111')
select 'all',country,release_channel,app_ver_name,count(1)
where day='2017-08-15'
group by country,release_channel,app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='1000')
select 'os_name','all','all','all',count(1)
where day='2017-08-15'
group by os_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='1001')
select 'os_name','all','all','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='1010')
select 'os_name','all','release_channel','all',count(1)
where day='2017-08-15'
group by os_name,release_channel

insert into table dim_user_new_day partition(day='2017-08-15',dim='1011')
select 'os_name','all','release_channel','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,release_channel,app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='1100')
select 'os_name','country','all','all',count(1)
where day='2017-08-15'
group by os_name,country

insert into table dim_user_new_day partition(day='2017-08-15',dim='1101')
select 'os_name','country','all','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,country,app_ver_name

insert into table dim_user_new_day partition(day='2017-08-15',dim='1110')
select 'os_name','country','release_channel','all',count(1)
where day='2017-08-15'
group by os_name,country,release_channel

insert into table dim_user_new_day partition(day='2017-08-15',dim='1111')
select 'os_name','country','release_channel','app_ver_name',count(1)
where day='2017-08-15'
group by os_name,country,release_channel,app_ver_name;

/* 
留存用户分析
概念:昨日新增,今日还活跃

逻辑思路:昨天在新用户表中,今天在活跃用户表中 --> 今日的“次日留存用户”
-- 昨天的新用户表中,存在于今天的活跃用户表中的人  --> 今日的“次日留存用户”


*/


-- 数据建模 
--建次日留存etl信息表:记录跟活跃用户表相同的字段

create table etl_user_keepalive_nextday like etl_active_day;

-- etl开发
insert into table etl_user_keepalive_nextday partition(day='2017-08-16')
select
     actuser.cid_sn
     ,actuser.os_ver
     ,actuser.resolution
     ,actuser.commit_time
     ,actuser.sdk_ver
     ,actuser.device_id_type
     ,actuser.device_model
     ,actuser.promotion_channel
     ,actuser.app_ver_name
     ,actuser.app_ver_code
     ,actuser.pid
     ,actuser.net_type
     ,actuser.device_id
     ,actuser.app_device_id
     ,actuser.release_channel
     ,actuser.country
     ,actuser.time_zone
     ,actuser.os_name
     ,actuser.manufacture
     ,actuser.commit_id
     ,actuser.app_token
     ,actuser.app_id
     ,actuser.language
     ,actuser.user_id

from etl_user_new_day newuser join etl_active_day actuser
on newuser.user_id = actuser.user_id
where newuser.day='2017-08-15' and actuser.day='2017-08-16';


/*  ***************用左半连接效率略高***************************************************** */
insert into table etl_user_keepalive_nextday partition(day='2017-08-16')
select 
 cid_sn
,os_ver
,resolution
,commit_time
,sdk_ver
,device_id_type
,device_model
,promotion_channel
,app_ver_name
,app_ver_code
,pid
,net_type
,device_id
,app_device_id
,release_channel
,country
,time_zone
,os_name
,manufacture
,commit_id
,app_token
,app_id
,language
,a.user_id
from etl_user_new_day a left semi join etl_active_day b
on a.user_id = b.user_id and a.day='2017-08-15' and b.day='2017-08-16';
where a.day='2017-08-15' and b.day='2017-08-16'; // 注意:left semi join中,右表的引用不能出现在where条件中

/*
维度统计
*/

-- 1.次日留存维度统计报表--数据建模

create table dim_user_keepalive_nextday(os_name string,country string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);

-- 2.利用多重插入语法

思路: a、从次日留存etl表中,按照维度组合,统计出各种维度组合下的留存用户数量

维度:
os_name    country    release_channel   app_ver_name

维度组合统计 
0 0 0 0 
0 0 0 1 
0 0 1 0 
0 0 1 1 
0 1 0 0 
0 1 0 1 
0 1 1 0 
0 1 1 1 
1 0 0 0 
1 0 0 1 
1 0 1 0 
1 0 1 1 
1 1 0 0 
1 1 0 1 
1 1 1 0 
1 1 1 1

*/

From etl_user_keepalive_nextday

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0000')
select 'all','all','all','all',count(1)
where day='2017-08-16'

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2017-08-16'
group by app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2017-08-16'
group by release_channel

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2017-08-16'
group by release_channel,app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0100')
select 'all',country,'all','all',count(1)
where day='2017-08-16'
group by country

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0101')
select 'all',country,'all',app_ver_name,count(1)
where day='2017-08-16'
group by country,app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0110')
select 'all',country,release_channel,'all',count(1)
where day='2017-08-16'
group by country,release_channel

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='0111')
select 'all',country,release_channel,app_ver_name,count(1)
where day='2017-08-16'
group by country,release_channel,app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1000')
select 'os_name','all','all','all',count(1)
where day='2017-08-16'
group by os_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1001')
select 'os_name','all','all','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1010')
select 'os_name','all','release_channel','all',count(1)
where day='2017-08-16'
group by os_name,release_channel

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1011')
select 'os_name','all','release_channel','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,release_channel,app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1100')
select 'os_name','country','all','all',count(1)
where day='2017-08-16'
group by os_name,country

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1101')
select 'os_name','country','all','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,country,app_ver_name

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1110')
select 'os_name','country','release_channel','all',count(1)
where day='2017-08-16'
group by os_name,country,release_channel

insert into table dim_user_keepalive_nextday partition(day='2017-08-16',dim='1111')
select 'os_name','country','release_channel','app_ver_name',count(1)
where day='2017-08-16'
group by os_name,country,release_channel,app_ver_name;

复制代码
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值