9. Hive技术之HQL练习☆

Hive技术之HQL练习

第一题

1. 数据准备
userIdvisitDatevisitCount
u012017/1/215
u022017/1/236
u032017/1/228
u042017/1/203
u012017/1/236
u012017/2/218
u022017/1/236
u012017/2/224
u01	2017/1/21	5
u02	2017/1/23	6
u03	2017/1/22	8
u04	2017/1/20	3
u01	2017/1/23	6
u01	2017/2/21	8
u02	2017/1/23	6
u01	2017/2/22	4

建表语句及导入

-- 建表语句
create table action(userId string,visitDate string,visitCount int) 
row format delimited fields terminated by "\t";

-- 导入数据
load data local inpath '/opt/module/data/action.txt' into table action;
2. 需求分析

要求使用SQL统计出每个用户的累积访问次数,如下表所示:

用户id月份小计累积
u012017-011111
u012017-021223
u022017-011212
u032017-0188
u042017-0133
3. HQL语句
-- 先处理一下日期的格式
select 
  userId,
  date_format(regexp_replace(visitDate,"/","-"),"yyyy-MM") delDate,
  visitCount
from
  action;
-- 结果
userid  deldate visitcount
u01     2017-01 5
u02     2017-01 6
u03     2017-01 8
u04     2017-01 3
u01     2017-01 6
u01     2017-02 8
u02     2017-01 6
u01     2017-02 4

-- 计算小计
-- 使用group by把userid和deldate相同的进行求和,即为求和
select 
  userId,
  delDate,
  sum(t1.visitCount) small_sum
from 
(select userId,
  date_format(regexp_replace(visitDate,"/","-"),"yyyy-MM") delDate,
  visitCount
from action)t1
group by userId,delDate;

-- 结果
userid  deldate small_sum
u01     2017-01 11
u01     2017-02 12
u02     2017-01 12
u03     2017-01 8
u04     2017-01 3

-- 计算累计
select 
  userId,
  delDate,
  small_sum,
  sum(small_sum) over(partition by userId order by delDate) total_sum
from (
select 
  userId,
  delDate,
  sum(t1.visitCount) small_sum
from (
select 
  userId,
  date_format(regexp_replace(visitDate,"/","-"),"yyyy-MM") delDate,
  visitCount
from action)t1
group by 
  userId,delDate)t2;

第二题 京东

题目描述:

  • 有50W个京东店铺,每个顾客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为visit,访客的用户id为user_id,被访问的店铺名称为shop,
1. 数据准备

测试数据visit.txt

u1	a
u2	b
u1	b
u1	a
u3	c
u4	b
u1	a
u2	c
u5	b
u4	b
u6	c
u2	c
u1	b
u2	a
u2	a
u3	a
u5	a
u5	a
u5	a

建表语句及导入数据

-- 键表
create table visit(user_id string,shop string) 
row format delimited fields terminated by '\t';

--导入数据
load data local inpath '/opt/module/data/visit.txt' into table visit;
2. HQL语句
  1. 每个店铺的UV(访客数)
-- 把一个用户多次访问同一个店铺的记录,而我们需要的是每个店铺的UV(访客数),所以要去重
select
  shop,
  user_id
from
  visit
group by shop,user_id;

shop    user_id
a       u1
a       u2
a       u3
a       u5
b       u1
b       u2
b       u4
b       u5
c       u2
c       u3
c       u6

-- 使用子查询,
select 
  shop,
  count(user_id) uv
from (
select
  shop,
  user_id
from
  visit
group by 
  shop,user_id)t1
group by shop;t2

shop uv
a       4
b       4
c       3
  1. 每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
-- 计算每个人访问每个店铺的总次数
select
  shop,
  user_id,
  count(user_id) cui
from 
  visit
group by 
  shop,user_id;

shop    user_id cui
a       u1      3
a       u2      2
a       u3      1
a       u5      3
b       u1      2
b       u2      1
b       u4      2
b       u5      1
c       u2      2
c       u3      1
c       u6      1

-- 针对同一店铺,对访问次数进行逆序排序,并添加rank值
select
  shop,
  user_id,
  cui,
  row_number() over(partition by shop order by cui desc) rank
from (
select
  shop,
  user_id,
  count(user_id) cui
from 
  visit
group by 
  shop,user_id)t1;

shop    user_id cui     rank
a       u5      3       1
a       u1      3       2
a       u2      2       3
a       u3      1       4
b       u4      2       1
b       u1      2       2
b       u5      1       3
b       u2      1       4
c       u2      2       1
c       u6      1       2
c       u3      1       3

select
  shop,
  user_id,
  cui
from(
select
  shop,
  user_id,
  cui,
  row_number() over(partition by shop order by cui desc) rank
from (
select
  shop,
  user_id,
  count(user_id) cui
from 
  visit
group by 
  shop,user_id)t1
)t2
where rank<4;

shop    user_id cui
a       u5      3
a       u1      3
a       u2      2
b       u4      2
b       u1      2
b       u5      1
c       u2      2
c       u6      1
c       u3      1

第三题 蚂蚁金服 第一问

1. 数据准备

表user_low_carbon记录了用户每天的蚂蚁森林低碳生活领取的记录流水。

user_iddata_dtlow_carbon
用户日期减少碳排放(g)

user_low_carbon.txt

蚂蚁森林植物换购表plant_carbon,用于记录申领环保植物所需要减少的碳排放量

plant_idplant_namelow_carbon
植物编号植物名换购植物所需要的碳

plant_carbon.txt

-- 创建表
create table user_low_carbon(user_id String,data_dt String,low_carbon int) 
row format delimited fields terminated by '\t';

create table plant_carbon(plant_id string,plant_name String,low_carbon int) 
row format delimited fields terminated by '\t';

-- 加载数据
load data local inpath "/opt/module/data/user_low_carbon.txt" into table user_low_carbon;

load data local inpath "/opt/module/data/plant_carbon.txt" into table plant_carbon;

-- 测试数据是否导入成功
select * from user_low_carbon;
select * from plant_carbon;
2. 需求分析

蚂蚁森林植物申领统计

  • 问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,剩余的能量全部用来领取“p002-沙柳” 。
    • 统计在10月1日累计申领“p002-沙柳” 排名前10(只要十个)的用户信息;
    • 以及他比后一名多领了几颗沙柳。
  • 得到的统计结果如下表样式:
user_idplant_countless_count(比后一名多领了几颗沙柳)
u_1011000100
u_088900400
u_103500
3. HQL语句
-- 设置本地模式,提升测试速度
set hive.exec.mode.local.auto=true;

-- 第一步:统计每个用户截止到2017/10/1日期总低碳量
-- 因为要的是前十个人的,所以可以在这里进行数据过滤,提高整个过程的速度
select
  user_id,
  sum(low_carbon) sum_carbon
from
  user_low_carbon
where
  date_format(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd")<='2017-10-1'
group by
  user_id
order by
  sum_carbon desc
limit 11;t1
-- 结果:
user_id sum_carbon
u_007   1470
u_013   1430
u_008   1240
u_005   1100
u_010   1080
u_014   1060
u_011   960
u_009   930
u_006   830
u_002   659
u_004   640


plant_id plant_name low_carbon
p001	梭梭树	17
p002	沙柳	19
p003	樟子树	146
p004	胡杨	215

-- 第二步:查询兑换胡杨需要的能量p004_carbon
select
  low_carbon p004_carbon
from
  plant_carbon
where
  plant_id = 'p004';t2
-- 结果:
p004_carbon
215

-- 第三步:查询兑换沙柳需要的能量p002_carbon
select
  low_carbon p002_carbon
from
  plant_carbon
where
  plant_id = 'p002';t3
-- 结果:
p002_carbon
19

-- 第四步:求得前十一个用户兑换沙柳的总数p002_sum,并按降序排序
select
  user_id,
  floor((sum_carbon-p004_carbon)/p002_carbon) p002_sum
from
  (select
  user_id,
  sum(low_carbon) sum_carbon
from
  user_low_carbon
where
  date_format(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd")<='2017-10-1'
group by
  user_id
order by
  sum_carbon desc
limit 11)t1,
  (select
  low_carbon p004_carbon
from
  plant_carbon
where
  plant_id = 'p004')t2,
  (select
  low_carbon p002_carbon
from
  plant_carbon
where
  plant_id = 'p002')t3;t4

-- 结果:
user_id p002_sum
u_007   66
u_013   63
u_008   53
u_005   46
u_010   45
u_014   44
u_011   39
u_009   37
u_006   32
u_002   23
u_004   22

-- 第五步:获取每个用户,下一位兑换的沙柳数leadOneSum
select
  user_id,
  p002_sum,
  lead(p002_sum,1) over() leadOneSum
from
  (select
  user_id,
  floor((sum_carbon-p004_carbon)/p002_carbon) p002_sum
from
  (select
  user_id,
  sum(low_carbon) sum_carbon
from
  user_low_carbon
where
  date_format(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd")<='2017-10-1'
group by
  user_id
order by
  sum_carbon desc
limit 11)t1,
  (select
  low_carbon p004_carbon
from
  plant_carbon
where
  plant_id = 'p004')t2,
  (select
  low_carbon p002_carbon
from
  plant_carbon
where
  plant_id = 'p002')t3)t4
limit 10;t5
-- 结果:
user_id p002_sum        leadonesum
u_004   22      23
u_002   23      32
u_006   32      37
u_009   37      39
u_011   39      44
u_014   44      45
u_010   45      46
u_005   46      53
u_008   53      63
u_013   63      66

-- 第六步:得到比后一名多领了几颗沙柳less_count
select
  user_id,
  p002_sum,
  (leadonesum-p002_sum) less_count
from
  (select
  user_id,
  p002_sum,
  lead(p002_sum,1) over() leadOneSum
from
  (select
  user_id,
  floor((sum_carbon-p004_carbon)/p002_carbon) p002_sum
from
  (select
  user_id,
  sum(low_carbon) sum_carbon
from
  user_low_carbon
where
  date_format(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd")<='2017-10-1'
group by
  user_id
order by
  sum_carbon desc
limit 11)t1,
  (select
  low_carbon p004_carbon
from
  plant_carbon
where
  plant_id = 'p004')t2,
  (select
  low_carbon p002_carbon
from
  plant_carbon
where
  plant_id = 'p002')t3)t4
limit 10)t5;
-- 结果:
user_id p002_sum        less_count
u_013   63      3
u_008   53      10
u_005   46      7
u_010   45      1
u_014   44      1
u_011   39      5
u_009   37      2
u_006   32      5
u_002   23      9
u_004   22      1

第四题 蚂蚁金服 第二问

1. 需求分析

蚂蚁森林低碳用户排名分析

  • 问题:查询user_low_carbon表中每日流水记录,条件为:

    • 用户在2017年,连续三天(或以上)的天数里,每天减少碳排放(low_carbon)都超过100g的用户低碳流水。需要查询返回满足以上条件的user_low_carbon表中的记录流水。

    • 例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g

      seq(key)user_iddata_dtlow_carbon
      xxxxx10u_0022017/1/2150
      xxxxx11u_0022017/1/270
      xxxxx12u_0022017/1/330
      xxxxx13u_0022017/1/380
      xxxxx14u_0022017/1/4150
      xxxxx14u_0022017/1/5101
  • 备注:统计方法不限于sql、procedure、python,java等

2. HQL语句
2.1 解法一
-- 设置本地模式,提升测试速度
set hive.exec.mode.local.auto=true;

-- 数据
user_id data_dt low_carbon
-- ...
u_002	2017/1/6	68

-- 第一步:过滤出2017年且单日低碳量超过100g
-- 按user_id和data_dt排序去重
select
   user_id,
   date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
   user_low_carbon
where
   substring(data_dt,1,4) = '2017'
group by
   user_id,data_dt
having
   sum(low_carbon)>=100;t1

-- 结果:
user_id data_dt
u_001   2017-01-02
u_001   2017-01-06
u_002   2017-01-02
u_002   2017-01-03
u_002   2017-01-04
u_002   2017-01-05
u_003   2017-01-02
u_003   2017-01-03
u_003   2017-01-05
u_003   2017-01-07

-- 第二步:将前两行数据以及后两行数据的日期放置当前行
select
    user_id,
    data_dt,
    lag(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lag2,
    lag(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lag1,
    lead(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lead1,
    lead(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lead2
from
    (select
   user_id,
   date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
   user_low_carbon
where
   substring(data_dt,1,4) = '2017'
group by
   user_id,data_dt
having
   sum(low_carbon)>=100)t1;t2

-- 结果:
user_id data_dt 		lag2    		lag1    		lead1   		lead2
u_001   2017-01-02      9999-99-99      9999-99-99      2017-01-06      9999-99-99
u_001   2017-01-06      9999-99-99      2017-01-02      9999-99-99      9999-99-99
u_002   2017-01-02      9999-99-99      9999-99-99      2017-01-03      2017-01-04
u_002   2017-01-03      9999-99-99      2017-01-02      2017-01-04      2017-01-05
u_002   2017-01-04      2017-01-02      2017-01-03      2017-01-05      9999-99-99
u_002   2017-01-05      2017-01-03      2017-01-04      9999-99-99      9999-99-99
u_003   2017-01-02      9999-99-99      9999-99-99      2017-01-03      2017-01-05

-- 第三步:计算当前日期跟前后两行时间的差值
select
    user_id,
    data_dt,
    datediff(data_dt,lag2) lag2diff,
    datediff(data_dt,lag1) lag1diff,
    datediff(data_dt,lead1) lead1diff,
    datediff(data_dt,lead2) lead2diff
from
    (select
    user_id,
    data_dt,
    lag(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lag2,
    lag(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lag1,
    lead(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lead1,
    lead(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lead2
from
    (select
   user_id,
   date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
   user_low_carbon
where
   substring(data_dt,1,4) = '2017'
group by
   user_id,data_dt
having
   sum(low_carbon)>=100)t1)t2;t3
   
-- 结果:
user_id data_dt 		lag2diff        lag1diff        lead1diff       lead2diff
u_001   2017-01-02      -2918443        -2918443        -4      -2918443
u_001   2017-01-06      -2918439        4       -2918439        -2918439
u_002   2017-01-02      -2918443        -2918443        -1      -2
u_002   2017-01-03      -2918442        1       -1      -2
u_002   2017-01-04      2       1       -1      -2918441
u_002   2017-01-05      2       1       -2918440        -2918440
u_003   2017-01-02      -2918443        -2918443        -1      -3

-- 第四步:过滤出连续3天超过100g的用户
select
    user_id,
    data_dt
from
    (select
    user_id,
    data_dt,
    datediff(data_dt,lag2) lag2diff,
    datediff(data_dt,lag1) lag1diff,
    datediff(data_dt,lead1) lead1diff,
    datediff(data_dt,lead2) lead2diff

from
    (select
    user_id,
    data_dt,
    lag(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lag2,
    lag(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lag1,
    lead(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lead1,
    lead(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lead2
from
    (select
   user_id,
   date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
   user_low_carbon
where
   substring(data_dt,1,4) = '2017'
group by
   user_id,data_dt
having
   sum(low_carbon)>=100)t1)t2)t3
where
    (lag2diff = 2 and lag1diff = 1) or
    (lag1diff = 1 and lead1diff = -1) or
    (lead1diff = -1 and lead2diff == -2);t4

-- 结果:
-- 少的一侧
user_id data_dt
u_002   2017-01-02
u_002   2017-01-03
u_002   2017-01-04
u_002   2017-01-05
u_005   2017-01-02


-- 多的一侧
select
    user_id,
    date_format(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt
from
    user_low_carbon;t5




-- 第五步:使用join on多表查询出low_carbon

select
    t5.user_id user_id,
    t5.data_dt data_dt,
    t5.low_carbon low_carbon
from
(
select
    user_id,
    data_dt
from
    (select
    user_id,
    data_dt,
    datediff(data_dt,lag2) lag2diff,
    datediff(data_dt,lag1) lag1diff,
    datediff(data_dt,lead1) lead1diff,
    datediff(data_dt,lead2) lead2diff

from
    (select
    user_id,
    data_dt,
    lag(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lag2,
    lag(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lag1,
    lead(data_dt,1,"9999-99-99") over(partition by user_id order by data_dt) lead1,
    lead(data_dt,2,"9999-99-99") over(partition by user_id order by data_dt) lead2
from
    (select
   user_id,
   date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
   user_low_carbon
where
   substring(data_dt,1,4) = '2017'
group by
   user_id,data_dt
having
   sum(low_carbon)>=100)t1)t2)t3
where
    (lag2diff = 2 and lag1diff = 1) or
    (lag1diff = 1 and lead1diff = -1) or
    (lead1diff = -1 and lead2diff == -2)
)t4
join
(
select
    user_id,
    date_format(regexp_replace(data_dt,"/","-"),"yyyy-MM-dd") data_dt,
    low_carbon
from
    user_low_carbon
)t5
on  
    t4.user_id = t5.user_id 
    and 
    t4.data_dt = t5.data_dt;t6

2.2 解法二
-- 第一步:过滤出2017年且单日低碳量超过100g
select
    user_id,
    date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
    user_low_carbon
where
    substring(data_dt,1,4)='2017'
group by
    user_id,data_dt
having
    sum(low_carbon)>=100;t1

-- 结果:
user_id data_dt
u_001   2017-01-02
u_001   2017-01-06
u_002   2017-01-02

-- 第二步:按照日期进行排序,并给每一条数据一个标记

select
    user_id,
    data_dt,
    rank() over(partition by user_id order by data_dt) rk
from
    (select
    user_id,
    date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
    user_low_carbon
where
    substring(data_dt,1,4)='2017'
group by
    user_id,data_dt
having
    sum(low_carbon)>=100)t1;t2

-- 结果:
user_id data_dt rk
u_001   2017-01-02      1
u_001   2017-01-06      2
u_002   2017-01-02      1
u_002   2017-01-03      2
u_002   2017-01-04      3
u_002   2017-01-05      4
u_003   2017-01-02      1

-- 第三步:将日期减去当前的rank值
select
    user_id,
    data_dt,
    date_sub(data_dt,rk) date_sub_rk
from
    (select
    user_id,
    data_dt,
    rank() over(partition by user_id order by data_dt) rk
from
    (select
    user_id,
    date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
    user_low_carbon
where
    substring(data_dt,1,4)='2017'
group by
    user_id,data_dt
having
    sum(low_carbon)>=100)t1)t2;t3

-- 结果:
user_id data_dt date_sub_rk
u_001   2017-01-02      2017-01-01
u_001   2017-01-06      2017-01-04
u_002   2017-01-02      2017-01-01
u_002   2017-01-03      2017-01-01
u_002   2017-01-04      2017-01-01
u_002   2017-01-05      2017-01-01
u_003   2017-01-02      2017-01-01

-- 第四天:过滤出连续3天超过100g的用户

select
    user_id
from
    (select
    user_id,
    data_dt,
    date_sub(data_dt,rk) date_sub_rk
from
    (select
    user_id,
    data_dt,
    rank() over(partition by user_id order by data_dt) rk
from
    (select
    user_id,
    date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
    user_low_carbon
where
    substring(data_dt,1,4)='2017'
group by
    user_id,data_dt
having
    sum(low_carbon)>=100)t1)t2)t3
group by 
    user_id,date_sub_rk
having
    count(*)>=3;t4
-- 符合条件的用户结果:
user_id
u_002
u_005
u_008
u_009
u_010
u_011
u_013
u_014

-- 第五步:根据user_id多表查询出题目的结果
select
   user_low_carbon.user_id user_id,
   user_low_carbon.data_dt data_dt,
   user_low_carbon.low_carbon low_carbon
from
   (select
    user_id
from
    (select
    user_id,
    data_dt,
    date_sub(data_dt,rk) date_sub_rk
from
    (select
    user_id,
    data_dt,
    rank() over(partition by user_id order by data_dt) rk
from
    (select
    user_id,
    date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd') data_dt
from
    user_low_carbon
where
    substring(data_dt,1,4)='2017'
group by
    user_id,data_dt
having
    sum(low_carbon)>=100)t1)t2)t3
group by 
    user_id,date_sub_rk
having
    count(*)>=3)t4
join
   user_low_carbon
on
   t4.user_id = user_low_carbon.user_id;

-- 结果:
user_id data_dt low_carbon
u_002   2017/1/1        10
u_002   2017/1/2        150
u_002   2017/1/2        70
u_002   2017/1/3        30
u_002   2017/1/3        80
u_002   2017/1/4        150
u_002   2017/1/5        101
u_002   2017/1/6        68
u_005   2017/1/1        80

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值