第一题
数据
1001 01 90
1001 02 90
1001 03 90
1002 01 85
1002 02 85
1002 03 70
1003 01 70
1003 02 70
1003 03 85
建表语句
create table score
(
uid string
,subject_id string
,score int
)
row format delimited fields terminated by '\t'
;
load data local inpath '/home/hive/score.txt' into table score;
select * from score;
需求
找出所有科目成绩都大于某一学科平均成绩的学生
sql
select uid
from (
select uid
,if(score>avg_score,0,1) flag
from (
select uid
,score
,avg(score) over(partition by subject_id) avg_score
from score
) t1
) t2
group by uid
having sum(flag) = 0
;
第二题
数据
u01 2017/01/21 5
u02 2017/01/23 6
u03 2017/01/22 8
u04 2017/01/20 3
u01 2017/01/23 6
u01 2017/02/21 8
u02 2017/01/23 6
u01 2017/02/22 4
建表语句
create table action
(
userId string
,visitDate string
,visitCount int
)
row format delimited fields terminated by '\t';
load data local inpath '/home/hive_test_data/action.txt' into table action;
select * from action;
需求
统计出每个用户的累积访问次数,如下表所示:
用户id | 月份 | 小计 | 累积 |
u01 | 2017-01 | 11 | 11 |
u01 | 2017-02 | 12 | 23 |
u02 | 2017-01 | 12 | 12 |
u03 | 2017-01 | 8 | 8 |
u04 | 2017-01 | 3 | 3 |
sql
with t1 as (
select userId
,date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') visitDate
,visitCount
from action
)
,t2 as (
select userId
,visitDate
,sum(visitCount) visitCount
from t1
group by userId
,visitDate
)
select userId
,visitDate
,visitCount
,sum(visitCount) over (partition by userId order by visitDate rows between unbounded preceding and current row) visitCount_total
from t2
;
第三题
数据
u1 a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 b
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a
建表语句
create table visit
(
user_id string
,shop string
)
row format delimited fields terminated by '\t';
load data local inpath '/home/hive_test_data/visit.txt' into table visit;
select * from visit;
需求
有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
1)每个店铺的UV(访客数)
2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
sql
select shop
,count(distinct user_id) uv
from visit
group by shop
;
with temp as (
select shop
,user_id
,count(1) visit_count
from visit
group by user_id,shop
)
,rn as (
select shop
,user_id
,visit_count
,rank() over(partition by shop order by visit_count desc) as rn
from temp
)
select shop
,user_id
,visit_count
from rn
where rn <= 3
;
第四题
数据
2017-01-01 10029028 1000003251 33.57
2017-01-10 11039016 1100003000 25.50
2017-01-25 11039015 1101003001 95.50
2017-02-01 11029028 1000003251 30.57
2017-02-10 11029016 1100003000 15.50
2017-03-01 10029030 1000003251 33.57
2017-03-10 11039017 1100003000 25.50
2017-03-25 11039080 1101003001 95.50
2017-04-10 21039017 1100003000 75.15
2017-04-25 21039080 1101003001 45.59
2017-05-07 20029028 1000003251 32.57
2017-05-10 41039016 1100003000 20.50
2017-06-22 31039015 1101003001 15.50
2017-07-08 60029028 1000003251 39.57
2017-07-11 51039016 1100003000 28.50
2017-08-01 70029028 1000003251 83.57
2017-09-10 81039016 1100003000 65.50
2017-10-25 91039015 1101003001 25.50
2017-11-10 10039016 1100003000 65.50
2017-11-25 10039015 1201003001 25.50
2017-12-01 11020028 1000003251 33.57
2017-12-10 12039016 1100003000 25.50
2017-12-25 14039015 1101003001 95.50
2017-12-25 10039089 1201003001 25.50
建表语句
create table order_tab
(
dt string
,order_id string
,user_id string
,amount decimal(10,2)
)
row format delimited fields terminated by '\t';
load data local inpath '/home/hive_test_data/order_tab.txt' into table order_tab;
select * from order_tab;
需求
1)给出 2017年每个月的订单数、用户数、总成交金额。
2)给出2017年11月的新客数(指在11月才有第一笔订单)
sql
select date_format(dt,'yyyy-MM') dt
,count(order_id) orders
,count(distinct user_id) users
,sum(amount) amount
from order_tab
where year(dt) = '2017'
group by date_format(dt,'yyyy-MM')
;
select count(user_id) new_users
from order_tab
group by user_id
having date_format(min(dt),'yyyy-MM') = '2017-11'
;
第五题
数据
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
建表语句
create table user_age
(
dt string
,user_id string
,age int
)
row format delimited fields terminated by ',';
load data local inpath '/home/hive_test_data/user_age.txt' into table user_age;
select * from user_age;
需求
求所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)
sql
with temp as (
select dt
,user_id
,age
from user_age
group by dt
,user_id
,age
)
,temp2 as (
select distinct user_id
,age
from temp
)
,res1 as (
select '所有用户' as user_type
,count(1) users
,cast(sum(age)/count(user_id) as decimal(10,2)) avg_age
from temp2
)
,lead_dt as (
select dt
,user_id
,age
,lead(dt,1,'9999-12-30') over(partition by user_id order by dt) dt_1
from temp
)
,continue_users as (
select user_id
,age
from lead_dt
where datediff(dt_1,dt) = 1
)
,continue_users2 as (
select distinct user_id
,age
from continue_users
)
,res2 as (
select '活跃用户' as user_type
,count(1) users
,cast(sum(age)/count(1) as decimal(10,2)) avg_age
from continue_users2
)
select *from res1
union all
select *from res2
;
第六题
数据
1000003251 33 2017-01-01 10029028
1100003000 25 2017-01-10 11039016
1101003001 95 2017-01-25 11039015
1000003251 30 2017-02-01 11029028
1100003000 15 2017-02-10 11029016
1000003251 33 2017-03-01 10029030
1100003000 25 2017-03-10 11039017
1101003001 95 2017-03-25 11039080
1100003000 75 2017-04-10 21039017
1101003001 45 2017-04-25 21039080
1000003251 32 2017-05-07 20029028
1100003000 20 2017-05-10 41039016
1101003001 15 2017-06-22 31039015
1000003251 39 2017-07-08 60029028
1100003000 28 2017-07-11 51039016
1000003251 83 2017-08-01 70029028
1100003000 65 2017-09-10 81039016
1101003001 25 2017-10-25 91039015
1010013251 50 2017-10-26 10129028
1100003000 65 2017-11-10 10039016
1201003001 25 2017-11-25 10039015
1000003251 33 2017-12-01 11020028
1100003000 25 2017-12-10 12039016
1101003001 95 2017-12-25 14039015
1201003001 25 2017-12-25 10039089
1010013251 30 2017-12-26 12129028
建表语句
create table ordertable
(
userid string
,money int
,paymenttime string
,orderid string
)
row format delimited fields terminated by '\t';
load data local inpath '/home/hive_test_data/ordertable.txt' into table ordertable;
select * from ordertable;
需求
表ordertable字段(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)
请用sql写出所有用户中在今年10月份第一次购买商品的金额
sql
with temp as (
select userid
,min(paymenttime) paymenttime
from ordertable
where date_format(paymenttime,'yyyy-MM') = '2017-10'
group by userid
)
select temp.userid
,temp.paymenttime
,ordertable.money
from temp
,ordertable
where temp.userid = ordertable.userid
and temp.paymenttime = ordertable.paymenttime
;
第七题
数据
2016-11-09 14:22:05 /api/user/login 110.23.5.33
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 14:59:40 /api/user/login 200.6.5.166
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
2016-11-09 14:22:05 /api/user/login 110.23.5.34
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
2016-11-09 14:22:05 /api/user/login 110.23.5.35
2016-11-09 14:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
2016-11-09 14:59:40 /api/user/login 200.6.5.166
2016-11-09 14:59:40 /api/user/login 200.6.5.166
建表语句
create table ip
(
time string
,interface string
,ip string
)
row format delimited fields terminated by '\t';
load data local inpath '/home/hive_test_data/ip.txt' into table ip;
select * from ip;
需求
求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址
sql
select ip
,count(1) visit_count
from ip
where date_format(time,'yyyy-MM-dd HH') >= '2016-11-09 14'
and date_format(time,'yyyy-MM-dd HH') <= '2016-11-09 15'
and interface = 'api/user/login'
group by ip
order by visit_count desc
limit 10
;