Hive SQL 练习

题目1

表student中的数据格式如下:
name month degree
s1 201801 A
s1 201802 A
s1 201803 C
s1 201804 A
s1 201805 A
s1 201806 A
s2 201801 A
s2 201802 B
s2 201803 C
s2 201804 A
s2 201805 D
s2 201806 A
s3 201801 C
s3 201802 A
s3 201803 A
s3 201804 A
s3 201805 B
s3 201806 A

现要查询表中连续三个月以上degree均为A的记录

select
    a1.name,
    a1.month,
    a1.degree
from
(
    select
        name,
        month,
        degree,
        sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS score1,
        sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 1 PRECEDING AND 1 following) AS score2,
        sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN  CURRENT ROW AND 2 following) AS score3
    from student
) as a1
where
    a1.score1 = 3 or
    a1.score2 = 3 or
    a1.score3 = 3;

题目2

文本数据如下:

name  scores
张三    语文:78,数学:90,英语:82,历史:77,政治:80,物理:88,化学:79,地理:92,生物:86
李四    语文:68,数学:84,英语:79,历史:65,政治:66,物理:78,化学:72,地理:83,生物:75
王五    语文:88,数学:69,英语:86,历史:82,政治:70,物理:68,化学:89,地理:95,生物:93
朱六    语文:73,数学:76,英语:73,历史:76,政治:75,物理:85,化学:86,地理:85,生物:90
钱二    语文:68,数学:91,英语:92,历史:97,政治:83,物理:88,化学:77,地理:72,生物:70
段七    语文:86,数学:80,英语:85,历史:87,政治:88,物理:98,化学:69,地理:87,生物:74

建表语句:

create external table score_test(
    name string,
    scores map<String, int>
) 
row format delimited fields terminated by '\t' 
collection items terminated by ','     
map keys terminated by ':'   
lines terminated by '\n'

查询每个学生最好的课程和成绩,最差的课程和成绩,以及各科的平均分

select
    a.name,
    max(if(a.key1 = 1, a.subject, '')) as bad_subject,
    max(if(a.key1 = 1, a.score, 0)) as bad_score,
    max(if(a.key2 = 1, a.subject, '')) as good_subject,
    max(if(a.key2 = 1, a.score, 0)) as good_score,
    avg(a.score) as avg_score
from
(
    select
        name,
        subject,
        score,
        row_number() over(partition by name order by score) as key1,
        row_number() over(partition by name order by score desc) as key2
    from
        score_test LATERAL VIEW explode(scores) adTable AS subject, score
) as a
group by a.name
having bad_score > 0 and good_score > 0;

题目3

业务中涉及到学生退费的统计问题,月退费人数,求上月存在,这月不存在的学生个数。
创建测试表并插入测试数据:

create table test_aaa
(
    day string comment '日期',
    stu_id int comment '学生id' 
);

insert into test_aaa values 
("2020-01-02", 1001),
("2020-01-02", 1002),
("2020-02-02", 1001),
("2020-02-02", 1002),
("2020-02-02", 1003),
("2020-02-02", 1004),
("2020-03-02", 1001),
("2020-03-02", 1002),
("2020-04-02", 1005),
("2020-05-02", 1006);

sql 实现:

select 
    day,
    stu_id,
    next_month,
    next_stu_list,
    -- 使用 judge_1 = 0 来求本月没有的个数
    if(array_contains(next_stu_list, stu_id), 1, 0) as judge_1
from 
(
    select 
        a.day,
        a.stu_id,
        b.month as next_month,
        b.stu_list as next_stu_list 
    from 
    (
        select 
            day,     --  上一月的某一天
            stu_id
        from test_aaa 
    ) a 
    left join 
    (
        select 
            substring(day, 1, 7) as month,  -- 本月
            collect_set(stu_id) as stu_list 
        from test_aaa 
        group by substring(day, 1, 7)
    ) b 
    on substring(date_add(last_day(a.day), 1), 1, 7) = b.month
) temp;

题目4

字段:
班级id , 学生id, 课程id, 学生成绩
class_id, stu_id, course_id, score
求每个班级每个课程学生成绩的中位数。

sql实现:

with tmp_tb as 
(
    select 
        class_id, stu_id, course_id, score, cnt, rn 
    from 
    (
        select 
            class_id, stu_id, course_id, score, 
            row_number() over (partition by class_id, course_id order by score asc) as rn 
        from test
    ) a 
    left join 
    (
        select 
            class_id, course_id, 
            count(*) as cnt
        from test 
        group by class_id, course_id 
    ) b 
    on a.class_id = b.class_id and a.course_id = b.course_id
) 
select 
    class_id, course_id, 
    score 
from tmp_tb where cnt%2 != 0 and rn = (cnt+1)/2;
union all 
select 
    class_id, course_id, 
    score 
from tmp_tb where cnt%2 = 0 and rn = ((cnt/2)+(n/2+1))/2;

题目5

现有城市网吧访问数据,字段:网吧id,访客id(身份证号),上线时间,下线时间
规则1、如果有两个用户在一家网吧的前后上下线时间在10分钟以内,则两人可能认识
规则2、如果这两个用户在三家以上网吧出现【规则1】的情况,则两人一定认识

需求:
该城市上网用户中两人一定认识的组合数
该题可以选用自己擅长的任何技术来解决,可以是JAVA、Python、C、C++编程语言,也可以是Hadoop,Spark大数据工具

sql实现:

select 
    friend_id, 
    count(distinct wid) as visit_cnt 
from 
(
    select 
        a.wid, 
        concat_ws('_', a.uid, b.uid) as friend_id 
    from 
    (
        select 
            wid , uid ,
            unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss') as ontime,
            unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss') as offtime 
        from test 
    ) a, 
    (
        select 
            wid , uid ,
            unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss') as ontime,
            unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss') as offtime 
        from test 
    ) b 
    where a.wid = b.wid 
    	-- 注意这里不是 != , 因为a_b 和 b_a意义是相同的
        and a.uid > b.uid 
        and abs(a.ontime-b.ontime) <= 10*60 
        and abs(a.offtime-b.offtime) <= 10*60
) temp
group by friend_id 
having visit_cnt > 3;

题目6

创建测试表,并插入测试数据, 用户名,关注用户

create table test_friend (
    uname string,
    fname string
);

insert into test_friend(uname, fname) values 
('a', 'b'),
('b', 'a'),
('a', 'c'),
('b', 'd'),
('c', 'd'),
('d', 'b');

求互相关注的用户对:

备注: 这里可以 hive的join 或者左半连接来完成,都是可以的

select
    a.user_id
    ,a.friend_id
from test1 a join test2 b 
on a.uer_id = b.friend_id and a.friend_id = b.user_id
where a.user_id > a.friend_id;

思路2:

with tmp_data as (
	select 1 as a, 2 as b 
	union all 
	select 2 as a, 1 as b 
	union all 
	select 1 as a, 3 as b 
	union all 
	select 3 as a, 1 as b 
	union all 
	select 1 as a, 5 as b 
	union all 
	select 1 as a, 6 as b 
)
select 
	a, b 
from 
(
	select   a
			,b
			,greatest(a, b)  as  max_val
			,least(a, b)  as  min_val 
			,count(*) over (partition by greatest(a, b), least(a, b)) as cnt 
	from tmp_data
) t1 
where cnt > 1

题目7

学生答题记录表 user_ques_record
字段 user_id,
question_id,
scroe,
record_time
学生每做一道题目就会在表里记录一条

求8月22号学生答的第一题以及得分,最后一题以及得分,输出结果如下
user_id first_question_id first_question_scroe last_question_id last_question_scroe

select 
    user_id, 
    sum(if(rn1 = 1,question_id,null)) as first_question_id,
    sum(if(rn1 = 1,scroe,null)) as first_question_scroe,
    sum(if(rn2 = 1,question_id,null)) as last_question_id,
    sum(if(rn2 = 1,scroe,null)) as last_question_scroe
from 
(
select 
    user_id, 
    question_id,
    scroe,
    record_time,
    row_number() over (partition by user_id order by record_time asc) as rn1,
    row_number() over (partition by user_id order by record_time desc) as rn2
from user_ques_record 
where date_format(record_time, 'yyyy-MM-dd') = '2020-08-22'
) t1 
where rn1 = 1 or rn2 = 1 
group by user_id;

题目8

直播间出勤峰值
user_id, home_id, in_time, end_time 记录用户进出直播间时间,求每个直播间的出勤峰值

sql实现

这个单独写了一篇博客,可以参考一下

题目9

求股票波峰波谷
股票id, time(min), price 求每个股票的所有波峰和波谷?

select 
	id, time, price, last_value, next_value,
	if(price > last_value and price > next_value, 1, 0) as bo_feng,
	if(price < last_value and price < next_value, 1, 0) as bo_gu
from 
(
	select id, time, price, 
		lag(price, 1, 0) over (partition by id order by time) as last_value, 
		lead(price, 1, 0) over(partition by id order by time) as next_value
	from tb
) t1;

哎,遇到了一个**, 没办法啊 ~~~~~这种我也不常用啊!!!!!!!!~~~~~~nngt

对于和历史数据做对比的情况,举个例子,
订单情况,需要和历史的每条做对比,这种情况应该怎么做?

订单表:
order_id(订单id), user_id(学生id), lesson_id(课程id), subject_id(学科id), order_time(订单时间)

求当前订单之前, 非当前学科,存在多少课程 和 学科

select 
	order_id, user_id, lesson_id, subject_id, order_time,
	if(array_contains(subject_set, subject_id), size-1, size) as sub_cnt   -- 不考虑当前学科, 存在多少学科
	if(array_contains(lesson_set, "None"), lesson_set_size-1, lesson_set_size) as lesson_cnt   -- 不考虑当前学科,存在多少课程
from 
(
	select 
		t1.order_id, t1.user_id, t1.lesson_id, t1.subject_id, t1.order_time,
		-- 从当前行往后一行到末尾
		collect_set(subject_id) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as subject_set,
		size(collect_set(subject_id)) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as subject_set_size, 
		-- 一个学科下有多个课程
		collect_set(if(array_contains(les_set,lesson_id), "None", lesson_id)) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set,
		size(collect_set(if(array_contains(les_set,lesson_id), "None", lesson_id))) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set_size
	from 
	(
		select order_id, user_id, lesson_id, subject_id, order_time
		from tb 
	) t1
	left join
	(
		select subject_id, 
			collect_set(lesson_id) as les_set
		from tb
		group by subject_id
	) t2 
	on t1.subject_id = t2.subject_id
) tmp;


上面写的有一些问题,课程那块不对 (科目算的问题不大,主要是课程,课程这样来写)

注意: 这个问题涉及到一个如何在array中过滤数据操作,使用 udtf (expolde函数) 进行相关处理, 把数组先打开,再过滤
select 
	rn, order_id, user_id, lesson_id, subject_id, order_time,
	sum(if(les_obj_info.subject_id != subject_id, 1, 0)) as lesson_num
from 
(
	select 
		rn, order_id, user_id, lesson_id, subject_id, order_time, les_obj_info
	from 
	(
		select 
				order_id, user_id, lesson_id, subject_id, order_time,
				row_number() over (partition by user_id order by order_time desc) as rn, 
				-- 一个学科下有多个课程
				collect_set(les_obj) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set
			from 
		(
				select order_id, user_id, lesson_id, subject_id, order_time,
					named_struct("subject_id", subject_id, "lesson_id", lesson_id) as les_obj
				from tb 
		) t1
	) t2 
	lateral view explode(lesson_set) as les_obj_info
) t3
group by rn, order_id, user_id, lesson_id, subject_id, order_time;

题目11

给出如下用户登陆信息
u_id, event_date, event_name
用户id, 登陆日期,事件名称
求每天的新用户和当天新用户的次日留存率(保留两位小数)

select 
	event_date, 
	sum(if(rn=1,1,0)) as cnt,
	sum(if(rn=2 and date_add(event_date,1)=next_date, 1, 0)) as cnt2,
	round(sum(if(rn=2 and date_add(event_date,1)=next_date, 1, 0))/sum(if(rn=1,1,0)), 2) as result
from 
(
	select 
		u_id, 
		event_date,
		row_number() over (partition by u_id order by event_date) as rn,
		lead(event_date, 1) over (partition by u_id order by event_date) as next_date
	from
	(
		select 
			distinct u_id, event_date
		from tb
	) t1
) t2 
group by event_date;
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

雾岛与鲸

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值