Hive SQL 练习

雾岛与鲸

已于 2023-07-10 17:36:45 修改

阅读量1.3k

点赞数

分类专栏： hive 文章标签：大数据 hive

于 2020-09-04 16:08:19 首次发布

本文链接：https://blog.csdn.net/qq_36039236/article/details/108406198

版权

hive 专栏收录该内容

64 篇文章 15 订阅

订阅专栏

题目1

表student中的数据格式如下：
name month degree
s1 201801 A
s1 201802 A
s1 201803 C
s1 201804 A
s1 201805 A
s1 201806 A
s2 201801 A
s2 201802 B
s2 201803 C
s2 201804 A
s2 201805 D
s2 201806 A
s3 201801 C
s3 201802 A
s3 201803 A
s3 201804 A
s3 201805 B
s3 201806 A

现要查询表中连续三个月以上degree均为A的记录

select
    a1.name,
    a1.month,
    a1.degree
from
(
    select
        name,
        month,
        degree,
        sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS score1,
        sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 1 PRECEDING AND 1 following) AS score2,
        sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN  CURRENT ROW AND 2 following) AS score3
    from student
) as a1
where
    a1.score1 = 3 or
    a1.score2 = 3 or
    a1.score3 = 3;

题目2

文本数据如下：

name　　scores
张三    语文:78,数学:90,英语:82,历史:77,政治:80,物理:88,化学:79,地理:92,生物:86
李四    语文:68,数学:84,英语:79,历史:65,政治:66,物理:78,化学:72,地理:83,生物:75
王五    语文:88,数学:69,英语:86,历史:82,政治:70,物理:68,化学:89,地理:95,生物:93
朱六    语文:73,数学:76,英语:73,历史:76,政治:75,物理:85,化学:86,地理:85,生物:90
钱二    语文:68,数学:91,英语:92,历史:97,政治:83,物理:88,化学:77,地理:72,生物:70
段七    语文:86,数学:80,英语:85,历史:87,政治:88,物理:98,化学:69,地理:87,生物:74

建表语句：

create external table score_test(
    name string,
    scores map<String, int>
) 
row format delimited fields terminated by '\t' 
collection items terminated by ','     
map keys terminated by ':'   
lines terminated by '\n'

查询每个学生最好的课程和成绩，最差的课程和成绩，以及各科的平均分

select
    a.name,
    max(if(a.key1 = 1, a.subject, '')) as bad_subject,
    max(if(a.key1 = 1, a.score, 0)) as bad_score,
    max(if(a.key2 = 1, a.subject, '')) as good_subject,
    max(if(a.key2 = 1, a.score, 0)) as good_score,
    avg(a.score) as avg_score
from
(
    select
        name,
        subject,
        score,
        row_number() over(partition by name order by score) as key1,
        row_number() over(partition by name order by score desc) as key2
    from
        score_test LATERAL VIEW explode(scores) adTable AS subject, score
) as a
group by a.name
having bad_score > 0 and good_score > 0;

题目3

业务中涉及到学生退费的统计问题，月退费人数，求上月存在，这月不存在的学生个数。
创建测试表并插入测试数据:

create table test_aaa
(
    day string comment '日期',
    stu_id int comment '学生id' 
);

insert into test_aaa values 
("2020-01-02", 1001),
("2020-01-02", 1002),
("2020-02-02", 1001),
("2020-02-02", 1002),
("2020-02-02", 1003),
("2020-02-02", 1004),
("2020-03-02", 1001),
("2020-03-02", 1002),
("2020-04-02", 1005),
("2020-05-02", 1006);

sql 实现:

select 
    day,
    stu_id,
    next_month,
    next_stu_list,
    -- 使用 judge_1 = 0 来求本月没有的个数
    if(array_contains(next_stu_list, stu_id), 1, 0) as judge_1
from 
(
    select 
        a.day,
        a.stu_id,
        b.month as next_month,
        b.stu_list as next_stu_list 
    from 
    (
        select 
            day,     --  上一月的某一天
            stu_id
        from test_aaa 
    ) a 
    left join 
    (
        select 
            substring(day, 1, 7) as month,  -- 本月
            collect_set(stu_id) as stu_list 
        from test_aaa 
        group by substring(day, 1, 7)
    ) b 
    on substring(date_add(last_day(a.day), 1), 1, 7) = b.month
) temp;

题目4

字段:
班级id , 学生id, 课程id, 学生成绩
class_id, stu_id, course_id, score
求每个班级每个课程学生成绩的中位数。

sql实现:

with tmp_tb as 
(
    select 
        class_id, stu_id, course_id, score, cnt, rn 
    from 
    (
        select 
            class_id, stu_id, course_id, score, 
            row_number() over (partition by class_id, course_id order by score asc) as rn 
        from test
    ) a 
    left join 
    (
        select 
            class_id, course_id, 
            count(*) as cnt
        from test 
        group by class_id, course_id 
    ) b 
    on a.class_id = b.class_id and a.course_id = b.course_id
) 
select 
    class_id, course_id, 
    score 
from tmp_tb where cnt%2 != 0 and rn = (cnt+1)/2;
union all 
select 
    class_id, course_id, 
    score 
from tmp_tb where cnt%2 = 0 and rn = ((cnt/2)+(n/2+1))/2;

题目5

现有城市网吧访问数据，字段：网吧id，访客id（身份证号），上线时间，下线时间
规则1、如果有两个用户在一家网吧的前后上下线时间在10分钟以内，则两人可能认识
规则2、如果这两个用户在三家以上网吧出现【规则1】的情况，则两人一定认识

需求：
该城市上网用户中两人一定认识的组合数
该题可以选用自己擅长的任何技术来解决，可以是JAVA、Python、C、C++编程语言，也可以是Hadoop，Spark大数据工具

sql实现:

select 
    friend_id, 
    count(distinct wid) as visit_cnt 
from 
(
    select 
        a.wid, 
        concat_ws('_', a.uid, b.uid) as friend_id 
    from 
    (
        select 
            wid ， uid ，
            unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss') as ontime,
            unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss') as offtime 
        from test 
    ) a, 
    (
        select 
            wid ， uid ，
            unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss') as ontime,
            unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss') as offtime 
        from test 
    ) b 
    where a.wid = b.wid 
    	-- 注意这里不是 != , 因为a_b 和 b_a意义是相同的
        and a.uid > b.uid 
        and abs(a.ontime-b.ontime) <= 10*60 
        and abs(a.offtime-b.offtime) <= 10*60
) temp
group by friend_id 
having visit_cnt > 3;

题目6

创建测试表，并插入测试数据，用户名，关注用户

create table test_friend (
    uname string,
    fname string
);

insert into test_friend(uname, fname) values 
('a', 'b'),
('b', 'a'),
('a', 'c'),
('b', 'd'),
('c', 'd'),
('d', 'b');

求互相关注的用户对:

备注: 这里可以 hive的join 或者左半连接来完成，都是可以的

select
    a.user_id
    ,a.friend_id
from test1 a join test2 b 
on a.uer_id = b.friend_id and a.friend_id = b.user_id
where a.user_id > a.friend_id;

思路2:

with tmp_data as (
	select 1 as a, 2 as b 
	union all 
	select 2 as a, 1 as b 
	union all 
	select 1 as a, 3 as b 
	union all 
	select 3 as a, 1 as b 
	union all 
	select 1 as a, 5 as b 
	union all 
	select 1 as a, 6 as b 
)
select 
	a, b 
from 
(
	select   a
			,b
			,greatest(a, b)  as  max_val
			,least(a, b)  as  min_val 
			,count(*) over (partition by greatest(a, b), least(a, b)) as cnt 
	from tmp_data
) t1 
where cnt > 1

题目7

学生答题记录表 user_ques_record
字段 user_id,
question_id,
scroe,
record_time
学生每做一道题目就会在表里记录一条

求8月22号学生答的第一题以及得分，最后一题以及得分，输出结果如下
user_id first_question_id first_question_scroe last_question_id last_question_scroe

select 
    user_id, 
    sum(if(rn1 = 1,question_id,null)) as first_question_id,
    sum(if(rn1 = 1,scroe,null)) as first_question_scroe,
    sum(if(rn2 = 1,question_id,null)) as last_question_id,
    sum(if(rn2 = 1,scroe,null)) as last_question_scroe
from 
(
select 
    user_id, 
    question_id,
    scroe,
    record_time,
    row_number() over (partition by user_id order by record_time asc) as rn1,
    row_number() over (partition by user_id order by record_time desc) as rn2
from user_ques_record 
where date_format(record_time, 'yyyy-MM-dd') = '2020-08-22'
) t1 
where rn1 = 1 or rn2 = 1 
group by user_id;

题目8

直播间出勤峰值
user_id, home_id, in_time, end_time 记录用户进出直播间时间，求每个直播间的出勤峰值

sql实现

这个单独写了一篇博客，可以参考一下

题目9

求股票波峰波谷
股票id, time(min), price 求每个股票的所有波峰和波谷？

select 
	id, time, price, last_value, next_value,
	if(price > last_value and price > next_value, 1, 0) as bo_feng,
	if(price < last_value and price < next_value, 1, 0) as bo_gu
from 
(
	select id, time, price, 
		lag(price, 1, 0) over (partition by id order by time) as last_value, 
		lead(price, 1, 0) over(partition by id order by time) as next_value
	from tb
) t1;

哎，遇到了一个**, 没办法啊～～～～～这种我也不常用啊！！！！！！！！～～～～～～nngt

对于和历史数据做对比的情况，举个例子，
订单情况，需要和历史的每条做对比，这种情况应该怎么做？

订单表:
order_id(订单id), user_id(学生id), lesson_id(课程id), subject_id(学科id), order_time(订单时间)

求当前订单之前， 非当前学科，存在多少课程 和 学科

select 
	order_id, user_id, lesson_id, subject_id, order_time,
	if(array_contains(subject_set, subject_id), size-1, size) as sub_cnt   -- 不考虑当前学科， 存在多少学科
	if(array_contains(lesson_set, "None"), lesson_set_size-1, lesson_set_size) as lesson_cnt   -- 不考虑当前学科，存在多少课程
from 
(
	select 
		t1.order_id, t1.user_id, t1.lesson_id, t1.subject_id, t1.order_time,
		-- 从当前行往后一行到末尾
		collect_set(subject_id) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as subject_set,
		size(collect_set(subject_id)) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as subject_set_size, 
		-- 一个学科下有多个课程
		collect_set(if(array_contains(les_set,lesson_id), "None", lesson_id)) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set,
		size(collect_set(if(array_contains(les_set,lesson_id), "None", lesson_id))) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set_size
	from 
	(
		select order_id, user_id, lesson_id, subject_id, order_time
		from tb 
	) t1
	left join
	(
		select subject_id, 
			collect_set(lesson_id) as les_set
		from tb
		group by subject_id
	) t2 
	on t1.subject_id = t2.subject_id
) tmp;


上面写的有一些问题，课程那块不对 (科目算的问题不大，主要是课程，课程这样来写)

注意: 这个问题涉及到一个如何在array中过滤数据操作，使用 udtf (expolde函数) 进行相关处理， 把数组先打开，再过滤
select 
	rn, order_id, user_id, lesson_id, subject_id, order_time,
	sum(if(les_obj_info.subject_id != subject_id, 1, 0)) as lesson_num
from 
(
	select 
		rn, order_id, user_id, lesson_id, subject_id, order_time, les_obj_info
	from 
	(
		select 
				order_id, user_id, lesson_id, subject_id, order_time,
				row_number() over (partition by user_id order by order_time desc) as rn, 
				-- 一个学科下有多个课程
				collect_set(les_obj) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set
			from 
		(
				select order_id, user_id, lesson_id, subject_id, order_time,
					named_struct("subject_id", subject_id, "lesson_id", lesson_id) as les_obj
				from tb 
		) t1
	) t2 
	lateral view explode(lesson_set) as les_obj_info
) t3
group by rn, order_id, user_id, lesson_id, subject_id, order_time;

题目11

给出如下用户登陆信息
u_id, event_date, event_name
用户id, 登陆日期，事件名称
求每天的新用户和当天新用户的次日留存率(保留两位小数)

select 
	event_date, 
	sum(if(rn=1,1,0)) as cnt,
	sum(if(rn=2 and date_add(event_date,1)=next_date, 1, 0)) as cnt2,
	round(sum(if(rn=2 and date_add(event_date,1)=next_date, 1, 0))/sum(if(rn=1,1,0)), 2) as result
from 
(
	select 
		u_id, 
		event_date,
		row_number() over (partition by u_id order by event_date) as rn,
		lead(event_date, 1) over (partition by u_id order by event_date) as next_date
	from
	(
		select 
			distinct u_id, event_date
		from tb
	) t1
) t2 
group by event_date;

雾岛与鲸

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
打赏
1
评论
Hive SQL 练习

目录题目1题目1表student中的数据格式如下：name month degrees1 201801 As1 201802 As1 201803 Cs1 201804 As1 201805 As1 201806 As2 201801 As2 201802 Bs2 201803 Cs2 201804 As2 201805 Ds2 201806 As3 201801 Cs3 201802 As3 201803 As3 201804 As3 201805 Bs3 201
复制链接

扫一扫