题目1
表student中的数据格式如下:
name month degree
s1 201801 A
s1 201802 A
s1 201803 C
s1 201804 A
s1 201805 A
s1 201806 A
s2 201801 A
s2 201802 B
s2 201803 C
s2 201804 A
s2 201805 D
s2 201806 A
s3 201801 C
s3 201802 A
s3 201803 A
s3 201804 A
s3 201805 B
s3 201806 A
现要查询表中连续三个月以上degree均为A的记录
select
a1.name,
a1.month,
a1.degree
from
(
select
name,
month,
degree,
sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS score1,
sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 1 PRECEDING AND 1 following) AS score2,
sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN CURRENT ROW AND 2 following) AS score3
from student
) as a1
where
a1.score1 = 3 or
a1.score2 = 3 or
a1.score3 = 3;
题目2
文本数据如下:
name scores
张三 语文:78,数学:90,英语:82,历史:77,政治:80,物理:88,化学:79,地理:92,生物:86
李四 语文:68,数学:84,英语:79,历史:65,政治:66,物理:78,化学:72,地理:83,生物:75
王五 语文:88,数学:69,英语:86,历史:82,政治:70,物理:68,化学:89,地理:95,生物:93
朱六 语文:73,数学:76,英语:73,历史:76,政治:75,物理:85,化学:86,地理:85,生物:90
钱二 语文:68,数学:91,英语:92,历史:97,政治:83,物理:88,化学:77,地理:72,生物:70
段七 语文:86,数学:80,英语:85,历史:87,政治:88,物理:98,化学:69,地理:87,生物:74
建表语句:
create external table score_test(
name string,
scores map<String, int>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
查询每个学生最好的课程和成绩,最差的课程和成绩,以及各科的平均分
select
a.name,
max(if(a.key1 = 1, a.subject, '')) as bad_subject,
max(if(a.key1 = 1, a.score, 0)) as bad_score,
max(if(a.key2 = 1, a.subject, '')) as good_subject,
max(if(a.key2 = 1, a.score, 0)) as good_score,
avg(a.score) as avg_score
from
(
select
name,
subject,
score,
row_number() over(partition by name order by score) as key1,
row_number() over(partition by name order by score desc) as key2
from
score_test LATERAL VIEW explode(scores) adTable AS subject, score
) as a
group by a.name
having bad_score > 0 and good_score > 0;
题目3
业务中涉及到学生退费的统计问题,月退费人数,求上月存在,这月不存在的学生个数。
创建测试表并插入测试数据:
create table test_aaa
(
day string comment '日期',
stu_id int comment '学生id'
);
insert into test_aaa values
("2020-01-02", 1001),
("2020-01-02", 1002),
("2020-02-02", 1001),
("2020-02-02", 1002),
("2020-02-02", 1003),
("2020-02-02", 1004),
("2020-03-02", 1001),
("2020-03-02", 1002),
("2020-04-02", 1005),
("2020-05-02", 1006);
sql 实现:
select
day,
stu_id,
next_month,
next_stu_list,
-- 使用 judge_1 = 0 来求本月没有的个数
if(array_contains(next_stu_list, stu_id), 1, 0) as judge_1
from
(
select
a.day,
a.stu_id,
b.month as next_month,
b.stu_list as next_stu_list
from
(
select
day, -- 上一月的某一天
stu_id
from test_aaa
) a
left join
(
select
substring(day, 1, 7) as month, -- 本月
collect_set(stu_id) as stu_list
from test_aaa
group by substring(day, 1, 7)
) b
on substring(date_add(last_day(a.day), 1), 1, 7) = b.month
) temp;
题目4
字段:
班级id , 学生id, 课程id, 学生成绩
class_id, stu_id, course_id, score
求每个班级每个课程学生成绩的中位数。
sql实现:
with tmp_tb as
(
select
class_id, stu_id, course_id, score, cnt, rn
from
(
select
class_id, stu_id, course_id, score,
row_number() over (partition by class_id, course_id order by score asc) as rn
from test
) a
left join
(
select
class_id, course_id,
count(*) as cnt
from test
group by class_id, course_id
) b
on a.class_id = b.class_id and a.course_id = b.course_id
)
select
class_id, course_id,
score
from tmp_tb where cnt%2 != 0 and rn = (cnt+1)/2;
union all
select
class_id, course_id,
score
from tmp_tb where cnt%2 = 0 and rn = ((cnt/2)+(n/2+1))/2;
题目5
现有城市网吧访问数据,字段:网吧id,访客id(身份证号),上线时间,下线时间
规则1、如果有两个用户在一家网吧的前后上下线时间在10分钟以内,则两人可能认识
规则2、如果这两个用户在三家以上网吧出现【规则1】的情况,则两人一定认识
需求:
该城市上网用户中两人一定认识的组合数
该题可以选用自己擅长的任何技术来解决,可以是JAVA、Python、C、C++编程语言,也可以是Hadoop,Spark大数据工具
sql实现:
select
friend_id,
count(distinct wid) as visit_cnt
from
(
select
a.wid,
concat_ws('_', a.uid, b.uid) as friend_id
from
(
select
wid , uid ,
unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss') as ontime,
unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss') as offtime
from test
) a,
(
select
wid , uid ,
unix_timestamp(ontime,'yyyy-MM-dd HH:mm:ss') as ontime,
unix_timestamp(offtime,'yyyy-MM-dd HH:mm:ss') as offtime
from test
) b
where a.wid = b.wid
-- 注意这里不是 != , 因为a_b 和 b_a意义是相同的
and a.uid > b.uid
and abs(a.ontime-b.ontime) <= 10*60
and abs(a.offtime-b.offtime) <= 10*60
) temp
group by friend_id
having visit_cnt > 3;
题目6
创建测试表,并插入测试数据, 用户名,关注用户
create table test_friend (
uname string,
fname string
);
insert into test_friend(uname, fname) values
('a', 'b'),
('b', 'a'),
('a', 'c'),
('b', 'd'),
('c', 'd'),
('d', 'b');
求互相关注的用户对:
备注: 这里可以 hive的join 或者左半连接来完成,都是可以的
select
a.user_id
,a.friend_id
from test1 a join test2 b
on a.uer_id = b.friend_id and a.friend_id = b.user_id
where a.user_id > a.friend_id;
思路2:
with tmp_data as (
select 1 as a, 2 as b
union all
select 2 as a, 1 as b
union all
select 1 as a, 3 as b
union all
select 3 as a, 1 as b
union all
select 1 as a, 5 as b
union all
select 1 as a, 6 as b
)
select
a, b
from
(
select a
,b
,greatest(a, b) as max_val
,least(a, b) as min_val
,count(*) over (partition by greatest(a, b), least(a, b)) as cnt
from tmp_data
) t1
where cnt > 1
题目7
学生答题记录表 user_ques_record
字段 user_id,
question_id,
scroe,
record_time
学生每做一道题目就会在表里记录一条
求8月22号学生答的第一题以及得分,最后一题以及得分,输出结果如下
user_id first_question_id first_question_scroe last_question_id last_question_scroe
select
user_id,
sum(if(rn1 = 1,question_id,null)) as first_question_id,
sum(if(rn1 = 1,scroe,null)) as first_question_scroe,
sum(if(rn2 = 1,question_id,null)) as last_question_id,
sum(if(rn2 = 1,scroe,null)) as last_question_scroe
from
(
select
user_id,
question_id,
scroe,
record_time,
row_number() over (partition by user_id order by record_time asc) as rn1,
row_number() over (partition by user_id order by record_time desc) as rn2
from user_ques_record
where date_format(record_time, 'yyyy-MM-dd') = '2020-08-22'
) t1
where rn1 = 1 or rn2 = 1
group by user_id;
题目8
直播间出勤峰值
user_id, home_id, in_time, end_time 记录用户进出直播间时间,求每个直播间的出勤峰值
sql实现
这个单独写了一篇博客,可以参考一下
题目9
求股票波峰波谷
股票id, time(min), price 求每个股票的所有波峰和波谷?
select
id, time, price, last_value, next_value,
if(price > last_value and price > next_value, 1, 0) as bo_feng,
if(price < last_value and price < next_value, 1, 0) as bo_gu
from
(
select id, time, price,
lag(price, 1, 0) over (partition by id order by time) as last_value,
lead(price, 1, 0) over(partition by id order by time) as next_value
from tb
) t1;
哎,遇到了一个**, 没办法啊 ~~~~~这种我也不常用啊!!!!!!!!~~~~~~nngt
对于和历史数据做对比的情况,举个例子,
订单情况,需要和历史的每条做对比,这种情况应该怎么做?
订单表:
order_id(订单id), user_id(学生id), lesson_id(课程id), subject_id(学科id), order_time(订单时间)
求当前订单之前, 非当前学科,存在多少课程 和 学科
select
order_id, user_id, lesson_id, subject_id, order_time,
if(array_contains(subject_set, subject_id), size-1, size) as sub_cnt -- 不考虑当前学科, 存在多少学科
if(array_contains(lesson_set, "None"), lesson_set_size-1, lesson_set_size) as lesson_cnt -- 不考虑当前学科,存在多少课程
from
(
select
t1.order_id, t1.user_id, t1.lesson_id, t1.subject_id, t1.order_time,
-- 从当前行往后一行到末尾
collect_set(subject_id) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as subject_set,
size(collect_set(subject_id)) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as subject_set_size,
-- 一个学科下有多个课程
collect_set(if(array_contains(les_set,lesson_id), "None", lesson_id)) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set,
size(collect_set(if(array_contains(les_set,lesson_id), "None", lesson_id))) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set_size
from
(
select order_id, user_id, lesson_id, subject_id, order_time
from tb
) t1
left join
(
select subject_id,
collect_set(lesson_id) as les_set
from tb
group by subject_id
) t2
on t1.subject_id = t2.subject_id
) tmp;
上面写的有一些问题,课程那块不对 (科目算的问题不大,主要是课程,课程这样来写)
注意: 这个问题涉及到一个如何在array中过滤数据操作,使用 udtf (expolde函数) 进行相关处理, 把数组先打开,再过滤
select
rn, order_id, user_id, lesson_id, subject_id, order_time,
sum(if(les_obj_info.subject_id != subject_id, 1, 0)) as lesson_num
from
(
select
rn, order_id, user_id, lesson_id, subject_id, order_time, les_obj_info
from
(
select
order_id, user_id, lesson_id, subject_id, order_time,
row_number() over (partition by user_id order by order_time desc) as rn,
-- 一个学科下有多个课程
collect_set(les_obj) over (partition by user_id order by order_time desc rows between 1 following and unbounded following) as lesson_set
from
(
select order_id, user_id, lesson_id, subject_id, order_time,
named_struct("subject_id", subject_id, "lesson_id", lesson_id) as les_obj
from tb
) t1
) t2
lateral view explode(lesson_set) as les_obj_info
) t3
group by rn, order_id, user_id, lesson_id, subject_id, order_time;
题目11
给出如下用户登陆信息
u_id, event_date, event_name
用户id, 登陆日期,事件名称
求每天的新用户和当天新用户的次日留存率(保留两位小数)
select
event_date,
sum(if(rn=1,1,0)) as cnt,
sum(if(rn=2 and date_add(event_date,1)=next_date, 1, 0)) as cnt2,
round(sum(if(rn=2 and date_add(event_date,1)=next_date, 1, 0))/sum(if(rn=1,1,0)), 2) as result
from
(
select
u_id,
event_date,
row_number() over (partition by u_id order by event_date) as rn,
lead(event_date, 1) over (partition by u_id order by event_date) as next_date
from
(
select
distinct u_id, event_date
from tb
) t1
) t2
group by event_date;