作业:返回每一门课程和对应的最高分的学生姓名
t_score_data.txt
zhangsan math:90,english:60
lisi chinese:80,math:66,english:77
wangwu chinese:66,math:55,english:80
返回每一门课程和对于的最高分的学生姓名
chinese lisi
math zhangsan
english wangwu
方案一:以map类型存储
1. 创建表
create TABLE t_score(
name STRING,
scores map<STRING,INT>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
每个字段之间由[ \t ]分割----------FIELDS TERMINATED BY '\t'
第二个字段是Array形式,元素与元素之间由[ , ]分割----------COLLECTION ITEMS TERMINATED BY ','
数据是K-V形式,每组K-V对内部由[ : ]分割,每组K-V对之间由[ : ]分割----------MAP KEYS TERMINATED BY ':'
将数据导入表中后,表数据如下:
zhangsan {"math":90,"english":60}
lisi {"chinese":80,"math":66,"english":77}
wangwu {"chinese":66,"math":55,"english":80}
2.sql
select name,subject,score from(
select name,subject,score,
rank() over(partition by subject order by score desc) as rank
from t_score lateral view explode(scores) subject_score as subject,score
)T1 where T1.rank = 1;
按科目分区排序且编号----------row_number() over(partition by subject order by score desc) as rank
拆分scores字段且生成把单行数据拆解成多行后的数据结果集——t_score lateral view explode(scores) my_score as subject,score 参考:https://blog.csdn.net/guodong2k/article/details/79459282
sql结果:
lisi chinese 80
wangwu english 80
zhangsan math 90
方案二:非map类型
1. 建表
create TABLE t_score2(
name STRING,
scores STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
将数据导入表中,表数据如下:
zhangsan math:90,english:60
lisi chinese:80,math:66,english:77
wangwu chinese:66,math:55,english:80
2. sql
select T2.name,T2.subject,T2.score from
(
select T1.name,T1.subject,T1.score,rank() over(partition by T1.subject order by T1.score desc) as rank from
(
select name,split(subject_score,':')[0] as subject,split(subject_score,':')[1] as score from t_score2 lateral view explode(split(scores,',')) t as subject_score
) T1
) T2
where T2.rank = 1;
sql结果:
lisi chinese 80
wangwu english 80
zhangsan math 90
方案三:使用嵌套python脚本处理
1. 建表
create TABLE t_score3(
line STRING
)
ROW FORMAT DELIMITED LINES TERMINATED BY '\n';
2. sql
mapper.py
# -*- coding: utf-8 -*-
import sys
for line in sys.stdin:
name_sub_sc = line.strip().split('\t')
name = name_sub_sc[0]
sub_scs = name_sub_sc[1].strip().split(',')
for sub_sc in sub_scs:
subject,score = sub_sc.split(':')
print '\t'.join([name,subject,score])
执行sql前要把python脚本加载到分布式缓存中
hive> add file /usr/local/src/apache-hive-1.2.2-bin/warehouse/mapper.py;
select T2.subject, T2.name, T2.score from
(
select name, subject, score, rank() over (partition by T1.subject order by T1.score desc) as rank from
(
select transform(line) using 'python mapper.py' as (name STRING, subject STRING, score INT) from t_score3
) T1
) T2
where T2.rank = 1;
sql结果:
lisi chinese 80
wangwu english 80
zhangsan math 90