hive练习小例子

最新推荐文章于 2023-11-05 01:41:19 发布

璀璨下的一点星辰

最新推荐文章于 2023-11-05 01:41:19 发布

阅读量289

点赞数 1

分类专栏： hive

本文链接：https://blog.csdn.net/cuicanxingchen123456/article/details/88089994

版权

hive 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

course.txt

1,数据库
2,数学
3,信息系统
4,操作系统
5,数据结构
6,数据处理

sc.txt

95001,1,81
95001,2,85
95001,3,88
95001,4,70
95002,2,90
95002,3,80
95002,4,71
95002,5,60
95003,1,82
95003,3,90
95003,5,100
95004,1,80
95004,2,92
95004,4,91
95004,5,70
95005,1,70
95005,2,92
95005,3,99
95005,6,87
95006,1,72
95006,2,62
95006,3,100
95006,4,59
95006,5,60
95006,6,98
95007,3,68
95007,4,91
95007,5,94
95007,6,78
95008,1,98
95008,3,89
95008,6,91
95009,2,81
95009,4,89
95009,6,100
95010,2,98
95010,5,90
95010,6,80
95011,1,81
95011,2,91
95011,3,81
95011,4,86
95012,1,81
95012,3,78
95012,4,85
95012,6,98
95013,1,98
95013,2,58
95013,4,88
95013,5,93
95014,1,91
95014,2,100
95014,4,98
95015,1,91
95015,3,59
95015,4,100
95015,6,95
95016,1,92
95016,2,99
95016,4,82
95017,4,82
95017,5,100
95017,6,58
95018,1,95
95018,2,100
95018,3,67
95018,4,78

students.txt

95001,李勇,男,20,CS
95002,刘晨,女,19,IS
95003,王敏,女,22,MA
95004,张立,男,19,IS
95005,刘刚,男,18,MA
95006,孙庆,男,23,CS
95007,易思玲,女,19,MA
95008,李娜,女,18,CS
95009,梦圆圆,女,18,MA
95010,孔小涛,男,19,CS
95011,包小柏,男,18,MA
95012,孙花,女,20,CS
95013,冯伟,男,21,CS
95014,王小丽,女,19,CS
95015,王君,男,18,MA
95016,钱国,男,21,MA
95017,王风娟,女,18,IS
95018,王一,女,19,IS

stu-sc-course-ddl.txt

create table student(Sno int,Sname string,Sex string,Sage int,Sdept string)row format delimited fields terminated by ','stored as textfile;
create table course(Cno int,Cname string) row format delimited fields terminated by ',' stored as textfile;
create table sc(Sno int,Cno int,Grade int)row format delimited fields terminated by ',' stored as textfile;



load data local inpath '/home/hadoop/hivedata/students.txt' overwrite into table student;
load data local inpath '/home/hadoop/hivedata/sc.txt' overwrite into table sc;
load data local inpath '/home/hadoop/hivedata/course.txt' overwrite into table course;

查询全体学生的学号与姓名
	select Sno, Sname from student;
查询选修了课程的学生姓名
	select Sname from student s where s.Sdept is not null;
----hive的group by 和集合函数

查询学生的总人数
	select count(*) from student;
计算1号课程的学生平均成绩
	select avg(s.Grade) from sc s where s.Grade = 1;
查询各科成绩平均分
	
查询选修1号课程的学生最高分数
　　
求各个课程号及相应的选课人数 

查询选修了3门以上的课程的学生学号

----hive的Order By/Sort By/Distribute By
查询学生信息，结果按学号全局有序

查询学生信息，结果区分性别按年龄有序

----Join查询,join只支持等值连接
查询每个学生及其选修课程的情况
　　hive> select student.*,sc.* from student join sc on (student.Sno =sc.Sno);
查询学生的得分情况。
　　hive>select student.Sname,course.Cname,sc.Grade from student join sc on student.Sno=sc.Sno join course on sc.cno=course.cno;
查询选修2号课程且成绩在90分以上的所有学生。
　　
----LEFT，RIGHT 和 FULL OUTER JOIN
查询所有学生的信息，如果在成绩表中有成绩，则输出成绩表中的课程号
　　
----LEFT SEMI JOIN  Hive当前没有实现 IN/EXISTS 子查询，可以用 LEFT SEMI JOIN 重写子查询语句。
重写以下子查询
  SELECT a.key, a.value
  FROM a
  WHERE a.key in
   (SELECT b.key
    FROM B);

查询与“刘晨”在同一个系学习的学生
　　hive> select s1.Sname from student s1 left semi join student s2 on s1.Sdept=s2.Sdept and s2.Sname='刘晨';

show databases;
show tables;
desc test;

-------------
分桶表示例：

set mapreduce.job.reduces=4;

drop table stu_buck;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno) 
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';


insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage);  报错,cluster 和 sort 不能共存



insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student cluster by(Sno);

-------------
多重插入：

from student
insert into table student_p partition(part='a')
select * where Sno<95011;
insert into table student_p partition(part='a')
select * where Sno<95011;


--------------
导出数据到本地：

insert overwrite local directory '/home/hadoop/student.txt'
select * from student;

-------------
UDF案例：
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';

insert overwrite table t_rating
select split(parsejson(line),',')[0]as movieid,split(parsejson(line),',')[1] as rate,split(parsejson(line),',')[2] as timestring,split(parsejson(line),',')[3] as uid from rat_json limit 10;


-------
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json limit 10;


-----------
transform案例:

INSERT OVERWRITE TABLE u_data_new
SELECT
  TRANSFORM (movieid, rate, timestring,uid)
  USING 'python weekday_mapper.py'
  AS (movieid, rate, weekday,uid)
FROM t_rating;

select distinct(weekday) from u_data_new limit 10;

insert into 和insert overwrite的区别 into是直接插入 overwrite是覆盖之前的数据

璀璨下的一点星辰

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
hive练习小例子

course.txt1,数据库2,数学3,信息系统4,操作系统5,数据结构6,数据处理sc.txt95001,1,8195001,2,8595001,3,8895001,4,7095002,2,9095002,3,8095002,4,7195002,5,6095003,1,8295003,3,9095003,5,10095004,1,8095004...
复制链接

扫一扫