hive练习小例子

course.txt

1,数据库
2,数学
3,信息系统
4,操作系统
5,数据结构
6,数据处理

sc.txt

95001,1,81
95001,2,85
95001,3,88
95001,4,70
95002,2,90
95002,3,80
95002,4,71
95002,5,60
95003,1,82
95003,3,90
95003,5,100
95004,1,80
95004,2,92
95004,4,91
95004,5,70
95005,1,70
95005,2,92
95005,3,99
95005,6,87
95006,1,72
95006,2,62
95006,3,100
95006,4,59
95006,5,60
95006,6,98
95007,3,68
95007,4,91
95007,5,94
95007,6,78
95008,1,98
95008,3,89
95008,6,91
95009,2,81
95009,4,89
95009,6,100
95010,2,98
95010,5,90
95010,6,80
95011,1,81
95011,2,91
95011,3,81
95011,4,86
95012,1,81
95012,3,78
95012,4,85
95012,6,98
95013,1,98
95013,2,58
95013,4,88
95013,5,93
95014,1,91
95014,2,100
95014,4,98
95015,1,91
95015,3,59
95015,4,100
95015,6,95
95016,1,92
95016,2,99
95016,4,82
95017,4,82
95017,5,100
95017,6,58
95018,1,95
95018,2,100
95018,3,67
95018,4,78

students.txt

95001,李勇,男,20,CS
95002,刘晨,女,19,IS
95003,王敏,女,22,MA
95004,张立,男,19,IS
95005,刘刚,男,18,MA
95006,孙庆,男,23,CS
95007,易思玲,女,19,MA
95008,李娜,女,18,CS
95009,梦圆圆,女,18,MA
95010,孔小涛,男,19,CS
95011,包小柏,男,18,MA
95012,孙花,女,20,CS
95013,冯伟,男,21,CS
95014,王小丽,女,19,CS
95015,王君,男,18,MA
95016,钱国,男,21,MA
95017,王风娟,女,18,IS
95018,王一,女,19,IS

stu-sc-course-ddl.txt

create table student(Sno int,Sname string,Sex string,Sage int,Sdept string)row format delimited fields terminated by ','stored as textfile;
create table course(Cno int,Cname string) row format delimited fields terminated by ',' stored as textfile;
create table sc(Sno int,Cno int,Grade int)row format delimited fields terminated by ',' stored as textfile;



load data local inpath '/home/hadoop/hivedata/students.txt' overwrite into table student;
load data local inpath '/home/hadoop/hivedata/sc.txt' overwrite into table sc;
load data local inpath '/home/hadoop/hivedata/course.txt' overwrite into table course;
查询全体学生的学号与姓名
	select Sno, Sname from student;
查询选修了课程的学生姓名
	select Sname from student s where s.Sdept is not null;
----hive的group by 和集合函数

查询学生的总人数
	select count(*) from student;
计算1号课程的学生平均成绩
	select avg(s.Grade) from sc s where s.Grade = 1;
查询各科成绩平均分
	
查询选修1号课程的学生最高分数
  
求各个课程号及相应的选课人数 

查询选修了3门以上的课程的学生学号

----hive的Order By/Sort By/Distribute By
查询学生信息,结果按学号全局有序

查询学生信息,结果区分性别按年龄有序

----Join查询,join只支持等值连接
查询每个学生及其选修课程的情况
  hive> select student.*,sc.* from student join sc on (student.Sno =sc.Sno);
查询学生的得分情况。
  hive>select student.Sname,course.Cname,sc.Grade from student join sc on student.Sno=sc.Sno join course on sc.cno=course.cno;
查询选修2号课程且成绩在90分以上的所有学生。
  
----LEFT,RIGHT 和 FULL OUTER JOIN
查询所有学生的信息,如果在成绩表中有成绩,则输出成绩表中的课程号
  
----LEFT SEMI JOIN  Hive当前没有实现 IN/EXISTS 子查询,可以用 LEFT SEMI JOIN 重写子查询语句。
重写以下子查询
  SELECT a.key, a.value
  FROM a
  WHERE a.key in
   (SELECT b.key
    FROM B);

查询与“刘晨”在同一个系学习的学生
  hive> select s1.Sname from student s1 left semi join student s2 on s1.Sdept=s2.Sdept and s2.Sname='刘晨';

 

show databases;
show tables;
desc test;

-------------
分桶表示例:

set mapreduce.job.reduces=4;

drop table stu_buck;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno) 
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';


insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage);  报错,cluster 和 sort 不能共存



insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student cluster by(Sno);

-------------
多重插入:

from student
insert into table student_p partition(part='a')
select * where Sno<95011;
insert into table student_p partition(part='a')
select * where Sno<95011;


--------------
导出数据到本地:

insert overwrite local directory '/home/hadoop/student.txt'
select * from student;

-------------
UDF案例:
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';

insert overwrite table t_rating
select split(parsejson(line),',')[0]as movieid,split(parsejson(line),',')[1] as rate,split(parsejson(line),',')[2] as timestring,split(parsejson(line),',')[3] as uid from rat_json limit 10;


-------
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json limit 10;


-----------
transform案例:

INSERT OVERWRITE TABLE u_data_new
SELECT
  TRANSFORM (movieid, rate, timestring,uid)
  USING 'python weekday_mapper.py'
  AS (movieid, rate, weekday,uid)
FROM t_rating;

select distinct(weekday) from u_data_new limit 10;

insert into 和insert overwrite的区别  into是直接插入  overwrite是覆盖之前的数据

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值