hive学习3：hive常用HQL语句

最新推荐文章于 2022-04-14 23:53:01 发布

云游遍天下

最新推荐文章于 2022-04-14 23:53:01 发布

阅读量864

点赞数

分类专栏：大数据 SQL

本文链接：https://blog.csdn.net/u011410254/article/details/83213983

版权

SQL 同时被 2 个专栏收录

33 篇文章 0 订阅

订阅专栏

大数据

15 篇文章 0 订阅

订阅专栏

show databases;
show tables;
desc test;
#删除表
drop table t_buck;
#请表数据
truncate table t_buck;
-------------
分桶表示例：

#创建分桶表
drop table stu_buck;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno) 
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';


#设置变量,设置分桶为true, 设置reduce数量是分桶的数量个数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;


insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage);  报错,cluster 和 sort 不能共存


#开会往创建的分通表插入数据(插入数据需要是已分桶, 且排序的)
#可以使用distribute by(sno) sort by(sno asc)   或是排序和分桶的字段相同的时候使用Cluster by(字段)
#注意使用cluster by  就等同于分桶+排序(sort)
insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);

insert overwrite table stu_buck
select * from student cluster by(Sno);

------------------------
保存select查询结果的几种方式：
1、将查询结果保存到一张新的hive表中
create table t_tmp
as
select * from t_p;

2、将查询结果保存到一张已经存在的hive表中
insert into  table t_tmp
select * from t_p;

3、将查询结果保存到指定的文件目录（可以是本地，也可以是hdfs）
insert overwrite local directory '/home/hadoop/test'
select * from t_p;


insert overwrite directory '/aaa/test'
select * from t_p;

-----------------------------------
关于hive中的各种join

准备数据
1,a
2,b
3,c
4,d
7,y
8,u

2,bb
3,cc
7,yy
9,pp



建表：
create table a(id int,name string)
row format delimited fields terminated by ',';

create table b(id int,name string)
row format delimited fields terminated by ',';

导入数据：
load data local inpath '/home/hadoop/a.txt' into table a;
load data local inpath '/home/hadoop/b.txt' into table b;


实验：
** inner join
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 7     | y       | 7     | yy      |
+-------+---------+-------+---------+--+





**left join
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
+-------+---------+-------+---------+--+





**right join
select * from a right join b on a.id=b.id;





**
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
| NULL  | NULL    | 9     | pp      |
+-------+---------+-------+---------+--+


**
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 2     | b       |
| 3     | c       |
| 7     | y       |
+-------+---------+--+


-------------
多重插入：

from student
insert into table student_p partition(part='a')
select * where Sno<95011;
insert into table student_p partition(part='a')
select * where Sno<95011;


--------------
导出数据到本地：

insert overwrite local directory '/home/hadoop/student.txt'
select * from student;

-------------
UDF案例：
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';

insert overwrite table t_rating
select split(parsejson(line),'\t')[0]as movieid,split(parsejson(line),'\t')[1] as rate,split(parsejson(line),'\t')[2] as timestring,split(parsejson(line),'\t')[3] as uid from rat_json limit 10;


-------
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json limit 10;


-----------
transform案例:

1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

2、需要解析json数据成四个字段，插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json;

3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  movieid, rating, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rating, str(weekday),userid])

保存文件
然后，将文件加入hive的classpath：
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as
SELECT
  TRANSFORM (movieid, rate, timestring,uid)
  USING 'python weekday_mapper.py'
  AS (movieid, rate, weekday,uid)
FROM t_rating;

select distinct(weekday) from u_data_new limit 10;

云游遍天下

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
hive学习3：hive常用HQL语句

show databases;show tables;desc test;#删除表drop table t_buck;#请表数据truncate table t_buck;-------------分桶表示例：#创建分桶表drop table stu_buck;create table stu_buck(Sno int,Sname string,Sex string,Sa...
复制链接

扫一扫