Hive的HiveQL

最新推荐文章于 2022-10-27 17:19:08 发布

aicloudgo

最新推荐文章于 2022-10-27 17:19:08 发布

阅读量328

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/evelen/article/details/108036595

版权

大数据专栏收录该内容

12 篇文章 1 订阅

订阅专栏

# 启动Hive客户端
cd /opt/apache_hadoop/apache-hive-1.2.1
bin/hive


# 创建数据库
create database [if not exists] dbname;

# 显示数据库列表
show databases;

# 显示数据库表列表
show tables;

# 显示Hive的自带函数
show functions;
查看某函数信息
desc function func_name
例子：desc function sum;

# 删除数据库
drop database [if exists] dbname cascade;

# 选择数据库
use dbname;

# 创建数据库表
create table [if not exists] tbname(id int,name string)
row format delimited fields terminated by '\t';

# 读取数据写入表中

读取本地文件-->将本地文件数据复制到Hive表目录下
load data local inpath '/opt/data/student.txt' into table tbname;

读取HDFS目录下的文件-->将HDFS的文件移动到Hive表目录下
load data inpath '/data/student.txt' into table tbname;

# 查询表数据
select * from tbname
select * from tbname where 条件表达式

# 查询表结构
desc tbname;
desc formatted tbname;

# 删除表，注意：删除表是把表的元数据信息删除，同时HDFS上表目录和目录下的数据一起删除
drop table [if exists] tbname;

# 清空表数据，只删除表内数据，表还在,表的元数据信息不会删除。
truncate table tbname;

# 修改表名
alter table tbname rename to new_tbname;

# 表内添加列 
alter table tbname add columns(col,col_type,...);
例子：alter table tbname add columns(addr string, phone int);

# 修改表内列名及类型
alter table tbname change col_name new_col_name new_col_type;

# 替换列，注意：列不能删除，但可以修改或替换，会删除原来的所有列，新增替换的列。
alter table tbname replace columns(col col_type,...)

Hive的Linux Shell参数使用讲解

[root@aibigdata apache-hive-1.2.1]# bin/hive -help
usage: hive
 -d,--define <key=value>          Variable subsitution to apply to hive
                                  commands. e.g. -d A=B or --define A=B
    --database <databasename>     Specify the database to use
 -e <quoted-query-string>         SQL from command line
 -f <filename>                    SQL from files
 -H,--help                        Print help information
    --hiveconf <property=value>   Use value for given property
    --hivevar <key=value>         Variable subsitution to apply to hive
                                  commands. e.g. --hivevar A=B
 -i <filename>                    Initialization SQL file
 -S,--silent                      Silent mode in interactive shell
 -v,--verbose                     Verbose mode (echo executed SQL to the
                                  console)



1）--database 指定进入hive之后使用的数据库
bin/hive --database dbname;

2）-e 在Linux命令行执行SQL语句
bin/hive -e 'show databases;use dbname;show tables;'

把查询结果重定向到本地文件（下载保存查询数据）
bin/hive -e 'show databases;' > /opt/testdata/hivefile.txt

3）-f 执行文件里的SQL语句
bin/hive -f /opt/testdata/file.hql 或者 .sql

4）--hiveconf 设置hive的参数
bin/hive --hiveconf hive.cli.print.current.db=false

5）set 设置和查看hive的参数

# 查看当前的配置信息是否显示正在使用的数据库名称
set hive.cli.print.current.db;

# 设置显示正在使用的数据库名称
set hive.cli.print.current.db=true;

6）--hivevar 传递参数
bin/hive --hivevar v_name='xiaoming' -f /opt/testdata/file.hql

# file.hql内容：
use test1;select * from student where name='${hivevar:v_name}';

Hive表创建的三种方式

# 创建普通表
create table if not exists student(id int,name string)
row format delimited fields terminated by '\t';

# 向表内加载数据
load data local inpath '/opt/data/student.txt' into table student;
备注：可以重复加载数据到表内

# 使用overwrite加载数据，则会覆盖原来的数据
load data local inpath '/opt/data/student.txt' overwrite into table student;

# 子查询方式建表：复制一个表的结构及数据到另一个表
create table student2 as select id from student;

# like方式建表，仅仅是复制表的结构，并不复制表的数据
create table student3 like student;

Hive的托管表

# 托管表：使用托管表，当删除某一个表时会把HDFS的该表下的数据文件也删除掉，导致其他表无法访

例子：
create table student(id int,name string)
row format delimited fields terminated by '\t';

load data local inpath '/opt/testdata/student.txt' into table student;

创建托管表
create table student2(id int,name string)
row format delimited fields terminated by '\t'
location '/user/hive/warehouse/school.db/student';

说明：
表student2的数据是来自表student。两表公用一个文件数据。
在HDFS的 /user/hive/warehouse/school.db 目录下不会创建表 student2的目录。

但是使用在hive中能查询到student2表：
hive > show tables;
student
student2


drop table student2;

当删除表 student2时，
HDFS的目录 
/user/hive/warehouse/school.db/student
/user/hive/warehouse/school.db/student/student.txt 
也同时被删除。


再查询:
hive > show tables;
student

虽表student还在，但是数据已被清空，不可用。

Hive的外部表

# 创建外部表 external

create external table student3(id int,name string)
row format delimited fields terminated by '\t'
location '/user/hive/warehouse/school.db/student';


# 托管表与外部表的区别

1）托管表：在drop的时候，不仅仅会删除表的元数据信息，
还会把表目录下的数据一并删除，
/user/hive/warehouse/company.db下无法再看到student表目录

2）外部表：在drop的时候，只会删除表的元数据信息，不会删除表所关联的数据文件。

Hive的分区表

# 创建分区表，注意：指定的分区字段不能是表中的字段

create table emp_part(id int,name string)
partitioned by(day string)
row format delimited fields terminated by '\t';


# 加载数据（分区，分区在HDFS上为一个文件目录）
load data local inpath '/opt/testdata/emp.txt' into table emp_part
partition(day='20200801');

load data local inpath '/opt/testdata/emp.txt' into table emp_part
partition(day='20200802');

# 在HDFS上的文件路径如下：
/user/hive/warehouse/company.db/emp_part/day=20200801/emp.txt
/user/hive/warehouse/company.db/emp_part/day=20200802/emp.txt


select * from emp_part;
返回表的所有数据，包含day=20200801,day=20200802

# 表分区为了更好的查询数据，按分区条件查询
select * from emp_part where day=20200801;
select * from emp_part where day=20200802;

#删除分区的表，全部分区都被删除
drop table emp_part;


# 可以指定多条件分区：一个分区字段即为一个文件目录
create table emp_part(id int,name string)
partitioned by(day string, hour string)
row format delimited fields terminated by '\t';

# 加载数据
load data local inpath '/opt/testdata/emp.txt' into table emp_part
partition(day='20200801',hour='01');

load data local inpath '/opt/testdata/emp.txt' into table emp_part
partition(day='20200801',hour='02');

load data local inpath '/opt/testdata/emp.txt' into table emp_part
partition(day='20200802',hour='01');

# 查询
select * from emp_part where day=20200801 and hour=01;

Hive的数据导入方式

1）本地加载：本地路径下的数据会复制到HDFS对应的表的目录下
load data local inpath '/opt/data/student.txt' into table student;

2）HDFS加载：把HDFS上的文件移动(mv)到表目录下
load data inpath '/student.txt' into table student;

3）overwrite覆盖加载：先把表目录下的所有数据删除，然后在加载新的数据
load data local inpath '/opt/data/student.txt' orverwrite into table student;

4）子查询：只能在创建一张新表的时候使用，as select 即将一个表的数据及结构复制到另外一个表中
create table student2 as select * from student;

5）先like, 再insert into

先克隆表的结构，并没有复制表的数据
create table student3 like student;

再写入数据
insert into table student3 select * from student;

或者使用HDFS上传文件到该表目录下
bin/hdfs dfs -put /opt/data/student.txt /user/hive/warehouse/company.db/student3


6）使用Sqoop工具：把关系型数据（MYSQL）的数据导入到HDFS或Hive里。

Hive的数据导出方式

1）insert overwrite
格式：insert overwrite [local] directory 'path' select sql;
例子：insert overwrite local directory '/opt/data/stu_export' select * from student;

备注：导出目录不存在则会自动创建。
此处导出的文件保存在本地，目录可以提前存在。
mapreduce计算输出的文件存在HDFS上，目录不能提前存在。

可重新指定导出数据的分隔符-->导出文件内容数据以逗号分隔
insert overwrite local directory '/opt/data/stu_export'
row format delimited fields terminated by ',' select * from student;


2）导出到HDFS：注意，导入到HDFS上的目录（包括父级目录）可以不存在。会自动创建。
insert overwrite directory '/export/stu_export' select * from student;


3）通过hdfs的shell命令 -get 下载文件。也可以重名命
bin/hdfs dfs -get /user/hive/warehouse/company.db/stu/student.txt /tmp/stu_rename.txt


4）通过hive -e将查询结果重定向到本地文件
bin/hive -e 'use company;select * from student;' > /tmp/stu_export.dat


5）使用Sqoop工具

Hive的hql与是否运行MapReduce

# 不会执行mapreduce进程计算的sql（查询语句）
select * from student;
select * from studentwhere 表达式;
select * from student limit 2;


# 会启动mapreduce进程计算的sql (sql中涉及需要计算的才启动，仅仅条件查询不会启动)
select distinct deptno from emp;
select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;

Hive常用HQL语句

# 过滤条件查询

1）where，在分组之前进行过滤（不会跑mapreduce）
select * from student where id> 20;

2）limit，限制查询多少条信息（不会跑mapreduce）
select * from student limit 5;

3）distinct 查询值不重复（会跑mapreduce）
select distinct deptno from emp;

4）between and 区间条件查询（不会跑mapreduce）
select * from emp where sal between 100 and 2000;

5）is null 或 is not null （不会跑mapreduce）
select * from emp where comm is not null;

6）having，在分组之后进行过滤（会跑mapreduce）
select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal > 1000;

# 聚合函数（会跑mapreduce）

1）count 统计表的行数
select count(*) line from emp;

2）sum 求和
select sum(sal) sum_sal from emp;

3）avg 求平均值
select avg(sal) avg_sal from emp;

4）max 求最大薪资数
select max(sal) max_sal from emp;

5）min 求最小薪资数
select min(sal) min_sal from emp;

# JOIN 多表连接查询

1）内关联 inner join on (两表都符合条件则返回行)
select a.id,a.name,b.city from tb_a a join tb_b b on a.id=b.id;


2）左关联 left join on（以左表为主，右表的字段不符合条件则该字段返回null）
select a.id,a.name,b.city from tb_a a left join tb_b b on a.id=b.id;


3）右关联 right join on（以右表为主，左表的字段不符合条件则该字段返回null）
select a.id,a.name,b.city from tb_a a right join tb_b b on a.id=b.id;


4）全关联 full join on (左右表的字段不符合条件则该字段返回null)
select a.id,a.name,b.city from tb_a a full join tb_b b on a.id=b.id;

1） order by 全局排序
select * from emp order by sal;

2）sort by 局部排序
select * from emp sort by sal;

3）distribute by 指定分区
insert overwrite local directory '/opt/datas/emp_dist' 
row format delimited fields terminated by '\t' 
select * from emp distribute by deptno sort by sal;


4）cluster by 同时指定分区和排序字段（注意：分区和排序字段必须是同一个）
insert overwrite local directory '/opt/datas/emp_cluster' 
row format delimited fields terminated by '\t' 
select * from emp cluster by sal;

aicloudgo

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Hive的HiveQL

# 启动Hive客户端cd /opt/apache_hadoop/apache-hive-1.2.1bin/hive# 创建数据库create database [if not exists] dbname;# 显示数据库列表show databases;# 显示数据库表列表show tables;# 显示Hive的自带函数show functions;查看某函数信息desc function func_name例子：desc function sum;# 删除数据.
复制链接

扫一扫