基本操作
DLL操作
数据库-
-----1. 创建数据库:默认保存在:HDFS默认位置:/user/hive/warehouse/数据库名.db
create database if not exists hive;
-----2. 查看数据库
show databases;
–如果数据库非常多,可以使用正则表达式,例如查看以“h”开头的数据库:
show databases like 'h*';
-----3. 查看数据库信息
describe database hive;
-----4. 删除数据库
drop database if exists hive;
#强制删除数据库(非空数据库)
drop database if exists hive cascade;
二. 表
创建表
创建表之前,最好使用use 数据库名;选择数据库,否则表会默认创建在default数据库中;
–(1)创建内部表
create table if not exists student(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
stored as textfile;
#指定列的分隔符,默认是^A,需要根据上传文件的分隔符确定。默认的行分隔符是'\n';
#指定保存的格式
#默认的保存的路径
–(2)创建外部表
create external table if not exists stu_external2(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
location '/shiny/hive.db/stu_external2';
#指定保存的路径
一级分区
create table if not exists stu_partition(
id int,
name string,
age int)
partitioned by(sex string)
row format delimited fields terminated by '\t';
#partitioned by指定按照那个字段分区,这个字段不能在定义表的时候定义
– 二级分区
create table if not exists stu_partition2(
id int,
name string,
age int)
partitioned by(classname string,sex string)
row format delimited fields terminated by '\t';
—插入女分区数据------
load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition partition(sex='female');
load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition2 partition(classname='1101',sex='female');
—插入男分区数据------
load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition partition(sex='male');
load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition2 partition(classname='1101',sex='male');
----查询表的所有数据------
select * from stu_partition;
----查询表分区-----------
show partitions stu_partition;
–(4)创建分桶表
create table if not exists stu_buck(
id int,
name string,
age int)
clustered by(id) sorted by(id desc) into 3 buckets
row format delimited fields terminated by '\t';
#指定分桶的字段,排序的字段,桶的数量
–插入数据
insert into table stu_buck select * from student distribute by (id) sort by (id desc);
------2. 修改表
–(1)重命名表
alter table student rename to stu_internal;
–(2)增加列
alter table stu_partition add columns (address string);
—查看表结构-----
desc stu_partition;
—查看表结构详细信息
desc formatted stu_partition;
–(3)改变列
alter table stu_partition change id number string;
–(4)替换/删除列
alter table stu_partition replace columns(id int,name string,age int);
–(5)添加分区------
alter table stu_partition add partition(sex=‘weizhi’);
–(6)删除分区------
alter table stu_partition drop partition(sex=‘weizhi’);
------3. 删除表
drop table if exists stu_external2;
–显示当前数据库中所有的表
show tables;
---------1. Load装载数据-------------------
–(1)加载本地数据(复制数据)
load data local inpath ‘/home/shiny/Desktop/data/female.txt’ into table stu_internal;
–(2)加载HDFS数据(移动数据)
load data inpath ‘/data/male.txt’ into table stu_internal;
–(3)加载本地数据覆盖表中内容
load data local inpath ‘/home/shiny/Desktop/data/female.txt’ overwrite into table stu_internal;
---------2. INSERT插入数据-----------------
–(1)单条插入(一般不使用)
insert into table stu_internal values(1116,‘bob’,23);
–(2)利用查询语句将结果导入新表(新表必须事先手动创建)
– 复制表(只是复制现有的表结构,不复制数据)
create table student like stu_internal;
– 将数据导入新表
insert overwrite/into table student select * from stu_internal where age>=23;
–(3)多重插入(新表事先创建)
# 新建表
create table stu_insert(
id int,
name string)
row format delimited fields terminated by '\t';
# 实现多重插入
from stu_internal
insert into table student select * where age<23
insert into table stu_insert select id,name;
–(4)CTAS(create table … as select …)(新表不用事先手动创建)如果select语句查询由于某种原因而失败,新表是不会创建的。
create table stu_ctas as select id,age from stu_internal where age<23;
-----3. INSERT导出数据(注意是overwrite,不能使用into)-----------------
–(1)单模式导出:导出到本地(^A(ctrl+A)为列分隔符,\n为行分隔符)
insert overwrite local directory '/home/shiny/Desktop/data/student' select * from student;
–(2)单模式导出:导出到HDFS(^A(ctrl+A)为列分隔符,\n为行分隔符)
insert overwrite directory '/student' select * from student;
----4. SELECT查询数据---------------------
–创建表
create table if not exists score(
id int,
name string,
course string,
score int)
row format delimited fields terminated by '\t';
#加载本地数据
load data local inpath '/home/shiny/Desktop/data/score.txt' into table score;
–(1)GROUP BY:查询每位学生总成绩
– 注意:在Group by子句中,Select查询的列,要么需要是Group by中的列,要么得是用聚合函数(比如sum、count等)加工过的列。不支持直接引用非Group by的列。
select id,name,sum(score) as count from score group by id,name;
–(2)ORDER BY:获取全级总成绩最高的学生信息(全局排序)默认是升序排序asc
select id,name,sum(score) as count from score group by id,name order by count desc limit 1;
–(3)SORT BY: 查询学生信息,按照id降序排序(局部排序)
–设置reduce的个数为2
set mapreduce.job.reduces=2;
create table stu_sort as select * from student sort by id desc;
–(4)先对age进行降序排序,age相同的情况下对id进行降序排序
–DISTRIBUTE BY + SORT BY:分桶和排序的组合操作,对id进行分桶,对age,id进行降序排序
-- 指定开启分桶
SET hive.enforce.bucketing = true;
-- 指定 reducetask 数量,也就是指定桶的数量
SET mapreduce.job.reduces=3;
insert overwrite local directory '/home/shiny/Desktop/data/distr'
select * from student distribute by (id) sort by (age desc,id desc);
–(5)对id进行分桶,对id进行升序排序
– CLUSTER BY:分桶和排序的组合操作,等于DISTRIBUTE BY + SORT BY(前提:分桶和SORT字段是同一个)。
insert overwrite local directory '/home/shiny/Desktop/data/cluster'
select * from student cluster by (id); -- 等价于distribute by id sort by id
join连接
#内连接:显示符合条件的连接
select * from studenta a join studentb b on a.id=b.id;
#左外连接,以左表位基准,匹配不上的null
select * from studenta a left join studentb b on a.id=b.id;
#右外连接,以右表为基准
select * from studenta a right join studentb b on a.id=b.id;
#全外连接,以两个表为标准,并去重
select * from student a full join studentb b on a.id=b.id;
#左半连接:只显示匹配成功后左表的数据
select * from studenta a left semi join studentb b on a.id=b.id;