hive的基本语法操作

最新推荐文章于 2022-01-24 10:28:43 发布

植与恋恋

最新推荐文章于 2022-01-24 10:28:43 发布

阅读量329

点赞数

分类专栏： hadoop 文章标签： java hadoop hive

本文链接：https://blog.csdn.net/Li_peipei/article/details/83997734

版权

hadoop 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

基本操作

DLL操作

数据库-
-----1. 创建数据库：默认保存在：HDFS默认位置：/user/hive/warehouse/数据库名.db

create database if not exists hive;

-----2. 查看数据库

show databases;

–如果数据库非常多，可以使用正则表达式，例如查看以“h”开头的数据库：

show databases like 'h*';

-----3. 查看数据库信息

describe database hive;

-----4. 删除数据库

drop database if exists hive;
#强制删除数据库（非空数据库）
drop database if exists hive cascade;

二. 表

创建表

创建表之前，最好使用use 数据库名；选择数据库，否则表会默认创建在default数据库中；

–（1）创建内部表

create table if not exists student(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
stored as textfile;
#指定列的分隔符，默认是^A，需要根据上传文件的分隔符确定。默认的行分隔符是'\n';
#指定保存的格式
#默认的保存的路径

–（2）创建外部表

create external table if not exists stu_external2(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
location '/shiny/hive.db/stu_external2';
#指定保存的路径

一级分区

create table if not exists stu_partition(
id int,
name string,
age int)
partitioned by(sex string)
row format delimited fields terminated by '\t';
#partitioned by指定按照那个字段分区，这个字段不能在定义表的时候定义

– 二级分区

create table if not exists stu_partition2(
id int,
name string,
age int)
partitioned by(classname string,sex string)
row format delimited fields terminated by '\t';

—插入女分区数据------

load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition partition(sex='female');
load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition2 partition(classname='1101',sex='female');

—插入男分区数据------

load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition partition(sex='male');
load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition2 partition(classname='1101',sex='male');

----查询表的所有数据------

select * from stu_partition;

----查询表分区-----------

show partitions stu_partition;

–（4）创建分桶表

create table if not exists stu_buck(
id int,
name string,
age int)
clustered by(id) sorted by(id desc) into 3 buckets
row format delimited fields terminated by '\t';
#指定分桶的字段，排序的字段，桶的数量

–插入数据

insert into table stu_buck select * from student distribute by (id) sort by (id desc);

------2. 修改表
–（1）重命名表

alter table student rename to stu_internal;

–（2）增加列

alter table stu_partition add columns (address string);

—查看表结构-----

desc stu_partition;

—查看表结构详细信息
desc formatted stu_partition;

–（3）改变列
alter table stu_partition change id number string;

–（4）替换/删除列
alter table stu_partition replace columns(id int,name string,age int);

–（5）添加分区------
alter table stu_partition add partition(sex=‘weizhi’);

–（6）删除分区------
alter table stu_partition drop partition(sex=‘weizhi’);

------3. 删除表
drop table if exists stu_external2;

–显示当前数据库中所有的表
show tables;

---------1. Load装载数据-------------------
–（1）加载本地数据（复制数据）
load data local inpath ‘/home/shiny/Desktop/data/female.txt’ into table stu_internal;

–（2）加载HDFS数据（移动数据）
load data inpath ‘/data/male.txt’ into table stu_internal;

–（3）加载本地数据覆盖表中内容
load data local inpath ‘/home/shiny/Desktop/data/female.txt’ overwrite into table stu_internal;

---------2. INSERT插入数据-----------------
–（1）单条插入(一般不使用)
insert into table stu_internal values(1116,‘bob’,23);

–（2）利用查询语句将结果导入新表（新表必须事先手动创建）
– 复制表（只是复制现有的表结构，不复制数据）
create table student like stu_internal;

– 将数据导入新表

insert overwrite/into table student select * from stu_internal where age>=23;

–（3）多重插入（新表事先创建）

# 新建表
create table stu_insert(
id int,
name string)
row format delimited fields terminated by '\t';

# 实现多重插入
from stu_internal
insert into table student select * where age<23
insert into table stu_insert select id,name;

–（4）CTAS（create table … as select …）（新表不用事先手动创建）如果select语句查询由于某种原因而失败，新表是不会创建的。

create table stu_ctas as select id,age from stu_internal where age<23;

-----3. INSERT导出数据（注意是overwrite，不能使用into）-----------------
–（1）单模式导出：导出到本地（^A(ctrl+A)为列分隔符，\n为行分隔符）

insert overwrite local directory '/home/shiny/Desktop/data/student' select * from student;

–（2）单模式导出：导出到HDFS（^A(ctrl+A)为列分隔符，\n为行分隔符）

insert overwrite directory '/student' select * from student;

----4. SELECT查询数据---------------------
–创建表

create table if not exists score(
id int,
name string,
course string,
score int)
row format delimited fields terminated by '\t';

#加载本地数据
load data local inpath '/home/shiny/Desktop/data/score.txt' into table score;

–（1）GROUP BY:查询每位学生总成绩
– 注意：在Group by子句中，Select查询的列，要么需要是Group by中的列，要么得是用聚合函数（比如sum、count等）加工过的列。不支持直接引用非Group by的列。

select id,name,sum(score) as count from score group by id,name;

–（2）ORDER BY:获取全级总成绩最高的学生信息（全局排序）默认是升序排序asc

select id,name,sum(score) as count from score group by id,name order by count desc limit 1;

–（3）SORT BY：查询学生信息，按照id降序排序（局部排序）
–设置reduce的个数为2

set mapreduce.job.reduces=2;
create table stu_sort as select * from student sort by id desc;

–（4）先对age进行降序排序，age相同的情况下对id进行降序排序
–DISTRIBUTE BY + SORT BY：分桶和排序的组合操作，对id进行分桶，对age，id进行降序排序

-- 指定开启分桶
SET hive.enforce.bucketing = true;
-- 指定 reducetask 数量，也就是指定桶的数量
SET mapreduce.job.reduces=3;
insert overwrite local directory '/home/shiny/Desktop/data/distr' 
select * from student distribute by (id) sort by (age desc,id desc);

–（5）对id进行分桶，对id进行升序排序
– CLUSTER BY：分桶和排序的组合操作，等于DISTRIBUTE BY + SORT BY（前提：分桶和SORT字段是同一个）。

insert overwrite local directory '/home/shiny/Desktop/data/cluster'
select * from student cluster by (id); -- 等价于distribute by id sort by id

join连接

#内连接：显示符合条件的连接
select * from studenta a join studentb b on a.id=b.id;
#左外连接,以左表位基准，匹配不上的null
select * from studenta a left join studentb b on a.id=b.id;
#右外连接，以右表为基准
select * from studenta a right join studentb b on a.id=b.id;
#全外连接,以两个表为标准，并去重
select * from student a full join studentb b on a.id=b.id;
#左半连接：只显示匹配成功后左表的数据
select * from studenta a left semi join studentb b on a.id=b.id;