hive的基本语法操作

基本操作

DLL操作

数据库-
-----1. 创建数据库:默认保存在:HDFS默认位置:/user/hive/warehouse/数据库名.db

create database if not exists hive;

-----2. 查看数据库

show databases;

–如果数据库非常多,可以使用正则表达式,例如查看以“h”开头的数据库:

show databases like 'h*';

-----3. 查看数据库信息

describe database hive;

-----4. 删除数据库

drop database if exists hive;
#强制删除数据库(非空数据库)
drop database if exists hive cascade;

二. 表

创建表

创建表之前,最好使用use 数据库名;选择数据库,否则表会默认创建在default数据库中;

–(1)创建内部表

create table if not exists student(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
stored as textfile;
#指定列的分隔符,默认是^A,需要根据上传文件的分隔符确定。默认的行分隔符是'\n';
#指定保存的格式
#默认的保存的路径

–(2)创建外部表

create external table if not exists stu_external2(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
location '/shiny/hive.db/stu_external2';
#指定保存的路径

一级分区

create table if not exists stu_partition(
id int,
name string,
age int)
partitioned by(sex string)
row format delimited fields terminated by '\t';
#partitioned by指定按照那个字段分区,这个字段不能在定义表的时候定义

– 二级分区

create table if not exists stu_partition2(
id int,
name string,
age int)
partitioned by(classname string,sex string)
row format delimited fields terminated by '\t';

—插入女分区数据------

load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition partition(sex='female');
load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition2 partition(classname='1101',sex='female');

—插入男分区数据------

load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition partition(sex='male');
load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition2 partition(classname='1101',sex='male');

----查询表的所有数据------

select * from stu_partition;

----查询表分区-----------

show partitions stu_partition;

–(4)创建分桶表

create table if not exists stu_buck(
id int,
name string,
age int)
clustered by(id) sorted by(id desc) into 3 buckets
row format delimited fields terminated by '\t';
#指定分桶的字段,排序的字段,桶的数量

–插入数据

insert into table stu_buck select * from student distribute by (id) sort by (id desc);

------2. 修改表
–(1)重命名表

alter table student rename to stu_internal;

–(2)增加列

alter table stu_partition add columns (address string);

—查看表结构-----

desc stu_partition;

—查看表结构详细信息
desc formatted stu_partition;

–(3)改变列
alter table stu_partition change id number string;

–(4)替换/删除列
alter table stu_partition replace columns(id int,name string,age int);

–(5)添加分区------
alter table stu_partition add partition(sex=‘weizhi’);

–(6)删除分区------
alter table stu_partition drop partition(sex=‘weizhi’);

------3. 删除表
drop table if exists stu_external2;

–显示当前数据库中所有的表
show tables;

---------1. Load装载数据-------------------
–(1)加载本地数据(复制数据)
load data local inpath ‘/home/shiny/Desktop/data/female.txt’ into table stu_internal;

–(2)加载HDFS数据(移动数据)
load data inpath ‘/data/male.txt’ into table stu_internal;

–(3)加载本地数据覆盖表中内容
load data local inpath ‘/home/shiny/Desktop/data/female.txt’ overwrite into table stu_internal;

---------2. INSERT插入数据-----------------
–(1)单条插入(一般不使用)
insert into table stu_internal values(1116,‘bob’,23);

–(2)利用查询语句将结果导入新表(新表必须事先手动创建)
– 复制表(只是复制现有的表结构,不复制数据)
create table student like stu_internal;

– 将数据导入新表

insert overwrite/into table student select * from stu_internal where age>=23;

–(3)多重插入(新表事先创建)

# 新建表
create table stu_insert(
id int,
name string)
row format delimited fields terminated by '\t';

# 实现多重插入
from stu_internal
insert into table student select * where age<23
insert into table stu_insert select id,name;

–(4)CTAS(create table … as select …)(新表不用事先手动创建)如果select语句查询由于某种原因而失败,新表是不会创建的。

create table stu_ctas as select id,age from stu_internal where age<23;

-----3. INSERT导出数据(注意是overwrite,不能使用into)-----------------
–(1)单模式导出:导出到本地(^A(ctrl+A)为列分隔符,\n为行分隔符)

insert overwrite local directory '/home/shiny/Desktop/data/student' select * from student;

–(2)单模式导出:导出到HDFS(^A(ctrl+A)为列分隔符,\n为行分隔符)

insert overwrite directory '/student' select * from student;

----4. SELECT查询数据---------------------
–创建表

create table if not exists score(
id int,
name string,
course string,
score int)
row format delimited fields terminated by '\t';

#加载本地数据
load data local inpath '/home/shiny/Desktop/data/score.txt' into table score;

–(1)GROUP BY:查询每位学生总成绩
– 注意:在Group by子句中,Select查询的列,要么需要是Group by中的列,要么得是用聚合函数(比如sum、count等)加工过的列。不支持直接引用非Group by的列。

select id,name,sum(score) as count from score group by id,name;

–(2)ORDER BY:获取全级总成绩最高的学生信息(全局排序)默认是升序排序asc

select id,name,sum(score) as count from score group by id,name order by count desc limit 1;

–(3)SORT BY: 查询学生信息,按照id降序排序(局部排序)
–设置reduce的个数为2

set mapreduce.job.reduces=2;
create table stu_sort as select * from student sort by id desc;

–(4)先对age进行降序排序,age相同的情况下对id进行降序排序
–DISTRIBUTE BY + SORT BY:分桶和排序的组合操作,对id进行分桶,对age,id进行降序排序

-- 指定开启分桶
SET hive.enforce.bucketing = true;
-- 指定 reducetask 数量,也就是指定桶的数量
SET mapreduce.job.reduces=3;
insert overwrite local directory '/home/shiny/Desktop/data/distr' 
select * from student distribute by (id) sort by (age desc,id desc);

–(5)对id进行分桶,对id进行升序排序
– CLUSTER BY:分桶和排序的组合操作,等于DISTRIBUTE BY + SORT BY(前提:分桶和SORT字段是同一个)。

insert overwrite local directory '/home/shiny/Desktop/data/cluster'
select * from student cluster by (id); -- 等价于distribute by id sort by id

join连接

#内连接:显示符合条件的连接
select * from studenta a join studentb b on a.id=b.id;
#左外连接,以左表位基准,匹配不上的null
select * from studenta a left join studentb b on a.id=b.id;
#右外连接,以右表为基准
select * from studenta a right join studentb b on a.id=b.id;
#全外连接,以两个表为标准,并去重
select * from student a full join studentb b on a.id=b.id;
#左半连接:只显示匹配成功后左表的数据
select * from studenta a left semi join studentb b on a.id=b.id;
展开阅读全文

没有更多推荐了,返回首页