hive操作

最新推荐文章于 2024-04-07 14:03:01 发布

power_kai

最新推荐文章于 2024-04-07 14:03:01 发布

阅读量388

点赞数

分类专栏： hive

本文链接：https://blog.csdn.net/popping_kai/article/details/68941033

版权

hive 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1、在bin目录下面有的hiveserver2命令，可以开启hive服务端，默认前台启动
2、netstat -nltp查看端口是否占用
3、启动服务后我们使用bin/beeline 操作hive数据库
--链接命令
bin/beeline
!connect jdbc:hive2://mini1:10000
--直接bin路径连接 ./beeline -u jdbc:hive2://mini1:10000 -n root
4、创建表 --这个是普通字段 --这个是分区字段
create table t_partition(ip string,duration int)
partitioned by(country string)
row format delimited
fields terminated by ',';

--创建多级分区 --这个是普通字段 --这个是分区字段
create table t_partition(ip string,duration int)
partitioned by(country string)
row format delimited
fields terminated by ',';

--文本对象存储
create table t_2(id int,name string)
row format delimited
fields terminated by ','
stored as textfile;

--通过压缩文件存储数据
create table t_3(id int,name string)
row format delimited
fields terminated by ','
stored as sequencefile; --sequencefile压缩格式

压缩存储数据可以通过查询另一张表存储到这张表中，并且是通过mapreduce程序运算插入
insert overwrite table t_3 select * from t_2;

5、查询表字段
desc t_partition;

6、加载本地文件到hive数据库，每次加载的时候都需要指定分区
load data local inpath '/root/data' into table t_partition partition(country="China");

7、添加一个分区(添加默认是放在所有分区最后面)
alter table t_partition add partition(country="Japan");

8、查看分区
show partitions t_partition;

9、删除一个分区
alter table t_partition drop partition(country="Japan");

10、改变表名
alter table t_partition rename to t_patition_new;

11、查看所有表
show tables;

12、增加一列(添加默认是放在所有字段最后面)
alter table t_patition_new add columns(city string);

13、替换列(默认是替换所有列)
alter table t_patition_new replace columns(name string);

14、修改列名
alter table t_patition_new change column name name_new int;

15、显示库
show databases;

16、进入库
use 库名

17、查看方法
show functions;

18、查看条存储信息
desc extended t_patition_new;(信息没有格式化)
desc formatted t_patition_new;(信息格式化)

19、使用shell命令
!ls;
!clear;

20、hadoop命令操作
dfs -ls /
dfs -cat /

21、删除表
DROP TABLE IF EXISTS employee;

21、根据某张表创建相同字段的表
create table t_1 like t_patition_new;

22、从一张表中查询数据插入到另一张表中
insert overwrite table t_1 partition(city="Japan") select id,name from t_patition_new;

23、把数据导到一个目录中(默认没有加上local就会把数据导入到hdfs上面,hdfs如果没有指定local没有目录会默认自己创建一个相同路径的目录，默认的指定目录的时候会清空里面所有内容然后把数据保存进去--慎用)
insert overwrite directory "/root" select * from t_1;
insert overwrite local directory "/root" select * from t_1;

24、查询数据指定分区并插入到另一张表中，需要设置非连接模式
set hive.exec.dynamic.dynamic.partition.mode=nonstrict

25、设置map或者reduce使用的个数
set mapred.map.tasks=4 --设置map的启动运算个数
set mapred.reduce.tasks=4 --设置reduce的启动运算个数

26、开起分桶机制
set hive.enforce.bucketing;
set hive.enforce.bucketing=true;

27、清理表数据
truncate table 表;

首先先创建好两张数据能够关联的字段数据表
28、隐氏内连接操作
select a.*,b.* from a inner join b on a.id = b.id;

29、左外连接
select * from a left join b on a.id = b.id;

30、右外连接
select * from a right join b on a.id = b.id;

31、全表匹配
select * from a full outer join b on a.id = b.id;

32、显示只能关联上的两表，并且只显示a表中能关联上的数据
--运行机制，先拿a表中的id去和b表中的id匹配，如果id匹配则显示a表对应的数据
select * from a left semi join b on a.id = b.id;

--相当于,但是这条sql在hive中效率极低
select * from a where a.id exists(select b.id from b);

33、删除表
DROP TABLE IF EXISTS 表;

34、hive的集合数据类型(复杂数据类型)
集合类型主要包括：array，map，struct等，hive的特性支持集合类型，这特性是关系类型数据库所不支持，利用好集合可以提成SQL的查询速率
--测试array集合
1、创建一个库
create database collection;

2、操作该数据库
use collection;

3、创建一个array集合表,'-'指定字段里面切分的数据
create table t_array(id int,name string,hobby array<string>)
row format delimited
fields terminated by ','
collection items terminated by '-';

4、加载数据
load data local inpath '/root/collect/array.txt' into table t_array;

5、通过下标索引查询数组
select id,name,hobby[0] from t_array;

--测试map集合
1、创建爱你一个map集合
create table t_map(id int,name string,hobby map<string,string>)
row format delimited
fields terminated by ','
collection items terminated by '-'
map keys terminated by ':';

2、加载数据
load data local inpath '/root/collect/map.txt' into table t_map;

3、通过key查询数据
select id,name,hobby["唱歌"]from t_map;

--测试struct集合
1、创建表
create table t_struct(id int,name string,address struct<country:string,city:string>)
row format delimited
fields terminated by ','
collection items terminated by '-';

2、加载数据
load data local inpath '/root/collect/struct.txt' into table t_struce;

3、通过.的方式查询到出家
select id,name,address.country,address.city from t_struct;

35、查看mapreduce的2种查询模式
set hive.mapred.mode;
/hive/conf/hive-default.xml
--nonstrict 表示sql非严格模式

36、设置sql严格模式
set hive.mapred.mode=strict;

37、设置sql严格模式在sql查询的时候需要指定条件
例如：
--需要制定分区才能查询到(严格模式)
select * from t_patition_new where country='China';

38、使用order by命令(严格模式)
--需要执行limit number
select * from t_3 order by limit 2;

39、限制笛卡尔积的查询(严格模式)
select * from t_1 a join t_2 b on (a.id = b.id);

40、测试内置函数
1、创建一个dual表
create table dual(id string);

2、concat()函数
--聚合所有给定数据
select concat('a','b','c') from dual;

3、自定义函数(UDF:user-defined function,UDAF:接收多个输入数据行，并产生一个输出数据行。(count,max))
--1、eclipse用java代码实现并打包jar
--2、上传到linux服务器上
--3、进入hive命令窗口
--4、添加函数jar
add jar /root/dual/bigdata-example-hive-0.0.1-SNAPSHOT.jar(jar linux的服务器路径);
--5、指定函数名和方法
create temporary function tolowercase as 'com.kai.bigdata.hive.udf.Lower';

41、hive中多个字符分割处理--RegexSerDe
1、创建t_test表
create table t_test(id string,name string)
row format delimited
fields terminated by '||';

2、加载数据
load data local inpath '/root/data/t_test.txt' into table t_test;

--无法读取到文本中的||后面的数据
--补充:hive读取数据的机制
1、首先用inputformat的一个具体的实现类读取文件数据，返回一条条记录(可以是行，或者是逻辑中的"行")
2、然后利用SerDe<默认:org.apache.hadoop.hive.serde2.LazySimpleSerDe>的一个具体的实现类，对上面返回的一条条记录进行字段切割

42、设置本地模式(可以节省一大笔时间)
set hive.exec.mode.local.auto=true;