hive的简单入门

最新推荐文章于 2023-02-06 11:50:49 发布

Next__One

最新推荐文章于 2023-02-06 11:50:49 发布

阅读量539

点赞数 1

分类专栏： hive 文章标签： hive 数据仓库

本文链接：https://blog.csdn.net/Next__One/article/details/77530108

版权

hive 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

hive常用命令cli
1.清屏
CTRL+L || !clear;
2.查看数据库中的表
show tables;
3.查看内置函数
show functions;
4.查看表结构
desc tablename;
5.查看hdfs 上的文件
dfs -ls 目录
6.执行操作系统命令
!命令;
7.执行hql语句
select *** from ***
8.执行外部sql脚本
source sql文件

创建表
create table student(sid int,sname string,grade array<float>);
数据格式：{1,tom,[80,70,90]}
create table student01(sid int ,sname string,grade map<string,int>);
数据格式：{1,tom,<"English",90>}
create table student02(sid int ,sname string,grade array<map<string,int>>);
数据格式：{1,tom,[<"English",90>,<"Chinese",70>]}
create table student03(sid int ,struct<name:string,age:int,sex:string>);
数据格式：{1,{'tom',20,'male'}}

1.内部表
1.1创建一张指定存储位置的表 absolute path
create table t1
(id int,name string,age int)
location '/mytable/hive/t1';
1.2根据数据创建一张表csv 1,cwj,21,5000,cwj123
create table t2
(tid int,tname string,sex char(2),age int,salary double,pwd char(20))
row format delimited fields terminated by ',';
1.3使用查询语句的返回值创建一张表会装化成mr作业执行默认没有分隔符
create table t3
as
select * from emp_info;
1.4使用查询语句并制定分隔符
create table t4
row format delimited fields terminated by ','
as
select * from emp_info;
1.5在已创建好的表中添加一个新列
alter table t1 add columns(english int);

2.1创建指定数据的没有分区表
create table sample1
(id int, nickname varchar(20),sex char(2),age int,salary double, pwd char(20))
row format delimited
fields terminated by ','
lines terminated by '\n';
载入指定数据 1,cwj,F,21,5000,cwj123
LOAD DATA LOCAL INPATH './ut.txt' OVERWRITE INTO TABLE sample1;

2.2创建有分区的表（根据sex进行分区的表）要分区的列不能再建表语句中出现
create table p2
(id int, nickname varchar(20),age int,salary double, pwd char(20))
partitioned by (sex char(2))
row format delimited fields terminated by ',';
2.2.1向分区表中插入数据(分) 会装换成mr作业
不能使用*号，要和插入分区表的列匹配
insert into table p2 partition(sex='M') select * from sample1 where sex='M';
分两次将同字段的元素分别插入分区表中
insert into table p2 partition(sex='M') select id,nickname,age,salary,pwd from s1 where sex='M';
这里两个性别要一致
insert into table p2 partition(sex='F') select id,nickname,age,salary,pwd from sample1 where sex='F';

2.3对比没有分区表的执行计划
2.3.1没有分区的执行计划
explain select * from sample1 where sex='M';
STAGE PLANS:
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
TableScan
alias: sample1
Statistics: Num rows: 6 Data size: 352 Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: (sex = 'M ') (type: boolean)
Statistics: Num rows: 3 Data size: 176 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: id (type: int), nickname (type: varchar(20)), 'M ' (type: char(2)), age (type: int), salary (type: double), pwd (type: char(20))
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
Statistics: Num rows: 3 Data size: 176 Basic stats: COMPLETE Column stats: NONE
ListSink
2.3.2有分区表的执行计划
explain select * from p2 where sex='M';
STAGE PLANS:
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
TableScan
alias: p2
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
Filter Operator
predicate: (sex = 'M ') (type: boolean)
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
Select Operator
expressions: id (type: int), nickname (type: varchar(20)), age (type: int), salary (type: double), pwd (type: char(20)), 'M ' (type: char(2))
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
ListSink

2.4外部表
数据已经存在在hdfs中
即使表被删除数据依然不会被删除
2.4.1创建外部表
create external table et1
(id int, nickname varchar(20),sex char(2),age int,salary double, pwd char(20))
row format delimited fields terminated by ','
location '/user/root/tables';
表中的数据会根据hdfs上的数据变化，具体指文件夹中的数据增加，表中数据就会增加，反之亦然
2.5桶表（Bucket table）
--桶表是对数据进行hash取值，然后放到不同文件中进行存储
--避免系统热块，做到负载均衡
--hash值相同会被放入同一个桶中
2.5.1创建通表
create table bt1
(id int, nickname varchar(20),sex char(2),age int,salary double, pwd char(20))
clustered by(nickname) into 5 buckets;
--根据nickname的hash值，划分为5个桶
2.6视图会转换为mr作业
--视图是一张虚表，是逻辑概念
--表是实际存在的表,基表
2.6.1创建一张部门表
create table dpart
(id int,name string)
row format delimited
fields terminated by ',';
载入指定数据 1,sales
LOAD DATA LOCAL INPATH './dpart.txt' OVERWRITE INTO TABLE dpart;

2.6.2创建一张员工表，并制定数据
create table emp
(id int, nickname varchar(20),sex char(2),age int,salary double, pwd char(20),pid int)
row format delimited
fields terminated by ',';
LOAD DATA LOCAL INPATH './ut.txt' OVERWRITE INTO TABLE emp;
2.6.3创建视图查询员工的名字，性别。月薪，年薪以及所在部门
create view empinfo
as
select e.nickname,e.sex,e.salary,e.salary*12 annlsal,d.name
from emp e,dpart d
where e.pid = d.id;
2.6.4使用视图(和使用表类似) 转换为mr作业和mysql中的物化视图不同只是逻辑存在
select * from empinfo;

3数据导入
使用时需要注意：数据的分割格式，默认为制表符，如果是其他格式的数据会出错,数据顺序也要一直
3.1使用load语句
解释：1.不加local表示从hdfs中导入数据 2.overwrite表示是否要覆盖原来存在的数据，默认不覆盖，3。
还可以给导入的数据进行分区
load data [local] inpath 'filepath' [overwrite]
into table talbename [partition (partcol1=val1,partcol2=val2 ..)]
3.1.1导入本地数据（单个文件）
load data local inpath './dpart.txt' overwrite
into table dpart;
3.1.2导入多个文件（文件夹）当前目录下的所有数据
load data local inpath './' overwrite
into table dpart;
3.1.3导入hdfs中的数据(不加local表示从hdfs中导入数据)
3.1.3.1创建一个分区表
create table p3
(id int, name varchar(20),age int,salary double, pwd char(20),pid int)
partitioned by (sex char(2))
row format delimited fields terminated by ',';

load data local inpath './pf.txt' overwrite
into table p3 partition(sex='F');
load data local inpath './pm.txt' overwrite
into table p3 partition(sex='M');

3.2使用sqoop进行数据的数据导入导出
关系型数据库到hdfs中
使用sqoop要导入两个环境变量
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
3.2.1使用sqoop导入mysql中的数据到HDFS中
sqoop import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 --table student --columns 'sId,sName,sSex,sBirthday,sKey,instituteId,majorId,sPhone,sAddress' -m 2 --target-dir '/sqoop/student'

3.2.2使用sqoop导入mysql中的数据到HIVE中(不指定表名则用同名表)
导入mysql中的student表
sqoop import --hive-import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 --table student -m 2 --columns 'sId,sName,sSex,sBirthday,sKey,instituteId,majorId,sPhone,sAddress'
导入mysql中的course表
sqoop import --hive-import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 --table course -m 1
导入mysql中的score表
sqoop import --hive-import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 --table score -m 3

3.2.3使用sqoop导入mysql中的数据到HIVE中，并指定表名
sqoop import --hive-import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 --table student -m 2 --columns 'sId,sName,sSex,sBirthday,sKey,instituteId,majorId,sPhone,sAddress' --hive-table student1

3.2.4使用sqoop导入mysql中的数据到HIVE中，并使用where条件
sqoop import --hive-import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 --table student -m 2 --columns 'sId,sName,sSex,sBirthday,sKey,instituteId,majorId,sPhone,sAddress' --hive-table student2 --where 'instituteId="R"'

3.2.5使用sqoop导入mysql中的数据到HIVE中，并使用查询语句
sqoop import --hive-import --connect jdbc:mysql://172.19.11.189:3306/scoredb --username root --password wmx123456 -m 2 --query 'SELECT * from student where instituteId="A" ADN $CONDITIONS' --target-dir '/sqoop/student3' --hive-table student3

3.3使用sqoop将HIVE中的数据导入mysql中
注意url地址是否写错 export-dir是hdfs上源文件的地址
sqoop export --connect jdbc:mysql://172.19.11.189:3306/test --username root --password wmx123456 -m 2 --table emp --export-dir /user/root/tables
通过Sqoop将Hive表数据导入到Mysql通常有两种情况。
第一种是将hive上某张表的全部数据导入到mysql对应的表中。
第二种是将hive上某张表中的部分数据导入到mysql对应的表中。
两种方式的区别在于第二种情况需要指定要导入数据的列名称。两种情况的导入方式分别如下：
1.全部导入
sqoop export --connect jdbc:mysql://127.0.0.1:3306/dbname --username mysql(mysql用户名) --password 123456(密码) --table student(mysql上的表) --hcatalog-database sopdm(hive上的schema) --hcatalog-table student(hive上的表)
sqoop export --connect jdbc:mysql://172.19.11.189:3306/test --username root --password wmx123456 --table emp --hcatalog-table emp

2.部分导入
sqoop export --connect jdbc:mysql://127.0.0.1:3306/dbname --username mysql(mysql用户名) --password 123456(密码) --table student(mysql上的表) --columns "id,name,age" --hcatalog-database sopdm(hive上的schema) --hcatalog-table student(hive上的表)

sqoop export --connect jdbc:mysql://172.19.11.189:3306/test --username root --password wmx123456 --table emp --export-dir /user/hive/warehouse/emp/ut.txt -input-fields-terminated-by ","

4Hive数据查询
4.1简单查询
查询所有课程
select * from course;
select courseId,courseName from course where credit > 3
4.1.1如果一个表达式中有空值null，则整个表达式为null
使用nvl（comm,0）如果comm的值为空null则会被转化为0进行计算
将null值查询出来使用where comm is null;不为空是where comm is not null；
使用distinct去掉重复记录
select distinct instituteId from course;
4.1.2简单查询的fetch task功能
配置方式
1.- set hive.fetch.task.conversion=more;
2.- hive --hiveconf hive.fetch.task.conversion=more
3.- 修改hive-sitme.xl的配置文件

4.2过滤和排序
4.2.1过滤where
select * from student where instituteId="A";
select * from student where instituteId="A" and ssex="女";
模糊查询
名字中带有_的员工，虚掩实用两个//转义_
select * from student where sname='%//_%';
4.2.2排序order by默认升序
需要使用mapreduce作业来执行
select * from score where courseId='R004' order by score;
表示降序排列
select * from score where courseId='R004' order by score desc;
order by 后面的跟表达式，别名
null默认排在最后，表示最小值

4.3Hive的函数
4.3.1内置函数
4.3.1.1数学函数
select round(49.657,2),round(49.657,1),round(49.657,0),round(49.657,-1),round(49.657,-2);
select ceil(45.9);
select floor(45.9);
4.3.1.2字符函数
lower
upper

select lower("Hello"),upper("Hello");

length:字符数，注意和字节数不一样
select length("hello world"),length("你好");

concat:连接字符串
select concat("hello "," world");
substr:求子串
select substr("hello world",1,3);
trim

lpad:走填充
rpad：右填充

select lpad("adsb",10," "),rpad("sdfasd",10,"*");

4.3.1.3收集函数和转换函数
size(map(<k,v>,<k,v>));
select size(map(1,'tom',2,'jack'));
转化函数，转化数据类型
select cast(1 as double);
select cast('2015-10-2' as date);

4.3.1.4日期函数
to_date
select to_date("2015-04-23 11:23:11");

year
month
day

select year("2015-4-12"),month("2015-4-12"),day("2015-4-12");

weekofyear:一年中的第几个星期
select weekofyear("2015-4-12");

datediff:两个日期相减，返回相差的天数
select datediff("2014-2-1","2015-2-1");
在原来的时间基础上加减天数，得到一个新天数
date_add
date_sub
select date_add("2017-8-23",10),date_sub("2017-4-1",8);

4.3.1.5条件函数
coalesce:从左到右返回第一个不为null的值
eg: select bonus,sal,coalesce(bonus,sal) from emp;

case... when...:条件表达式
CASE a WHERE b THEN c [WHEN d THEN e]* [ELSE f] END
eg:要放在一行上执行
select ename.job,sal,
case job when 'PRESIDENT' then sal+1000
when "MANAGER" then sal+800
else sal+400
end
from emp;

4.3.1.6聚合函数
count
sum
min
max
avg

explode:把键值对转化为kv对
select explode(map(1,"Tom",2,'mary',3,'cwj'));

5。hive表连接
5.1等值连接

select st.sname,st.majorId,sc.sid,sc.cid,sc.score
from student st, score sc
where st.sid=sc.sid;

5.2不等值连接
5.3外链接
select d.pid,d.pname,count(e.id)
from emp e join dept d
on (e.pid=d.pid)
group by d.pid,d.pname;
右外连接
select d.pid,d.pname,count(e.id)
from emp e right outer join dept d
on (e.pid=d.pid)
group by d.pid,d.pname;

create table dept(pid int,pname string)
row format delimited
fields terminated by ',';
载入指定数据 1,sales
LOAD DATA LOCAL INPATH './dpart.txt' OVERWRITE INTO TABLE dept;

6 hive的
6.1JDBC连接
6.1.1步骤
获取连接--》创建运行环境--》执行HQL--》处理结果--》释放资源
获取驱动类
private static String driverString = "org.apache.hive.jdbc.HiveDriver";
连接的url地址
private static String urlString = "jdbc:hive2://localhost:10000/default";

6.2 thrift server

Next__One

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hive的简单入门

hive常用命令cli1.清屏CTRL+L || !clear;2.查看数据库中的表show tables;3.查看内置函数show functions;4.查看表结构desc tablename;5.查看hdfs 上的文件dfs -ls 目录6.执行操作系统命令!命令;7.执行hql语句select *** from ***8.执行外部
复制链接

扫一扫

专栏目录