Hive(DML)

最新推荐文章于 2021-11-10 19:58:44 发布

qq_30130043

最新推荐文章于 2021-11-10 19:58:44 发布

阅读量135

点赞数

INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format]
SELECT ... FROM ...

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp'
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
select * from emp;

FROM from_statement
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
[INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2] ...

from emp
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
select empno, ename
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
select ename;

有多少个task就会有多少个文件产生

如果task数量过多,会有什么现象?? 小文件可能会很多~~~ 联想线下内容:hive参数调优,根据文件大小调整最合适的值

128M metadata ？byte
1k metadata ? byte

所以,为什么会说hdfs适用于大文件并不适用于小文件的原因,关键就是元数据占用的内存.小文件越多，1k的文件/128M的文件，也是占128M的内存，NN不一定扛得住

生产普遍：block count 3200W，这时候要报警，删除一些没用的文件，或者文件合并。

如何导出Hive表中的数据二：
Sqoop 后续

hibe -e "select * from xxx" 生产定时器使用。。统计之类的不需要启动hive客户端
SELECT

where
= >= <= limit (not)in is(not) null

聚合函数：sum count max min....
聚合函数是什么？多行进去一行出来。

分组函数： group by 会有shuffle，所以是产生数据倾斜的根本原因！！！！！！！！！重点
每个部门的平均工资
select deptno, avg(salary) from emp group by deptno;

select ename, deptno, avg(salary) from emp group by deptno;
Expression not in GROUP BY key 'ename'

出现在select中的字段，要么出现在group by中，要么是聚合函数

每个部门、岗位的最高工资
select deptno, job, max(salary) from emp group by deptno, job;

每个部门的平均工资大于2000
select deptno, avg(salary) avg_sal from emp group by deptno having avg_sal>2000;

case when then end

select ename, salary,
case
when salary > 1 and salary <= 1000 then 'LOWER'
when salary > 1000 and salary <= 2000 then 'MIDDLE'
when salary > 2000 and salary <= 4000 then 'HIGH'
ELSE 'HIGHEST'
end
from emp;

会不会跑MR？居然不会...

union all ：在处理数据倾斜的时候常用

select count(1) from emp where empno=7369
union all
select count(1) from emp where empno=8888;

分区表

对应于HDFS上的文件夹

好处：查询速度快

静态分区
CREATE TABLE ruoze_order_partition (
order_number string,
event_time string
)
PARTITIONED BY (event_month string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";

LOAD DATA LOCAL INPATH "/home/hadoop/data/order_created.txt"
OVERWRITE INTO TABLE ruoze_order_partition
PARTITION (event_month='2014-05')
;

乱码解决办法：
改变mysql设置，不能改变已经存在的表。你需要转换表的编码。

alter database ruozedata_basic02 character set latin1;
use ruozedata_basic02;
alter table PARTITIONS convert to character set latin1;
alter table PARTITION_KEYS convert to character set latin1;

伪列

刷新元数据
手工把数据放到hdfs上，并手工创建了分区，meta不知道这个关系，所以要刷新
MSCK REPAIR TABLE xxx;

CREATE TABLE ruoze_order_partition_basic (
order_number string,
event_time string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";

LOAD DATA LOCAL INPATH "/home/hadoop/data/order_created.txt"
OVERWRITE INTO TABLE ruoze_order_partition_basic
;

insert overwrite table ruoze_order_partition PARTITION(event_month='2014-07')
select * from ruoze_order_partition_basic;

CREATE TABLE ruoze_order_mulit_partition (
order_number string,
event_time string
)
PARTITIONED BY (event_month string, step string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";

LOAD DATA LOCAL INPATH "/home/hadoop/data/order_created.txt"
OVERWRITE INTO TABLE ruoze_order_mulit_partition
PARTITION (event_month='2014-08', step='1')
;

动态分区

将emp表的数据按照部门分组，并将数据加载到其对应的分组中去

ruoze_emp_partition

CREATE TABLE ruoze_emp_partition (
empno int,
ename string,
job string,
mgr int,
hiredate string,
salary double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";

insert into table ruoze_emp_partition partition(deptno=10)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm from emp where deptno=10;

insert into table ruoze_emp_partition partition(deptno=20)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm from emp where deptno=20;

insert into table ruoze_emp_partition partition(deptno=30)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm from emp where deptno=30;

CREATE TABLE ruoze_emp_dynamic_partition (
empno int,
ename string,
job string,
mgr int,
hiredate string,
salary double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";



insert into table ruoze_emp_dynamic_partition partition(deptno)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm, deptno from emp;

!connect jdbc:hive2://localhost:10000 hadoop

beeline -u jdbc:hive2://localhost:10000/default -n hadoop

hiveserver2/beeline
/jdbc

作业：
1) export/import
2) ALTER TABLE table_name ADD PARTITION
3) hive -e/-f的使用

qq_30130043

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hive(DML)

INSERT OVERWRITE [LOCAL] DIRECTORY directory1 [ROW FORMAT row_format] [STORED AS file_format] SELECT ... FROM ... INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp'ROW FORMAT DELIMITED ...
复制链接

扫一扫