Hive(DML)

INSERT OVERWRITE [LOCAL] DIRECTORY directory1
  [ROW FORMAT row_format] [STORED AS file_format] 
  SELECT ... FROM ...  
 
INSERT OVERWRITE  LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp'
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
select * from emp;
  
FROM from_statement
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
[INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2] ...
  
from emp
INSERT OVERWRITE  LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
select empno, ename  
INSERT OVERWRITE  LOCAL DIRECTORY '/home/hadoop/tmp/hivetmp2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
select ename;   

  

有多少个task就会有多少个文件产生

如果task数量过多,会有什么现象?? 小文件可能会很多~~~ 联想线下内容:hive参数调优,根据文件大小调整最合适的值

128M  metadata ?byte
1k    metadata  ? byte  

 所以,为什么会说hdfs适用于大文件并不适用于小文件的原因,关键就是元数据占用的内存.小文件越多,1k的文件/128M的文件,也是占128M的内存,NN不一定扛得住

生产普遍:block count 3200W,这时候要报警,删除一些没用的文件,或者文件合并。 
 


如何导出Hive表中的数据二:
Sqoop 后续

hibe -e "select * from xxx"    生产定时器使用。。统计之类的 不需要启动hive客户端
SELECT


where
=     >=     <=     limit     (not)in     is(not) null


聚合函数:sum count max min....
聚合函数是什么?    多行进去一行出来。

分组函数: group by    会有shuffle,所以是产生数据倾斜的根本原因!!!!!!!!!重点
每个部门的平均工资
select deptno, avg(salary) from emp group by deptno;


select ename, deptno, avg(salary) from emp group by deptno;
Expression not in GROUP BY key 'ename'


出现在select中的字段,要么出现在group by中,要么是聚合函数


每个部门、岗位的最高工资  
select deptno, job, max(salary) from emp group by deptno, job;




每个部门的平均工资大于2000
select deptno, avg(salary) avg_sal from emp group by deptno having avg_sal>2000;


case when then end




select ename, salary, 
case
when salary > 1 and salary <= 1000 then 'LOWER'
when salary > 1000 and salary <= 2000 then 'MIDDLE'
when salary > 2000 and salary <= 4000 then 'HIGH'
ELSE 'HIGHEST'
end
from emp;

会不会跑MR?    居然不会...


union all  : 在处理数据倾斜的时候 常用


select count(1) from emp where empno=7369
union all
select count(1) from emp where empno=8888;


分区表

对应于HDFS上的文件夹

        好处:查询速度快



静态分区
CREATE TABLE ruoze_order_partition (
order_number string,
event_time string
)
PARTITIONED BY (event_month string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";


LOAD DATA LOCAL INPATH "/home/hadoop/data/order_created.txt" 
OVERWRITE INTO TABLE ruoze_order_partition
PARTITION (event_month='2014-05')
;


乱码解决办法:
改变mysql设置,不能改变已经存在的表。你需要转换表的编码。


alter database ruozedata_basic02 character set latin1;
use ruozedata_basic02;
alter table PARTITIONS convert to character set latin1;
alter table PARTITION_KEYS convert to character set latin1;


伪列




刷新元数据
    手工把数据放到hdfs上,并手工创建了分区,meta不知道这个关系,所以要刷新
    MSCK REPAIR TABLE xxx;


CREATE TABLE ruoze_order_partition_basic (
order_number string,
event_time string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";


LOAD DATA LOCAL INPATH "/home/hadoop/data/order_created.txt" 
OVERWRITE INTO TABLE ruoze_order_partition_basic
;


insert overwrite table ruoze_order_partition PARTITION(event_month='2014-07')
select * from ruoze_order_partition_basic;


CREATE TABLE ruoze_order_mulit_partition (
order_number string,
event_time string
)
PARTITIONED BY (event_month string, step string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";


LOAD DATA LOCAL INPATH "/home/hadoop/data/order_created.txt" 
OVERWRITE INTO TABLE ruoze_order_mulit_partition
PARTITION (event_month='2014-08', step='1')
;






动态分区


将emp表的数据按照部门分组,并将数据加载到其对应的分组中去


ruoze_emp_partition





CREATE TABLE ruoze_emp_partition (
empno int,
ename string,
job string,
mgr int,
hiredate string,
salary double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";




insert into table ruoze_emp_partition partition(deptno=10)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm from emp where deptno=10;


insert into table ruoze_emp_partition partition(deptno=20)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm from emp where deptno=20;


insert into table ruoze_emp_partition partition(deptno=30)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm from emp where deptno=30;


CREATE TABLE ruoze_emp_dynamic_partition (
empno int,
ename string,
job string,
mgr int,
hiredate string,
salary double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"; 
  
  
  
insert into table ruoze_emp_dynamic_partition partition(deptno)
select empno,ename ,job ,mgr ,hiredate ,salary ,comm, deptno from emp;
  
!connect jdbc:hive2://localhost:10000 hadoop 


beeline -u jdbc:hive2://localhost:10000/default -n hadoop   


hiveserver2/beeline
/jdbc




作业:
1) export/import  
2) ALTER TABLE table_name ADD PARTITION   
3) hive -e/-f的使用  
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值