表
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];
- 拷贝表结构,不拷贝表数据
CREATE TABLE ruozedata_emp2 LIKE ruozedata_emp;
- 查看表的表结构
show create table table_name;
- Create Table As Select (CTAS) 以select的结果创建一张表
create table ruozedata_emp3 as select empno,ename,deptno from ruozedata_emp;
- 改表名
ALTER TABLE table_name RENAME TO new_table_name;
ALTER TABLE ruozedata_emp3 rename to ruozedata_emp3_bak;
改了表名之后,hdfs上的路径也随之改变
- 删除
1)Drop Table 删表跑路,DROP TABLE removes metadata and data for this table
2)Truncate Table 删表的数据 removes all rows from a table or partition(s). - 加载
LOAD DATA [LOCAL] INPATH '' [OVERWRITE] INTO TABLE XXX;
带local:从本地系统,Linux
不带local:从Hadoop文件系统,hdfs
带overwrite:数据覆盖
不带overwrite:追加
- 数据导出 Writing data into the filesystem from queries
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/empout'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT empno,ename FROM ruozedata_emp;
内外部表
- 内部表和外部表
1)MANAGED_TABLE 内部表
删除内部表时,数据和元数据都要删除
2)EXTERNAL_TABLE 外部表
删除外部表时,只删除元数据
——外部表更安全,被删后数据还在hdfs上,故工作中一般使用外部表 - 创建外部表时一般要配置路径
CREATE EXTERNAL TABLE emp_external(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/d7_externel/emp/' ;
加载数据到emp_external
hadoop fs -put emp.txt /d7_externel/emp/
面试题
order by vs sort by vs distribute by vs cluster by
- order by:全局排序,只能有一个reduce,在reduce里面进行排序
——数据量特别大时不适用,故生产上慎用
在严格模式下(hive.mapred.mode=strict),order by后必须要跟一个limit;非严格模式下则不需要limit - sort by:局部排序,每个reduce内有序
通过set mapred.reduce.tasks=x;设置reduce个数
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/sortby'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT * FROM ruozedata_emp sort by empno desc;
- distribute by:分发规则,按照一定的规则把数据分散到某个reducer
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/distributeby'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT * FROM ruozedata_emp
distribute by length(ename)
sort by empno;
- cluster by=distribute by xxx sort by xxx 直接使用的场景不多
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/clusterby'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from ruozedata_emp cluster by ename
内置函数
show functions; 查看全部函数
desc function length; 不带例子的函数说明
desc function extended length; 带例子的函数说明
hive完成wc统计
现在有一个表,数据如下:
hello hello hello
world world
welcome
用hive来完成wc统计该如何做?
- 创建表
create table wc(sentence string);
- 加载数据
load data local inpath '/home/hadoop/data/wc.data' into table wc;
- 行转列
select word,count(1) as cnt
from
(
select explode(split(sentence,'\t')) as word from wc
) t
group by word
order by cnt desc;
涉及到知识点:split explode