创建表
hive (default)> show tables; ---查看所有表
OK
tab_name
data_emp
hello
Time taken: 1.061 seconds, Fetched: 2 row(s)
hive (default)> create table data_emp2 like data_emp; ---复制表结构
OK
Time taken: 0.918 seconds
hive (default)> select * from data_emp2; ---查看表数据
OK
data_emp2.empno data_emp2.ename data_emp2.job data_emp2.mgr data_emp2.hiredate data_emp2.sal data_emp2.comm data_emp2.deptno
Time taken: 0.439 seconds
hive (default)> show create table data_emp; ---查看表结构信息
OK
createtab_stmt
CREATE TABLE `data_emp`(
`empno` int,
`ename` string,
`job` string,
`mgr` int,
`hiredate` string,
`sal` double,
`comm` double,
`deptno` int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://hadoop001:9000/user/hive/warehouse/data_emp'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='true',
'numFiles'='1',
'numRows'='0',
'rawDataSize'='0',
'totalSize'='969',
'transient_lastDdlTime'='1563450637')
Time taken: 0.266 seconds, Fetched: 24 row(s)
hive (default)>
hive (default)> create table data_emp3 as select empno,ename,deptno from data_emp; ---数据和表结构都复制了
hive (default)> select * from data_emp3;
OK
data_emp3.empno data_emp3.ename data_emp3.deptno
7369 SMITH 20
7499 ALLEN 30
7521 WARD 30
7566 JONES 20
7654 MARTIN 30
7698 BLAKE 30
7782 CLARK 10
7788 SCOTT 20
7839 KING 10
7844 TURNER 30
7876 ADAMS 20
7900 JAMES 30
7902 FORD 20
7934 MILLER 10
Time taken: 0.117 seconds, Fetched: 14 row(s)
**离线作业:天颗粒度,今天统计的是昨天的,统计结果写到一个临时表里面。
(CTAS) create table as select
hive (default)> ALTER TABLE data_emp3 rename to data_emp3_bak; --修改表名
OK
Time taken: 145.364 seconds
hive (default)> show tables;
OK
tab_name
data_emp
data_emp2
data_emp3_bak
hello
Time taken: 9.143 seconds, Fetched: 4 row(s)
[hadoop@hadoop001 ~]$ hdfs dfs -ls /user/hive/warehouse/ ---文件夹名字被修改了
19/07/19 03:54:27 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 4 items
drwxr-xr-x - hadoop supergroup 0 2019-07-18 19:50 /user/hive/warehouse/data_emp
drwxr-xr-x - hadoop supergroup 0 2019-07-18 23:18 /user/hive/warehouse/data_emp2
drwxr-xr-x - hadoop supergroup 0 2019-07-19 03:07 /user/hive/warehouse/data_emp3_bak
drwxr-xr-x - hadoop supergroup 0 2019-07-15 15:40 /user/hive/w
hive (default)> DROP TABLE data_emp3_bak; 删除表
OK
Time taken: 70.864 seconds
Truncate Table 删表的数据 删除表的数据
内部表和外部表
MANAGED_TABLE
DROP : data + metadata 删除时候 数据和元数据都被删除
EXTERNAL_TABLE
DROP: metadata X HDSF √ 元数据被删除,HDFS 数据不会被删除
hive (default)> create table emp_managed as select empno,ename,deptno from data_emp; ---添加表
[hadoop@hadoop001 ~]$ hdfs dfs -ls /user/hive/warehouse/ ---文件系统可以查到添加表
19/07/19 04:10:46 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 4 items
drwxr-xr-x - hadoop supergroup 0 2019-07-18 19:50 /user/hive/warehouse/data_emp
drwxr-xr-x - hadoop supergroup 0 2019-07-18 23:18 /user/hive/warehouse/data_emp2
drwxr-xr-x - hadoop supergroup 0 2019-07-19 04:08 /user/hive/warehouse/emp_managed
drwxr-xr-x - hadoop supergroup 0 2019-07-15 15:40 /user/hive/warehouse/hello
mysql> select * from tbls \G; ---mysql 数据库可以查到MySQL元数据
*************************** 4. row ***************************
TBL_ID: 22
CREATE_TIME: 1563480489
DB_ID: 1
LAST_ACCESS_TIME: 0
OWNER: hadoop
RETENTION: 0
SD_ID: 22
TBL_NAME: emp_managed
TBL_TYPE: MANAGED_TABLE
VIEW_EXPANDED_TEXT: NULL
VIEW_ORIGINAL_TEXT: NULL
4 rows in set (0.00 sec)
Time taken: 21.276 seconds
hive (default)> CREATE EXTERNAL TABLE emp_external( ---创建一张外部表
> empno int,
> ename string,
> job string,
> mgr int,
> hiredate string,
> sal double,
> comm double,
> deptno int
> )ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
> LOCATION '/d7_externel/emp/' ;
OK
Time taken: 0.506 seconds
[hadoop@hadoop001 data]$ hadoop fs -put emp.txt /d7_externel/emp/ ---将表数据拷贝到此目录下
19/07/19 04:30:27 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[hadoop@hadoop001 data]$ hdfs dfs -ls /d7_externel/emp/ ---浏览此目录
19/07/19 04:31:00 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 1 items
-rw-r--r-- 1 hadoop supergroup 969 2019-07-19 04:30 /d7_externel/emp/emp.txt
数据已经导入并进行查看
hive (default)> select * from emp_external;
OK
emp_external.empno emp_external.ename emp_external.job emp_external.mgr emp_external.hiredatemp_external.sal emp_external.comm emp_external.deptno
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1982-12-09 00:00:00 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 00:00:00 5000.0 NULL 10
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.0 0.0 30
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.0 NULL 30
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 NULL 10
创建表-dept
hive (default)> create table data_dept(
> deptno int,
> dname string,
> loc string
> )ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
OK
Time taken: 1.349 seconds
准备测试数据
[hadoop@hadoop001 data]$ cat dept.txt
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
导入数据
hive (default)> load data local inpath '/home/hadoop/data/dept.txt' overwrite into table data_dept;
Loading data to table default.data_dept
Table default.data_dept stats: [numFiles=1, numRows=0, totalSize=98, rawDataSize=0]
OK
Time taken: 0.344 seconds
hive (default)> select * from data_dept;
OK
data_dept.deptno data_dept.dname data_dept.loc
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
Time taken: 0.085 seconds, Fetched: 4 row(s)
注意事项:
LOAD DATA [LOCAL] INPATH '' [OVERWRITE] INTO TABLE XXX;
LOCAL:从本地系统 linux
不带LOCAL: 从Hadoop文件系统 HDFS
OVERWRITE 数据覆盖
不带OVERWRITE 追加
准备数据,把数据使用 into table 导入
创建目录
[hadoop@hadoop001 data]$ hadoop fs -mkdir -p /wc/dept/
19/07/19 06:01:01 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
把数据上传到HDFS
[hadoop@hadoop001 data]$ hadoop fs -put dept.txt /wc/dept/
19/07/19 06:02:18 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
查看数据文件
[hadoop@hadoop001 data]$ hadoop fs -ls -R /wc/dept/
19/07/19 06:03:00 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
-rw-r--r-- 1 hadoop supergroup 98 2019-07-19 06:02 /wc/dept/dept.txt
导入数据
hive (default)> load data inpath 'hdfs://hadoop001:9000/wc/dept/dept.txt' into table data_dept; ---9000为namenode端口
验证是否导入成功
hive (default)> select * from data_dept;
OK
data_dept.deptno data_dept.dname data_dept.loc
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
Time taken: 3.506 seconds, Fetched: 8 row(s)
利用create 创建emp
hive (default)> create table data_emp4 like data_emp;
OK
Time taken: 0.827 seconds
hive (default)> insert into table data_emp4 select * from data_emp;
hive 数据导出到本地的方法
hive (default)> insert overwrite local DIRECTORY '/home/hadoop/tmp/empout'
> ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
> select empno,ename FROM data_emp;
hive SQL分析
hive (default)> select * from data_emp limit 5;
OK
data_emp.empno data_emp.ename data_emp.job data_emp.mgr data_emp.hiredate data_emp.sal data_emp.comm data_emp.deptno
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.0 1400.0 30
Time taken: 0.197 seconds, Fetched: 5 row(s)
hive (default)> select * from data_emp where sal between 800 and 1500;
OK
data_emp.empno data_emp.ename data_emp.job data_emp.mgr data_emp.hiredate data_emp.sal data_emp.comm data_emp.deptno
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.0 500.0 30
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.0 1400.0 30
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.0 0.0 30
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.0 NULL 30
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 NULL 10
Time taken: 0.682 seconds, Fetched: 7 row(s)
hive (default)> select * from data_emp where comm is null;
OK
data_emp.empno data_emp.ename data_emp.job data_emp.mgr data_emp.hiredate data_emp.sal data_emp.comm data_emp.deptno
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 NULL 20
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1982-12-09 00:00:00 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 00:00:00 5000.0 NULL 10
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.0 NULL 30
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 NULL 10
Time taken: 0.125 seconds, Fetched: 10 row(s)
聚合函数使用
hive (default)> select max(sal),min(sal),sum(sal),avg(sal) from data_emp;
hive (default)> select deptno,avg(sal) from data_emp group by deptno; ---每个部门的平均工资
deptno _c1
10 2916.6666666666665
20 2175.0
30 1566.6666666666667
hive (default)> select deptno,avg(sal) avg_sal from data_emp group by deptno having avg_sal>2000;
deptno avg_sal
10 2916.6666666666665
20 2175.0
case 使用
hive (default)> select ename,sql,
> case
> when sal>1 and sal<1000 then "lower"
> when sal>1000 and sal<=2000 then "just so so"
> when sal>2000 and sal<=4000 then "ok"
> else "high"
> end
> from data_emp;
hive (default)> select * from data_emp order by empno desc; map educe 计算
order by 在严格模式下和非严格模式下的工作
hive (default)> set hive.mapred.mode; ---非严格模式
hive.mapred.mode=nonstrict
hive (default)> set hive.mapred.mode=strict; ---严格模式
hive (default)> select * from data_emp order by empno desc;
FAILED: SemanticException 1:32 In strict mode, if ORDER BY is specified, LIMIT must also be specified. Error encountered near token 'empno' (严格模式下 order by必须加LIMIT)
hive (default)> select * from data_emp sort by empno desc; ---局部排序,在严格模式下可以不加limit的
hive (default)> set mapred.reduce.tasks=3; ---任务改为3
hive (default)> select * from data_emp sort by empno desc;
Query ID = hadoop_20190719072828_aca819d6-99c2-4b4c-93b6-54029c4c5075
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Defaulting to jobconf value of: 3
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1562568082175_0014, Tracking URL = http://hadoop001:8088/proxy/application_1562568082175_0014/
Kill Command = /home/hadoop/app/hadoop/bin/hadoop job -kill job_1562568082175_0014
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 3 一个map 共三个reducers使用
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.0 0.0 30
7839 KING PRESIDENT NULL 1981-11-17 00:00:00 5000.0 NULL 10
7788 SCOTT ANALYST 7566 1982-12-09 00:00:00 3000.0 NULL 20
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.0 NULL 10
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.0 NULL 30
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.0 1400.0 30
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 NULL 10
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.0 NULL 30
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 NULL 20
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 NULL 20
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.0 500.0 30
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.0 300.0 30
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.0 NULL 20
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
把结果输出到本地,验证是否是三个reducers
hive (default)> INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/sortby'
> ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
> SELECT * FROM data_emp sort by empno desc;
[hadoop@hadoop001 empout]$ cd /home/hadoop/tmp/sortby
[hadoop@hadoop001 sortby]$ ll
total 12
-rw-r--r-- 1 hadoop hadoop 399 Jul 19 08:45 000000_0
-rw-r--r-- 1 hadoop hadoop 401 Jul 19 08:45 000001_0
-rw-r--r-- 1 hadoop hadoop 131 Jul 19 08:45 000002_0
[hadoop@hadoop001 sortby]$ cat 000000_0
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.0 0.0 30
7839 KING PRESIDENT \N 1981-11-17 00:00:00 5000.0 \N 10
7788 SCOTT ANALYST 7566 1982-12-09 00:00:00 3000.0 \N 20
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.0 \N 10
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.0 \N 30
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.0 1400.0 30
[hadoop@hadoop001 sortby]$ cat 000001_0
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 \N 10
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.0 \N 30
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 \N 20
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 \N 20
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.0 500.0 30
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.0 300.0 30
[hadoop@hadoop001 sortby]$ cat 000002_0
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.0 \N 20
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 \N 20
distribute by
按照一定的规则把数据分散到某个reducer
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/distributeby'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT * FROM data_emp
distribute by length(ename) ---分发规则 按照长度进行归类,并排序
sort by empno ;
cluster by 使用
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/tmp/clusterby'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
select * from data_emp cluster by ename;
order by vs sort by vs distribute by vs cluster by
order by 全局排序 1reduce X
sort by 局部排序 每个reduce内是有序
hive (default)> show functions; 查看所有函数
hive (default)> desc function length; 查看函数使用
OK
tab_name
length(str | binary) - Returns the length of str or number of bytes in binary data
Time taken: 0.018 seconds, Fetched: 1 row(s)
hive (default)>
hive (default)> desc function extended length; 查看函数使用的例子
OK
tab_name
length(str | binary) - Returns the length of str or number of bytes in binary data
Example:
> SELECT length('Facebook') FROM src LIMIT 1;
8
Time taken: 0.031 seconds, Fetched: 4 row(s)
current_date 时间:天
current_timestamp 时间秒
hive (default)> select unix_timestamp("2019-08-08 20:00:00") from data_emp; --转换为秒
OK
_c0
1565265600
1565265600
hive (default)> select unix_timestamp("20190808 200000","yyyyMMdd HHmmss") from data_emp;
OK
_c0
1565265600
hive (default)> select to_date("2018-08-08 20:08:08") from data_emp;
OK
_c0
2018-08-08
hive (default)> select year("2018-08-08 20:08:08") from data_emp;
OK
_c0
2018
hive (default)> select hour("2018-08-08 20:08:08") from data_emp;
OK
_c0
20
hive (default)> select minute("2018-08-08 20:08:08") from data_emp;
OK
_c0
8
hive (default)> select second("2018-08-08 20:08:08") from data_emp;
OK
_c0
8
hive (default)> select date_add("2018-08-08",10) from data_emp;
OK
_c0
2018-08-18
hive (default)> select date_sub("2018-08-08",10) from data_emp;
OK
_c0
2018-07-29
hive (default)> select cast("5" as int) from data_emp; ---字符串转成INT类型
OK
_c0
5
hive (default)> select cast("2019-08-08" as date) from data_emp; ---字符串转成date类型
OK
_c0
2019-08-08
hive (default)> select cast(current_timestamp as date) from data_emp;
OK
_c0
2019-07-19
hive (default)> select round(3.1415926,4) from data_emp; ---指定精度
OK
_c0
3.1416
hive (default)> select ceiling(5.4) from data_emp; ---向上取值
OK
_c0
6
hive (default)> desc function extended ceiling; ---查看函数使用
OK
tab_name
ceiling(x) - Find the smallest integer not smaller than x
Synonyms: ceil
Example:
> SELECT ceiling(-0.1) FROM src LIMIT 1;
0
> SELECT ceiling(5) FROM src LIMIT 1;
5
Time taken: 0.022 seconds, Fetched: 7 row(s)
hive (default)> select floor(5.4) from data_emp;
OK
_c0
5
hive (default)> select least(4,5,6) from data_emp; ---最小值
OK
_c0
4
hive (default)> select greatest(4,5,6) from data_emp; ---最大值
OK
_c0
6
hive (default)> select substr("abcdefg",2,3) from data_emp; ---截取字符串 从第2位开始截取
OK
_c0
bcd
hive (default)> select concat('abc','def') from data_emp; ---字符串拼接
OK
_c0
abcdef
hive (default)> select abs(-5.4) from data_emp; ---绝对值
OK
_c0
5.4
hive (default)> select concat_ws(".","192","168","1","1") from data_emp limit 1;
OK
_c0
192.168.1.1
Time taken: 0.425 seconds, Fetched: 1 row(s)
hive计算wordcount
[hadoop@hadoop001 data]$ cat wc.data ---数据准备
hello hello hello
world world
welcome
hive (d7_hive)> load data local inpath '/home/hadoop/data/wc.data' into table wc; ---本地数据导入
hive (d7_hive)> select word,count(1) as count ---sql语句
> from
> (
> select explode(split(sentence,'\t')) as word from wc
> ) t
> group by word
> order by count desc;
word count ---计算结果
hello 3
world 2
welcome 1