Hive基本使用

最新推荐文章于 2024-01-10 10:39:09 发布

__海盗__

最新推荐文章于 2024-01-10 10:39:09 发布

阅读量3.2k

点赞数

分类专栏：其他

本文链接：https://blog.csdn.net/lihm0_1/article/details/17579903

版权

其他专栏收录该内容

13 篇文章 0 订阅

订阅专栏

1、生成测试数据

使用ASCII 06作为分隔符，在bash下用ctrl+v ctrl+f输入
seq 1 9999999 | awk '{print $1"^F"$1"aaaaaaaaaaaaaaaa"}' > a.txt

2、创建表结构

hive (dcenter)> create table t(id int,msg string) row format delimited fields terminated by '\006' stored as textfile;
OK
Time taken: 0.191 seconds


--查看表结构
hive (dcenter)> desc formatted t;
OK
# col_name            	data_type           	comment             
	 	 
id                  	int                 	None                
msg                 	string              	None                
	 	 
# Detailed Table Information	 	 
Database:           	dcenter             	 
Owner:              	hc                  	 
CreateTime:         	Thu Dec 26 11:00:04 CST 2013	 
LastAccessTime:     	UNKNOWN             	 
Protect Mode:       	None                	 
Retention:          	0                   	 
Location:           	hdfs://hc1:9000/hive/warehouse/dcenter.db/t	 
Table Type:         	MANAGED_TABLE       	 
Table Parameters:	 	 
	transient_lastDdlTime	1388026804          
	 	 
# Storage Information	 	 
SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
Compressed:         	No                  	 
Num Buckets:        	-1                  	 
Bucket Columns:     	[]                  	 
Sort Columns:       	[]                  	 
Storage Desc Params:	 	 
	field.delim         	\u0006              
	serialization.format	\u0006              
Time taken: 0.183 seconds

3、导入表数据

--本地载入
hive (dcenter)> load data local inpath '/tmp/a.txt' overwrite into table t;
Copying data from file:/tmp/a.txt
Copying file: file:/tmp/a.txt
Loading data to table dcenter.t
Deleted hdfs://hc1:9000/hive/warehouse/dcenter.db/t
OK
Time taken: 28.181 seconds


--HDFS载入
hive (dcenter)> load data  inpath '/tmp/a.*' overwrite into table t;
Loading data to table dcenter.t
Deleted hdfs://hc1:9000/hive/warehouse/dcenter.db/t
OK
Time taken: 0.768 seconds

4、查看表文件

--HDFS载入的时候，实际做的是文件移动
hive (dcenter)> dfs -ls /hive/warehouse/dcenter.db/t;
Found 1 items
-rw-r--r--   2 hc supergroup  317777760 2013-12-26 11:28 /hive/warehouse/dcenter.db/t/a.txt

5、外部表创建

--外部表文件的位置要位于HDFS中，客户端节点上的文件他是不能识别的
hive (dcenter)> create external table t1(id int,msg string) row format delimited fields terminated by '\006' stored as textfile ocation '/tmp/b.txt';
OK
Time taken: 0.213 seconds	
hive (dcenter)> desc formatted t1;
OK
# col_name            	data_type           	comment             
	 	 
id                  	int                 	None                
msg                 	string              	None                
	 	 
# Detailed Table Information	 	 
Database:           	dcenter             	 
Owner:              	hc                  	 
CreateTime:         	Thu Dec 26 11:37:17 CST 2013	 
LastAccessTime:     	UNKNOWN             	 
Protect Mode:       	None                	 
Retention:          	0                   	 
Location:           	hdfs://hc1:9000/tmp/b.txt	 
Table Type:         	EXTERNAL_TABLE      	 
Table Parameters:	 	 
	EXTERNAL            	TRUE                
	transient_lastDdlTime	1388029037          
	 	 
# Storage Information	 	 
SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
Compressed:         	No                  	 
Num Buckets:        	-1                  	 
Bucket Columns:     	[]                  	 
Sort Columns:       	[]                  	 
Storage Desc Params:	 	 
	field.delim         	\u0006              
	serialization.format	\u0006              
Time taken: 0.18 seconds

6、分区表创建

--分区表物理上把不同分区放在各自的目录文件中，可以在load数据的时候动态创建
hive (dcenter)> create table t2 ( id int,msg string) partitioned by( indate string) row format delimited fields terminated by '\006' stored as textfile ;
OK
Time taken: 1.524 seconds	
hive (dcenter)> load data local inpath '/tmp/b.txt' overwrite into table t2 partition(indate="20131001");
Copying data from file:/tmp/b.txt
Copying file: file:/tmp/b.txt
Loading data to table dcenter.t2 partition (indate=20131001)
OK
Time taken: 28.818 seconds
hive (dcenter)> load data local inpath '/tmp/b.txt' overwrite into table t2 partition(indate="20131002");
Copying data from file:/tmp/b.txt
Copying file: file:/tmp/b.txt
Loading data to table dcenter.t2 partition (indate=20131002)
OK
Time taken: 28.348 seconds
访问分区的方式如下：
hive (dcenter)> select count(1) from t2 where indate>='20131001' and indate<'20131002';
Total MapReduce jobs = 1
Launching Job 1 out of 1
.....

7、导出表数据

hive (dcenter)> insert overwrite local directory '/tmp/t' select * from t;
Total MapReduce jobs = 1
Launching Job 1 out of 1
...
--大致检测表数据
hive (dcenter)> !wc -l /tmp/t/000000_0;
9999 /tmp/t/000000_0
[hc@hc8 t]$ hexdump -C /tmp/t/000000_0 | head -2
00000000  31 01 31 61 61 61 61 61  61 61 61 61 61 61 61 61  |1.1aaaaaaaaaaaaa|
00000010  61 61 61 0a 32 01 32 61  61 61 61 61 61 61 61 61  |aaa.2.2aaaaaaaaa|


[hc@hc8 t]$ hexdump -C /tmp/a.txt | head -2
00000000  31 06 31 61 61 61 61 61  61 61 61 61 61 61 61 61  |1.1aaaaaaaaaaaaa|
00000010  61 61 61 0a 32 06 32 61  61 61 61 61 61 61 61 61  |aaa.2.2aaaaaaaaa|

可以看导入文件的分隔符虽然为06，但导出后变为了01，这也是Hive的默认列分隔符，可以更改。

8、视图创建

--视图可以降低我们编写SQL的复杂程度，如下面转化
hive (dcenter)> from( select * from t where id<100 order by id asc)
              > a select id,msg where id>98;
Total MapReduce jobs = 1
Launching Job 1 out of 1
...


hive (dcenter)> create view v_t as select * from t where id<100 order by id asc;
OK
Time taken: 0.284 seconds
hive (dcenter)> select * from v_t where id>98;
OK
99	99aaaaaaaaaaaaaaaa
Time taken: 33.617 seconds

9、explain的使用

hive (dcenter)> explain select count(1) from t a join t1 b on (a.id=b.id);
OK
ABSTRACT SYNTAX TREE:
  (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME t) a) (TOK_TABREF (TOK_TABNAME t1) b) (= (. (TOK_TABLE_OR_COL a) id) (. (TOK_TABLE_OR_COL b) id)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)))))


STAGE DEPENDENCIES://阶段间的依赖关系
  Stage-1 is a root stage
  Stage-2 depends on stages: Stage-1
  Stage-0 is a root stage


STAGE PLANS:
  Stage: Stage-1 
    Map Reduce  --该阶段扫描t t1 两个表，先做map操作，打相应文件标记
      Alias -> Map Operator Tree:
        a 
          TableScan
            alias: a
            Reduce Output Operator
              key expressions:
                    expr: id
                    type: int
              sort order: +
              Map-reduce partition columns:
                    expr: id
                    type: int
              tag: 0
        b 
          TableScan
            alias: b
            Reduce Output Operator
              key expressions:
                    expr: id
                    type: int
              sort order: +
              Map-reduce partition columns:
                    expr: id
                    type: int
              tag: 1
      Reduce Operator Tree: --reduce阶段读取来自不同文件的相同KEY，进行合并
        Join Operator
          condition map:
               Inner Join 0 to 1
          condition expressions:
            0 
            1 
          handleSkewJoin: false
          Select Operator
            Group By Operator
              aggregations:
                    expr: count(1)
              bucketGroup: false
              mode: hash
              outputColumnNames: _col0
              File Output Operator
                compressed: false
                GlobalTableId: 0
                table:
                    input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat


  Stage: Stage-2 --该阶段统计Stage-1的记录数
    Map Reduce
      Alias -> Map Operator Tree:
        hdfs://hc1:9000/tmp/hive-hc/hive_2013-12-26_15-54-38_047_2645242837849245691/-mr-10002 
            Reduce Output Operator
              sort order: 
              tag: -1
              value expressions:
                    expr: _col0
                    type: bigint
      Reduce Operator Tree:
        Group By Operator
          aggregations:
                expr: count(VALUE._col0)
          bucketGroup: false
          mode: mergepartial
          outputColumnNames: _col0
          Select Operator
            expressions:
                  expr: _col0
                  type: bigint
            outputColumnNames: _col0
            File Output Operator
              compressed: false
              GlobalTableId: 0
              table:
                  input format: org.apache.hadoop.mapred.TextInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat


  Stage: Stage-0 --取出数据
    Fetch Operator
      limit: -1




Time taken: 0.262 seconds

Stage-1阶段操作如下图：

10、修改表结构

--需要导入的数据，b.txt比a.txt多一列
[root@hc8 tmp]# cat a.txt
11aaaaaaaaaaaaaaaa
22aaaaaaaaaaaaaaaa
[root@hc8 tmp]# cat b.txt 
11aaaaaaaaaaaaaaaa6
22aaaaaaaaaaaaaaaa7
--创建分区表
hive (dcenter)> create table t ( id int,msg string) partitioned by( indate string) row format delimited fields terminated by '\006' stored as textfile ;
OK
Time taken: 0.453 seconds
--导入一个分区
hive (dcenter)> load data local inpath '/tmp/a.txt' overwrite into table t partition(indate="1001");
Copying data from file:/tmp/a.txt
Copying file: file:/tmp/a.txt
Loading data to table dcenter.t partition (indate=1001)
OK
Time taken: 1.111 seconds
--查询数据
hive (dcenter)> select * from t;
OK
1	1aaaaaaaaaaaaaaaa	1001
2	2aaaaaaaaaaaaaaaa	1001
Time taken: 0.408 seconds
--修改表结构
hive (dcenter)> alter table t add columns (id2 int);
OK
Time taken: 0.371 seconds
--再次导入分区
hive (dcenter)> load data local inpath '/tmp/b.txt' overwrite into table t partition(indate="1002");
Copying data from file:/tmp/b.txt
Copying file: file:/tmp/b.txt
Loading data to table dcenter.t partition (indate=1002)
OK
Time taken: 0.918 seconds
--查询数据
hive (dcenter)> select * from t;
OK
1	1aaaaaaaaaaaaaaaa	NULL	1001
2	2aaaaaaaaaaaaaaaa	NULL	1001
1	1aaaaaaaaaaaaaaaa	6	1002
2	2aaaaaaaaaaaaaaaa	7	1002
Time taken: 0.436 seconds

11、自定义函数的使用
1）编写函数类，继承自UDF，函数调用时会调用evaluate

package com.zy.hive.function;


import java.util.ArrayList;
import java.util.List;


import org.apache.hadoop.hive.ql.exec.UDF;
	
public class range extends UDF{
	//构建IP查询库
	private static List<IpRange> ipLib = new ArrayList<IpRange>();
	static{
		for(int i=0;i<5695104;i++){
			ipLib.add(new IpRange(i,i+5,"USA"+i));
		}
	}
	//调用时执行的函数
	public String evaluate(int ip){
		IpRange ir;
		for(int i=0;i<ipLib.size();i++){
			ir = ipLib.get(i);
			if(ip>=ir.getStartip() && ip <= ir.getEndip()){
				return ir.getCountry();				
			}
		}
		return null;
	}
	
	public static void main(String[] args) {
		range a = new range();
		for(int i=0;i<100;i++)
		System.out.println(a.evaluate(2));
	}
}


//ip类结构
class IpRange{
	private int startip;
	private int endip;
	private String country;
	public IpRange(int startip, int endip, String country) {
		this.startip = startip;
		this.endip = endip;
		this.country = country;
	}
	public int getStartip() {
		return startip;
	}
	public void setStartip(int startip) {
		this.startip = startip;
	}
	public int getEndip() {
		return endip;
	}
	public void setEndip(int endip) {
		this.endip = endip;
	}
	public String getCountry() {
		return country;
	}
	public void setCountry(String country) {
		this.country = country;
	}
	
}

2）导出jar文件
3）保存至hive客户端节点一个目录中
4）加载jar文件

hive (dcenter)> add jar /tmp/lihm_udf1.jar;
Added /tmp/lihm_udf1.jar to class path
Added resource: /tmp/lihm_udf1.jar

5）创建临时函数

hive (dcenter)> create temporary function findip as 'com.zy.hive.function.range';
OK
Time taken: 5.669 seconds

6）调用函数

hive (dcenter)> select id,findip(id) from t;
1	USA0
2	USA0
1	USA0
2	USA0
Time taken: 28.983 seconds

7）删除函数

hive (dcenter)> drop  temporary function findip;
OK
Time taken: 0.013 seconds

12、利用Hive API操作Hive实例

package com.zy.hive.function;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;


public class HiveServerTest {
	public static void main(String[] args) throws Exception {
		String driverName = "org.apache.hadoop.hive.jdbc.HiveDriver";
		Class.forName(driverName);
		Connection conn = DriverManager.getConnection("jdbc:hive://192.168.1.1:10000/dcenter","","");
		Statement stmt = conn.createStatement();
		stmt.executeQuery("use dcenter");
		//query record from a table
		ResultSet rst = stmt.executeQuery("select id,msg from t");
		while(rst.next()){
			System.out.println("id:"+rst.getInt(1)+"  msg:"+rst.getString(2));
		}
		//show tables
		rst = stmt.executeQuery("show tables");
		while(rst.next()){
			System.out.println(rst.getString(1)+"\t");
		}
		conn.close();
	}
}