Hive自定义函数UDF、UDTF
hive中已经自带一些函数,但数量有限,有时候需要自己定义函数,自定义函数分为一下三种:
1、UDF(User-Defined-Function) 一进一出 类似于:lower/upper/reverse 2、UDAF(User-Defined Aggregation Function) 聚集函数,多进一出 类似于:count/max/min 3、UDTF(User-Defined Table-Generating Functions) 一进多出 如lateral view explode()
1.自定义UDF
1.1依赖
< dependencies>
< dependency>
< groupId> org.apache.hive</ groupId>
< artifactId> hive-exec</ artifactId>
< version> 2.1.0</ version>
</ dependency>
< dependency>
< groupId> org.apache.hadoop</ groupId>
< artifactId> hadoop-common</ artifactId>
< version> 2.7.5</ version>
</ dependency>
</ dependencies>
1.2代码实现
public class Uppercase extends UDF {
public Text evaluate ( final Text s) {
if ( null == s) {
return null ;
}
return new Text ( s. toString ( ) . toUpperCase ( ) ) ;
}
}
1.3 函数使用
1.3.1 临时函数
cd /export/server/hive-2.1.0/lib
mv user-defined-function-1.0-SNAPSHOT.jar my_uppercase.jar
add jar / export/ server/ hive- 2.1 .0 / lib/ my_uppercase. jar;
create temporary function my_upercase as 'com.dk.udf.Uppercase' ;
select my_upercase( "abcDe" ) ;
1.3.2 永久函数
hadoop fs -mkdir /hive_func
hadoop fs -put /export/server/hive-2.1.0/lib/my_uppercase.jar /hive_func
create function my_upercase2 as 'com.dk.udf.Uppercase'
using jar 'hdfs://node1:8020/hive_func/my_uppercase.jar' ;
select my_upercase2( "abcDe" ) ;
2. 自定义UDTF
2.1 单列一进多出转换
2.1.1 代码实现
import org. apache. hadoop. hive. ql. exec. UDFArgumentException ;
import org. apache. hadoop. hive. ql. metadata. HiveException ;
import org. apache. hadoop. hive. ql. udf. generic. GenericUDTF ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspectorFactory ;
import org. apache. hadoop. hive. serde2. objectinspector. StructObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. PrimitiveObjectInspectorFactory ;
import java. util. ArrayList ;
import java. util. List ;
public class SplitString extends GenericUDTF {
private final transient Object [ ] forwardList = new Object [ 1 ] ;
@Override
public StructObjectInspector initialize ( StructObjectInspector argOIs) throws UDFArgumentException {
List < String > fieldNames = new ArrayList < > ( ) ;
fieldNames. add ( "column_1" ) ;
List < ObjectInspector > inspectors = new ArrayList < > ( ) ;
inspectors. add ( PrimitiveObjectInspectorFactory . javaStringObjectInspector) ;
return ObjectInspectorFactory . getStandardStructObjectInspector ( fieldNames, inspectors) ;
}
@Override
public void process ( Object [ ] args) throws HiveException {
if ( args == null || args. length < 1 ) {
super . forward ( forwardList) ;
return ;
}
String argsStr = args[ 0 ] . toString ( ) ;
String splitStr = args[ 1 ] . toString ( ) ;
String [ ] fields = argsStr. split ( splitStr) ;
for ( String field : fields) {
forwardList[ 0 ] = field;
super . forward ( forwardList) ;
}
}
@Override
public void close ( ) throws HiveException {
}
}
2.1.2 函数使用
add jar /export/server/hive-2.1.0/lib/my_split_string.jar;
create temporary function split_string_udtf as 'com.dk.udtf.SplitString' ;
select split_string_udtf( "索隆,路飞,山治,乔巴" , "," ) ;
2.2 多列一进多出转换
2.2.1 代码实现
import org. apache. hadoop. hive. ql. exec. UDFArgumentException ;
import org. apache. hadoop. hive. ql. metadata. HiveException ;
import org. apache. hadoop. hive. ql. udf. generic. GenericUDTF ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspectorFactory ;
import org. apache. hadoop. hive. serde2. objectinspector. StructObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. PrimitiveObjectInspectorFactory ;
import java. util. ArrayList ;
import java. util. Arrays ;
import java. util. List ;
public class SplitMapList < main> extends GenericUDTF {
private final transient Object [ ] fieldlist = new Object [ 2 ] ;
@Override
public StructObjectInspector initialize ( StructObjectInspector argOIs) throws UDFArgumentException {
List < String > fieldList = new ArrayList < > ( ) ;
fieldList. add ( "column_1" ) ;
fieldList. add ( "column_2" ) ;
List < ObjectInspector > inspectors = new ArrayList < > ( ) ;
inspectors. add ( PrimitiveObjectInspectorFactory . javaStringObjectInspector) ;
inspectors. add ( PrimitiveObjectInspectorFactory . javaLongObjectInspector) ;
return ObjectInspectorFactory . getStandardStructObjectInspector ( fieldList, inspectors) ;
}
@Override
public void process ( Object [ ] args) throws HiveException {
if ( args == null || args. length < 1 ) {
super . forward ( fieldlist) ;
return ;
}
String arg0 = args[ 0 ] . toString ( ) ;
String arg1 = args[ 1 ] . toString ( ) ;
String arg2 = args[ 2 ] . toString ( ) ;
String [ ] items = arg0. split ( arg1) ;
for ( String item : items) {
String [ ] beans = item. split ( arg2) ;
fieldlist[ 0 ] = beans[ 0 ] ;
fieldlist[ 1 ] = Long . parseLong ( beans[ 1 ] ) ;
super . forward ( fieldlist) ;
}
}
@Override
public void close ( ) throws HiveException {
}
public static void main ( String [ ] args) {
String [ ] strings = new String [ 3 ] ;
strings[ 0 ] = "路飞:12000000000,索隆:8000000000,乔巴:3000000" ;
strings[ 1 ] = "," ;
strings[ 2 ] = ":" ;
String arg0 = strings[ 0 ] . toString ( ) ;
String arg1 = strings[ 1 ] . toString ( ) ;
String arg2 = strings[ 2 ] . toString ( ) ;
String [ ] items = arg0. split ( arg1) ;
for ( String item : items) {
String [ ] beans = item. split ( arg2) ;
System . out. println ( Arrays . toString ( beans) ) ;
}
}
}
2.2.2 函数使用
drop function my_split_map;
create function my_split_map as 'com.dk.udtf.SplitMapList'
using jar 'hdfs://node1:8020/hive_func/my_split_map.jar' ;
select my_split_map( "路飞:12000000000,索隆:8000000000,乔巴:3000000" , "," , ":" ) ;
2.3 删除函数命令
drop temporary function if exists encryptPhoneNumber;
drop function if exists my_lower2;