说明
hive udf用于扩展hive功能,常用语自定义算法,如数据加密、解密等。 本文将从代码实现和配置讲解udf相关信息
分享
环境
udf种类
简称 特点 描述 参考类 UDF(user defined function) one to one 输入一个,输出一个,如:upper、substr GenericUDFConcat UDAF(user defined aggregare function) many to one 输入多个,输出一个,如:sum、min GenericUDAFVariance UDTF one to many 输入一个,输出多个,如 alteral view、explode GenericUDTFExplode
udf实现
pom
< properties>
< project.build.sourceEncoding> UTF-8</ project.build.sourceEncoding>
< java.version> 1.8</ java.version>
< maven.compiler.source> ${java.version}</ maven.compiler.source>
< maven.compiler.target> ${java.version}</ maven.compiler.target>
</ properties>
< dependencies>
< dependency>
< groupId> org.apache.hive</ groupId>
< artifactId> hive-exec</ artifactId>
< version> 3.1.2</ version>
</ dependency>
</ dependencies>
< build>
< resources>
< resource>
< directory> src/main/resources</ directory>
</ resource>
</ resources>
< sourceDirectory> src/main/java</ sourceDirectory>
< plugins>
< plugin>
< artifactId> maven-compiler-plugin</ artifactId>
< version> 3.3</ version>
< configuration>
< source> ${java.version}</ source>
< target> ${java.version}</ target>
< encoding> UTF-8</ encoding>
</ configuration>
</ plugin>
</ plugins>
</ build>
udf代码
创建udf有两种方式
继承UDF类,实现简餐,hive3标记删除,不推荐使用。 继承GenericUDF类,相对复杂,控制更精细。
UDF类(不推荐)
继承UDF类,编写evaluate方法,该方法需要手动添加,不是实现UDF类的抽象方法。验证两个字符字符是否相同,
代码
import org. apache. hadoop. hive. ql. exec. UDF;
@SuppressWarnings ( "deprecation" )
public class UdfDemo extends UDF {
public boolean evaluate ( String data, String value) {
try {
return data. equals ( value) ;
} catch ( Exception e) {
return false ;
}
}
}
测试
import org. junit. jupiter. api. Test ;
public class UdfTest {
@Test
public void testUDF ( ) {
UdfDemo udf= new UdfDemo ( ) ;
boolean result= udf. evaluate ( "qwe" , "qwe" ) ;
System . out. println ( result) ;
result= udf. evaluate ( "w" , "L" ) ;
System . out. println ( result) ;
result= udf. evaluate ( null , null ) ;
System . out. println ( result) ;
}
}
GenericUDF类
方法名 说明 运行 initialize 校验输入参数类型, 指定输出结果类型 单节点运行周期内,最开始执行一次 evaluate 处理输入内容生成输出结果 单节点运行周期中内执行多次, 执行次数与数据行数相等 getDisplayString 异常退出时输出标识内容 单节点运行周期内,仅在Hive UDF发生异常时执行一次
类型 方式 说明 普通java类型 String
对应 PrimitiveObjectInspectorFactory.javaStringObjectInspector
通过PrimitiveObjectInspectorFactory工厂类指定 集合map、list等 List<String>
对应 ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory 嵌套PrimitiveObjectInspectorFactory工厂类指定
代码
创建udf类,功能验证字符串参数二是否存在于参数一列表内。
import java. util. List ;
import org. apache. hadoop. hive. ql. exec. UDFArgumentException ;
import org. apache. hadoop. hive. ql. exec. UDFArgumentLengthException ;
import org. apache. hadoop. hive. ql. metadata. HiveException ;
import org. apache. hadoop. hive. ql. udf. generic. GenericUDF ;
import org. apache. hadoop. hive. serde2. objectinspector. ListObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. PrimitiveObjectInspectorFactory ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. StringObjectInspector ;
public class GenericUDFDemo extends GenericUDF {
ListObjectInspector listOI;
StringObjectInspector elementOI;
@Override
public ObjectInspector initialize ( ObjectInspector [ ] arguments) throws UDFArgumentException {
if ( arguments. length != 2 ) {
throw new UDFArgumentLengthException ( " UdfDemoThreeJin Input Error, only takes 2 arguments: List<T>, T" ) ;
}
ObjectInspector a = arguments[ 0 ] ;
ObjectInspector b = arguments[ 1 ] ;
if ( ! ( a instanceof ListObjectInspector ) || ! ( b instanceof StringObjectInspector ) ) {
throw new UDFArgumentException ( "UdfDemoThreeJin Input Error, first argument must be a list / array, second argument must be a string" ) ;
}
this . listOI = ( ListObjectInspector ) a;
this . elementOI = ( StringObjectInspector ) b;
if ( ! ( listOI. getListElementObjectInspector ( ) instanceof StringObjectInspector ) ) {
throw new UDFArgumentException ( "UdfDemoThreeJin Input Error,first argument must be a list of strings" ) ;
}
return PrimitiveObjectInspectorFactory . javaBooleanObjectInspector;
}
@Override
public Object evaluate ( DeferredObject [ ] arguments) throws HiveException {
@SuppressWarnings ( "unchecked" )
List < String > list = ( List < String > ) this . listOI. getList ( arguments[ 0 ] . get ( ) ) ;
String arg = elementOI. getPrimitiveJavaObject ( arguments[ 1 ] . get ( ) ) ;
if ( list == null || arg == null ) {
return null ;
}
for ( String s: list) {
if ( arg. equals ( s) ) return new Boolean ( true ) ;
}
return new Boolean ( false ) ;
}
@Override
public String getDisplayString ( String [ ] arg0) {
return "arrayContainsExample()" ;
}
}
测试
import java. io. IOException ;
import java. util. ArrayList ;
import java. util. List ;
import org. apache. hadoop. hive. ql. metadata. HiveException ;
import org. apache. hadoop. hive. ql. udf. generic. GenericUDF. DeferredJavaObject ;
import org. apache. hadoop. hive. ql. udf. generic. GenericUDF. DeferredObject ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspectorFactory ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. JavaBooleanObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. PrimitiveObjectInspectorFactory ;
import org. junit. Assert ;
import org. junit. jupiter. api. Test ;
import com. dinglicom. cn. udf. GenericUDFDemo ;
public class GenericUDFTest {
@Test
public void testComplexUDFReturnsCorrectValues ( ) throws HiveException , IOException {
GenericUDFDemo example = new GenericUDFDemo ( ) ;
ObjectInspector stringOI = PrimitiveObjectInspectorFactory . javaStringObjectInspector;
ObjectInspector listOI = ObjectInspectorFactory . getStandardListObjectInspector ( stringOI) ;
JavaBooleanObjectInspector resultInspector = ( JavaBooleanObjectInspector ) example. initialize ( new ObjectInspector [ ] { listOI, stringOI} ) ;
List < String > list = new ArrayList < String > ( ) ;
list. add ( "a" ) ;
list. add ( "b" ) ;
list. add ( "c" ) ;
Object result = example. evaluate ( new DeferredObject [ ] { new DeferredJavaObject ( list) , new DeferredJavaObject ( "a" ) } ) ;
Assert . assertEquals ( true , resultInspector. get ( result) ) ;
System . out. println ( "result:" + result) ;
Object result2 = example. evaluate ( new DeferredObject [ ] { new DeferredJavaObject ( list) , new DeferredJavaObject ( "d" ) } ) ;
Assert . assertEquals ( false , resultInspector. get ( result2) ) ;
System . out. println ( "result:" + result2) ;
Object result3 = example. evaluate ( new DeferredObject [ ] { new DeferredJavaObject ( null ) , new DeferredJavaObject ( null ) } ) ;
Assert . assertNull ( result3) ;
System . out. println ( "result:" + result3) ;
example. close ( ) ;
}
}
UDTF代码
UDTF 只能通过继承GenericUDTF实现,代码如下,功能为拆分字符串,第一个参数内容,第二个参数字符串分隔。 由于涉及流,未找到合适本地测试方法
package com. dinglicom. cn. udtf ;
import java. util. ArrayList ;
import java. util. List ;
import org. apache. hadoop. hive. ql. exec. UDFArgumentException ;
import org. apache. hadoop. hive. ql. metadata. HiveException ;
import org. apache. hadoop. hive. ql. udf. generic. GenericUDTF ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. ObjectInspectorFactory ;
import org. apache. hadoop. hive. serde2. objectinspector. StructObjectInspector ;
import org. apache. hadoop. hive. serde2. objectinspector. primitive. PrimitiveObjectInspectorFactory ;
public class GenericUDTFDemo extends GenericUDTF {
private ArrayList < String > outList = new ArrayList < > ( ) ;
@Override
public StructObjectInspector initialize ( StructObjectInspector argOIs) throws UDFArgumentException {
List < String > fieldNames = new ArrayList < > ( ) ;
List < ObjectInspector > fieldOIs = new ArrayList < > ( ) ;
fieldNames. add ( "Word" ) ;
fieldOIs. add ( PrimitiveObjectInspectorFactory . javaStringObjectInspector) ;
return ObjectInspectorFactory . getStandardStructObjectInspector ( fieldNames, fieldOIs) ;
}
@Override
public void process ( Object [ ] args) throws HiveException {
String arg = args[ 0 ] . toString ( ) ;
String splitKey = args[ 1 ] . toString ( ) ;
String [ ] fields = arg. split ( splitKey) ;
for ( String field : fields) {
outList. clear ( ) ;
outList. add ( field) ;
forward ( outList) ;
}
}
@Override
public void close ( ) throws HiveException {
}
}
UDAF
UDAF 大都继承AbstractGenericUDAFResolver
类,hive3中已标记删除,未找到其他途径,暂不做说明。
hive 创建udf
udf分两种临时和永久,临时udf关闭会话结束生命周期,再使用需重新注册
永久udf需要指定数据库 ,如果不use数据库,直接创建,则该函数只能在
临时udf
# 打开hive客户端
hive
# 添加jar包
hive> add jar /home/data/hive-udf-0.0.1-SNAPSHOT.jar;
# 创建临时udf
hive> create temporary function myEqual as 'io.myudf.UdfDemo';
# 查看udf信息
hive> desc function myEqual;
# 测试udf,退出自动删除
hive> select myEqual('hello','hello');
永久udf
上传jar包到hadoop几区:hdfs dfs -put hive-udf-0.0.1-SNAPSHOT.jar /user/
登录hive客户端完成操作
# 打开hive客户端
hive
# 添加集群jar
hive> add jar hdfs://node1:9000/hive-udf-0.0.1-SNAPSHOT.jar;
# 进入demo库,永久udf仅支持库粒度使用
hive> use demo;
# 创建永久udf(也支持临时)
hive> create function myEqual as 'io.myudf.UdfDemo';
# 查看udf信息
hive> desc function myEqual;
# 测试udf
hive> select myEqual('hello','hello');
# 删除udf
hive> drop function IF EXISTS myEqual;
UDF操作语句
搜索month相关udf:show functions like '*month*';
添加jar包:add jar hdfs://hadoop102:8020/user/hive/hive-udf-0.0.1-SNAPSHOT.jar;
,端口和地址在hadoop core-site.xml中 删除jar包:delete jar hdfs://hadoop102:8020/user/hive/hive-udf-0.0.1-SNAPSHOT.jar;
创建永久udf:create function udf名 as '包名.类名';
创建临时udf:create temporary function udf名 as '包名.类名';
查看udf用法:desc function udf名;
查看udf函数详细说明并举例:desc function extended udf名;
删除永久UDF:DROP FUNCTION [IF EXISTS] udf名;
删除临时UDF:DROP temporary FUNCTION [IF EXISTS] udf名;
异常
新注册udf,部分客户端提示不存在解决方法,执行 reload function
,刷新UDF信息。
总结
永久udf必须制定数据库,不use 数据库创建永久udf,默认添加到default库下。 因为一些原因,博客停更了一段时间,后续将开启不定时更新模式,学海无涯,加油。