自定义UDF
在Hive中,如果Hive原生提供的函数不能够处理数据,那么Hive允许用户自定义函数,在Hive3.X中,需要定义类继承GenericUDF类
自定义udf,添加pom依赖
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>3.1.2</version>
<exclusions>
<exclusion>
<groupId>org.glassfish</groupId>
<artifactId>javax.el</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-common</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
编写代码
package com.test;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
// 传入两个字符串,获取第二个字符串在第一个字符串中第一次出现的下标位置
public class SelfUDF extends GenericUDF {
// 初始化 - evaluate方法的返回值类型
public ObjectInspector initialize(ObjectInspector[] ois) throws UDFArgumentException {
// 确定参数个数
if(ois.length != 2)
throw new UDFArgumentException("参数个数必须为2!!!");
// 返回结果,这个结果决定了函数的返回值类型
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
// 函数要执行的逻辑需要覆盖在这个方法中
public Object evaluate(DeferredObject[] dob) throws HiveException {
// 获取第一个字符串
String str = dob[0].get().toString();
// 获取第二个字符串
String sub = dob[1].get().toString();
// 获取下标位置
return str.indexOf(sub);
}
@Override
public String getDisplayString(String[] s) {
return null;
}
}
上传到服务器上,在hive中添加jar包
hive> add jar /HIVE-1.0-SNAPSHOT.jar;
创建临时函数
create temporary function stringindexof as 'com.test.SelfUDF';
select stringindexof('abcdef','de');
自定义UDTF
UDTF解决输入一行,输出多行的需求
输入"hello,world,zhangsan,shanghai","," * 输出 * world * hello * world * zhangsan * shanghai
编写代码
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* UDTF解决输入一行,输出多行的需求
* 输入"hello,world,tom,shanghai",","
* 输出
* world
* hello
* world
* tom
* shanghai
*/
public class MyUDTF extends GenericUDTF {
private List<String> wordList = new ArrayList<String>();
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs)
throws UDFArgumentException {
/**
* 输出数据类型说明:
*/
List<String> fieldNames = new ArrayList<String>();
fieldNames.add("word");
List<ObjectInspector> fieldOIs = new ArrayList<>();
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String data = args[0].toString();
String splitkey = args[1].toString();
String[] words = data.split(splitkey);
for (String word :
words) {
wordList.clear();
wordList.add(word);
forward(wordList);
}
}
@Override
public void close() throws HiveException {
}
}
上传到服务器上,在hive中添加jar包
hive> add jar /HIVE-1.0-SNAPSHOT.jar;
创建临时函数
create temporary function myudtf as 'com.test.MyUDTF';
select myudtf('aa,bb,cc,dd',',');