目录
1.新建项目
idea->New->Project,选择Maven和jdk-> Next,修改Name
2.依赖引入
使用udf,需要引入org.apache.hive.hive-exec包,我这边需要处理json数据,加了个fastjson包
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<!--我这边线上环境有这个包,只需通过编译,打包时不需要打入这个包 测试时需要注释掉-->
<scope>provided</scope>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>2.0.6</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
3.编写函数
新建class继承GenericUDF,并实现三个方法initialize(),evaluate(),getDisplayString(),具体逻辑编写在evaluate()方法中,initialize()在evaluate()之前执行,可初始化判断参数,编写完后main()方法测试下(或者规范性test测试),我这边逻辑代码如下:
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class AddJsonObject extends GenericUDF {
/**
* 初始化,检验传参的正确性
*
* 第一个参数: 需要添加字段的json_str
* 第二个参数: 需要加入字段至其他json的json_str
* 第三个参数: 需要排除的参数 暂不支持
*/
public ObjectInspector initialize(ObjectInspector[] objectInspectors)
throws UDFArgumentException {
if (objectInspectors.length != 2){
throw new UDFArgumentException("The number of parameters is not 2");
}
return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
}
/**
* 处理数据
* 将参数2中的数据添加至参数1中
* 当前默认不添加参数2中的 rowkey、timestamp
*
*/
public String evaluate(DeferredObject[] deferredObjects) throws HiveException {
if (deferredObjects[0].get() == null || deferredObjects[0].get().toString().length() == 0) {
if (deferredObjects[1].get() != null && deferredObjects[1].get().toString().length() != 0) {
return deferredObjects[1].get().toString();
} else {
// 数据异常 即关联失败 返回null
return null;
}
} else if (deferredObjects[0].get() != null && deferredObjects[0].get().toString().length() != 0) {
if (deferredObjects[1].get() == null || deferredObjects[1].get().toString().length() == 0) {
return deferredObjects[0].get().toString();
} else {
// 解析参数1为json对象
String return_json_str = deferredObjects[0].get().toString();
JSONObject jsonObject = JSON.parseObject(return_json_str);
// 转换参数2为map
String add_ob_str = deferredObjects[1].get().toString();
Map map = JSON.parseObject(add_ob_str, Map.class);
for (Object obj : map.keySet()) {
// 排除属性
if (obj.equals("rowkey") || obj.equals("timestamp")) {
continue;
}
// 添加至json
jsonObject.put(obj.toString(), map.get(obj).toString());
}
return JSON.toJSONString(jsonObject);
}
}
}
return null;
}
public String getDisplayString(String[] strings) {
return null;
}
public static void main(String[] args) throws HiveException {
AddJsonObject addJsonObject = new AddJsonObject();
DeferredObject[] deferredObjects = new DeferredObject[2];
// deferredObjects[0] = new DeferredJavaObject("{\"a\":\"b\"}");
// deferredObjects[1] = new DeferredJavaObject("{\"timestamp\":\"axjffe\"}");
deferredObjects[0] = new DeferredJavaObject("");
deferredObjects[1] = new DeferredJavaObject("");
Object return_obj = addJsonObject.evaluate(deferredObjects);
System.out.println("返回字符串为: " + return_obj.toString());
}
}
4.打包
进入项目的target目录,找到jar包并上传到hive所在服务器,
5.测试
服务器进入spark-sql客户端,执行以下命令:
-- 添加jar包 该路径为服务器本地路径,非hdfs路径
add jar /home/zhangsan/hiveudf-1.0-SNAPSHOT.jar;
-- 创建临时函数
create temporary function add_json_object as "com.myudf.AddJsonObject";
select
add_json_object(column1,column2)
from table_test;
6.参考链接
IDEA maven开发hive的udf详细过程(附图片详解)_Lens5935的博客-CSDN博客_maven配置hive