1. GenericUDF
1.1 编写Apache Hive用户自定义函数(UDF)有两个不同的接口,一个非常简单,另一个…就相对复杂点。
简单API: org.apache.hadoop.hive.ql.exec.UDF
复杂API: org.apache.hadoop.hive.ql.udf.generic.GenericUDF
1.2 在hive 3.x中,UDF 已被声明@Depreacated 所以我们使用复杂API:GenericUDF
1.3 复杂API
// 只调用一次,在任何evaluate()调用之前,你可以接收到一个可以表示函数输入参数类型的object inspectors数组
// 这是你用来验证该函数是否接收正确的参数类型和参数个数的地方
abstract ObjectInspector initialize(ObjectInspector[] arguments);
// 这个类似于简单API的evaluat方法,它可以读取输入数据和返回结果
abstract Object evaluate(GenericUDF.DeferredObject[] arguments);
// 该方法无关紧要,我们可以返回任何东西,但应当是描述该方法的字符串
abstract String getDisplayString(String[] children);
2. 示例
我将通过建立一个UDF函数:BaseFieldUDF,来加深对该API了解,该函数接收两个参数:
都是String
package cn.linann.udf;
import org.json.JSONException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.json.JSONObject;
public class BaseFieldUDF extends GenericUDF {
StringObjectInspector line;
StringObjectInspector jsonkeysString;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 2) {
throw new UDFArgumentLengthException("BaseFieldUDF only takes 2 arguments: String, String");
}
ObjectInspector a = arguments[0];
ObjectInspector b = arguments[1];
if (!(a instanceof StringObjectInspector) || !(b instanceof StringObjectInspector)) {
throw new UDFArgumentException("first argument and second argument must be a string");
}
this.line = (StringObjectInspector) a;
this.jsonkeysString = (StringObjectInspector) b;
return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
String line = this.line.getPrimitiveJavaObject(arguments[0].get());
String jsonkeysString = this.jsonkeysString.getPrimitiveJavaObject(arguments[1].get());
String[] jsonkeys = jsonkeysString.split(",");
String[] logContents = line.split("\\|");
if (logContents.length != 2 || StringUtils.isBlank(logContents[1])) {
return null;
}
StringBuilder sb = new StringBuilder();
try {
JSONObject jsonObject = new JSONObject(logContents[1]);
JSONObject base = jsonObject.getJSONObject("cm");
for (String jsonkey : jsonkeys) {
String fieldName = jsonkey.trim();
if (base.has(fieldName)) {
sb.append(base.getString(fieldName)).append("\t");
} else {
sb.append("\t");
}
}
sb.append(jsonObject.getString("et")).append("\t");
sb.append(logContents[0]).append("\t");
} catch (JSONException e) {
e.printStackTrace();
}
return sb.toString();
}
@Override
public String getDisplayString(String[] children) {
return "BaseFieldUDF()";
}
// 测试类
public static void main(String[] args) throws HiveException {
BaseFieldUDF example = new BaseFieldUDF();
ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
String line = "1541217850324|{\"cm\":{\"ln\":\"-90.4\",\"sv\":\"V2.2.7\",\"os\":\"8.2.4\",\"g\":\"DP127CN4@gmail.com\",\"mid\":\"447\",\"nw\":\"WIFI\",\"l\":\"en\",\"vc\":\"0\",\"hw\":\"640*1136\",\"ar\":\"MX\",\"uid\":\"447\",\"t\":\"1604451309064\",\"la\":\"14.7\",\"md\":\"sumsung-17\",\"vn\":\"1.2.4\",\"ba\":\"Sumsung\",\"sr\":\"H\"},\"ap\":\"app\"}";
JavaStringObjectInspector resultInspector = (JavaStringObjectInspector) example.initialize(new ObjectInspector[]{stringOI, stringOI});
Object result = example.evaluate(new DeferredObject[]{new DeferredJavaObject(line), new DeferredJavaObject("mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t")});
System.out.println(resultInspector.getPrimitiveJavaObject(result).split("\t").length);
}
}