最近感受了hive的udf函数的强大威力了,不仅可以使用很多已经有的udf函数,还可以自己定义符合业务场景的udf函数,下面就说一下如何写udf/udaf/udtf函数,算是一个入门介绍吧。
First, you need to create a new class that extends UDF, with one or more methods named evaluate.
package com.example.hive.udf; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public final class Lower extends UDF { public Text evaluate(final Text s) { if (s == null) { return null; } return new Text(s.toString().toLowerCase()); } }
After compiling your code to a jar, you need to add this to the hive classpath.
add jar my_jar.jar;
Once hive is started up with your jars in the classpath, the final step is to register your function
create temporary function my_lower as 'com.example.hive.udf.Lower';
上面主要描述了实现一个udf的过程,首先自然是实现一个UDF函数,然后编译为jar并加入到hive的classpath中,最后创建一个临时变量名字让hive中调用。
下面这个表格可以更加清晰的看出udf/udaf/udtf之间的区别
Show几个例子:
1) UDF (参考:http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udf/)
package org.apache.hadoop.hive.contrib.udf.example;
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* UDFExampleAdd.
*
*/
public class UDFExampleAdd extends UDF {
public Integer evaluate(Integer... a) {
int total = 0;
for (Integer element : a) {
if (element != null) {
total += element;
}
}
return total;
}
public Double evaluate(Double... a) {
double total = 0;
for (Double element : a) {
if (element != null) {
total += element;
}
}
return total;
}
}
2)UDAF( http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udaf/)
package org.apache.hadoop.hive.contrib.udaf.example;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
/**
* This is a simple UDAF that calculates average.
*
* It should be very easy to follow and can be used as an example for writing
* new UDAFs.
*
* Note that Hive internally uses a different mechanism (called GenericUDAF) to
* implement built-in aggregation functions, which are harder to program but
* more efficient.
*
*/
public final class UDAFExampleAvg extends UDAF {
/**
* The internal state of an aggregation for average.
*
* Note that this is only needed if the internal state cannot be represented
* by a primitive.
*
* The internal state can also contains fields with types like
* ArrayList<String> and HashMap<String,Double> if needed.
*/
public static class UDAFAvgState {
private long mCount;
private double mSum;
}
/**
* The actual class for doing the aggregation. Hive will automatically look
* for all internal classes of the UDAF that implements UDAFEvaluator.
*/
public static class UDAFExampleAvgEvaluator implements UDAFEvaluator {
UDAFAvgState state;
public UDAFExampleAvgEvaluator() {
super();
state = new UDAFAvgState();
init();
}
/**
* Reset the state of the aggregation.
*/
public void init() {
state.mSum = 0;
state.mCount = 0;
}
/**
* Iterate through one row of original data.
*
* The number and type of arguments need to the same as we call this UDAF
* from Hive command line.
*
* This function should always return true.
*/
public boolean iterate(Double o) {
if (o != null) {
state.mSum += o;
state.mCount++;
}
return true;
}
/**
* Terminate a partial aggregation and return the state. If the state is a
* primitive, just return primitive Java classes like Integer or String.
*/
public UDAFAvgState terminatePartial() {
// This is SQL standard - average of zero items should be null.
return state.mCount == 0 ? null : state;
}
/**
* Merge with a partial aggregation.
*
* This function should always have a single argument which has the same
* type as the return value of terminatePartial().
*/
public boolean merge(UDAFAvgState o) {
if (o != null) {
state.mSum += o.mSum;
state.mCount += o.mCount;
}
return true;
}
/**
* Terminates the aggregation and return the final result.
*/
public Double terminate() {
// This is SQL standard - average of zero items should be null.
return state.mCount == 0 ? null : Double.valueOf(state.mSum
/ state.mCount);
}
}
private UDAFExampleAvg() {
// prevent instantiation
}
}
3)UDTF( http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udtf/)
package org.apache.hadoop.hive.contrib.udtf.example;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
/**
* GenericUDTFExplode2.
*
*/
@Description(name = "explode2",
value = "_FUNC_(a) - like explode, but outputs two identical columns (for testing purposes)")
public class GenericUDTFExplode2 extends GenericUDTF {
ListObjectInspector listOI = null;
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentException("explode() takes only one argument");
}
if (args[0].getCategory() != ObjectInspector.Category.LIST) {
throw new UDFArgumentException("explode() takes an array as a parameter");
}
listOI = (ListObjectInspector) args[0];
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("col1");
fieldNames.add("col2");
fieldOIs.add(listOI.getListElementObjectInspector());
fieldOIs.add(listOI.getListElementObjectInspector());
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
Object forwardObj[] = new Object[2];
@Override
public void process(Object[] o) throws HiveException {
List<?> list = listOI.getList(o[0]);
for (Object r : list) {
forwardObj[0] = r;
forwardObj[1] = r;
forward(forwardObj);
}
}
@Override
public String toString() {
return "explode";
}
}
Ref: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF