最近感受了Hive的udf函数的强大威力了,不仅可以使用很多已经有的udf函数,还可以自己定义符合业务场景的udf函数,下面就说一下如何写udf/udaf/udtf函数,算是一个入门介绍吧。
First, you need to create a new class that extends UDF, with one or more methods named evaluate.
[html]
view plain
copy
- package com.example.hive.udf;
-
- import org.apache.hadoop.hive.ql.exec.UDF;
- import org.apache.hadoop.io.Text;
-
- public final class Lower extends UDF {
- public Text evaluate(final Text s) {
- if (s == null) { return null; }
- return new Text(s.toString().toLowerCase());
- }
- }
After compiling your code to a jar, you need to add this to the hive classpath.
- add jar my_jar.jar;
Once hive is started up with your jars in the classpath, the final step is to register your function
[html]
view plain
copy
- create temporary function my_lower as 'com.example.hive.udf.Lower';
上面主要描述了实现一个udf的过程,首先自然是实现一个UDF函数,然后编译为jar并加入到hive的classpath中,最后创建一个临时变量名字让hive中调用。
下面这个表格可以更加清晰的看出udf/udaf/udtf之间的区别
Show几个例子:
1) UDF (参考:http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udf/)
- package org.apache.hadoop.hive.contrib.udf.example;
- import org.apache.hadoop.hive.ql.exec.UDF;
- /**
- * UDFExampleAdd.
- *
- */
- public class UDFExampleAdd extends UDF {
- public Integer evaluate(Integer... a) {
- int total = 0;
- for (Integer element : a) {
- if (element != null) {
- total += element;
- }
- }
- return total;
- }
- public Double evaluate(Double... a) {
- double total = 0;
- for (Double element : a) {
- if (element != null) {
- total += element;
- }
- }
- return total;
- }
- }
2)UDAF( http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udaf/ )
- package org.apache.hadoop.hive.contrib.udaf.example;
- import org.apache.hadoop.hive.ql.exec.UDAF;
- import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
- /**
- * This is a simple UDAF that calculates average.
- *
- * It should be very easy to follow and can be used as an example for writing
- * new UDAFs.
- *
- * Note that Hive internally uses a different mechanism (called GenericUDAF) to
- * implement built-in aggregation functions, which are harder to program but
- * more efficient.
- *
- */
- public final class UDAFExampleAvg extends UDAF {
- /**
- * The internal state of an aggregation for average.
- *
- * Note that this is only needed if the internal state cannot be represented
- * by a primitive.
- *
- * The internal state can also contains fields with types like
- * ArrayList<String> and HashMap<String,Double> if needed.
- */
- public static class UDAFAvgState {
- private long mCount;
- private double mSum;
- }
- /**
- * The actual class for doing the aggregation. Hive will automatically look
- * for all internal classes of the UDAF that implements UDAFEvaluator.
- */
- public static class UDAFExampleAvgEvaluator implements UDAFEvaluator {
- UDAFAvgState state;
- public UDAFExampleAvgEvaluator() {
- super();
- state = new UDAFAvgState();
- init();
- }
- /**
- * Reset the state of the aggregation.
- */
- public void init() {
- state.mSum = 0;
- state.mCount = 0;
- }
- /**
- * Iterate through one row of original data.
- *
- * The number and type of arguments need to the same as we call this UDAF
- * from Hive command line.
- *
- * This function should always return true.
- */
- public boolean iterate(Double o) {
- if (o != null) {
- state.mSum += o;
- state.mCount++;
- }
- return true;
- }
- /**
- * Terminate a partial aggregation and return the state. If the state is a
- * primitive, just return primitive Java classes like Integer or String.
- */
- public UDAFAvgState terminatePartial() {
- // This is SQL standard - average of zero items should be null.
- return state.mCount == 0 ? null : state;
- }
- /**
- * Merge with a partial aggregation.
- *
- * This function should always have a single argument which has the same
- * type as the return value of terminatePartial().
- */
- public boolean merge(UDAFAvgState o) {
- if (o != null) {
- state.mSum += o.mSum;
- state.mCount += o.mCount;
- }
- return true;
- }
- /**
- * Terminates the aggregation and return the final result.
- */
- public Double terminate() {
- // This is SQL standard - average of zero items should be null.
- return state.mCount == 0 ? null : Double.valueOf(state.mSum
- / state.mCount);
- }
- }
- private UDAFExampleAvg() {
- // prevent instantiation
- }
- }
3)UDTF( http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udtf/ )
- package org.apache.hadoop.hive.contrib.udtf.example;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.hadoop.hive.ql.exec.Description;
- import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
- import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
- import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
- import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
- import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
- /**
- * GenericUDTFExplode2.
- *
- */
- @Description(name = "explode2",
- value = "_FUNC_(a) - like explode, but outputs two identical columns (for testing purposes)")
- public class GenericUDTFExplode2 extends GenericUDTF {
- ListObjectInspector listOI = null;
- @Override
- public void close() throws HiveException {
- }
- @Override
- public StructObjectInspector initialize(ObjectInspector[] args)
- throws UDFArgumentException {
- if (args.length != 1) {
- throw new UDFArgumentException("explode() takes only one argument");
- }
- if (args[0].getCategory() != ObjectInspector.Category.LIST) {
- throw new UDFArgumentException("explode() takes an array as a parameter");
- }
- listOI = (ListObjectInspector) args[0];
- ArrayList<String> fieldNames = new ArrayList<String>();
- ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
- fieldNames.add("col1");
- fieldNames.add("col2");
- fieldOIs.add(listOI.getListElementObjectInspector());
- fieldOIs.add(listOI.getListElementObjectInspector());
- return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
- fieldOIs);
- }
- Object forwardObj[] = new Object[2];
- @Override
- public void process(Object[] o) throws HiveException {
- List<?> list = listOI.getList(o[0]);
- for (Object r : list) {
- forwardObj[0] = r;
- forwardObj[1] = r;
- forward(forwardObj);
- }
- }
- @Override
- public String toString() {
- return "explode";
- }
- }