写hive的udf函数

最新推荐文章于 2024-07-31 21:45:00 发布

anningzhu

最新推荐文章于 2024-07-31 21:45:00 发布

阅读量1k

点赞数

hive 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

最近感受了Hive的udf函数的强大威力了，不仅可以使用很多已经有的udf函数，还可以自己定义符合业务场景的udf函数，下面就说一下如何写udf/udaf/udtf函数，算是一个入门介绍吧。

First, you need to create a new class that extends UDF, with one or more methods named evaluate.


  
  
   
   
    
    
     
     [html] 
     
     view plain
     
      copy
     
     
     
      
    
    
   
   
   
   package com.example.hive.udf;  
  
import org.apache.hadoop.hive.ql.exec.UDF;  
import org.apache.hadoop.io.Text;  
  
public final class Lower extends UDF {  
  public Text evaluate(final Text s) {  
    if (s == null) { return null; }  
    return new Text(s.toString().toLowerCase());  
  }  
}

After compiling your code to a jar, you need to add this to the hive classpath.

[html]view plaincopy 
   
 add jar my_jar.jar;  

Once hive is started up with your jars in the classpath, the final step is to register your function


  
  
   
   
    
    
     
     [html] 
     
     view plain
     
      copy
     
     
     
      
    
    
   
   
   
   create temporary function my_lower as 'com.example.hive.udf.Lower';

上面主要描述了实现一个udf的过程，首先自然是实现一个UDF函数，然后编译为jar并加入到hive的classpath中，最后创建一个临时变量名字让hive中调用。

下面这个表格可以更加清晰的看出udf/udaf/udtf之间的区别

Show几个例子：

1） UDF （参考：http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udf/）

[html]view plaincopy 
   
 package org.apache.hadoop.hive.contrib.udf.example;  
   
 import org.apache.hadoop.hive.ql.exec.UDF;  
   
 /**  
  * UDFExampleAdd.  
  *  
  */  
 public class UDFExampleAdd extends UDF {  
   
   public Integer evaluate(Integer... a) {  
     int total = 0;  
     for (Integer element : a) {  
       if (element != null) {  
         total += element;  
       }  
     }  
     return total;  
   }  
   
   public Double evaluate(Double... a) {  
     double total = 0;  
     for (Double element : a) {  
       if (element != null) {  
         total += element;  
       }  
     }  
     return total;  
   }  
   
 }  

2）UDAF（ http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udaf/ ）

[html]view plaincopy 
   
 package org.apache.hadoop.hive.contrib.udaf.example;  
   
 import org.apache.hadoop.hive.ql.exec.UDAF;  
 import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;  
   
 /**  
  * This is a simple UDAF that calculates average.  
  *   
  * It should be very easy to follow and can be used as an example for writing  
  * new UDAFs.  
  *   
  * Note that Hive internally uses a different mechanism (called GenericUDAF) to  
  * implement built-in aggregation functions, which are harder to program but  
  * more efficient.  
  *   
  */  
 public final class UDAFExampleAvg extends UDAF {  
   
   /**  
    * The internal state of an aggregation for average.  
    *   
    * Note that this is only needed if the internal state cannot be represented  
    * by a primitive.  
    *   
    * The internal state can also contains fields with types like  
    * ArrayList<String> and HashMap<String,Double> if needed.  
    */  
   public static class UDAFAvgState {  
     private long mCount;  
     private double mSum;  
   }  
   
   /**  
    * The actual class for doing the aggregation. Hive will automatically look  
    * for all internal classes of the UDAF that implements UDAFEvaluator.  
    */  
   public static class UDAFExampleAvgEvaluator implements UDAFEvaluator {  
   
     UDAFAvgState state;  
   
     public UDAFExampleAvgEvaluator() {  
       super();  
       state = new UDAFAvgState();  
       init();  
     }  
   
     /**  
      * Reset the state of the aggregation.  
      */  
     public void init() {  
       state.mSum = 0;  
       state.mCount = 0;  
     }  
   
     /**  
      * Iterate through one row of original data.  
      *   
      * The number and type of arguments need to the same as we call this UDAF  
      * from Hive command line.  
      *   
      * This function should always return true.  
      */  
     public boolean iterate(Double o) {  
       if (o != null) {  
         state.mSum += o;  
         state.mCount++;  
       }  
       return true;  
     }  
   
     /**  
      * Terminate a partial aggregation and return the state. If the state is a  
      * primitive, just return primitive Java classes like Integer or String.  
      */  
     public UDAFAvgState terminatePartial() {  
       // This is SQL standard - average of zero items should be null.  
       return state.mCount == 0 ? null : state;  
     }  
   
     /**  
      * Merge with a partial aggregation.  
      *   
      * This function should always have a single argument which has the same  
      * type as the return value of terminatePartial().  
      */  
     public boolean merge(UDAFAvgState o) {  
       if (o != null) {  
         state.mSum += o.mSum;  
         state.mCount += o.mCount;  
       }  
       return true;  
     }  
   
     /**  
      * Terminates the aggregation and return the final result.  
      */  
     public Double terminate() {  
       // This is SQL standard - average of zero items should be null.  
       return state.mCount == 0 ? null : Double.valueOf(state.mSum  
           / state.mCount);  
     }  
   }  
   
   private UDAFExampleAvg() {  
     // prevent instantiation  
   }  
   
 }  

3）UDTF（ http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udtf/ ）

[html]view plaincopy 
   
 package org.apache.hadoop.hive.contrib.udtf.example;  
   
 import java.util.ArrayList;  
 import java.util.List;  
   
 import org.apache.hadoop.hive.ql.exec.Description;  
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;  
 import org.apache.hadoop.hive.ql.metadata.HiveException;  
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;  
 import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;  
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;  
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;  
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;  
   
 /**  
  * GenericUDTFExplode2.  
  *  
  */  
 @Description(name = "explode2",  
     value = "_FUNC_(a) - like explode, but outputs two identical columns (for testing purposes)")  
 public class GenericUDTFExplode2 extends GenericUDTF {  
   
   ListObjectInspector listOI = null;  
   
   @Override  
   public void close() throws HiveException {  
   }  
   
   @Override  
   public StructObjectInspector initialize(ObjectInspector[] args)  
       throws UDFArgumentException {  
   
     if (args.length != 1) {  
       throw new UDFArgumentException("explode() takes only one argument");  
     }  
   
     if (args[0].getCategory() != ObjectInspector.Category.LIST) {  
       throw new UDFArgumentException("explode() takes an array as a parameter");  
     }  
     listOI = (ListObjectInspector) args[0];  
   
     ArrayList<String> fieldNames = new ArrayList<String>();  
     ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();  
     fieldNames.add("col1");  
     fieldNames.add("col2");  
     fieldOIs.add(listOI.getListElementObjectInspector());  
     fieldOIs.add(listOI.getListElementObjectInspector());  
     return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,  
         fieldOIs);  
   }  
   
   Object forwardObj[] = new Object[2];  
   
   @Override  
   public void process(Object[] o) throws HiveException {  
   
     List<?> list = listOI.getList(o[0]);  
     for (Object r : list) {  
       forwardObj[0] = r;  
       forwardObj[1] = r;  
       forward(forwardObj);  
     }  
   }  
   
   @Override  
   public String toString() {  
     return "explode";  
   }  
 }  

anningzhu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
写hive的udf函数

最近感受了Hive的udf函数的强大威力了，不仅可以使用很多已经有的udf函数，还可以自己定义符合业务场景的udf函数，下面就说一下如何写udf/udaf/udtf函数，算是一个入门介绍吧。First, you need to create a new class that extends UDF, with one or more methods named evaluate.
复制链接

扫一扫