Hive UDTF

UDTF:用户自定义表生成函数,表生成函数接受0个或多个输入然后产生多列或多行输出。
例如:

select array(1,2,3);

结果为:
在这里插入图片描述
然后我们使用一个表生成函数explode()

样式描述
explode(ARRAY array)返回0到多行结果,每行对应输入的array数组中的一个元素。
explode(MAP map)返回0到多行结果,每行对应每个map键值对,其中一个字段是map的键,另一个字段对应map的值。
select explode(array(1,2,3));

结果为:
在这里插入图片描述
我们自定义的UDTF就需要实现类似功能,将0个或多个输入转换成多行输出。

GenericUDTF Interface

一个 UDTF 必须继承 GenericUDTF 抽象类然后实现抽象类中的 initialize,process,和 close方法。其中,Hive 调用 initialize 方法来确定传入参数的类型并确定 UDTF 生成表的每个字段的数据类型(即输入类型和输出类型)。initialize 方法必须返回一个生成表的字段的相应的 object inspector。一旦调用了 initialize() ,Hive将把 UDTF 参数传给 process() 方法,调用这个方法可以产生行对象并将行对象转发给其他操作器。最后当所有的行对象都传递出 UDTF 调用 close() 方法。
例子:

package myhive.functions.UDTF;

import com.util.SEtimeToHour;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;

/**
* @Description:    假设一个用户在"2019-03-23 20:17:34"到"2019-03-23 22:17:34"收视,
* 				   计算每个小时收视了多少秒。
* 				   将输入的时间"2019-03-23 20:17:34","2019-03-23 22:17:34"按小时划分
* 				   select ("2019-03-23 20:17:34","2019-03-23 22:17:34");
* 				   输出 小时数,收视时长
* 					hour	viewsecond
* 					20	2554
* 					21	3600
* 					22	1054
* @Author:         choosing
* @Date:           2019/4/20 11:08
*/
public class DateToHour extends GenericUDTF {

    @Override
    public StructObjectInspector initialize(ObjectInspector[] args)
            throws UDFArgumentException {
        if (args.length != 2) {//判断自定义函数传入参数是否为2.
            throw new UDFArgumentLengthException("ExplodeMap takes only two argument");
        }
        if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE && args[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
            //判断参数是否是PRIMITIVE,LIST,MAP,STRUCT,UNION类型;
            throw new UDFArgumentException("ExplodeMap takes string as a parameter");
        }

        ArrayList<String> fieldNames = new ArrayList<String>();//生成表的字段名数组
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();//生成表的字段对象监控器(object inspector)数组,即生成表的行对象每个字段的类型
        fieldNames.add("hour");//时段
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);//类型是PRIMITIVE
        fieldNames.add("viewsecond");//相应时段的观看时长
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);//类型是PRIMITIVE
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);//返回对象监控器
    }

    @Override
    public void process(Object[] args) throws HiveException {
        // args为自定义函数传入的参数数据
        List<String[]> result = null;
        try {
            result = SEtimeToHour.setimeToHour(args[0], args[1]);//处理参数生成行对象
        } catch (ParseException e) {
            e.printStackTrace();
        }
        for (String[] str : result) {
            forward(str);//转发行对象。注意,循环一次意味着返回一行
        }

    }

    @Override
    public void close() throws HiveException {

    }

}

package com.util;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;

/**
* @Description:    切分2019-03-23 20:17:34到019-03-23 22:17:34这种时间的小时分布
* @Author:         choosing
* @Date:           2019/4/20 11:25
*/
public class SEtimeToHour {

    public static List<String[]> setimeToHour(String startTime, String endtime) throws ParseException {
        //返回的list
        List<String[]> list = new ArrayList<String[]>();
        //观看时长
        long viewsecond = 0;

        //解析字符串
        SimpleDateFormat dateFormat= new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        Date startDate =dateFormat.parse(startTime);
        Date endDate =dateFormat.parse(endtime);
        //将开始时间和结束时间的Date转换成Calendar,并获取开始时间和结束时间的小时值
        Calendar startCalendar = Calendar.getInstance();
        startCalendar.setTime(startDate);
        int startHour = startCalendar.get(Calendar.HOUR_OF_DAY);

        Calendar endCalendar = Calendar.getInstance();
        endCalendar.setTime(endDate);
        int endHour = endCalendar.get(Calendar.HOUR_OF_DAY);

        if(endHour - startHour == 0){
            //如果起始时间和结束时间是同一个小时
            long startSec = startDate.getTime() / 1000;
            long endSec = endDate.getTime() / 1000;
            viewsecond = endSec - startSec;
            list.add(SEtimeToHour.toStringArray(startHour,viewsecond));
            return list;
        }else{
            //起始时间和结束时间不是同一个小时
            for(int i = startHour,j = 1;i <= endHour;i++,j++){
                if(j == 1){
                    //第一个小时观看时长
                    int minute = 60 - startCalendar.get(Calendar.MINUTE);
                    int second = 60 - startCalendar.get(Calendar.SECOND);
                    viewsecond = minute * 60 - second;
                    list.add(SEtimeToHour.toStringArray(i,viewsecond));
                }else if(i != endHour){
                    viewsecond = 60 * 60;
                    list.add(SEtimeToHour.toStringArray(i,viewsecond));
                }else{
                    //最后一个小时观看时长
                    int minute = endCalendar.get(Calendar.MINUTE);
                    int second = endCalendar.get(Calendar.SECOND);
                    viewsecond = minute * 60 + second;
                    list.add(SEtimeToHour.toStringArray(i,viewsecond));
                }
            }
            return list;
        }
    }

    public static String[] toStringArray(int hour,long viewsecond){
        String[] str = new String[2];
        str[0] = String.valueOf(hour);
        str[1] = String.valueOf(viewsecond);
        return str;
    }

    /*  public static void main(String[] args) throws ParseException {
        String startTime = "2019-02-01 12:52:27";
        String endTime = "2019-02-01 13:19:31";
        List<String[]> list = SEtimeToHour.setimeToHour(startTime, endTime);
        for (String[] str : list) {
           System.out.println("hour:"+str[0]+"  viewsecond:"+str[1]);
        }
    }*/
}

官方例子:

package org.apache.hadoop.hive.contrib.udtf.example;
 
import java.util.ArrayList;
 
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 
 
/**
 * GenericUDTFCount2 outputs the number of rows seen, twice. It's output twice
 * to test outputting of rows on close with lateral view.
 * 即select GenericUDTFCount2() from dual;结果为:
 * col1
 * ------
 * 1
 * 1
 */
public class GenericUDTFCount2 extends GenericUDTF {
 
  Integer count = Integer.valueOf(0);
  Object forwardObj[] = new Object[1];
 
  @Override
  public void close() throws HiveException {
    forwardObj[0] = count;
    forward(forwardObj);
    forward(forwardObj);//转发两次行对象
  }
 
  @Override
  public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
    ArrayList<String> fieldNames = new ArrayList<String>();
    ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
    fieldNames.add("col1");//确定行对象其中字段的字段名
    fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);//确定行对象其中字段的字段类型
    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
        fieldOIs);//返回对象监控器
  }
 
  @Override
  public void process(Object[] args) throws HiveException {
    count = Integer.valueOf(count.intValue() + 1);//将count加1,注意并没有处理输入参数
  }
 
}

GenericUDTF 抽象类

package org.apache.hadoop.hive.ql.udf.generic;
 
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 
/**
 * A Generic User-defined Table Generating Function (UDTF)
 *
 * Generates a variable number of output rows for a single input row. Useful for
 * explode(array)...
 */
 
public abstract class GenericUDTF {
  Collector collector = null;
 
  /**
 * Initialize this GenericUDTF. This will be called only once per instance.
 *
 * @param args
 *          An array of ObjectInspectors for the arguments
 * @return A StructObjectInspector for output. The output struct represents a
 *         row of the table where the fields of the stuct are the columns. The
 *         field names are unimportant as they will be overridden by user
 *         supplied column aliases.
   */
  public abstract StructObjectInspector initialize(ObjectInspector[] argOIs)
      throws UDFArgumentException;
 
  /**
 * Give a set of arguments for the UDTF to process.
 *
 * @param o
 *          object array of arguments
   */
  public abstract void process(Object[] args) throws HiveException;
 
  /**
 * Called to notify the UDTF that there are no more rows to process.
 * Clean up code or additional forward() calls can be made here.
   */
  public abstract void close() throws HiveException;
 
  /**
 * Associates a collector with this UDTF. Can't be specified in the
 * constructor as the UDTF may be initialized before the collector has been
 * constructed.
 *
 * @param collector
   */
  public final void setCollector(Collector collector) {
    this.collector = collector;
  }
 
  /**
 * Passes an output row to the collector.
 *
 * @param o
 * @throws HiveException
   */
  protected final void forward(Object o) throws HiveException {
    collector.collect(o);
  }
 
}
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Hive UDTF(User-Defined Table-Generating Function)是一种自定义函数,可以用于生成表格数据。下面是编写Hive UDTF的基本步骤: 1. 继承Hive UDTF类(org.apache.hadoop.hive.ql.udtf.generic.GenericUDTF)。 2. 实现一个或多个方法,例如initialize()、process()和close()。 3. 在process()方法中生成输出数据并使用forward()方法将其发送到Hive中。 4. 定义输入参数和输出列的元数据。可以使用@UDFType、@UDF和@Description注解来指定元数据。 5. 将UDTF打包成JAR文件并将其添加到Hive的CLASSPATH中。 6. 在Hive中创建函数并使用它。 下面是一个示例UDTF,它将输入字符串拆分为单词并将每个单词输出为一行: ``` import org.apache.hadoop.hive.ql.udtf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.ArrayList; public class SplitUDTF extends GenericUDTF { private final ArrayList<Object[]> output = new ArrayList<>(); @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { throw new UDFArgumentException("SplitUDTF takes exactly one argument"); } if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE || !args[0].getTypeName().equals("string")) { throw new UDFArgumentException("SplitUDTF takes a string as its argument"); } final StandardListObjectInspector outputOI = ObjectInspectorFactory.getStandardListObjectInspector( PrimitiveObjectInspectorFactory.javaStringObjectInspector); return ObjectInspectorFactory.getStandardStructObjectInspector( new ArrayList<String>() {{ add("word"); }}, new ArrayList<ObjectInspector>() {{ add(outputOI); }}); } @Override public void process(Object[] args) throws HiveException { final String input = args[0].toString(); final String[] words = input.split("\\s+"); for (final String word: words) { output.add(new Object[] { word }); } } @Override public void close() throws HiveException { for (final Object[] row: output) { forward(row); } } } ``` 使用@UDFType、@UDF和@Description注解指定元数据: ``` @UDFType(deterministic = true) @UDF( name = "split", description = "Splits a string into words", returnType = "array<string>", extended = "Example: SELECT split('hello world') AS words FROM table") public class SplitUDTF extends GenericUDTF { ... } ``` 在Hive中创建函数并使用它: ``` ADD JAR /path/to/split-udtf.jar; CREATE FUNCTION split AS 'SplitUDTF'; SELECT split('hello world') AS words; ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值