Hive UDTF

最新推荐文章于 2022-11-18 09:05:23 发布

qiulinsama

最新推荐文章于 2022-11-18 09:05:23 发布

阅读量2.2k

点赞数 1

分类专栏： Hive 文章标签： hive udtf

本文链接：https://blog.csdn.net/qiulinsama/article/details/89481053

版权

Hive 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

UDTF:用户自定义表生成函数，表生成函数接受0个或多个输入然后产生多列或多行输出。
例如：

select array(1,2,3);

结果为：
在这里插入图片描述
然后我们使用一个表生成函数explode()

样式	描述
explode(ARRAY array)	返回0到多行结果，每行对应输入的array数组中的一个元素。
explode(MAP map)	返回0到多行结果，每行对应每个map键值对，其中一个字段是map的键，另一个字段对应map的值。

select explode(array(1,2,3));

结果为：
在这里插入图片描述
我们自定义的UDTF就需要实现类似功能，将0个或多个输入转换成多行输出。

GenericUDTF Interface

一个 UDTF 必须继承 GenericUDTF 抽象类然后实现抽象类中的 initialize，process，和 close方法。其中，Hive 调用 initialize 方法来确定传入参数的类型并确定 UDTF 生成表的每个字段的数据类型（即输入类型和输出类型）。initialize 方法必须返回一个生成表的字段的相应的 object inspector。一旦调用了 initialize() ，Hive将把 UDTF 参数传给 process() 方法，调用这个方法可以产生行对象并将行对象转发给其他操作器。最后当所有的行对象都传递出 UDTF 调用 close() 方法。
例子：

package myhive.functions.UDTF;

import com.util.SEtimeToHour;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;

/**
* @Description:    假设一个用户在"2019-03-23 20:17:34"到"2019-03-23 22:17:34"收视，
* 				   计算每个小时收视了多少秒。
* 				   将输入的时间"2019-03-23 20:17:34","2019-03-23 22:17:34"按小时划分
* 				   select ("2019-03-23 20:17:34","2019-03-23 22:17:34");
* 				   输出 小时数，收视时长
* 					hour	viewsecond
* 					20	2554
* 					21	3600
* 					22	1054
* @Author:         choosing
* @Date:           2019/4/20 11:08
*/
public class DateToHour extends GenericUDTF {

    @Override
    public StructObjectInspector initialize(ObjectInspector[] args)
            throws UDFArgumentException {
        if (args.length != 2) {//判断自定义函数传入参数是否为2.
            throw new UDFArgumentLengthException("ExplodeMap takes only two argument");
        }
        if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE && args[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
            //判断参数是否是PRIMITIVE,LIST,MAP,STRUCT,UNION类型;
            throw new UDFArgumentException("ExplodeMap takes string as a parameter");
        }

        ArrayList<String> fieldNames = new ArrayList<String>();//生成表的字段名数组
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();//生成表的字段对象监控器（object inspector）数组，即生成表的行对象每个字段的类型
        fieldNames.add("hour");//时段
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);//类型是PRIMITIVE
        fieldNames.add("viewsecond");//相应时段的观看时长
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);//类型是PRIMITIVE
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);//返回对象监控器
    }

    @Override
    public void process(Object[] args) throws HiveException {
        // args为自定义函数传入的参数数据
        List<String[]> result = null;
        try {
            result = SEtimeToHour.setimeToHour(args[0], args[1]);//处理参数生成行对象
        } catch (ParseException e) {
            e.printStackTrace();
        }
        for (String[] str : result) {
            forward(str);//转发行对象。注意，循环一次意味着返回一行
        }

    }

    @Override
    public void close() throws HiveException {

    }

}

package com.util;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;

/**
* @Description:    切分2019-03-23 20:17:34到019-03-23 22:17:34这种时间的小时分布
* @Author:         choosing
* @Date:           2019/4/20 11:25
*/
public class SEtimeToHour {

    public static List<String[]> setimeToHour(String startTime, String endtime) throws ParseException {
        //返回的list
        List<String[]> list = new ArrayList<String[]>();
        //观看时长
        long viewsecond = 0;

        //解析字符串
        SimpleDateFormat dateFormat= new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        Date startDate =dateFormat.parse(startTime);
        Date endDate =dateFormat.parse(endtime);
        //将开始时间和结束时间的Date转换成Calendar，并获取开始时间和结束时间的小时值
        Calendar startCalendar = Calendar.getInstance();
        startCalendar.setTime(startDate);
        int startHour = startCalendar.get(Calendar.HOUR_OF_DAY);

        Calendar endCalendar = Calendar.getInstance();
        endCalendar.setTime(endDate);
        int endHour = endCalendar.get(Calendar.HOUR_OF_DAY);

        if(endHour - startHour == 0){
            //如果起始时间和结束时间是同一个小时
            long startSec = startDate.getTime() / 1000;
            long endSec = endDate.getTime() / 1000;
            viewsecond = endSec - startSec;
            list.add(SEtimeToHour.toStringArray(startHour,viewsecond));
            return list;
        }else{
            //起始时间和结束时间不是同一个小时
            for(int i = startHour,j = 1;i <= endHour;i++,j++){
                if(j == 1){
                    //第一个小时观看时长
                    int minute = 60 - startCalendar.get(Calendar.MINUTE);
                    int second = 60 - startCalendar.get(Calendar.SECOND);
                    viewsecond = minute * 60 - second;
                    list.add(SEtimeToHour.toStringArray(i,viewsecond));
                }else if(i != endHour){
                    viewsecond = 60 * 60;
                    list.add(SEtimeToHour.toStringArray(i,viewsecond));
                }else{
                    //最后一个小时观看时长
                    int minute = endCalendar.get(Calendar.MINUTE);
                    int second = endCalendar.get(Calendar.SECOND);
                    viewsecond = minute * 60 + second;
                    list.add(SEtimeToHour.toStringArray(i,viewsecond));
                }
            }
            return list;
        }
    }

    public static String[] toStringArray(int hour,long viewsecond){
        String[] str = new String[2];
        str[0] = String.valueOf(hour);
        str[1] = String.valueOf(viewsecond);
        return str;
    }

    /*  public static void main(String[] args) throws ParseException {
        String startTime = "2019-02-01 12:52:27";
        String endTime = "2019-02-01 13:19:31";
        List<String[]> list = SEtimeToHour.setimeToHour(startTime, endTime);
        for (String[] str : list) {
           System.out.println("hour:"+str[0]+"  viewsecond:"+str[1]);
        }
    }*/
}

官方例子：

package org.apache.hadoop.hive.contrib.udtf.example;
 
import java.util.ArrayList;
 
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 
 
/**
 * GenericUDTFCount2 outputs the number of rows seen, twice. It's output twice
 * to test outputting of rows on close with lateral view.
 * 即select GenericUDTFCount2() from dual;结果为：
 * col1
 * ------
 * 1
 * 1
 */
public class GenericUDTFCount2 extends GenericUDTF {
 
  Integer count = Integer.valueOf(0);
  Object forwardObj[] = new Object[1];
 
  @Override
  public void close() throws HiveException {
    forwardObj[0] = count;
    forward(forwardObj);
    forward(forwardObj);//转发两次行对象
  }
 
  @Override
  public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
    ArrayList<String> fieldNames = new ArrayList<String>();
    ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
    fieldNames.add("col1");//确定行对象其中字段的字段名
    fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);//确定行对象其中字段的字段类型
    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
        fieldOIs);//返回对象监控器
  }
 
  @Override
  public void process(Object[] args) throws HiveException {
    count = Integer.valueOf(count.intValue() + 1);//将count加1，注意并没有处理输入参数
  }
 
}

GenericUDTF 抽象类

package org.apache.hadoop.hive.ql.udf.generic;
 
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 
/**
 * A Generic User-defined Table Generating Function (UDTF)
 *
 * Generates a variable number of output rows for a single input row. Useful for
 * explode(array)...
 */
 
public abstract class GenericUDTF {
  Collector collector = null;
 
  /**
 * Initialize this GenericUDTF. This will be called only once per instance.
 *
 * @param args
 *          An array of ObjectInspectors for the arguments
 * @return A StructObjectInspector for output. The output struct represents a
 *         row of the table where the fields of the stuct are the columns. The
 *         field names are unimportant as they will be overridden by user
 *         supplied column aliases.
   */
  public abstract StructObjectInspector initialize(ObjectInspector[] argOIs)
      throws UDFArgumentException;
 
  /**
 * Give a set of arguments for the UDTF to process.
 *
 * @param o
 *          object array of arguments
   */
  public abstract void process(Object[] args) throws HiveException;
 
  /**
 * Called to notify the UDTF that there are no more rows to process.
 * Clean up code or additional forward() calls can be made here.
   */
  public abstract void close() throws HiveException;
 
  /**
 * Associates a collector with this UDTF. Can't be specified in the
 * constructor as the UDTF may be initialized before the collector has been
 * constructed.
 *
 * @param collector
   */
  public final void setCollector(Collector collector) {
    this.collector = collector;
  }
 
  /**
 * Passes an output row to the collector.
 *
 * @param o
 * @throws HiveException
   */
  protected final void forward(Object o) throws HiveException {
    collector.collect(o);
  }
 
}

qiulinsama

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Hive UDTF

UDTF:用户自定义表生成函数，表生成函数接受0个或多个输入然后产生多列或多行输出。例如：select array(1,2,3) from dual;//注意，dual表中需要有数据结果为：然后我们使用一个表生成函数explode()样式描述explode(ARRAY array)返回0到多行结果，每行对应输入的array数组中的一个元素。explode(...
复制链接

扫一扫