UDTF:用户自定义表生成函数,表生成函数接受0个或多个输入然后产生多列或多行输出。
例如:
select array(1,2,3);
结果为:
然后我们使用一个表生成函数explode()
样式 | 描述 |
---|---|
explode(ARRAY array) | 返回0到多行结果,每行对应输入的array数组中的一个元素。 |
explode(MAP map) | 返回0到多行结果,每行对应每个map键值对,其中一个字段是map的键,另一个字段对应map的值。 |
select explode(array(1,2,3));
结果为:
我们自定义的UDTF就需要实现类似功能,将0个或多个输入转换成多行输出。
GenericUDTF Interface
一个 UDTF 必须继承 GenericUDTF 抽象类然后实现抽象类中的 initialize,process,和 close方法。其中,Hive 调用 initialize 方法来确定传入参数的类型并确定 UDTF 生成表的每个字段的数据类型(即输入类型和输出类型)。initialize 方法必须返回一个生成表的字段的相应的 object inspector。一旦调用了 initialize() ,Hive将把 UDTF 参数传给 process() 方法,调用这个方法可以产生行对象并将行对象转发给其他操作器。最后当所有的行对象都传递出 UDTF 调用 close() 方法。
例子:
package myhive.functions.UDTF;
import com.util.SEtimeToHour;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
/**
* @Description: 假设一个用户在"2019-03-23 20:17:34"到"2019-03-23 22:17:34"收视,
* 计算每个小时收视了多少秒。
* 将输入的时间"2019-03-23 20:17:34","2019-03-23 22:17:34"按小时划分
* select ("2019-03-23 20:17:34","2019-03-23 22:17:34");
* 输出 小时数,收视时长
* hour viewsecond
* 20 2554
* 21 3600
* 22 1054
* @Author: choosing
* @Date: 2019/4/20 11:08
*/
public class DateToHour extends GenericUDTF {
@Override
public StructObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException {
if (args.length != 2) {//判断自定义函数传入参数是否为2.
throw new UDFArgumentLengthException("ExplodeMap takes only two argument");
}
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE && args[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
//判断参数是否是PRIMITIVE,LIST,MAP,STRUCT,UNION类型;
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
ArrayList<String> fieldNames = new ArrayList<String>();//生成表的字段名数组
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();//生成表的字段对象监控器(object inspector)数组,即生成表的行对象每个字段的类型
fieldNames.add("hour");//时段
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);//类型是PRIMITIVE
fieldNames.add("viewsecond");//相应时段的观看时长
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);//类型是PRIMITIVE
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);//返回对象监控器
}
@Override
public void process(Object[] args) throws HiveException {
// args为自定义函数传入的参数数据
List<String[]> result = null;
try {
result = SEtimeToHour.setimeToHour(args[0], args[1]);//处理参数生成行对象
} catch (ParseException e) {
e.printStackTrace();
}
for (String[] str : result) {
forward(str);//转发行对象。注意,循环一次意味着返回一行
}
}
@Override
public void close() throws HiveException {
}
}
package com.util;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
/**
* @Description: 切分2019-03-23 20:17:34到019-03-23 22:17:34这种时间的小时分布
* @Author: choosing
* @Date: 2019/4/20 11:25
*/
public class SEtimeToHour {
public static List<String[]> setimeToHour(String startTime, String endtime) throws ParseException {
//返回的list
List<String[]> list = new ArrayList<String[]>();
//观看时长
long viewsecond = 0;
//解析字符串
SimpleDateFormat dateFormat= new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date startDate =dateFormat.parse(startTime);
Date endDate =dateFormat.parse(endtime);
//将开始时间和结束时间的Date转换成Calendar,并获取开始时间和结束时间的小时值
Calendar startCalendar = Calendar.getInstance();
startCalendar.setTime(startDate);
int startHour = startCalendar.get(Calendar.HOUR_OF_DAY);
Calendar endCalendar = Calendar.getInstance();
endCalendar.setTime(endDate);
int endHour = endCalendar.get(Calendar.HOUR_OF_DAY);
if(endHour - startHour == 0){
//如果起始时间和结束时间是同一个小时
long startSec = startDate.getTime() / 1000;
long endSec = endDate.getTime() / 1000;
viewsecond = endSec - startSec;
list.add(SEtimeToHour.toStringArray(startHour,viewsecond));
return list;
}else{
//起始时间和结束时间不是同一个小时
for(int i = startHour,j = 1;i <= endHour;i++,j++){
if(j == 1){
//第一个小时观看时长
int minute = 60 - startCalendar.get(Calendar.MINUTE);
int second = 60 - startCalendar.get(Calendar.SECOND);
viewsecond = minute * 60 - second;
list.add(SEtimeToHour.toStringArray(i,viewsecond));
}else if(i != endHour){
viewsecond = 60 * 60;
list.add(SEtimeToHour.toStringArray(i,viewsecond));
}else{
//最后一个小时观看时长
int minute = endCalendar.get(Calendar.MINUTE);
int second = endCalendar.get(Calendar.SECOND);
viewsecond = minute * 60 + second;
list.add(SEtimeToHour.toStringArray(i,viewsecond));
}
}
return list;
}
}
public static String[] toStringArray(int hour,long viewsecond){
String[] str = new String[2];
str[0] = String.valueOf(hour);
str[1] = String.valueOf(viewsecond);
return str;
}
/* public static void main(String[] args) throws ParseException {
String startTime = "2019-02-01 12:52:27";
String endTime = "2019-02-01 13:19:31";
List<String[]> list = SEtimeToHour.setimeToHour(startTime, endTime);
for (String[] str : list) {
System.out.println("hour:"+str[0]+" viewsecond:"+str[1]);
}
}*/
}
官方例子:
package org.apache.hadoop.hive.contrib.udtf.example;
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* GenericUDTFCount2 outputs the number of rows seen, twice. It's output twice
* to test outputting of rows on close with lateral view.
* 即select GenericUDTFCount2() from dual;结果为:
* col1
* ------
* 1
* 1
*/
public class GenericUDTFCount2 extends GenericUDTF {
Integer count = Integer.valueOf(0);
Object forwardObj[] = new Object[1];
@Override
public void close() throws HiveException {
forwardObj[0] = count;
forward(forwardObj);
forward(forwardObj);//转发两次行对象
}
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("col1");//确定行对象其中字段的字段名
fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);//确定行对象其中字段的字段类型
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);//返回对象监控器
}
@Override
public void process(Object[] args) throws HiveException {
count = Integer.valueOf(count.intValue() + 1);//将count加1,注意并没有处理输入参数
}
}
GenericUDTF 抽象类
package org.apache.hadoop.hive.ql.udf.generic;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
/**
* A Generic User-defined Table Generating Function (UDTF)
*
* Generates a variable number of output rows for a single input row. Useful for
* explode(array)...
*/
public abstract class GenericUDTF {
Collector collector = null;
/**
* Initialize this GenericUDTF. This will be called only once per instance.
*
* @param args
* An array of ObjectInspectors for the arguments
* @return A StructObjectInspector for output. The output struct represents a
* row of the table where the fields of the stuct are the columns. The
* field names are unimportant as they will be overridden by user
* supplied column aliases.
*/
public abstract StructObjectInspector initialize(ObjectInspector[] argOIs)
throws UDFArgumentException;
/**
* Give a set of arguments for the UDTF to process.
*
* @param o
* object array of arguments
*/
public abstract void process(Object[] args) throws HiveException;
/**
* Called to notify the UDTF that there are no more rows to process.
* Clean up code or additional forward() calls can be made here.
*/
public abstract void close() throws HiveException;
/**
* Associates a collector with this UDTF. Can't be specified in the
* constructor as the UDTF may be initialized before the collector has been
* constructed.
*
* @param collector
*/
public final void setCollector(Collector collector) {
this.collector = collector;
}
/**
* Passes an output row to the collector.
*
* @param o
* @throws HiveException
*/
protected final void forward(Object o) throws HiveException {
collector.collect(o);
}
}