目录
一、UDF 介绍
UDF(user defined functions)是用户定义函数,UDF操作作用于单个数据行,并且产生单个数据行作为输出。大多数函数都属于这一类(比如数学函数和字符串函数)。
实现UDF需要继承特定类UDF或GenericUDF二选一。
- apache.hadoop.hive.ql.exec.UDF,处理并返回基本数据类型,int、string、boolean、double等;(也可以返回复杂数据类型)
- apache.hadoop.hive.ql.udf.generic.GenericUDF,可处理并返回复杂数据类型,如Map、List、Array等,同时支持嵌套;
二、UDF和GenericUDF 对应函数源码对比
1、substr函数介绍
string substr(string|binary A, int start).
Returns the substring or slice of the byte array of A starting from start position till the end of string A.
For example, substr('foobar', 4) results in 'bar'.
substr函数对应源码类UDFSubstr ,UDFSubstr 源码如下:
/**
* UDFSubstr.
*
*/
@Description(name = "substr,substring",
value = "_FUNC_(str, pos[, len]) - returns the substring of str that"
+ " starts at pos and is of length len or" +
"_FUNC_(bin, pos[, len]) - returns the slice of byte array that"
+ " starts at pos and is of length len",
extended = "pos is a 1-based index. If pos<0 the starting position is"
+ " determined by counting backwards from the end of str.\n"
+ "Example:\n "
+ " > SELECT _FUNC_('Facebook', 5) FROM src LIMIT 1;\n"
+ " 'book'\n"
+ " > SELECT _FUNC_('Facebook', -5) FROM src LIMIT 1;\n"
+ " 'ebook'\n"
+ " > SELECT _FUNC_('Facebook', 5, 1) FROM src LIMIT 1;\n"
+ " 'b'")
@VectorizedExpressions({StringSubstrColStart.class, StringSubstrColStartLen.class})
public class UDFSubstr extends UDF {
private final int[] index;
private final Text r;
public UDFSubstr() {
index = new int[2];
r = new Text();
}
public Text evaluate(Text t, IntWritable pos, IntWritable len) {
if ((t == null) || (pos == null) || (len == null)) {
return null;
}
r.clear();
if ((len.get() <= 0)) {
return r;
}
String s = t.toString();
int[] index = makeIndex(pos.get(), len.get(), s.length());
if (index == null) {
return r;
}
r.set(s.substring(index[0], index[1]));
return r;
}
private int[] makeIndex(int pos, int len, int inputLen) {
if ((Math.abs(pos) > inputLen)) {
return null;
}
int start, end;
if (pos > 0) {
start = pos - 1;
} else if (pos < 0) {
start = inputLen + pos;
} else {
start = 0;
}
if ((inputLen - start) < len) {
end = inputLen;
} else {
end = start + len;
}
index[0] = start;
index[1] = end;
return index;
}
private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);
public Text evaluate(Text s, IntWritable pos) {
return evaluate(s, pos, maxValue);
}
public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len) {
if ((bw == null) || (pos == null) || (len == null)) {
return null;
}
if ((len.get() <= 0)) {
return new BytesWritable();
}
int[] index = makeIndex(pos.get(), len.get(), bw.getLength());
if (index == null) {
return new BytesWritable();
}
return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1]));
}
public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
return evaluate(bw, pos, maxValue);
}
}
2、size函数介绍
int size(Map<K.V>)/size(Array<T>).
Returns the number of elements in the map type./Returns the number of elements in the array type.
For example, size(array(1,2,3,4)) results in 4.
size函数对应源码类GenericUDFSize ,GenericUDFSize 源码如下:
/**
* GenericUDFSize.
*
*/
@Description(name = "size", value = "_FUNC_(a) - Returns the size of a")
public class GenericUDFSize extends GenericUDF {
private transient ObjectInspector returnOI;
private final transient IntWritable result = new IntWritable(-1);
@Override
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
if (arguments.length != 1) {
throw new UDFArgumentLengthException(
"The function SIZE only accepts 1 argument.");
}
Category category = arguments[0].getCategory();
String typeName = arguments[0].getTypeName();
if (category != Category.MAP && category != Category.LIST
&& !typeName.equals(serdeConstants.VOID_TYPE_NAME)) {
throw new UDFArgumentTypeException(0, "\""
+ Category.MAP.toString().toLowerCase() + "\" or \""
+ Category.LIST.toString().toLowerCase()
+ "\" is expected at function SIZE, " + "but \""
+ arguments[0].getTypeName() + "\" is found");
}
returnOI = arguments[0];
return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
Object data = arguments[0].get();
if (returnOI.getCategory() == Category.MAP) {
result.set(((MapObjectInspector) returnOI).getMapSize(data));
} else if (returnOI.getCategory() == Category.LIST) {
result.set(((ListObjectInspector) returnOI).getListLength(data));
} else if (returnOI.getTypeName().equals(serdeConstants.VOID_TYPE_NAME)) {
// null
result.set(-1);
}
return result;
}
@Override
public String getDisplayString(String[] children) {
assert (children.length == 1);
return getStandardDisplayString("size", children);
}
}
三、ObjectInspector 接口
我们都知道hql最后会转为MapReduce作业来执行。而我们之前单独写MR的时候,需要写一个Map类和Reduce类,在写这些类的时候我们需要指定输入和输出参数的数据类型(不是Java的基本数据类型,是经过Hadoop封装的XxxWritable类型,比如int类型,要写成IntWritable,String类型要写成Text)。因此,ObjectInspector 的作用就是使hive不拘泥于一种特定数据格式,使得数据流 1)在输入端和输出端切换不同的输入/输出格式 2)在不同的Operator上使用不同的数据格式。(在自定义函数中是在初始化方法中配置的),以便hive将hql转为MR程序。
1、ObjectInspector 接口
/**
* ObjectInspector helps us to look into the internal structure of a complex
* object.
* (ObjectInspector帮助我们查看复杂对象的内部结构。)
*
* A (probably configured) ObjectInspector instance stands for a specific type
* and a specific way to store the data of that type in the memory.
*(一个(可能已配置)ObjectInspector实例代表一种特定的类型和在内存中存储该类型数据的特定方式。)
*
* For native java Object, we can directly access the internal structure through
* member fields and methods. ObjectInspector is a way to delegate that
* functionality away from the Object, so that we have more control on the
* behavior of those actions.
* (对于原生java对象,我们可以通过成员字段和方法直接访问内部结构。
* ObjectInspector是一种将该功能委托给对象之外的方法,这样我们就可以更好地控制这些操作的行为。)
*
* An efficient implementation of ObjectInspector should rely on factory, so
* that we can make sure the same ObjectInspector only has one instance. That
* also makes sure hashCode() and equals() methods of java.lang.Object directly
* works for ObjectInspector as well.
* (ObjectInspector的有效实现应该依赖于工厂,因此我们可以确保同一个ObjectInspector只有一个实例。
* 这也确保java.lang.Object的hashCode()和equals()方法也直接适用于ObjectInspector。)
*/
public interface ObjectInspector extends Cloneable {
/**
* Category.
*
*/
public static enum Category {
PRIMITIVE, LIST, MAP, STRUCT, UNION
};
/**
* Returns the name of the data type that is inspected by this
* ObjectInspector. This is used to display the type information to the user.
*
* For primitive types, the type name is standardized. For other types, the
* type name can be something like "list<int>", "map<int,string>", java class
* names, or user-defined type names similar to typedef.
*/
String getTypeName();
/**
* An ObjectInspector must inherit from one of the following interfaces if
* getCategory() returns: PRIMITIVE: PrimitiveObjectInspector LIST:
* ListObjectInspector MAP: MapObjectInspector STRUCT: StructObjectInspector.
*/
Category getCategory();
}
2、接口:PrimitiveObjectInspector ; 实现类:WritableIntObjectInspector、WritableStringObjectInspector 等
package org.apache.hadoop.hive.serde2.objectinspector;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceStability;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
/**
* PrimitiveObjectInspector.
*
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public interface PrimitiveObjectInspector extends ObjectInspector {
/**
* hive支持的原始数据类型(这个枚举重要!)
*/
enum PrimitiveCategory {
VOID, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING,
DATE, TIMESTAMP, TIMESTAMPLOCALTZ, BINARY, DECIMAL, VARCHAR, CHAR,
INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, UNKNOWN
}
PrimitiveTypeInfo getTypeInfo();
/**
* Get the primitive category of the PrimitiveObjectInspector.
*/
PrimitiveCategory getPrimitiveCategory();
/**
* 获取Primitive Writable类(Hadoop写MR的时候输入输出类型都要是XxxWritable)
*/
Class<?> getPrimitiveWritableClass();
/**
* 返回可以转换成primitive writable Object o,如果这个o已经是writable的,直接返回,否则转为writable后再返回
*/
Object getPrimitiveWritableObject(Object o);
/**
* 获取Java原始数据类
*/
Class<?> getJavaPrimitiveClass();
/**
* 获取Java原始数据实例
*/
Object getPrimitiveJavaObject(Object o);
/**
* Get a copy of the Object in the same class, so the return value can be
* stored independently of the parameter.
*
* If the Object is a Primitive Java Object, we just return the parameter
* since Primitive Java Object is immutable.
*/
Object copyObject(Object o);
/**
* Whether the ObjectInspector prefers to return a Primitive Writable Object
* instead of a Primitive Java Object. This can be useful for determining the
* most efficient way to getting data out of the Object.
*/
boolean preferWritable();
/**
* The precision of the underlying data.
*/
int precision();
/**
* The scale of the underlying data.
*/
int scale();
}
3、接口:ListObjectInspector; 实现类:StandardListObjectInspector
package org.apache.hadoop.hive.serde2.objectinspector;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceStability;
import java.util.List;
/**
* ListObjectInspector.
*
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public interface ListObjectInspector extends ObjectInspector {
// ** Methods that does not need a data object **
ObjectInspector getListElementObjectInspector();
// ** Methods that need a data object **
/**
* returns null for null list, out-of-the-range index.
*/
Object getListElement(Object data, int index);
/**
* returns -1 for data = null.
*/
int getListLength(Object data);
/**
* returns null for data = null.
*
* Note: This method should not return a List object that is reused by the
* same ListObjectInspector, because it's possible that the same
* ListObjectInspector will be used in multiple places in the code.
*
* However it's OK if the List object is part of the Object data.
*/
List<?> getList(Object data);
}
4、接口:MapObjectInspector ;实现类:StandardMapObjectInspector
package org.apache.hadoop.hive.serde2.objectinspector;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceStability;
import java.util.Map;
/**
* MapObjectInspector.
*
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public interface MapObjectInspector extends ObjectInspector {
// ** Methods that does not need a data object **
// Map Type
ObjectInspector getMapKeyObjectInspector();
ObjectInspector getMapValueObjectInspector();
// ** Methods that need a data object **
// In this function, key has to be of the same structure as the Map expects.
// Most cases key will be primitive type, so it's OK.
// In rare cases that key is not primitive, the user is responsible for
// defining
// the hashCode() and equals() methods of the key class.
Object getMapValueElement(Object data, Object key);
/**
* returns null for data = null.
*
* Note: This method should not return a Map object that is reused by the same
* MapObjectInspector, because it's possible that the same MapObjectInspector
* will be used in multiple places in the code.
*
* However it's OK if the Map object is part of the Object data.
*/
Map<?, ?> getMap(Object data);
/**
* returns -1 for NULL map.
*/
int getMapSize(Object data);
}
5、常用Factory 和 Utils
5.1 PrimitiveObjectInspectorFactory 是创建新的PrimitiveObjectInspector实例的主要方法:一般用于创建原始数据类型。
getPrimitiveJavaObjectInspector
/**
* Returns the PrimitiveJavaObjectInspector for the PrimitiveCategory.
* 返回PrimitiveCategory的PrimitiveJavaObjectInspector。
*
* @param primitiveCategory input to be looked up.
*/
public static AbstractPrimitiveJavaObjectInspector getPrimitiveJavaObjectInspector(
PrimitiveCategory primitiveCategory);
与之对应的Writable类型:
getPrimitiveWritableObjectInspector
/**
* Returns the PrimitiveWritableObjectInspector for the PrimitiveCategory.
*
* @param primitiveCategory primitive category input to be looked up.
*/
public static AbstractPrimitiveWritableObjectInspector getPrimitiveWritableObjectInspector(
PrimitiveCategory primitiveCategory);
ObjectInspectorFactory 是创建新的ObjectInspector实例的主要方法:一般用于创建集合数据类型。
List(参数是内部元素的对象检查器)
public static StandardListObjectInspector getStandardListObjectInspector(
ObjectInspector listElementObjectInspector)
Map(参数是内部元素的,键/值对象检查器)
public static StandardMapObjectInspector getStandardMapObjectInspector(
ObjectInspector mapKeyObjectInspector,
ObjectInspector mapValueObjectInspector)
Struct(参数是内部字段的,字段名字符串列表,以及对应的对象检查器列表。还方法还有一个重载的方法,第三个参数是可选的,是一个List<?> value 表示对应字段的值)
public static StandardStructObjectInspector getStandardStructObjectInspector(
List<String> structFieldNames,
List<ObjectInspector> structFieldObjectInspectors)
5.2 ObjectInspectorUtils 工具类:一般用于将已有的数据类型转换为标准数据类型。
getStandardObjectInspector 根据传入的对象类型,获取标准对象类型
/**
* Get the corresponding standard ObjectInspector for an ObjectInspector.
* 获取ObjectInspector的相应标准ObjectInspector。
*
* The returned ObjectInspector can be used to inspect the standard object.
* 返回的ObjectInspector可用于检查标准对象。
*/
public static ObjectInspector getStandardObjectInspector(ObjectInspector oi) {
return getStandardObjectInspector(oi, ObjectInspectorCopyOption.DEFAULT);
}
四、实现自定义 UDF 和 GenericUDF
1、UDF
基础UDF的函数读取和返回基本类型,即Hadoop和Hive的基本类型。如,Text、IntWritable、LongWritable、DoubleWritable等。
继承UDF类必须实现evaluate方法,支持定义多个evaluate方法不同参数列表用于处理不同类型数据。
继承UDF类实现字符串拼接样例:
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
import org.apache.hadoop.io.IntWritable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Description(
name="UDFExample",
value="UDFExample(String str1,String str2) - string Concatenation",
extended = "Example :\n >select UDFExample('Hello', ' World');\n >Hello World\n"
)
public class UDFExample extends UDF {
public String evaluate(String str1,String str2) {
if(str1 != null && str2 != null)
return str1+str2;
return null;
}
}
2、GenericUDF
GenericUDF相比与UDF功能更丰富,支持所有参数类型,参数类型由ObjectInspector封装;
参数Writable类由DeferredObject封装,使用时简单类型可直接从Writable获取,复杂类型可由ObjectInspector解析。
继承GenericUDF必须实现如下3个接口:
1)初始化,ObjectInspector为数据类型封装类,无实际参数值,返回结果类型;校验输入参数类型, 指定输出结果类型 单节点运行周期内,最开始执行一次
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
return null;
}
2)DeferredObject封装实际参数的对应Writable类;处理输入内容生成输出结果 单节点运行周期中内执行多次, 执行次数与数据行数相等
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
return null;
}
3)函数信息;异常退出时输出标识内容 单节点运行周期内,仅在Hive UDF发生异常时执行一次
public String getDisplayString(String[] strings) {
return null;
}
继承GenericUDF类实现百分比计算样例:
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.Map;
@Description(
name="GenericUDFExample",
value="GenericUDFExample(...) - count int or long type numbers",
extended = "Example :\n >select GenericUDFExample(3, 5);\n >{numerator=3,denominator=5,percentage=60%}\n"
)
public class GenericUDFExample extends GenericUDF {
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
// 判断传入的参数个数
if(objectInspectors.length != 2){
throw new UDFArgumentLengthException("Input Args Length Error !!!");
}
// 判断传入参数的类型
// objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE 参数类型为hive原始数据类型
if (!objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)
|| !PrimitiveObjectInspector.PrimitiveCategory.INT.equals(((PrimitiveObjectInspector)objectInspectors[0]).getPrimitiveCategory())){ // 传入第一个参数类型是否为hive的Int类型
throw new UDFArgumentException("函数第一个参数为int类型"); // 当自定义UDF参数与预期不符时,抛出异常
}
if (!objectInspectors[1].getCategory().equals(ObjectInspector.Category.PRIMITIVE)
|| !PrimitiveObjectInspector.PrimitiveCategory.INT.equals(((PrimitiveObjectInspector)objectInspectors[1]).getPrimitiveCategory())){ // 传入第二个参数类型是否为hive的Int类型
throw new UDFArgumentException("函数第二个参数为int类型");
}
//最后返回结果的类型
return ObjectInspectorFactory.getStandardMapObjectInspector (
PrimitiveObjectInspectorFactory.writableStringObjectInspector,
PrimitiveObjectInspectorFactory.writableStringObjectInspector) ;
}
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
String num1 = deferredObjects[0].get().toString();
String num2 = deferredObjects[1].get().toString();
return intToPrecent(num1,num2);
}
public Map<Text,Text> intToPrecent(String i1, String i2){
int i = Integer.parseInt(i1);
int j = Integer.parseInt(i2);
double result = (double)i/j;
// 格式化器
DecimalFormat df = new DecimalFormat("0%");
Map<Text,Text> ret = new HashMap();
// 实际使用中 报错 String can not cast Text,但是在IDEA上测试没问题,IDEA上做功能逻辑测试
// ret.put("numerator",i1);
// ret.put("denominator",i2);
// ret.put("percentage",df.format(result));
ret.put(new Text("numerator"),new Text(i1));
ret.put(new Text("denominator"),new Text(i2));
ret.put(new Text("percentage"),new Text(df.format(result)));
return ret;
}
@Override
public String getDisplayString(String[] strings) {
return "GOOD";
}
}
五、打包上传使用
1、打包
maven Lifecycle介绍:https://www.cnblogs.com/lzbbbb/p/16519116.html
clean | 删除当前项目的target目录(可自行尝试) |
compile | 编译项目的源代码(将src/main中的java代码编译成class文件,输出到targe目录下) |
package | 把class文件,resources文件打包成jar包(也可以是war包),生成的jar包位于target目录下 |
2、上传
hdfs dfs -put /user/GenericUDFExampleTest.jar hdfs://user/udf
3、使用
--添加jar包名
add jar hdfs://user/udf/UDFTest-1.0-SNAPSHOT.jar;
--创建临时函数
create temporary function udftest as 'udf.UDFExample'; --''为类路径,根据实际情况填写
create temporary function gudftest as 'udf.GenericUDFExample';
-- 调用
SELECT udftest("3","5")
,gudftest(3,5);
hive官方wiki:LanguageManual UDF - Apache Hive - Apache Software Foundation