需求
需求描述
注意: 对于同一id在同一时间点只有一次修改
需求分析
- 需要分组排序
- 需要分组之内列转行集合
- 根据新批次号、分组排序结果、原批次号和新批次号进行计算(自定义GenericUDF函数)
demo 样例
生成数据
DDL:
create table t0411(
id bigint,
mod_date string,
src string,
cur string
)
插入数据:
insert into t0411
values
(1,'2022-01-08','b1','c1'),
(1,'2022-01-07','a1','b1'),
(1,'2022-01-05','b','c'),
(1,'2022-01-04','a','b'),
(1,'2022-01-06','c','d'),
(1,'2022-01-09','a2','b2'),
(2,'2022-01-08','b1','c1'),
(2,'2022-01-07','a1','b1'),
(2,'2022-01-05','b','c'),
(2,'2022-01-04','a','b'),
(2,'2022-01-06','c','d'),
(2,'2022-01-09','a2','b2');
GenericUDF函数
package org.pony.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.List;
/**
* @author ljj
* @date 2022/04/11
* hdfs dfs -put udf.jar /user/hive/udf/
* create function get_latest_ver as 'org.pony.hive.udf.GenericUDFGetLatestVer' using jar 'hdfs:/user/hive/udf/udf.jar';
* select udf.get_latest_ver('b',1,Array('a','b'),Array('b','b1'))
*/
@Description(name = "udf.get_latest_ver",
value = "_FUNC_(cur,num,srcs,curs) - returns the latest version.",
extended = "return the string data type.")
public class GenericUDFGetLatestVer extends GenericUDF {
private transient ListObjectInspector listOI1;
private transient ListObjectInspector listOI2;
private transient StringObjectInspector elementOI1;
private transient IntObjectInspector elementOI2;
private transient ObjectInspector[] argumentOIs;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 4) {
throw new UDFArgumentLengthException("get_latest_ver takes 4 arguments: string,int array<string>,array<string>");
}
this.argumentOIs = arguments;
if (isVoidType(arguments)) {
return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
}
ObjectInspector a = arguments[0];
ObjectInspector b = arguments[1];
ObjectInspector c = arguments[2];
ObjectInspector d = arguments[3];
this.listOI1 = (ListObjectInspector) c;
this.listOI2 = (ListObjectInspector) d;
this.elementOI1 = (StringObjectInspector) a;
this.elementOI2 = (IntObjectInspector) b;
// 1. 检查是否接收到正确的参数类型
if (!(a instanceof StringObjectInspector) || !(b instanceof IntObjectInspector) || !(c instanceof ListObjectInspector) || !(d instanceof ListObjectInspector)) {
throw new UDFArgumentException("first argument must be a string,second argument must be a int, third and fourth argument must be a list / array");
}
// 2. 检查listOI1是否包含的元素都是string, listOI2是否包含的元素都是int
if (!(listOI1.getListElementObjectInspector() instanceof WritableStringObjectInspector) || !(listOI2.getListElementObjectInspector() instanceof WritableStringObjectInspector)) {
throw new UDFArgumentException("third and fourth argument must be a list/array of strings");
}
// 返回类型是boolean,所以我们提供了正确的object inspector
return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
}
@Override
public String getDisplayString(String[] children) {
return "返回最新的版本:get_latest_ver(string,int,array<string>,array<string>)";
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if (isVoidType(argumentOIs)) {
return null;
}
String cur = this.elementOI1.getPrimitiveJavaObject(arguments[0].get());
Object intObjectInspector = arguments[1].get();
List<Text> srcs = (List<Text>) this.listOI1.getList(arguments[2].get());
List<Text> curs = (List<Text>) this.listOI2.getList(arguments[3].get());
// 检查空值
if (intObjectInspector == null || cur == null || srcs == null || curs == null) {
return null;
}
int num = this.elementOI2.get(intObjectInspector);
while (num < srcs.size()) {
cur = srcs.get(num).toString().equals(cur) ? curs.get(num).toString() : cur;
num++;
}
return cur;
}
protected boolean isVoidType(ObjectInspector[] arguments) {
for (ObjectInspector oi : arguments) {
if (oi.getCategory() == ObjectInspector.Category.PRIMITIVE) {
if (((PrimitiveObjectInspector) oi).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.VOID) {
return true;
}
}
}
return false;
}
public static void main(String[] args) throws Exception {
// 建立需要的模型
GenericUDFGetLatestVer example = new GenericUDFGetLatestVer();
ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
ObjectInspector stringOI1 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
ObjectInspector intOI1 = PrimitiveObjectInspectorFactory.javaIntObjectInspector;
ObjectInspector listOI1 = ObjectInspectorFactory.getStandardListObjectInspector(stringOI1);
StringObjectInspector resultInspector = (StringObjectInspector) example.initialize(new ObjectInspector[]{stringOI, intOI1, listOI1, listOI1});
// create the actual UDF arguments
List<Text> list1 = new ArrayList<Text>() {{
add(new Text("a"));
add(new Text("b"));
add(new Text("c"));
add(new Text("a1"));
add(new Text("b1"));
add(new Text("a2"));
}};
List<Text> list2 = new ArrayList<Text>() {{
add(new Text("b"));
add(new Text("c"));
add(new Text("d"));
add(new Text("b1"));
add(new Text("c1"));
add(new Text("b2"));
}};
// 测试结果
// 存在的值
Object result = example.evaluate(new DeferredObject[]{new DeferredJavaObject("b2"), new DeferredJavaObject(6),
new DeferredJavaObject(list1), new DeferredJavaObject(list2)});
System.out.println(result);
System.out.println(resultInspector.getPrimitiveJavaObject(result));
}
}
创建hive函数
上传hdfs:
hdfs dfs -put -f udf.jar /user/hive/udf
创建永久函数:
--创建函数
create database udf;
use udf;
create function udf.get_latest_ver as 'org.pony.hive.udf.GenericUDFGetLatestVer' using jar 'hdfs:/user/hive/udf/udf.jar';
-- 查看函数
show functions like udf.get_latest_ver;
-- 删除函数
DROP FUNCTION IF EXISTS udf.get_latest_ver;
-- 查看函数拓展信息
desc function extended udf.get_latest_ver;
使用udf函数
- 案例1:
select udf.get_latest_ver('b',1,Array('a','b'),Array('b','b1'))
结果:
- 案例2:
with tmp as (
select
id,
mod_date,
src,
cur,
row_number() over(partition by id order by mod_date asc) as num,
collect_set(src) over(partition by id ) as srcs,
collect_set(cur) over(partition by id ) as curs
from db01.t0411
)
select
*,
udf.get_latest_ver(cur,num,srcs,curs) as latest_ver
from tmp;
结果: