hive窗口之分组范围内计算

需求

需求描述

在这里插入图片描述注意: 对于同一id在同一时间点只有一次修改

需求分析

  • 需要分组排序
  • 需要分组之内列转行集合
  • 根据新批次号、分组排序结果、原批次号和新批次号进行计算(自定义GenericUDF函数)

demo 样例

生成数据

DDL:

create table t0411(
id bigint,
mod_date string,
src string,
cur string
)

插入数据:

 insert into t0411
    values
    (1,'2022-01-08','b1','c1'),
    (1,'2022-01-07','a1','b1'),
    (1,'2022-01-05','b','c'),
    (1,'2022-01-04','a','b'),
    (1,'2022-01-06','c','d'),
    (1,'2022-01-09','a2','b2'),
    (2,'2022-01-08','b1','c1'),
    (2,'2022-01-07','a1','b1'),
    (2,'2022-01-05','b','c'),
    (2,'2022-01-04','a','b'),
    (2,'2022-01-06','c','d'),
    (2,'2022-01-09','a2','b2');

GenericUDF函数

package org.pony.hive.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.List;

/**
 * @author ljj
 * @date 2022/04/11
 * hdfs dfs -put udf.jar /user/hive/udf/
 * create function get_latest_ver as 'org.pony.hive.udf.GenericUDFGetLatestVer' using jar 'hdfs:/user/hive/udf/udf.jar';
 * select udf.get_latest_ver('b',1,Array('a','b'),Array('b','b1'))
 */
@Description(name = "udf.get_latest_ver",
        value = "_FUNC_(cur,num,srcs,curs) - returns the latest version.",
        extended = "return the string data type.")
public class GenericUDFGetLatestVer extends GenericUDF {
    private transient ListObjectInspector listOI1;
    private transient ListObjectInspector listOI2;
    private transient StringObjectInspector elementOI1;
    private transient IntObjectInspector elementOI2;
    private transient ObjectInspector[] argumentOIs;

    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        if (arguments.length != 4) {
            throw new UDFArgumentLengthException("get_latest_ver takes 4 arguments: string,int array<string>,array<string>");
        }
        this.argumentOIs = arguments;

        if (isVoidType(arguments)) {
            return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
        }
        ObjectInspector a = arguments[0];
        ObjectInspector b = arguments[1];
        ObjectInspector c = arguments[2];
        ObjectInspector d = arguments[3];
        this.listOI1 = (ListObjectInspector) c;
        this.listOI2 = (ListObjectInspector) d;
        this.elementOI1 = (StringObjectInspector) a;
        this.elementOI2 = (IntObjectInspector) b;

        // 1. 检查是否接收到正确的参数类型
        if (!(a instanceof StringObjectInspector) || !(b instanceof IntObjectInspector) || !(c instanceof ListObjectInspector) || !(d instanceof ListObjectInspector)) {
            throw new UDFArgumentException("first  argument must be a string,second  argument must be a int, third and fourth argument must be a list / array");
        }

        // 2. 检查listOI1是否包含的元素都是string, listOI2是否包含的元素都是int
        if (!(listOI1.getListElementObjectInspector() instanceof WritableStringObjectInspector) || !(listOI2.getListElementObjectInspector() instanceof WritableStringObjectInspector)) {
            throw new UDFArgumentException("third and fourth argument must be a list/array of strings");
        }
        // 返回类型是boolean,所以我们提供了正确的object inspector
        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
    }

    @Override
    public String getDisplayString(String[] children) {
        return "返回最新的版本:get_latest_ver(string,int,array<string>,array<string>)";
    }

    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        if (isVoidType(argumentOIs)) {
            return null;
        }
        String cur = this.elementOI1.getPrimitiveJavaObject(arguments[0].get());
        Object intObjectInspector = arguments[1].get();

        List<Text> srcs = (List<Text>) this.listOI1.getList(arguments[2].get());
        List<Text> curs = (List<Text>) this.listOI2.getList(arguments[3].get());

        // 检查空值
        if (intObjectInspector == null || cur == null || srcs == null || curs == null) {
            return null;
        }
        int num = this.elementOI2.get(intObjectInspector);

        while (num < srcs.size()) {
            cur = srcs.get(num).toString().equals(cur) ? curs.get(num).toString() : cur;
            num++;
        }
        return cur;
    }

    protected boolean isVoidType(ObjectInspector[] arguments) {
        for (ObjectInspector oi : arguments) {
            if (oi.getCategory() == ObjectInspector.Category.PRIMITIVE) {
                if (((PrimitiveObjectInspector) oi).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.VOID) {
                    return true;
                }
            }
        }
        return false;
    }

    public static void main(String[] args) throws Exception {
        // 建立需要的模型
        GenericUDFGetLatestVer example = new GenericUDFGetLatestVer();
        ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
        ObjectInspector stringOI1 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
        ObjectInspector intOI1 = PrimitiveObjectInspectorFactory.javaIntObjectInspector;
        ObjectInspector listOI1 = ObjectInspectorFactory.getStandardListObjectInspector(stringOI1);
        StringObjectInspector resultInspector = (StringObjectInspector) example.initialize(new ObjectInspector[]{stringOI, intOI1, listOI1, listOI1});

        // create the actual UDF arguments
        List<Text> list1 = new ArrayList<Text>() {{
            add(new Text("a"));
            add(new Text("b"));
            add(new Text("c"));
            add(new Text("a1"));
            add(new Text("b1"));
            add(new Text("a2"));
        }};
        List<Text> list2 = new ArrayList<Text>() {{
            add(new Text("b"));
            add(new Text("c"));
            add(new Text("d"));
            add(new Text("b1"));
            add(new Text("c1"));
            add(new Text("b2"));
        }};

        // 测试结果
        // 存在的值
        Object result = example.evaluate(new DeferredObject[]{new DeferredJavaObject("b2"), new DeferredJavaObject(6),
                new DeferredJavaObject(list1), new DeferredJavaObject(list2)});
        System.out.println(result);
        System.out.println(resultInspector.getPrimitiveJavaObject(result));

    }
}

创建hive函数

上传hdfs:

hdfs dfs -put -f udf.jar /user/hive/udf

创建永久函数:


--创建函数
create database udf;
use udf;
create function udf.get_latest_ver as 'org.pony.hive.udf.GenericUDFGetLatestVer' using jar 'hdfs:/user/hive/udf/udf.jar';
-- 查看函数
show functions like udf.get_latest_ver;
-- 删除函数
DROP FUNCTION IF EXISTS udf.get_latest_ver;
-- 查看函数拓展信息
desc function extended udf.get_latest_ver;

使用udf函数

  • 案例1:
select udf.get_latest_ver('b',1,Array('a','b'),Array('b','b1'))

结果:
在这里插入图片描述

  • 案例2:
with tmp as (
    select 
        id, 
        mod_date,
        src,
        cur,
        row_number() over(partition by id order by mod_date asc) as num,
        collect_set(src) over(partition by id ) as srcs,
        collect_set(cur) over(partition by id ) as curs
    from db01.t0411
)
select 
   *,
   udf.get_latest_ver(cur,num,srcs,curs) as latest_ver
from tmp;

结果:

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值