hive 自定义UDF和UDTF函数解析事件

一 数据样例

1566461617106|{
    "cm":{
        "ln":"-96.6",
        "sv":"V2.1.6",
        "os":"8.1.3",
        "g":"REX44511@gmail.com",
        "mid":"992",
        "nw":"WIFI",
        "l":"pt",
        "vc":"5",
        "hw":"750*1134",
        "ar":"MX",
        "uid":"992",
        "t":"1566442996963",
        "la":"13.9",
        "md":"HTC-8",
        "vn":"1.1.2",
        "ba":"HTC",
        "sr":"R"
    },
    "ap":"app",
    "et":[
        {
            "ett":"1566438762478",
            "en":"newsdetail",
            "kv":{
                "entry":"3",
                "goodsid":"235",
                "news_staytime":"6",
                "loading_time":"3",
                "action":"1",
                "showtype":"2",
                "category":"48",
                "type1":""
            }
        },
        {
            "ett":"1566449474310",
            "en":"ad",
            "kv":{
                "entry":"2",
                "show_style":"5",
                "action":"3",
                "detail":"",
                "source":"3",
                "behavior":"2",
                "content":"1",
                "newstype":"7"
            }
        },
        {
            "ett":"1566389641833",
            "en":"notification",
            "kv":{
                "ap_time":"1566387527480",
                "action":"1",
                "type":"2",
                "content":""
            }
        },
        {
            "ett":"1566398913612",
            "en":"active_foreground",
            "kv":{
                "access":"1",
                "push_id":"1"
            }
        },
        {
            "ett":"1566430994142",
            "en":"praise",
            "kv":{
                "target_id":6,
                "id":0,
                "type":2,
                "add_time":"1566371286636",
                "userid":4
            }
        }
    ]
}

时间戳|{
cm:{}// 公共字段
ap:{} //数据来源
et:[{}] //事件日志集合
}

二 需求分析

2.1 定义UDF函数抽取JSON中对应的字段的值

UDF :用户定义函数一进一出

package com.gc.function;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * Created by guochao on 201924.
 */
public class GetValueUDF extends  UDF {
    //解析事件日志 json 数据  jsonkey kety的值
    public String evaluate(String json,String jsonkey){
        String result = new String();
        if (StringUtils.isEmpty(json) || StringUtils.isEmpty(jsonkey)){
            return result;
        }
        String[] split = json.split("\\|");
        try {
            JSONObject jsonObject  = new JSONObject(split[1].trim());

           if(jsonObject.has(jsonkey)){ //事件中的其它字段
               result= jsonObject.getString(jsonkey);
           }else if("st".equals(jsonkey)){
               result= split[0];//时间字段
           }else{
               //解析公共字段的值
               JSONObject cm = jsonObject.getJSONObject("cm");//公共字段
               if (cm.has(jsonkey)) {
                   result =cm.getString(jsonkey);
               }
           }

        } catch (JSONException e) {
            e.printStackTrace();
        }
        return result;
    }

    public static void main(String[] args) {
        String key="1566461617106|{\"cm\":{\"ln\":\"-92.7\",\"sv\":\"V2.6.0\",\"os\":\"8.1.2\",\"g\":\"H8694YIR@gmail.com\",\"mid\":\"988\",\"nw\":\"3G\",\"l\":\"es\",\"vc\":\"2\",\"hw\":\"1080*1920\",\"ar\":\"MX\",\"uid\":\"988\",\"t\":\"1566432661347\",\"la\":\"-42.1\",\"md\":\"Huawei-4\",\"vn\":\"1.2.0\",\"ba\":\"Huawei\",\"sr\":\"O\"},\"ap\":\"app\",\"et\":[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]}";
        String ln = new GetValueUDF().evaluate(key, "et");
        System.out.println("ln = " + ln);
    }
}

注意: 方法名必须为evaluate 且必须有返回值,可以返回NUll,可在UDF的类中查看到 UDF—>构造方法---->DefaultUDFMethodResolver–>下面的这段代码

2.2 定义UDTF函数取出event事件集合中的每个事件

package com.gc.function;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.List;

/**
 * Created by guochao on 2019/8/24.
 */
public class GetValueUDTF extends GenericUDTF {
    //处理对应的逻辑
    // 传入的数据类型为[[]]数组
    @Override
    public void process(Object[] objects) {
    //可以传入多个  在此处是一进多出 所以只取第一个
        if (objects!=null && objects.length!=0) {
            try {
                JSONArray array = new JSONArray(objects[0].toString());
                String [] result=null;
                for (int i=0;i<array.length();i++){

                    JSONObject jsonObject = array.getJSONObject(i);
                    // 获取到每个事件的事件类型
                    if (null!=jsonObject && jsonObject.has("en")) {
                        String event_name = jsonObject.getString("en");
                        result= new String[]{event_name,array.getString(i)};
                        try {
                            System.out.println("event_name = " + event_name);
                            System.out.println("jsonObject = " + jsonObject);
                            forward(result);
                        } catch (HiveException e) {
                            System.out.println(e.getMessage());
                           continue;
                        }
                    }
                }
            } catch (JSONException e) {
                System.out.println(e.getMessage());
                e.printStackTrace();
            }
        }
    }
    @Override
    public void close()  {

    }
    //初始化 设指定输出参数的名称和输出的类型
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        List<String> list = new ArrayList<String>();
        List<ObjectInspector> fileds = new ArrayList<ObjectInspector>();
        list.add("event_name");
        fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        list.add("event_json");
        fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(list, fileds);
    }

//    public static void main(String[] args) {
//        new GetValueUDTF().process(new Object[]{"[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]"});
//    }
}

注意:还需要实现initialize方法,在此方法中完成输出结果的类型和名称定义

三 函数测试

3.1 打包项目上传到HDFS上

 hadoop fs -mkdir /user/hive/jars
 hadoop fs -put /opt/module/hive1.2.1/lib/hive-function.jar /user/hive/jars

3.2 创建指定的函数

3.2.1 启动hdfs,yarn

hadoop 群起脚本

hadoop-shell start

3.2.2 启动hive

3.2.3创建函数

create [temporary] function [dbname.]function_name AS class_name;

// 处理公共字段的函数
create function gmall.base_analizer as 'com.gc.function.GetValueUDF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';
// 处理event 数组的函数
create function gmall.event_analizer as 'com.gc.function.GetValueUDTF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';

3.3 测试函数

select base_analizer(line,"sv") as t ,
event_name,
event_json
from gmall.ods_event_log lateral view event_analizer(base_analizer(line,"et")) tmp as  event_name,event_json limit 3;

测试运行结果如下

+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
|    t    |     event_name     |                                                                     event_json                                                                      |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
| V2.0.3  | display            | {"ett":"1566522643775","en":"display","kv":{"goodsid":"0","action":"1","extend1":"1","place":"0","category":"58"}}                                  |
| V2.0.3  | loading            | {"ett":"1566496875003","en":"loading","kv":{"extend2":"","loading_time":"2","action":"3","extend1":"","type":"3","type1":"433","loading_way":"1"}}  |
| V2.0.3  | active_foreground  | {"ett":"1566462987798","en":"active_foreground","kv":{"access":"1","push_id":"3"}}                                                                  |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+

总结:

  • event_analizer 函数为一进多出,需配合 lateral view 一起使用
  • base_analizer 函数为一进一出
  • as “t” 无法执行,需替换成 as t
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
使用SparkSQL和Hive API,可以通过以下步骤实现用户自定义函数UDF)、聚合函数(UDAF)和表生成函数UDTF): 1. 编写自定义函数的代码,例如: ``` // UDF def myUDF(str: String): Int = { str.length } // UDAF class MyUDAF extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil) override def bufferSchema: StructType = StructType(StructField("count", IntegerType) :: Nil) override def dataType: DataType = IntegerType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = 0 } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getInt(0) + input.getString(0).length } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer1.getInt(0) + buffer2.getInt(0) } override def evaluate(buffer: Row): Any = { buffer.getInt(0) } } // UDTF class MyUDTF extends GenericUDTF { override def initialize(args: Array[ConstantObjectInspector]): StructObjectInspector = { // 初始化代码 } override def process(args: Array[DeferedObject]): Unit = { // 处理代码 } override def close(): Unit = { // 关闭代码 } } ``` 2. 将自定义函数注册到SparkSQL或Hive中,例如: ``` // SparkSQL中注册UDF spark.udf.register("myUDF", myUDF _) // Hive中注册UDF hiveContext.sql("CREATE TEMPORARY FUNCTION myUDF AS 'com.example.MyUDF'") // Hive中注册UDAF hiveContext.sql("CREATE TEMPORARY FUNCTION myUDAF AS 'com.example.MyUDAF'") // Hive中注册UDTF hiveContext.sql("CREATE TEMPORARY FUNCTION myUDTF AS 'com.example.MyUDTF'") ``` 3. 在SQL语句中使用自定义函数,例如: ``` -- 使用SparkSQL中的UDF SELECT myUDF(name) FROM users -- 使用Hive中的UDF SELECT myUDF(name) FROM users -- 使用Hive中的UDAF SELECT myUDAF(name) FROM users GROUP BY age -- 使用Hive中的UDTF SELECT explode(myUDTF(name)) FROM users ``` 以上就是使用SparkSQL和Hive API实现用户自定义函数UDF、UDAF、UDTF)的步骤。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Master_slaves

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值