一 数据样例
1566461617106|{
"cm":{
"ln":"-96.6",
"sv":"V2.1.6",
"os":"8.1.3",
"g":"REX44511@gmail.com",
"mid":"992",
"nw":"WIFI",
"l":"pt",
"vc":"5",
"hw":"750*1134",
"ar":"MX",
"uid":"992",
"t":"1566442996963",
"la":"13.9",
"md":"HTC-8",
"vn":"1.1.2",
"ba":"HTC",
"sr":"R"
},
"ap":"app",
"et":[
{
"ett":"1566438762478",
"en":"newsdetail",
"kv":{
"entry":"3",
"goodsid":"235",
"news_staytime":"6",
"loading_time":"3",
"action":"1",
"showtype":"2",
"category":"48",
"type1":""
}
},
{
"ett":"1566449474310",
"en":"ad",
"kv":{
"entry":"2",
"show_style":"5",
"action":"3",
"detail":"",
"source":"3",
"behavior":"2",
"content":"1",
"newstype":"7"
}
},
{
"ett":"1566389641833",
"en":"notification",
"kv":{
"ap_time":"1566387527480",
"action":"1",
"type":"2",
"content":""
}
},
{
"ett":"1566398913612",
"en":"active_foreground",
"kv":{
"access":"1",
"push_id":"1"
}
},
{
"ett":"1566430994142",
"en":"praise",
"kv":{
"target_id":6,
"id":0,
"type":2,
"add_time":"1566371286636",
"userid":4
}
}
]
}
时间戳|{
cm:{}// 公共字段
ap:{} //数据来源
et:[{}] //事件日志集合
}
二 需求分析
2.1 定义UDF函数抽取JSON中对应的字段的值
UDF :用户定义函数一进一出
package com.gc.function;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;
/**
* Created by guochao on 201924.
*/
public class GetValueUDF extends UDF {
//解析事件日志 json 数据 jsonkey kety的值
public String evaluate(String json,String jsonkey){
String result = new String();
if (StringUtils.isEmpty(json) || StringUtils.isEmpty(jsonkey)){
return result;
}
String[] split = json.split("\\|");
try {
JSONObject jsonObject = new JSONObject(split[1].trim());
if(jsonObject.has(jsonkey)){ //事件中的其它字段
result= jsonObject.getString(jsonkey);
}else if("st".equals(jsonkey)){
result= split[0];//时间字段
}else{
//解析公共字段的值
JSONObject cm = jsonObject.getJSONObject("cm");//公共字段
if (cm.has(jsonkey)) {
result =cm.getString(jsonkey);
}
}
} catch (JSONException e) {
e.printStackTrace();
}
return result;
}
public static void main(String[] args) {
String key="1566461617106|{\"cm\":{\"ln\":\"-92.7\",\"sv\":\"V2.6.0\",\"os\":\"8.1.2\",\"g\":\"H8694YIR@gmail.com\",\"mid\":\"988\",\"nw\":\"3G\",\"l\":\"es\",\"vc\":\"2\",\"hw\":\"1080*1920\",\"ar\":\"MX\",\"uid\":\"988\",\"t\":\"1566432661347\",\"la\":\"-42.1\",\"md\":\"Huawei-4\",\"vn\":\"1.2.0\",\"ba\":\"Huawei\",\"sr\":\"O\"},\"ap\":\"app\",\"et\":[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]}";
String ln = new GetValueUDF().evaluate(key, "et");
System.out.println("ln = " + ln);
}
}
注意: 方法名必须为evaluate 且必须有返回值,可以返回NUll,可在UDF的类中查看到 UDF—>构造方法---->DefaultUDFMethodResolver–>下面的这段代码
2.2 定义UDTF函数取出event事件集合中的每个事件
package com.gc.function;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.List;
/**
* Created by guochao on 2019/8/24.
*/
public class GetValueUDTF extends GenericUDTF {
//处理对应的逻辑
// 传入的数据类型为[[]]数组
@Override
public void process(Object[] objects) {
//可以传入多个 在此处是一进多出 所以只取第一个
if (objects!=null && objects.length!=0) {
try {
JSONArray array = new JSONArray(objects[0].toString());
String [] result=null;
for (int i=0;i<array.length();i++){
JSONObject jsonObject = array.getJSONObject(i);
// 获取到每个事件的事件类型
if (null!=jsonObject && jsonObject.has("en")) {
String event_name = jsonObject.getString("en");
result= new String[]{event_name,array.getString(i)};
try {
System.out.println("event_name = " + event_name);
System.out.println("jsonObject = " + jsonObject);
forward(result);
} catch (HiveException e) {
System.out.println(e.getMessage());
continue;
}
}
}
} catch (JSONException e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}
@Override
public void close() {
}
//初始化 设指定输出参数的名称和输出的类型
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
List<String> list = new ArrayList<String>();
List<ObjectInspector> fileds = new ArrayList<ObjectInspector>();
list.add("event_name");
fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
list.add("event_json");
fileds.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(list, fileds);
}
// public static void main(String[] args) {
// new GetValueUDTF().process(new Object[]{"[{\"ett\":\"1566413607222\",\"en\":\"newsdetail\",\"kv\":{\"entry\":\"1\",\"goodsid\":\"234\",\"news_staytime\":\"6\",\"loading_time\":\"0\",\"action\":\"3\",\"showtype\":\"3\",\"category\":\"56\",\"type1\":\"\"}},{\"ett\":\"1566452826270\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"9\",\"action\":\"3\",\"extend1\":\"\",\"type\":\"2\",\"type1\":\"\",\"loading_way\":\"2\"}},{\"ett\":\"1566367597434\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1566424725932\",\"action\":\"4\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1566370724555\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}},{\"ett\":\"1566413177347\",\"en\":\"comment\",\"kv\":{\"p_comment_id\":1,\"addtime\":\"1566460762369\",\"praise_count\":401,\"other_id\":7,\"comment_id\":3,\"reply_count\":87,\"userid\":5,\"content\":\"遗甜港蹦辐铣\"}},{\"ett\":\"1566369789031\",\"en\":\"praise\",\"kv\":{\"target_id\":1,\"id\":7,\"type\":4,\"add_time\":\"1566440220563\",\"userid\":6}}]"});
// }
}
注意:还需要实现initialize方法,在此方法中完成输出结果的类型和名称定义
三 函数测试
3.1 打包项目上传到HDFS上
hadoop fs -mkdir /user/hive/jars
hadoop fs -put /opt/module/hive1.2.1/lib/hive-function.jar /user/hive/jars
3.2 创建指定的函数
3.2.1 启动hdfs,yarn
hadoop 群起脚本
hadoop-shell start
3.2.2 启动hive
3.2.3创建函数
create [temporary] function [dbname.]function_name AS class_name;
// 处理公共字段的函数
create function gmall.base_analizer as 'com.gc.function.GetValueUDF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';
// 处理event 数组的函数
create function gmall.event_analizer as 'com.gc.function.GetValueUDTF' using jar 'hdfs://hadoop102:9000/user/hive/jars/hive-function.jar';
3.3 测试函数
select base_analizer(line,"sv") as t ,
event_name,
event_json
from gmall.ods_event_log lateral view event_analizer(base_analizer(line,"et")) tmp as event_name,event_json limit 3;
测试运行结果如下
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
| t | event_name | event_json |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
| V2.0.3 | display | {"ett":"1566522643775","en":"display","kv":{"goodsid":"0","action":"1","extend1":"1","place":"0","category":"58"}} |
| V2.0.3 | loading | {"ett":"1566496875003","en":"loading","kv":{"extend2":"","loading_time":"2","action":"3","extend1":"","type":"3","type1":"433","loading_way":"1"}} |
| V2.0.3 | active_foreground | {"ett":"1566462987798","en":"active_foreground","kv":{"access":"1","push_id":"3"}} |
+---------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+--+
总结:
- event_analizer 函数为一进多出,需配合 lateral view 一起使用
- base_analizer 函数为一进一出
- as “t” 无法执行,需替换成 as t