(
1
)创建一个
maven
工程:
hivefunction
(
2
)创建包名:
com.atguigu.hive.udtf
(
3
)引入如下依赖
<dependencies>
<!--
添加
hive
依赖
-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
(
4
)编码
package com.atguigu.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorF
actory;
import
org.apache.hadoop.hive.serde2.objectinspector.StructObjectInsp
ector;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.Primit
iveObjectInspectorFactory;
import org.json.JSONArray;
import java.util.ArrayList;
import java.util.List;
public class ExplodeJSONArray extends GenericUDTF {
@Override
public StructObjectInspector
initialize(StructObjectInspector argOIs) throws
UDFArgumentException {
// 1
参数合法性检查
if (argOIs.getAllStructFieldRefs().size() != 1){
throw new UDFArgumentException("ExplodeJSONArray
只需
要一个参数
");
}
// 2
第一个参数必须为
string
if(!"string".equals(argOIs.getAllStructFieldRefs().get(0).getF
ieldObjectInspector().getTypeName())){
throw new
UDFArgumentException("json_array_to_struct_array
的第
1
个参数应为
string
类型
");
}
// 3
定义返回值名称和类型
List<String> fieldNames = new ArrayList<String>();
List<ObjectInspector> fieldOIs = new
ArrayList<ObjectInspector>();
fieldNames.add("items");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectI
nspector);
return
ObjectInspectorFactory.getStandardStructObjectInspector(fieldN
ames, fieldOIs);
}
public void process(Object[] objects) throws HiveException
{
// 1
获取传入的数据
String jsonArray = objects[0].toString();
// 2
将
string
转换为
json
数组
JSONArray actions = new JSONArray(jsonArray);
// 3
循环一次,取出数组中的一个
json
,并写出
for (int i = 0; i < actions.length(); i++) {
String[] result = new String[1];
result[0] = actions.getString(i);
forward(result);
}
}
public void close() throws HiveException {
}
}
4.创建函数
(
1
)打包
(
2
)将
hivefunction-1.0-SNAPSHOT.jar
上传到
hadoop102
的
/opt/module
,然后再将该
jar
包上传到
HDFS
的
/user/hive/jars
路径下
[atguigu@hadoop102 module]$ hadoop fs -mkdir -p /user/hive/jars
[atguigu@hadoop102 module]$ hadoop fs -put hivefunction-1.0-
SNAPSHOT.jar /user/hive/jars
(
3
)创建永久函数与开发好的
java class
关联
hive (gmall)>
create function
explode_json_array
as
'com.atguigu.hive.udtf.ExplodeJSONArray' using jar
'hdfs://hadoop102:8020/user/hive/jars/hivefunction-1.0-
SNAPSHOT.jar';
(
4
)注意:如果修改了自定义函数重新生成
jar
包怎么处理?只需要替换
HDFS
路径
上的旧
jar
包,然后重启
Hive
客户端即可。
5
)数据导入
hive (gmall)>
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFo
rmat;
insert overwrite table dwd_action_log partition(dt='2020-06-14')
select
get_json_object(line,'$.common.ar'),
get_json_object(line,'$.common.ba'),
get_json_object(line,'$.common.ch'),
get_json_object(line,'$.common.md'),
get_json_object(line,'$.common.mid'),
get_json_object(line,'$.common.os'),
get_json_object(line,'$.common.uid'),
get_json_object(line,'$.common.vc'),
get_json_object(line,'$.page.during_time'),
get_json_object(line,'$.page.item'),
get_json_object(line,'$.page.item_type'),
get_json_object(line,'$.page.last_page_id'),
get_json_object(line,'$.page.page_id'),
get_json_object(line,'$.page.sourceType'),
get_json_object(action,'$.action_id'),
get_json_object(action,'$.item'),
get_json_object(action,'$.item_type'),
get_json_object(action,'$.ts')
from ods_log
lateral view
explode_json_array(get_json_object(line,'$.actions')) tmp as
action
where dt='2020-06-14'
and
get_json_object(line,'$.actions') is not null;
3
)查看数据
hive (gmall)>
select * from dwd_action_log where dt='2020-06-14' limit 2;