hive0.13的自定义UDTF函数的应用
代码的编写
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.csm.data.udf.hive;
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* 将一个字段,它的值为
* 流行|90后|粤语|
* 这样的字段,转换为
* 流行
* 90后
* 粤语
* 这样的三行
* 本方法只能支持
* select csm_explode(tag) from t_test这种单字段的形式
*
*/
@Description(name = "csm_explode", value = "_FUNC_(a) - separates the elements of string with '|' into multiple rows")
public class CSMUDTFExplode extends GenericUDTF {
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentException("csm_explode() takes only one argument");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
//返回的新列的字段名称
fieldNames.add("tag");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(
fieldNames, fieldOIs);
}
@Override
public void process(Object[] o) throws HiveException {
String input = o[0].toString();
String[] test = input.split("\\|");
//forward处理一个数组,每个数组作为一行返回,每个数组里的元素作为一列
for(int i =0 ;i < test.length;i ++){
String[] tag = {test[i]};
forward(tag);
}
}
@Override
public String toString() {
return "";
}
}
这个自定义函数最好是打包成一个单独的jar包,这个jar包需要指定一个有main函数的类作为默认的入口,虽然这个类用不上
写了一个ATest作为有main函数的类
作为java application运行一下
运行结果不重要,只要确定main函数有效就可以了
导出Jar包:
这里选择导出为runnable jar包
指定刚刚定义的main函数类作为jar包的默认入口
忽略这个警告:
导出的jar包,这里依赖的jar包是不需要上传到有Hadoop环境的服务器上
使用scp工具上传jar包
原始的Hive表数据:
注册自定义函数为临时函数
使用该自定义函数