目录
UDF(User-Defined-Function)
一进一出
开发示例:
1.创建maven项目, 引入依赖
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.6</version>
</dependency>
</dependencies>
2.编写自定义类,继承GenericUDF类,并重写initialize和evaluate方法。
package hive.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* 自定义UDF函数
*
* 需要继承GenericUDF类,重写抽象方法。
*
* 函数的调用: select my_len('abcd');
* UDF
*/
public class StringLengthUDF extends GenericUDF {
/**
* 初始化方法 判断传入到参数的个数,类型等,约定函数的返回值类型
* @param objectInspectors 传入到函数的参数类型
* @return
* @throws UDFArgumentException
*/
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
//简单判断
//判断参数的个数
if(objectInspectors.length != 1){
throw new UDFArgumentLengthException("Input Args Length Error!!!");
}
//判断类型
if(!objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)){
throw new UDFArgumentTypeException(0,"Input Args Type Error!!!");
}
//约定函数的返回值类型
return PrimitiveObjectInspectorFactory.javaIntObjectInspector; //int
}
/**
* 函数的逻辑处理
* @param deferredObjects 传入到函数的参数值
* @return
* @throws HiveException
*/
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
//获取参数
Object o = deferredObjects[0].get();
return o.toString().length();
}
@Override
public String getDisplayString(String[] strings) {
return "";
}
}
3.maven 打包,上传到服务器 /opt/module/hive/datas目录
重命名jar包,为了后续方便书写
4.将自定义的UDF函数jar包,在hive命令窗口,添加到hive 的class path中:
hive (default)> add jar /opt/module/hive/datas/myudf.jar ;
Added [/opt/module/hive/datas/myudf.jar] to class path
Added resources: [/opt/module/hive/datas/myudf.jar]
5.注册函数名称,绑定自定义的UDF类(hive.udf.StringLengthUDF)
temporary 可选,表示本次连接后失效。
hive (default)> create temporary function my_len as "hive.udf.StringLengthUDF";
6.使用
hive (default)> select my_len('adsdfsdf');
OK
_c0
8
Time taken: 0.022 seconds, Fetched: 1 row(s)
hive (default)> select my_len(3123123);
OK
_c0
7
Time taken: 0.033 seconds, Fetched: 1 row(s)
--异常报错:
FAILED: ParseException line 1:34 mismatched input '<EOF>' expecting ) near ')' in function specification
hive (default)> select my_len(split('a,b,v,d',','));
FAILED: SemanticException [Error 10016]: Line 1:14 Argument type mismatch '','': Input Args Type Error!!!
hive (default)> select my_len(3123123,3123);
FAILED: SemanticException [Error 10015]: Line 1:7 Arguments length mismatch '3123': Input Args Length Error!!!
UDAF(User-Defined-Aggregation Function)
聚集函数,多进一出
类似于:count/max/min
UDTF(User-Defined Table-Generating Functions)
一进多出
如lateral view explode()
需求:
hive(default)> select myudtf("hello,world,hadoop,hive", ",");
hello
world
hadoop
hive
1.自定义类,继承GenericUDTF类,重写initialize和Process方法
package hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义UDTF函数
*
* 需要继承 GenericUDTF类,并重写抽象方法
*
* 函数的使用: select myudtf('hive,hadoop,flume,kafka',',');
*
* 结果: word
* hive
* hadoop
* flume
* kafka
*
* 扩展:
* select myudtf2('hadoop-niupi,java-lihai,songsong-kuai,dahai-lang',',','-');
* 结果: word1 word2
* hadoop niupi
* java lihai
* songsong kuai
* dahai lang
*
*/
public class StringSplitUDTF extends GenericUDTF {
private ArrayList<String> outList=new ArrayList<>();
/**
* 初始化方法 约定函数的返回值类型 和 函数的返回值列名
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//基本判断
List<? extends StructField> fieldRefs = argOIs.getAllStructFieldRefs();
if(fieldRefs.size() != 2){
throw new UDFArgumentException("Input Args Length Error!!!");
}
//约定函数返回的列的名字
ArrayList<String> fieldNames=new ArrayList<>();
fieldNames.add("word");
ArrayList<ObjectInspector> fieldOIs=new ArrayList<>();
//约定函数返回的列的类型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
}
/**
* 函数逻辑处理
* @param objects
* @throws HiveException
*/
@Override
public void process(Object[] objects) throws HiveException {
//获取参数
String argsData = objects[0].toString();
String argsSplit = objects[1].toString();
//切分数据
String[] words = argsData.split(argsSplit);
//写出数据 每个单词一行
for (String word : words) {
//因为集合时重用的,所以每次要先清空
outList.clear();
//将当前的单词全放到集合中
outList.add(word);
//将当前的单词一行一行写出
forward(outList);
}
}
@Override
public void close() throws HiveException {
}
}
2.打包
3.将打好的jar包添加到linux中
4.将上传的jar包添加到hive的环境中,创建自定义函数
hive (default)> add jar /opt/module/hive/datas/myudtf.jar;
Added [/opt/module/hive/datas/myudtf.jar] to class path
Added resources: [/opt/module/hive/datas/myudtf.jar]
hive (default)> create temporary function myudtf as "hive.udtf.StringSplitUDTF";
OK
Time taken: 0.104 seconds
hive (default)> select myudtf("hive,hadoop,flume,kafka",",");
OK
word
hive
hadoop
flume
kafka
Time taken: 0.278 seconds, Fetched: 4 row(s)
测试
如果自定义的包有问题,则hive重复 add jar 更新包不会生效。断掉本次连接,重新进入,方能生效。
扩展测试:返回两列
package hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义UDTF函数
*
* 需要继承 GenericUDTF类,并重写抽象方法
*
* 函数的使用: select myudtf('hive,hadoop,flume,kafka',',');
*
* 结果: word
* hive
* hadoop
* flume
* kafka
*
* 扩展:
* select myudtf2('hadoop-niupi,java-lihai,songsong-kuai,dahai-lang',',','-');
* 结果: word1 word2
* hadoop niupi
* java lihai
* songsong kuai
* dahai lang
*
*/
public class StringSplitUDTF2 extends GenericUDTF {
private ArrayList<String> outList=new ArrayList<>();
/**
* 初始化方法 约定函数的返回值类型 和 函数的返回值列名
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//基本判断
List<? extends StructField> fieldRefs = argOIs.getAllStructFieldRefs();
if(fieldRefs.size() != 3){
throw new UDFArgumentException("Input Args Length Error!!!");
}
//约定函数返回的列的名字
ArrayList<String> fieldNames=new ArrayList<>();
fieldNames.add("word1");
fieldNames.add("word2");
ArrayList<ObjectInspector> fieldOIs=new ArrayList<>();
//约定函数返回的列的类型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
}
/**
* 函数逻辑处理
* @param objects
* @throws HiveException
*/
@Override
public void process(Object[] objects) throws HiveException {
//获取参数
String argsData = objects[0].toString();//'hadoop-niupi,java-lihai,songsong-kuai,dahai-lang'
String rowsSplit = objects[1].toString(); // ,
String colsSplit = objects[2].toString();// -
String[] rows = argsData.split(rowsSplit);
for (String row : rows) {
//因为集合是复用的,使用前先清空
outList.clear();
//row:hadoop-niupi
String[] cols = row.split(colsSplit);
for (String word : cols) {
outList.add(word);//一行中的两列数据
}
//写出
forward(outList);
}
}
@Override
public void close() throws HiveException {
}
}
hive (default)> add jar /opt/module/hive/datas/myudtf2.jar;
Added [/opt/module/hive/datas/myudtf2.jar] to class path
Added resources: [/opt/module/hive/datas/myudtf2.jar]
hive (default)> create temporary function myudtf2 as 'hive.udtf.StringSplitUDTF2';
OK
Time taken: 0.449 seconds
hive (default)> select myudtf2('hadoop-niup','songsong-lihai','dahai-kuai',',','-');
FAILED: UDFArgumentException Input Args Length Error!!!
hive (default)> select myudtf2('hadoop-niup,songsong-lihai,dahai-kuai',',','-');
OK
word1 word2
hadoop niup
songsong lihai
dahai kuai
Time taken: 0.194 seconds, Fetched: 3 row(s)