- UDF
UDF是一种常见的自定义函数,用于对单个输入参数进行计算。以下是一个求平方的UDF例子:
package com.example.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
@Description(name = "square", value = "_FUNC_(x) - Returns the square of x",
extended = "Example:\n > SELECT square(3) FROM src LIMIT 1;\n 9")
public class Square extends UDF {
public DoubleWritable evaluate(final DoubleWritable n) {
if (n == null) return null;
return new DoubleWritable(n.get() * n.get());
}
public DoubleWritable evaluate(final Text str) {
if (str == null) return null;
try {
return new DoubleWritable(Double.parseDouble(str.toString()) * Double.parseDouble(str.toString()));
} catch (NumberFormatException e) {
return null;
}
}
}
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
2、UDAF
UDAF是一种用于聚合计算的自定义函数,例如求平均值、最大值等。以下是一个求平均值的UDAF例子:
package com.example.hive.udaf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
@Description(name = "my_avg", value = "_FUNC_(x) - Returns the average of a set of numbers",
extended = "Example:\n > SELECT my_avg(column_name) FROM table_name;")
public class MyAvg extends UDAF {
public static class AvgEvaluator implements UDAFEvaluator {
private double sum = 0;
private long count = 0;
public void init() {
sum = 0;
count = 0;
}
public boolean iterate(DoubleWritable value) {
if (value != null) {
sum += value.get();
count++;
}
return true;
}
public DoubleWritable terminatePartial() {
if (count == 0) return null;
return new DoubleWritable(sum / count);
}
public boolean merge(DoubleWritable other) {
if (other != null) {
sum += other.get();
count++;
return true;
}
return false;
}
public DoubleWritable terminate() {
if (count == 0) return null;
return new DoubleWritable(sum / count);
}
}
}
3、UDTF
UDTF是一种用户自定义的表生成函数,用于从单个输入行中生成多个输出行。以下是一个将一行文本按逗号分隔并输出的UDTF例子:
package com.example.hive.udtf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspector;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.List;
@Description(name = "split_rows",
value = "_FUNC_(str) - Explode a string into multiple rows",
extended = "Example:\n > SELECT * FROM split_rows('a,b,c');\n a\n b\n c")
public class SplitRowsUDTF extends GenericUDTF {
private PrimitiveObjectInspector inputOI = null;
@Override
public void close() throws HiveException {
// do nothing
}
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
if (argOIs.getAllStructFieldRefs().size() != 1) {
throw new UDFArgumentLengthException("The function split_rows() takes exactly one argument.");
}
ObjectInspector oi = argOIs.getAllStructFieldRefs().get(0).getObjectInspector();
if (oi.getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "The function split_rows() takes a string as parameter.");
}
inputOI = (PrimitiveObjectInspector) oi;
List<String> fieldNames = new ArrayList<>();
List<ObjectInspector> fieldOIs = new ArrayList<>();
fieldNames.add("col1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String input = ((Text) inputOI.getPrimitiveJavaObject(args[0])).toString();
String[] values = input.split(",");
for (String value : values) {
forward(new Object[]{value});
}
}
}