@TOC
UDAF
自定义求平均值
package cn.kgc.kb11.gudaf;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFAverage;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver2;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryStruct;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.DoubleWritable;
import java.util.ArrayList;
import java.util.List;
/**
* @author zhouhu
* @Date
* @Desription
*/
public class TestAVG implements GenericUDAFResolver2 {
@Override
public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException {
ObjectInspector[] paramOis = info.getParameterObjectInspectors();
return new GenericUDAFAverage.GenericUDAFAverageEvaluatorDouble();
}
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
return null;
}
public static class AvgTest extends GenericUDAFEvaluator {
@Override
public AggregationBuffer getNewAggregationBuffer() throws HiveException {
AvgAgg agg = new AvgAgg();
reset(agg);
return agg;
}
@Override
public void reset(AggregationBuffer agg) throws HiveException {
AvgAgg a = (AvgAgg) agg;
a.value = 0.0;
a.sum = 0.0;
a.count = 0;
}
DoubleObjectInspector ooi;//定义输入格式
DoubleWritable dw;//定义输出格式
@Override
public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
super.init(m, parameters);
ooi = PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
dw = new DoubleWritable(0.0);
List<String> fieldNames = new ArrayList<>();
fieldNames.add("sum");
fieldNames.add("count");
List<ObjectInspector> structFieldOis = new ArrayList<>();
structFieldOis.add(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector);
structFieldOis.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
// return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,structFieldOis);
}
@Override
public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
//每一行进一次这个方法
//只有一列数值列,对其进行累加同时计数
if (parameters == null) {
return;
} else {
AvgAgg a = (AvgAgg) agg;
a.sum += (Double) parameters[0];
a.count++;
// a.value = a.sum/a.value;
}
}
@Override
public Object terminatePartial(AggregationBuffer agg) throws HiveException {
return terminate(agg);
}
@Override
public void merge(AggregationBuffer agg, Object partial) throws HiveException {
System.out.println(partial);
if (partial instanceof LazyBinaryStruct) {
LazyBinaryStruct lbs = (LazyBinaryStruct) partial;
System.out.println(lbs.getFieldsAsList());
Double sum = (Double) lbs.getField(0);
Integer count = (Integer) lbs.getField(1);
AvgAgg a = (AvgAgg) agg;
a.sum += sum;
a.count += count;
}
}
@Override
public Object terminate(AggregationBuffer agg) throws HiveException {
AvgAgg a = (AvgAgg) agg;
Double sum = a.sum;
Integer count = a.count;
double avg = sum/count;
return new DoubleWritable(avg);
}
@AggregationType(estimable = true)
static class AvgAgg extends AbstractAggregationBuffer {
Double sum = 0.0;
Integer count = 0;
double value;
public int estimate() {
return 16;
}
}
}
}
UDTF
行转列(列示于炸裂函数)
package cn.kgc.kb11.gudtf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* @author zhouhu
* @Date
* @Desription
*/
@Description(
name = "splToLines",
value = "call to the split character,to split the str to lines.",
extended = "select splToLines(array('a','b','c'),','); result is:"+
"'a'\n"+
"'b'\n"+
"'c'"
)
public class TestUDTF extends GenericUDTF {
StringObjectInspector strOi;
StringObjectInspector separatorChar;
//1.确认输入类型是否正确
//2.输出类型的定义
// UDTF首先会调用initialize方法,此方法返回UDTF的返回行的信息(返回个数,类型)。
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
ArrayList<String> name=new ArrayList<>();
name.add("values");
strOi=PrimitiveObjectInspectorFactory.javaStringObjectInspector;
separatorChar=PrimitiveObjectInspectorFactory.javaStringObjectInspector;
ArrayList<ObjectInspector> listOis=new ArrayList<>();
listOis.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(name,listOis);
}
// 初始化完成后,会调用process方法,真正的处理过程在process函数中,在process中,
// 每一次forward()调用产生一行;如果产生多列可以将多个列的值放在一个数组中,
// 然后将该数组传入到forward()函数。
@Override
public void process(Object[] args) throws HiveException {
String str = strOi.getPrimitiveJavaObject(args[0]);
String separator = separatorChar.getPrimitiveJavaObject(args[1]);
String[] lines = str.split(separator);
ArrayList a=new ArrayList();
for (String line : lines) {
a.clear();
a.add(line);
forward(a);
}
}
// close()方法调用,对需要清理的方法进行清理。
@Override
public void close() throws HiveException {
}
}