一、需求背景:
源表oracle抽取数据到hive中,源表没有更新时间字段,也无法协调部门配置oracle redo log或者CDC,但是需要获取到变化的数据,并打上标签。
此UDF技术点:每张表字段数不固定,如何写不定长函数,像concat,当然可以参考hive源码如何实现concat内置函数,下面使用HashMap存放函数不定长入参。
二、代码实现:
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.jasper.tagplugins.jstl.core.ForEach;
import org.apache.log4j.Logger;
/**
* @author zhifeiji
* @file_name GenericUDFTableCompair.java
* @date 2020年11月9日
*/
@Description(name="GenericUDFTableCompair",
value="_FUNC_(tab1.col,tab2.coltab1.col2,tab2.col2.....) -Returns delete,unchange,insert,unknow",
extended ="Example:\n+"+"select GenericUDFTableCompair(a.name,b.name,a.addr,b.addr) from test_old a"+
"full join test_new b"+
"on a.id=b.id;"
)
public class GenericUDFTableCompair extends GenericUDF{
public static final Logger LOG = Logger.getLogger(GenericUDFTableCompair.class);
//定义输入变量,使用HashMap存放不定长参数
Map<Integer, StringObjectInspector > hashmapOI=new HashMap<Integer, StringObjectInspector>();
//输出变量
private Text text=null;
Map<Integer, String> hpRet=new HashMap<Integer, String>();
private String oldCol=null;
private String newCol=null;
private int deleteCnt;
private int insertCnt;
private int unchangeCnt;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
//输入变量
int num=0;
for (ObjectInspector objectInspector : arguments) {
hashmapOI.put(num, (StringObjectInspector) objectInspector);
num+=1;
}
//参数个数必须是偶数
if (!(hashmapOI.size()%2==0)) {
throw new UDFArgumentException("The number of arguments must be even!");
}
//判断参数类型
for (num=0;num<hashmapOI.size();num++){
if (!hashmapOI.get(num).getCategory().equals(Category.PRIMITIVE)) {
//入参必须为hive简单类型
throw new UDFArgumentException("The arguments must be PRIMITIVE!");
}
}
//返回值类型
return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
for(int num=0;num<arguments.length/2;num++){
oldCol=hashmapOI.get(num).getPrimitiveJavaObject(arguments[2*num].get());
newCol=hashmapOI.get(num).getPrimitiveJavaObject(arguments[2*num+1].get());
//对比新旧两张表的字段值
if(newCol==null || newCol.trim().length()==0){
hpRet.put(num, "newColNull");
}else if(oldCol==null || oldCol.trim().length()==0){
hpRet.put(num, "oldColNull");
}else if(!(newCol.trim().equals(oldCol.trim()))){
text=new Text("update");
return text;
}else if(newCol.trim().equals(oldCol.trim())){
hpRet.put(num, "unchange");
}else{
hpRet.put(num, "unknow");
}
}
deleteCnt=0;
insertCnt=0;
unchangeCnt=0;
for(int num=0;num<hpRet.size();num++){
LOG.info(hpRet.get(num));
if(hpRet.get(num)=="newColNull"){
++deleteCnt;
}
if(hpRet.get(num)=="oldColNull"){
++insertCnt;
}
if(hpRet.get(num)=="unchange"){
++unchangeCnt;
}
}
LOG.info("size====="+hpRet.size());
LOG.info("deleteCnt====="+deleteCnt);
LOG.info("insertCnt====="+insertCnt);
LOG.info("unchangeCnt====="+unchangeCnt);
if(deleteCnt==hpRet.size()){
text=new Text("delete");
}else if(insertCnt==hpRet.size()){
text=new Text("insert");
}else if(unchangeCnt==hpRet.size()){
text=new Text("unchange");
}else{
text=new Text("Unknow!Please contact the developer!");
}
return text;
}
@Override
public String getDisplayString(String[] children) {
assert( children.length>0 );
StringBuilder sb = new StringBuilder();
sb.append("GenericUDFTableCompair(");
for (String c : children) {
sb.append(c);
}
sb.append(")");
return sb.toString();
}
}
三、验证函数
1.打包上传centos;
2.启动hadoop和hive,启动hive之前删除hive.log日志,方便查看日志调试,hive.log路径可配置;
3.将上传到centos本地的jar包在hive命令行中加载至分布式缓存,并注册为临时函数;
add jar /home/hadoop/GenericUDFTableCompair.jar;
create temporary function GenericUDFTableCompair as 'com.paic.gbd.udfarray.GenericUDFTableCompair';
4.验证函数功能。
两表数据如下
使用函数,对增删改和不变的数据打上标签
select a.name,b.name,a.addr,b.addr,GenericUDFTableCompair(a.name,b.name,a.addr,b.addr) flag from test_old a
full join test_new b
on a.id=b.id
;
结果如下