Hive UDF—实现表比对

2 篇文章 0 订阅

一、需求背景:

源表oracle抽取数据到hive中,源表没有更新时间字段,也无法协调部门配置oracle redo log或者CDC,但是需要获取到变化的数据,并打上标签。

此UDF技术点:每张表字段数不固定,如何写不定长函数,像concat,当然可以参考hive源码如何实现concat内置函数,下面使用HashMap存放函数不定长入参。

二、代码实现:

import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.jasper.tagplugins.jstl.core.ForEach;
import org.apache.log4j.Logger;

/**
 * @author zhifeiji
 * @file_name GenericUDFTableCompair.java
 * @date 2020年11月9日
 */
@Description(name="GenericUDFTableCompair",
value="_FUNC_(tab1.col,tab2.coltab1.col2,tab2.col2.....) -Returns delete,unchange,insert,unknow",
extended ="Example:\n+"+"select GenericUDFTableCompair(a.name,b.name,a.addr,b.addr) from test_old a"+ 
"full join test_new b"+ 
"on a.id=b.id;"
)
public class GenericUDFTableCompair extends GenericUDF{
	public static final Logger LOG = Logger.getLogger(GenericUDFTableCompair.class);
	//定义输入变量,使用HashMap存放不定长参数
	Map<Integer, StringObjectInspector > hashmapOI=new HashMap<Integer, StringObjectInspector>();
	//输出变量
	private Text text=null;
	Map<Integer, String> hpRet=new HashMap<Integer, String>();
	
	private String oldCol=null;
	private String newCol=null;
	
	private int deleteCnt;
	private int insertCnt;
	private int unchangeCnt;
	

	
	@Override
	public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
		//输入变量
		int num=0;
		for (ObjectInspector objectInspector : arguments) {
			hashmapOI.put(num, (StringObjectInspector) objectInspector);
			num+=1;
		}

		//参数个数必须是偶数
		if (!(hashmapOI.size()%2==0)) {
        	throw new UDFArgumentException("The number of arguments must be even!");
        }
        //判断参数类型
		for (num=0;num<hashmapOI.size();num++){
			if (!hashmapOI.get(num).getCategory().equals(Category.PRIMITIVE)) {
				//入参必须为hive简单类型
	        	throw new UDFArgumentException("The arguments must be PRIMITIVE!");
	        }
		}
        //返回值类型  
        return PrimitiveObjectInspectorFactory.writableStringObjectInspector; 
	}

	@Override
	public Object evaluate(DeferredObject[] arguments) throws HiveException {
		for(int num=0;num<arguments.length/2;num++){
			oldCol=hashmapOI.get(num).getPrimitiveJavaObject(arguments[2*num].get());
			newCol=hashmapOI.get(num).getPrimitiveJavaObject(arguments[2*num+1].get());

			//对比新旧两张表的字段值
			if(newCol==null || newCol.trim().length()==0){
				hpRet.put(num, "newColNull");
			}else if(oldCol==null || oldCol.trim().length()==0){
				hpRet.put(num, "oldColNull");
			}else if(!(newCol.trim().equals(oldCol.trim()))){
				text=new Text("update");
				return text;
			}else if(newCol.trim().equals(oldCol.trim())){
				hpRet.put(num, "unchange");
			}else{
				hpRet.put(num, "unknow");
			}
		}
		
		deleteCnt=0;
		insertCnt=0;
		unchangeCnt=0;
		for(int num=0;num<hpRet.size();num++){
			LOG.info(hpRet.get(num));
			if(hpRet.get(num)=="newColNull"){
				++deleteCnt;
			}
			if(hpRet.get(num)=="oldColNull"){
				++insertCnt;
			}
			if(hpRet.get(num)=="unchange"){
				++unchangeCnt;
			}
		}
		LOG.info("size====="+hpRet.size());
		LOG.info("deleteCnt====="+deleteCnt);
		LOG.info("insertCnt====="+insertCnt);
		LOG.info("unchangeCnt====="+unchangeCnt);
		if(deleteCnt==hpRet.size()){
			text=new Text("delete");
		}else if(insertCnt==hpRet.size()){
			text=new Text("insert");
		}else if(unchangeCnt==hpRet.size()){
			text=new Text("unchange");
		}else{
			text=new Text("Unknow!Please contact the developer!");
		}
		return text;
	}
	
	@Override
	public String getDisplayString(String[] children) {
		 assert( children.length>0 );
         StringBuilder sb = new StringBuilder();
         sb.append("GenericUDFTableCompair(");
         for (String c : children) {
        	 sb.append(c);
		}
         sb.append(")");
         return sb.toString();
	}
 
}

三、验证函数

1.打包上传centos;

2.启动hadoop和hive,启动hive之前删除hive.log日志,方便查看日志调试,hive.log路径可配置;

3.将上传到centos本地的jar包在hive命令行中加载至分布式缓存,并注册为临时函数;

add jar /home/hadoop/GenericUDFTableCompair.jar;
create temporary function GenericUDFTableCompair as 'com.paic.gbd.udfarray.GenericUDFTableCompair';

4.验证函数功能。

两表数据如下

使用函数,对增删改和不变的数据打上标签

select a.name,b.name,a.addr,b.addr,GenericUDFTableCompair(a.name,b.name,a.addr,b.addr) flag from test_old a 
full join test_new b 
on a.id=b.id
;

结果如下

 

 

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值