udf hive 写入mysql_hive udf 批量写入redis

本文介绍了一个自定义Hive UDF(用户定义函数)`redis_batch_hset`,该函数用于批量将数据写入Redis。UDF接受三个参数:Redis服务器的主机和端口、作为HSet键的字段名以及包含多个Map的数组。在内部,它使用Jedis客户端和Pipeline进行高效批量操作。文章提供了函数的使用示例和单元测试,展示了如何在Hive查询中使用此UDF进行数据分组并写入Redis。
摘要由CSDN通过智能技术生成

import org.apache.hadoop.hive.ql.exec.Description;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;

import org.apache.hadoop.hive.serde2.objectinspector.*;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import org.apache.hadoop.io.IntWritable;

import redis.clients.jedis.HostAndPort;

import redis.clients.jedis.Jedis;

import redis.clients.jedis.Pipeline;

import java.io.IOException;

import java.util.HashMap;

import java.util.Map;

@Description(name = "redis_batch_hset",

value = "_FUNC_(host_and_port,keyField, array) - Return ret "

)

public class RedisBatchHSetUDF extends GenericUDF {

private HostAndPort hostAndPort;

private String keyField;

private Object writableKeyField; //实际上是org.apache.hadoop.io.Text类型

private StandardListObjectInspector paramsListInspector;

private StandardMapObjectInspector paramsElementInspector;

@Override

public Object evaluate(DeferredObject[] arg0) throws HiveException {

try (Jedis jedis = new Jedis(hostAndPort.getHost(), hostAndPort.getPort(), 10000, 60000);

Pipeline pipeline = jedis.pipelined()

) {

for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) {

Object row = paramsListInspector.getListElement(arg0[2].get(), i);

Map, ?> map = paramsElementInspector.getMap(row);

// Object obj = ObjectInspectorUtils.copyToStandardJavaObject(row,paramsElementInspector); //转成标准的java map,否则里面的key value字段为hadoop writable对象

if (map.containsKey(writableKeyField)) {

String did = map.get(writableKeyField).toString();

Map data = new HashMap<>();

for (Map.Entry, ?> entry : map.entrySet()) {

if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) {

data.put(entry.getKey().toString(), entry.getValue().toString());

}

}

pipeline.hmset(did,data);

}

}

pipeline.sync();

return new IntWritable(1);

} catch (IOException e) {

e.printStackTrace();

throw new HiveException(e);

}

}

@Override

public String getDisplayString(String[] arg0) {

return "redis_batch_hset(redishost_and_port,keyField, array>)";

}

@Override

public ObjectInspector initialize(ObjectInspector[] arg0)

throws UDFArgumentException {

if (arg0.length != 3) {

throw new UDFArgumentException(" Expecting two arguments: array> ");

}

//第一个参数校验

if (arg0[0].getCategory() == Category.PRIMITIVE

&& ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {

if (!(arg0[0] instanceof ConstantObjectInspector)) {

throw new UDFArgumentException("redis host:port must be constant");

}

ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0];

String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":");

hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1]));

}

//第2个参数校验

if (arg0[1].getCategory() == Category.PRIMITIVE

&& ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {

if (!(arg0[1] instanceof ConstantObjectInspector)) {

throw new UDFArgumentException("redis hset key must be constant");

}

ConstantObjectInspector keyFieldOI = (ConstantObjectInspector) arg0[1];

keyField = keyFieldOI.getWritableConstantValue().toString();

writableKeyField = keyFieldOI.getWritableConstantValue();

}

//第3个参数校验

if (arg0[2].getCategory() != Category.LIST) {

throw new UDFArgumentException(" Expecting an array> field as third argument ");

}

ListObjectInspector third = (ListObjectInspector) arg0[2];

if (third.getListElementObjectInspector().getCategory() != Category.MAP) {

throw new UDFArgumentException(" Expecting an array> field as third argument ");

}

paramsListInspector = ObjectInspectorFactory.getStandardListObjectInspector(third.getListElementObjectInspector());

paramsElementInspector = (StandardMapObjectInspector) third.getListElementObjectInspector();

System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory());

System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory());

return PrimitiveObjectInspectorFactory.writableIntObjectInspector;

}

}

如何注册就不说了,可以翻看原来的那个Mysql udf的文章。

用法还蛮苛刻的:

SELECT g,xydb.redis_batch_hset('192.168.78.87:6379','did',collect_list(map('did',concat('xyzs.profile.',did),'sex',sex,'age',age))) AS result

FROM

(

SELECT did,pmod(abs(hash(did)),1000) AS g,sex,age FROM data_mining.adl_xyzs_user_profile_day_new WHERE ds='2017-08-02' AND isnotnull(sex) AND isnotnull(age)

) a

GROUP BY g;

redis_batch_hset(redishost_and_port,keyField, array>)

第一个参数是要连接的redis的host:port。我们redis没有密码并且默认连db0,所以就配一个hostport就行了,否则自己改一下代码。。。

第三个参数是要入库的数据,是一个array,array里面的元素是一个map。

第二个参数是指第三个数组里面每一行map数据里面作为redis hset的key的字段名,这里样例里面指did,当然did这个字段名必须在map中存在,不然会忽略。

did是用户唯一标识,由于入库的量比较大(每天有30多万个did待入库),可以按did mod分组,比如我这拆成了1000个数组分别使用pipline写入redis,这样每个数组里面其实只有几百条数据,使用pipline批量提交提高redis写入性能。

再附一个简单的测试用例:

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;

import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.junit.Before;

import org.junit.Test;

public class RedisBatchHsetUDFTest {

private RedisBatchHSetUDF udf;

@Before

public void before() {

udf = new RedisBatchHSetUDF();

}

@Test

public void test() throws HiveException {

Text host = new Text("name87:6379");

Text keyField = new Text("did");

ObjectInspector hostAndPort = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, host);

ObjectInspector key = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, keyField);

TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("map");

StandardMapObjectInspector elementOI = (StandardMapObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);

StandardListObjectInspector listIO = ObjectInspectorFactory.getStandardListObjectInspector(elementOI);

ObjectInspector resultoi = udf.initialize(new ObjectInspector[]{hostAndPort, key, listIO});

Object list = listIO.create(1);

Object row = elementOI.create();

elementOI.put(row, new Text("did"), new Text("xiaojuntest"));

elementOI.put(row, new Text("sex"), new IntWritable(1));

elementOI.put(row, new Text("age"), new Text("85"));

listIO.set(list, 0, row);

Object result = udf.evaluate(new GenericUDF.DeferredObject[]{new GenericUDF.DeferredJavaObject(host), new GenericUDF.DeferredJavaObject(keyField), new GenericUDF.DeferredJavaObject(list)});

System.out.println(result);

}

// @Test

// public void test2() {

// TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("array>");

// StandardListObjectInspector objInsp = (StandardListObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);

// System.out.println(objInsp);

// }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值