udf hive 写入mysql_hive udf 批量写入redis

最新推荐文章于 2024-04-19 05:31:57 发布

令狐诗楠

最新推荐文章于 2024-04-19 05:31:57 发布

阅读量232

点赞数

文章标签： udf hive 写入mysql

本文链接：https://blog.csdn.net/weixin_35970195/article/details/114008123

版权

本文介绍了一个自定义Hive UDF（用户定义函数）`redis_batch_hset`，该函数用于批量将数据写入Redis。UDF接受三个参数：Redis服务器的主机和端口、作为HSet键的字段名以及包含多个Map的数组。在内部，它使用Jedis客户端和Pipeline进行高效批量操作。文章提供了函数的使用示例和单元测试，展示了如何在Hive查询中使用此UDF进行数据分组并写入Redis。

摘要由CSDN通过智能技术生成

import org.apache.hadoop.hive.ql.exec.Description;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;

import org.apache.hadoop.hive.serde2.objectinspector.*;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import org.apache.hadoop.io.IntWritable;

import redis.clients.jedis.HostAndPort;

import redis.clients.jedis.Jedis;

import redis.clients.jedis.Pipeline;

import java.io.IOException;

import java.util.HashMap;

import java.util.Map;

@Description(name = "redis_batch_hset",

value = "_FUNC_(host_and_port,keyField, array) - Return ret "

)

public class RedisBatchHSetUDF extends GenericUDF {

private HostAndPort hostAndPort;

private String keyField;

private Object writableKeyField; //实际上是org.apache.hadoop.io.Text类型

private StandardListObjectInspector paramsListInspector;

private StandardMapObjectInspector paramsElementInspector;

@Override

public Object evaluate(DeferredObject[] arg0) throws HiveException {

try (Jedis jedis = new Jedis(hostAndPort.getHost(), hostAndPort.getPort(), 10000, 60000);

Pipeline pipeline = jedis.pipelined()

) {

for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) {

Object row = paramsListInspector.getListElement(arg0[2].get(), i);

Map, ?> map = paramsElementInspector.getMap(row);

// Object obj = ObjectInspectorUtils.copyToStandardJavaObject(row,paramsElementInspector); //转成标准的java map，否则里面的key value字段为hadoop writable对象

if (map.containsKey(writableKeyField)) {

String did = map.get(writableKeyField).toString();

Map data = new HashMap<>();

for (Map.Entry, ?> entry : map.entrySet()) {

if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) {

data.put(entry.getKey().toString(), entry.getValue().toString());

}

pipeline.hmset(did,data);

}

pipeline.sync();

return new IntWritable(1);

} catch (IOException e) {

e.printStackTrace();

throw new HiveException(e);

}

@Override

public String getDisplayString(String[] arg0) {

return "redis_batch_hset(redishost_and_port,keyField, array>)";

}

@Override

public ObjectInspector initialize(ObjectInspector[] arg0)

throws UDFArgumentException {

if (arg0.length != 3) {

throw new UDFArgumentException(" Expecting two arguments: array> ");

}

//第一个参数校验

if (arg0[0].getCategory() == Category.PRIMITIVE

&& ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {

if (!(arg0[0] instanceof ConstantObjectInspector)) {

throw new UDFArgumentException("redis host:port must be constant");

}

ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0];

String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":");

hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1]));

}

//第2个参数校验

if (arg0[1].getCategory() == Category.PRIMITIVE

&& ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {

if (!(arg0[1] instanceof ConstantObjectInspector)) {

throw new UDFArgumentException("redis hset key must be constant");

}

ConstantObjectInspector keyFieldOI = (ConstantObjectInspector) arg0[1];

keyField = keyFieldOI.getWritableConstantValue().toString();

writableKeyField = keyFieldOI.getWritableConstantValue();

}

//第3个参数校验

if (arg0[2].getCategory() != Category.LIST) {

throw new UDFArgumentException(" Expecting an array> field as third argument ");

}

ListObjectInspector third = (ListObjectInspector) arg0[2];

if (third.getListElementObjectInspector().getCategory() != Category.MAP) {

throw new UDFArgumentException(" Expecting an array> field as third argument ");

}

paramsListInspector = ObjectInspectorFactory.getStandardListObjectInspector(third.getListElementObjectInspector());

paramsElementInspector = (StandardMapObjectInspector) third.getListElementObjectInspector();

System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory());

System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory());

return PrimitiveObjectInspectorFactory.writableIntObjectInspector;

}

如何注册就不说了，可以翻看原来的那个Mysql udf的文章。

用法还蛮苛刻的：

SELECT g,xydb.redis_batch_hset('192.168.78.87:6379','did',collect_list(map('did',concat('xyzs.profile.',did),'sex',sex,'age',age))) AS result

FROM

(

SELECT did,pmod(abs(hash(did)),1000) AS g,sex,age FROM data_mining.adl_xyzs_user_profile_day_new WHERE ds='2017-08-02' AND isnotnull(sex) AND isnotnull(age)

) a

GROUP BY g;

redis_batch_hset(redishost_and_port,keyField, array>)

第一个参数是要连接的redis的host:port。我们redis没有密码并且默认连db0，所以就配一个hostport就行了，否则自己改一下代码。。。

第三个参数是要入库的数据，是一个array，array里面的元素是一个map。

第二个参数是指第三个数组里面每一行map数据里面作为redis hset的key的字段名，这里样例里面指did，当然did这个字段名必须在map中存在，不然会忽略。

did是用户唯一标识，由于入库的量比较大(每天有30多万个did待入库)，可以按did mod分组，比如我这拆成了1000个数组分别使用pipline写入redis,这样每个数组里面其实只有几百条数据，使用pipline批量提交提高redis写入性能。

再附一个简单的测试用例：

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;

import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;

import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.junit.Before;

import org.junit.Test;

public class RedisBatchHsetUDFTest {

private RedisBatchHSetUDF udf;

@Before

public void before() {

udf = new RedisBatchHSetUDF();

}

@Test

public void test() throws HiveException {

Text host = new Text("name87:6379");

Text keyField = new Text("did");

ObjectInspector hostAndPort = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, host);

ObjectInspector key = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, keyField);

TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("map");

StandardMapObjectInspector elementOI = (StandardMapObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);

StandardListObjectInspector listIO = ObjectInspectorFactory.getStandardListObjectInspector(elementOI);

ObjectInspector resultoi = udf.initialize(new ObjectInspector[]{hostAndPort, key, listIO});

Object list = listIO.create(1);

Object row = elementOI.create();

elementOI.put(row, new Text("did"), new Text("xiaojuntest"));

elementOI.put(row, new Text("sex"), new IntWritable(1));

elementOI.put(row, new Text("age"), new Text("85"));

listIO.set(list, 0, row);

Object result = udf.evaluate(new GenericUDF.DeferredObject[]{new GenericUDF.DeferredJavaObject(host), new GenericUDF.DeferredJavaObject(keyField), new GenericUDF.DeferredJavaObject(list)});

System.out.println(result);

}

// @Test

// public void test2() {

// TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("array>");

// StandardListObjectInspector objInsp = (StandardListObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);

// System.out.println(objInsp);

// }

}

令狐诗楠

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
udf hive 写入mysql_hive udf 批量写入redis

import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.ud...
复制链接

扫一扫