import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.IntWritable;
import redis.clients.jedis.HostAndPort;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.Pipeline;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
@Description(name = "redis_batch_hset",
value = "_FUNC_(host_and_port,keyField, array) - Return ret "
)
public class RedisBatchHSetUDF extends GenericUDF {
private HostAndPort hostAndPort;
private String keyField;
private Object writableKeyField; //实际上是org.apache.hadoop.io.Text类型
private StandardListObjectInspector paramsListInspector;
private StandardMapObjectInspector paramsElementInspector;
@Override
public Object evaluate(DeferredObject[] arg0) throws HiveException {
try (Jedis jedis = new Jedis(hostAndPort.getHost(), hostAndPort.getPort(), 10000, 60000);
Pipeline pipeline = jedis.pipelined()
) {
for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) {
Object row = paramsListInspector.getListElement(arg0[2].get(), i);
Map, ?> map = paramsElementInspector.getMap(row);
// Object obj = ObjectInspectorUtils.copyToStandardJavaObject(row,paramsElementInspector); //转成标准的java map,否则里面的key value字段为hadoop writable对象
if (map.containsKey(writableKeyField)) {
String did = map.get(writableKeyField).toString();
Map data = new HashMap<>();
for (Map.Entry, ?> entry : map.entrySet()) {
if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) {
data.put(entry.getKey().toString(), entry.getValue().toString());
}
}
pipeline.hmset(did,data);
}
}
pipeline.sync();
return new IntWritable(1);
} catch (IOException e) {
e.printStackTrace();
throw new HiveException(e);
}
}
@Override
public String getDisplayString(String[] arg0) {
return "redis_batch_hset(redishost_and_port,keyField, array>)";
}
@Override
public ObjectInspector initialize(ObjectInspector[] arg0)
throws UDFArgumentException {
if (arg0.length != 3) {
throw new UDFArgumentException(" Expecting two arguments: array> ");
}
//第一个参数校验
if (arg0[0].getCategory() == Category.PRIMITIVE
&& ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {
if (!(arg0[0] instanceof ConstantObjectInspector)) {
throw new UDFArgumentException("redis host:port must be constant");
}
ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0];
String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":");
hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1]));
}
//第2个参数校验
if (arg0[1].getCategory() == Category.PRIMITIVE
&& ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {
if (!(arg0[1] instanceof ConstantObjectInspector)) {
throw new UDFArgumentException("redis hset key must be constant");
}
ConstantObjectInspector keyFieldOI = (ConstantObjectInspector) arg0[1];
keyField = keyFieldOI.getWritableConstantValue().toString();
writableKeyField = keyFieldOI.getWritableConstantValue();
}
//第3个参数校验
if (arg0[2].getCategory() != Category.LIST) {
throw new UDFArgumentException(" Expecting an array> field as third argument ");
}
ListObjectInspector third = (ListObjectInspector) arg0[2];
if (third.getListElementObjectInspector().getCategory() != Category.MAP) {
throw new UDFArgumentException(" Expecting an array> field as third argument ");
}
paramsListInspector = ObjectInspectorFactory.getStandardListObjectInspector(third.getListElementObjectInspector());
paramsElementInspector = (StandardMapObjectInspector) third.getListElementObjectInspector();
System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory());
System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory());
return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
}
}
如何注册就不说了,可以翻看原来的那个Mysql udf的文章。
用法还蛮苛刻的:
SELECT g,xydb.redis_batch_hset('192.168.78.87:6379','did',collect_list(map('did',concat('xyzs.profile.',did),'sex',sex,'age',age))) AS result
FROM
(
SELECT did,pmod(abs(hash(did)),1000) AS g,sex,age FROM data_mining.adl_xyzs_user_profile_day_new WHERE ds='2017-08-02' AND isnotnull(sex) AND isnotnull(age)
) a
GROUP BY g;
redis_batch_hset(redishost_and_port,keyField, array>)
第一个参数是要连接的redis的host:port。我们redis没有密码并且默认连db0,所以就配一个hostport就行了,否则自己改一下代码。。。
第三个参数是要入库的数据,是一个array,array里面的元素是一个map。
第二个参数是指第三个数组里面每一行map数据里面作为redis hset的key的字段名,这里样例里面指did,当然did这个字段名必须在map中存在,不然会忽略。
did是用户唯一标识,由于入库的量比较大(每天有30多万个did待入库),可以按did mod分组,比如我这拆成了1000个数组分别使用pipline写入redis,这样每个数组里面其实只有几百条数据,使用pipline批量提交提高redis写入性能。
再附一个简单的测试用例:
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.junit.Before;
import org.junit.Test;
public class RedisBatchHsetUDFTest {
private RedisBatchHSetUDF udf;
@Before
public void before() {
udf = new RedisBatchHSetUDF();
}
@Test
public void test() throws HiveException {
Text host = new Text("name87:6379");
Text keyField = new Text("did");
ObjectInspector hostAndPort = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, host);
ObjectInspector key = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, keyField);
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("map");
StandardMapObjectInspector elementOI = (StandardMapObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
StandardListObjectInspector listIO = ObjectInspectorFactory.getStandardListObjectInspector(elementOI);
ObjectInspector resultoi = udf.initialize(new ObjectInspector[]{hostAndPort, key, listIO});
Object list = listIO.create(1);
Object row = elementOI.create();
elementOI.put(row, new Text("did"), new Text("xiaojuntest"));
elementOI.put(row, new Text("sex"), new IntWritable(1));
elementOI.put(row, new Text("age"), new Text("85"));
listIO.set(list, 0, row);
Object result = udf.evaluate(new GenericUDF.DeferredObject[]{new GenericUDF.DeferredJavaObject(host), new GenericUDF.DeferredJavaObject(keyField), new GenericUDF.DeferredJavaObject(list)});
System.out.println(result);
}
// @Test
// public void test2() {
// TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("array>");
// StandardListObjectInspector objInsp = (StandardListObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
// System.out.println(objInsp);
// }
}