HIVE内置函数hash() – 源码解析
在hash()值求解中,hive支持多类型,多参数的哈希值求解,那么底层是如何实现的?
首先HIVE提供的hash()内置函数的源码是怎么要求的?
hash内置函数在类中表明:
org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash
一、initialize方法
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentTypeException {
argumentOIs = arguments;
return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
}
在初始化参数的时候,并没有做任何的校验,直接返回了一个Int类型,所以在编译阶段是可以随意通过的,无论传入什么参数
二、evaluate方法
private final IntWritable result = new IntWritable();
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
Object[] fieldValues = new Object[arguments.length];
// 将参数取出
for(int i = 0; i < arguments.length; i++) {
fieldValues[i] = arguments[i].get();
}
int r = ObjectInspectorUtils.getBucketHashCode(fieldValues, argumentOIs);
result.set(r);
return result;
}
可以看到在处理阶段是可以传入多个参数的,也就是说hash()这个函数可以传入多个参数,其实最主要的处理方法是调用了ObjectInspectorUtils.getBucketHashCode(fieldValues, argumentOIs);
int r = ObjectInspectorUtils.getBucketHashCode(fieldValues, argumentOIs);
getBucketHashCode方法
点进去看一下这个方法是什么?
public static int getBucketHashCode(Object[] bucketFields, ObjectInspector[] bucketFieldInspectors) {
int hashCode = 0;
// 对传入的object,遍历调用hashCode方法
for(int i = 0; i < bucketFields.length; ++i) {
int fieldHash = hashCode(bucketFields[i], bucketFieldInspectors[i]);
hashCode = 31 * hashCode + fieldHash;
}
return hashCode;
}
hashCode方法
该方法对传入的object,遍历调用hashCode方法,那么hashCode方法是怎么写的?
public static int hashCode(Object o, ObjectInspector objIns) {
if (o == null) {
return 0;
} else {
int r;
ObjectInspector keyOI;
int i;
switch(objIns.getCategory()) {
// HIVE原始数据类型
case PRIMITIVE:
PrimitiveObjectInspector poi = (PrimitiveObjectInspector)objIns;
long a;
switch(poi.getPrimitiveCategory()) {
case VOID:
return 0;
case BOOLEAN:
return ((BooleanObjectInspector)poi).get(o) ? 1 : 0;
case BYTE:
return ((ByteObjectInspector)poi).get(o);
case SHORT:
return ((ShortObjectInspector)poi).get(o);
case INT:
return ((IntObjectInspector)poi).get(o);
case LONG:
a = ((LongObjectInspector)poi).get(o);
return (int)(a >>> 32 ^ a);
case FLOAT:
return Float.floatToIntBits(((FloatObjectInspector)poi).get(o));
case DOUBLE:
a = Double.doubleToLongBits(((DoubleObjectInspector)poi).get(o));
return (int)(a >>> 32 ^ a);
case STRING:
Text t = ((StringObjectInspector)poi).getPrimitiveWritableObject(o);
int r = 0;
for(i = 0; i < t.getLength(); ++i) {
r = r * 31 + t.getBytes()[i];
}
return r;
case CHAR:
return ((HiveCharObjectInspector)poi).getPrimitiveWritableObject(o).hashCode();
case VARCHAR:
return ((HiveVarcharObjectInspector)poi).getPrimitiveWritableObject(o).hashCode();
case BINARY:
return ((BinaryObjectInspector)poi).getPrimitiveWritableObject(o).hashCode();
case DATE:
return ((DateObjectInspector)poi).getPrimitiveWritableObject(o).hashCode();
case TIMESTAMP:
TimestampWritable t = ((TimestampObjectInspector)poi).getPrimitiveWritableObject(o);
return t.hashCode();
case INTERVAL_YEAR_MONTH:
HiveIntervalYearMonthWritable intervalYearMonth = ((HiveIntervalYearMonthObjectInspector)poi).getPrimitiveWritableObject(o);
return intervalYearMonth.hashCode();
case INTERVAL_DAY_TIME:
HiveIntervalDayTimeWritable intervalDayTime = ((HiveIntervalDayTimeObjectInspector)poi).getPrimitiveWritableObject(o);
return intervalDayTime.hashCode();
case DECIMAL:
return ((HiveDecimalObjectInspector)poi).getPrimitiveWritableObject(o).hashCode();
default:
throw new RuntimeException("Unknown type: " + poi.getPrimitiveCategory());
}
// 数组类型
case LIST:
r = 0;
ListObjectInspector listOI = (ListObjectInspector)objIns;
keyOI = listOI.getListElementObjectInspector();
for(i = 0; i < listOI.getListLength(o); ++i) {
r = 31 * r + hashCode(listOI.getListElement(o, i), keyOI);
}
return r;
// MAP类型
case MAP:
r = 0;
MapObjectInspector mapOI = (MapObjectInspector)objIns;
keyOI = mapOI.getMapKeyObjectInspector();
ObjectInspector valueOI = mapOI.getMapValueObjectInspector();
Map<?, ?> map = mapOI.getMap(o);
Entry entry;
for(Iterator var9 = map.entrySet().iterator(); var9.hasNext(); r += hashCode(entry.getKey(), keyOI) ^ hashCode(entry.getValue(), valueOI)) {
entry = (Entry)var9.next();
}
return r;
// STRUCT类型
case STRUCT:
r = 0;
StructObjectInspector structOI = (StructObjectInspector)objIns;
List<? extends StructField> fields = structOI.getAllStructFieldRefs();
StructField field;
for(Iterator var18 = fields.iterator(); var18.hasNext(); r = 31 * r + hashCode(structOI.getStructFieldData(o, field), field.getFieldObjectInspector())) {
field = (StructField)var18.next();
}
return r;
// 组合类型
case UNION:
UnionObjectInspector uOI = (UnionObjectInspector)objIns;
byte tag = uOI.getTag(o);
return hashCode(uOI.getField(o), (ObjectInspector)uOI.getObjectInspectors().get(tag));
default:
throw new RuntimeException("Unknown type: " + objIns.getTypeName());
}
}
}
可以看到hashCode根据传进来的参数的检测器类型来计算不同类型的hash数值,然后将计算结果返回。
hash方法支持的数据类型包括了 基本的数据类型、数组列表、map、struct、组合结构等,如果传入了其他的数据类型,会返回一个运行时异常:
("Unknown type: " + objIns.getTypeName())
TIPS*:注意,在HIVE中是可以使用hash()方法来处理各个类型的结果,但是在Spark中,这个函数就会有些bug存在的。

本文深入解析了HIVE内置的hash()函数,从initialize方法到evaluate方法,特别是getBucketHashCode和hashCode方法的细节。源码显示,hash()函数支持多种类型参数,通过ObjectInspectorUtils.getBucketHashCode计算哈希值,对于不同类型的数据进行不同的处理。同时警告在Spark中使用此函数可能存在兼容性问题。
1796

被折叠的 条评论
为什么被折叠?



