目录
大家好,我是老六。今天和大家一起来看下hive中内置函数concat的源码。
一、函数使用
hive函数官方文档:LanguageManual UDF - Apache Hive - Apache Software Foundation
二、使用案例
函数使用比较简单
结果
三、源码分析
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.udf.generic;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupConcatColCol;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupColConcatStringScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupColConcatCharScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupColConcatVarCharScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringScalarConcatStringGroupCol;
import org.apache.hadoop.hive.ql.exec.vector.expressions.CharScalarConcatStringGroupCol;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VarCharScalarConcatStringGroupCol;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter.StringConverter;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.BytesWritable;
/**
* GenericUDFConcat.
*/
@Description(name = "concat",
value = "_FUNC_(str1, str2, ... strN) - returns the concatenation of str1, str2, ... strN or "+
"_FUNC_(bin1, bin2, ... binN) - returns the concatenation of bytes in binary data " +
" bin1, bin2, ... binN",
extended = "Returns NULL if any argument is NULL.\n"
+ "Example:\n"
+ " > SELECT _FUNC_('abc', 'def') FROM src LIMIT 1;\n"
+ " 'abcdef'")
@VectorizedExpressions({StringGroupConcatColCol.class,
StringGroupColConcatStringScalar.class,
StringGroupColConcatCharScalar.class, StringGroupColConcatVarCharScalar.class,
StringScalarConcatStringGroupCol.class,
CharScalarConcatStringGroupCol.class, VarCharScalarConcatStringGroupCol.class})
public class GenericUDFConcat extends GenericUDF {
private transient ObjectInspector[] argumentOIs;
private transient StringConverter[] stringConverters;
private transient PrimitiveCategory returnType = PrimitiveCategory.STRING;
private transient BytesWritable[] bw;
private transient GenericUDFUtils.StringHelper returnHelper;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
// Loop through all the inputs to determine the appropriate return type/length.
// Return type:
// All CHAR inputs: return CHAR
// All VARCHAR inputs: return VARCHAR
// All CHAR/VARCHAR inputs: return VARCHAR
// All BINARY inputs: return BINARY
// Otherwise return STRING
argumentOIs = arguments;
PrimitiveCategory currentCategory;
PrimitiveObjectInspector poi;
boolean fixedLengthReturnValue = true;
int returnLength = 0; // Only for char/varchar return types
for (int idx = 0; idx < arguments.length; ++idx) {
if (arguments[idx].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentException("CONCAT only takes primitive arguments");
}
poi = (PrimitiveObjectInspector)arguments[idx];
currentCategory = poi.getPrimitiveCategory();
if (idx == 0) {
returnType = currentCategory;
}
switch (currentCategory) {
case BINARY:
fixedLengthReturnValue = false;
if (returnType != currentCategory) {
// mix of binary/non-binary args
returnType = PrimitiveCategory.STRING;
}
break;
case CHAR:
case VARCHAR:
if (!fixedLengthReturnValue) {
returnType = PrimitiveCategory.STRING;
}
if (fixedLengthReturnValue && currentCategory == PrimitiveCategory.VARCHAR) {
returnType = PrimitiveCategory.VARCHAR;
}
break;
default:
returnType = PrimitiveCategory.STRING;
fixedLengthReturnValue = false;
break;
}
// If all arguments are of known length then we can keep track of the max
// length of the return type. However if the return length exceeds the
// max length for the char/varchar, then the return type reverts to string.
if (fixedLengthReturnValue) {
returnLength += GenericUDFUtils.StringHelper.getFixedStringSizeForType(poi);
if ((returnType == PrimitiveCategory.VARCHAR
&& returnLength > HiveVarchar.MAX_VARCHAR_LENGTH)
|| (returnType == PrimitiveCategory.CHAR
&& returnLength > HiveChar.MAX_CHAR_LENGTH)) {
returnType = PrimitiveCategory.STRING;
fixedLengthReturnValue = false;
}
}
}
if (returnType == PrimitiveCategory.BINARY) {
bw = new BytesWritable[arguments.length];
return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector;
} else {
// treat all inputs as string, the return value will be converted to the appropriate type.
createStringConverters();
returnHelper = new GenericUDFUtils.StringHelper(returnType);
BaseCharTypeInfo typeInfo;
switch (returnType) {
case STRING:
return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
case CHAR:
typeInfo = TypeInfoFactory.getCharTypeInfo(returnLength);
return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo);
case VARCHAR:
typeInfo = TypeInfoFactory.getVarcharTypeInfo(returnLength);
return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo);
default:
throw new UDFArgumentException("Unexpected CONCAT return type of " + returnType);
}
}
}
private void createStringConverters() {
stringConverters = new StringConverter[argumentOIs.length];
for (int idx = 0; idx < argumentOIs.length; ++idx) {
stringConverters[idx] = new StringConverter((PrimitiveObjectInspector) argumentOIs[idx]);
}
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if (returnType == PrimitiveCategory.BINARY) {
return binaryEvaluate(arguments);
} else {
return returnHelper.setReturnValue(stringEvaluate(arguments));
}
}
public Object binaryEvaluate(DeferredObject[] arguments) throws HiveException {
int len = 0;
for (int idx = 0; idx < arguments.length; ++idx) {
bw[idx] = ((BinaryObjectInspector)argumentOIs[idx])
.getPrimitiveWritableObject(arguments[idx].get());
if (bw[idx] == null){
return null;
}
len += bw[idx].getLength();
}
byte[] out = new byte[len];
int curLen = 0;
// Need to iterate twice since BytesWritable doesn't support append.
for (BytesWritable bytes : bw){
System.arraycopy(bytes.getBytes(), 0, out, curLen, bytes.getLength());
curLen += bytes.getLength();
}
return new BytesWritable(out);
}
public String stringEvaluate(DeferredObject[] arguments) throws HiveException {
StringBuilder sb = new StringBuilder();
for (int idx = 0; idx < arguments.length; ++idx) {
String val = null;
if (arguments[idx] != null) {
val = (String) stringConverters[idx].convert(arguments[idx].get());
}
if (val == null) {
return null;
}
sb.append(val);
}
return sb.toString();
}
}
四、学习总结
需关注的函数有initialize()、evaluate()、binaryEvaluate()、stringEvaluate(),其中initialize()函数进行初始化,判断参数类型、进行类型转换以及设定返回类型,为下阶段进行准备;evaluate()内涵拼接逻辑,根据参数不同,分别调用了binaryEvaluate()、stringEvaluate(),这两个函数是真正实现拼接逻辑的函数,比较简单,大家自己看。