hive源码之concat函数

最新推荐文章于 2024-10-11 17:11:33 发布

数仓之路

最新推荐文章于 2024-10-11 17:11:33 发布

阅读量937

点赞数

分类专栏： Hive源码文章标签： hive hadoop apache 大数据 sql

本文链接：https://blog.csdn.net/CsDn_en/article/details/128385691

版权

Hive源码专栏收录该内容

8 篇文章 2 订阅

订阅专栏

大家好，我是老六。今天和大家一起来看下hive中内置函数concat的源码。

一、函数使用

hive函数官方文档：LanguageManual UDF - Apache Hive - Apache Software Foundation

二、使用案例

函数使用比较简单

结果

三、源码分析

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.udf.generic;

import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupConcatColCol;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupColConcatStringScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupColConcatCharScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringGroupColConcatVarCharScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringScalarConcatStringGroupCol;
import org.apache.hadoop.hive.ql.exec.vector.expressions.CharScalarConcatStringGroupCol;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VarCharScalarConcatStringGroupCol;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter.StringConverter;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.BytesWritable;

/**
 * GenericUDFConcat.
 */
@Description(name = "concat",
value = "_FUNC_(str1, str2, ... strN) - returns the concatenation of str1, str2, ... strN or "+
        "_FUNC_(bin1, bin2, ... binN) - returns the concatenation of bytes in binary data " +
        " bin1, bin2, ... binN",
extended = "Returns NULL if any argument is NULL.\n"
+ "Example:\n"
+ "  > SELECT _FUNC_('abc', 'def') FROM src LIMIT 1;\n"
+ "  'abcdef'")
@VectorizedExpressions({StringGroupConcatColCol.class,
    StringGroupColConcatStringScalar.class,
    StringGroupColConcatCharScalar.class, StringGroupColConcatVarCharScalar.class,
    StringScalarConcatStringGroupCol.class,
    CharScalarConcatStringGroupCol.class, VarCharScalarConcatStringGroupCol.class})
public class GenericUDFConcat extends GenericUDF {
  private transient ObjectInspector[] argumentOIs;
  private transient StringConverter[] stringConverters;
  private transient PrimitiveCategory returnType = PrimitiveCategory.STRING;
  private transient BytesWritable[] bw;
  private transient GenericUDFUtils.StringHelper returnHelper;

  @Override
  public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {

    // Loop through all the inputs to determine the appropriate return type/length.
    // Return type:
    //  All CHAR inputs: return CHAR
    //  All VARCHAR inputs: return VARCHAR
    //  All CHAR/VARCHAR inputs: return VARCHAR
    //  All BINARY inputs: return BINARY
    //  Otherwise return STRING
    argumentOIs = arguments;

    PrimitiveCategory currentCategory;
    PrimitiveObjectInspector poi;
    boolean fixedLengthReturnValue = true;
    int returnLength = 0;  // Only for char/varchar return types
    for (int idx = 0; idx < arguments.length; ++idx) {
      if (arguments[idx].getCategory() != Category.PRIMITIVE) {
        throw new UDFArgumentException("CONCAT only takes primitive arguments");
      }
      poi = (PrimitiveObjectInspector)arguments[idx];
      currentCategory = poi.getPrimitiveCategory();
      if (idx == 0) {
        returnType = currentCategory;
      }
      switch (currentCategory) {
        case BINARY:
          fixedLengthReturnValue = false;
          if (returnType != currentCategory) {
            // mix of binary/non-binary args
            returnType = PrimitiveCategory.STRING;
          }
          break;
        case CHAR:
        case VARCHAR:
          if (!fixedLengthReturnValue) {
            returnType = PrimitiveCategory.STRING;
          }
          if (fixedLengthReturnValue && currentCategory == PrimitiveCategory.VARCHAR) {
            returnType = PrimitiveCategory.VARCHAR;
          }
          break;
        default:
          returnType = PrimitiveCategory.STRING;
          fixedLengthReturnValue = false;
          break;
      }

      // If all arguments are of known length then we can keep track of the max
      // length of the return type. However if the return length exceeds the
      // max length for the char/varchar, then the return type reverts to string.
      if (fixedLengthReturnValue) {
        returnLength += GenericUDFUtils.StringHelper.getFixedStringSizeForType(poi);
        if ((returnType == PrimitiveCategory.VARCHAR
                && returnLength > HiveVarchar.MAX_VARCHAR_LENGTH)
            || (returnType == PrimitiveCategory.CHAR
                && returnLength > HiveChar.MAX_CHAR_LENGTH)) {
          returnType = PrimitiveCategory.STRING;
          fixedLengthReturnValue = false;
        }
      }
    }

    if (returnType == PrimitiveCategory.BINARY) {
      bw = new BytesWritable[arguments.length];
      return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector;
    } else {
      // treat all inputs as string, the return value will be converted to the appropriate type.
      createStringConverters();
      returnHelper = new GenericUDFUtils.StringHelper(returnType);
      BaseCharTypeInfo typeInfo;
      switch (returnType) {
        case STRING:
          return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
        case CHAR:
          typeInfo = TypeInfoFactory.getCharTypeInfo(returnLength);
          return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo);
        case VARCHAR:
          typeInfo = TypeInfoFactory.getVarcharTypeInfo(returnLength);
          return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo);
        default:
          throw new UDFArgumentException("Unexpected CONCAT return type of " + returnType);
      }
    }
  }

  private void createStringConverters() {
    stringConverters = new StringConverter[argumentOIs.length];
    for (int idx = 0; idx < argumentOIs.length; ++idx) {
      stringConverters[idx] = new StringConverter((PrimitiveObjectInspector) argumentOIs[idx]);
    }
  }

  @Override
  public Object evaluate(DeferredObject[] arguments) throws HiveException {
    if (returnType == PrimitiveCategory.BINARY) {
      return binaryEvaluate(arguments);
    } else {
      return returnHelper.setReturnValue(stringEvaluate(arguments));
    }
  }

  public Object binaryEvaluate(DeferredObject[] arguments) throws HiveException {
    int len = 0;
    for (int idx = 0; idx < arguments.length; ++idx) {
      bw[idx] = ((BinaryObjectInspector)argumentOIs[idx])
          .getPrimitiveWritableObject(arguments[idx].get());
      if (bw[idx] == null){
        return null;
      }
      len += bw[idx].getLength();
    }

    byte[] out = new byte[len];
    int curLen = 0;
    // Need to iterate twice since BytesWritable doesn't support append.
    for (BytesWritable bytes : bw){
      System.arraycopy(bytes.getBytes(), 0, out, curLen, bytes.getLength());
      curLen += bytes.getLength();
    }
    return new BytesWritable(out);
  }

  public String stringEvaluate(DeferredObject[] arguments) throws HiveException {
    StringBuilder sb = new StringBuilder();
    for (int idx = 0; idx < arguments.length; ++idx) {
      String val = null;
      if (arguments[idx] != null) {
        val = (String) stringConverters[idx].convert(arguments[idx].get());
      }
      if (val == null) {
        return null;
      }
      sb.append(val);
    }
    return sb.toString();
  }

}

四、学习总结

需关注的函数有initialize()、evaluate()、binaryEvaluate()、stringEvaluate()，其中initialize()函数进行初始化，判断参数类型、进行类型转换以及设定返回类型，为下阶段进行准备；evaluate()内涵拼接逻辑，根据参数不同，分别调用了binaryEvaluate()、stringEvaluate()，这两个函数是真正实现拼接逻辑的函数，比较简单，大家自己看。