第93课:Hive中的内置函数、UDF、UDAF实战

//hive的内置函数

向下取整

hive> select floor(salary) from employeeforhaving;

select log(salary) from employeeforhaving;

取负数

select negative(salary) from employeeforhaving;


UDF 

package com.dt.spark.hive;

import org.apache.hadoop.hive.ql.exec.UDF;

import org.apache.hadoop.io.Text;


public final class HiveUDF extends UDF {

/**

* 在这个方法中实现任意符合业务处理需求的代码,而hive自带的函数没有实现的

* @param s

* @return

*/

  public Text evaluate(final Text s) {

    if (s == null) { return null; }

    return new Text(s.toString().toLowerCase());

  }

}

打包

 lowerCase.jar

加到上下文:

hive> Add jar /bigdata/learn_data/lowerCase.jar;

Added [/bigdata/learn_data/lowerCase.jar] to class path

Added resources: [/bigdata/learn_data/lowerCase.jar]


//起别名

hive> CREATE TEMPORARY FUNCTION tolower AS ‘com.dt.spark.hive.HiveUDF';

//接下来直接使用:

hive> select tolower(address) from employeeforhaving;


UDAF

聚合操作就可以迭代多次


package com.dt.spark.hive;


import org.apache.hadoop.hive.ql.exec.UDAF;

import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;

/**

 * 多行字符串拼接为一行函数

 * @author jiudu

 */

public class HiveUDAF extends UDAF {

public static class ConcatUDAFEvaluator implements UDAFEvaluator {

public static class PartialResult {

String result;

String delimiter;

}


private PartialResult partial;

/**

* 用于初始化工作,例如用于设置全局变量的初始状态

*/

public void init() {

partial = null;

}

/**

* 是数据迭代的处理的核心

* @param value

* @param deli

* @return

*/

public boolean iterate(String value, String deli) {


if (value == null) {

return true;

}

if (partial == null) {

partial = new PartialResult();

partial.result = new String("");

if (deli == null || deli.equals("")) {

partial.delimiter = new String(",");

} else {

partial.delimiter = new String(deli);

}


}

if (partial.result.length() > 0) {

partial.result = partial.result.concat(partial.delimiter);

}


partial.result = partial.result.concat(value);


return true;

}


/**

* 分布式计算有很多Mapper会把自己的计算结构传给Reducer,如果说Mapper

* 可以继续计算就会调用terminatePartial来更新当前对象的状态

* @return

*/

public PartialResult terminatePartial() {

return partial;

}


/**

* 在Reducer端负责处理Mapper端传过来的参数,其传入的参数的类型是terminatePartial的返回的类型

* @param other

* @return

*/

public boolean merge(PartialResult other) {

if (other == null) {

return true;

}

if (partial == null) {

partial = new PartialResult();

partial.result = new String(other.result);

partial.delimiter = new String(other.delimiter);

} else {

if (partial.result.length() > 0) {

partial.result = partial.result.concat(partial.delimiter);

}

partial.result = partial.result.concat(other.result);

}

return true;

}

/**

* Hive 的UDAF最终返回的聚合结果

* @return

*/

public String terminate() {

return new String(partial.result);

}

}

}

打成pinjie.jar

hive> add jar /bigdata/learn_data/pinjie.jar;

hive> CREATE TEMPORARY FUNCTION pinjie AS 'com.dt.spark.hive.HiveUDAF';

hive> select pinjie(name,"111") from employeeforhaving;




展开阅读全文

没有更多推荐了,返回首页