Hive简易自定义函数详解
简介:
Hive自定义函数有三种
UDF 一进一出
UDAF 多进一出 一般是 聚合操作
UDTF 一进多出
步骤:
1.创建项目
2.编辑pom.xml 引入相应的JAR包
3.定义一个JAVA类 继承UDF
4.重写 evaluate 方法
5.根据自己的需求指定传入的参数 以及返回值 。同时编译逻辑代码
6.达成jar包 并添加到Hive中
输入 add jar /root/spark_scala_maven.jar(jar包在linux上的位置)
7.创建函数
临时函数 create temporary function 函数名字 as '重写的方法位置' ;
永久函数 create function add_prefix as 'com.test.AddPrefix' using jar 'hdfs:///wh/test/addPrefix.jar'
代码:这个是简单的自定义的UDF 模式匹配的方法
package FunctionUDF;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
public class CaseWhenDeptNameUDF extends UDF {
//必须重写一个方法,方法的名字必须叫:evaluate
public String evaluate(String deptPath) {
/**
* 判断是否是空字符串
* ' ' = true
*/
boolean blank = StringUtils.isBlank(deptPath);
if (!blank) {
/**
* 判断字符串是否包含 ==> ,
*/
boolean contains = deptPath.contains(",");
if (contains) {
String[] split = deptPath.split(",");
String pdept = split[1];
/**
* 判断是否是菏泽事业部
*/
boolean hz = pdept.contains("菏泽");
if (hz) {
return "菏泽事业部";
}
/**
* 判断是否是邯郸事业部
*/
boolean hd = pdept.contains("邯郸");
if (hd) {
return "邯郸事业部";
}
/**
* 判断是否是济南事业部
*/
boolean jn = pdept.contains("济南");
if (jn) {
return "济南事业部";
}
/**
* 判断是否是郑州事业部
*/
boolean zz = pdept.contains("郑州");
if (zz) {
return "郑州事业部";
}
} else {
return "旧事业部架构";
}
} else {
return "旧事业部架构";
}
return "旧事业部架构";
}
}
打成jar包
上传到Linux
如果你需要创建永久函数 ,需要上传到HDFS
临时函数 就在linux中的绝对路径下即可
登录HIve客户端
创建临时函数
add jar /a/b/xxxx.jar;
create temporary function add_prefix as 'FunctionUDF.CaseWhenDeptNameUDF ';
创建永久函数
1) 先将jar包上传到HDFS
hadoop fs -put xxx.jar /hdfd/hd/
2)创建永久函数
create function 函数名 as '全类名' using jar 'hdfs:///hdfd/hd/xxx.jar'
完整 pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.doiteu</groupId>
<artifactId>demo1Spark</artifactId>
<version>1.0-SNAPSHOT</version>
<!-- 定义了一些常量 -->
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.11.8</scala.version>
<spark.version>2.3.3</spark.version>
<hadoop.version>2.7.7</hadoop.version>
<encoding>UTF-8</encoding>
</properties>
<dependencies>
<!-- 导入scala的依赖 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- 导入spark的依赖,core指的是RDD编程API -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- Spark sql 2.11 办依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.3</version>
<!--<scope>provided</scope>-->
</dependency>
<!-- 导入spark streaming的依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark 连接Hadoop-->
<!-- <dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>-->
<!-- spark 连接Hdfs -->
<!-- <dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.8.5</version>
</dependency>
<!-- spark 连接 Mysql -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.48</version>
</dependency>
<!-- 导入spark Hive 连接的依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.4</version>
</dependency>
<!-- spark 连接Hive jdbc 驱动 -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.1.0</version>
</dependency>
<!-- spark streaming 跟kafka 0.10即以上版本进行整合的依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.3.3</version>
</dependency>
<!-- redis的客户端 -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>3.0.1</version>
</dependency>
<!-- 解析JSON的依赖 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.57</version>
</dependency>
<!-- 数据库连接池 -->
<!-- <dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.20</version>
</dependency>-->
<!-- mysql连接依赖 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<!-- 引入Jedis -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.4.9</version>
</dependency>
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-core</artifactId>
<version>4.14.3-HBase-1.4</version>
</dependency>
<!-- 阿里巴巴 连接池 -->
<!-- https://mvnrepository.com/artifact/com.alibaba/druid -->
<!-- <dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.19</version>
</dependency>-->
</dependencies>
<build>
<pluginManagement>
<plugins>
<!-- 编译scala的插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<!-- 编译java的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 打jar插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
刚开始写 你肯定会遇到很多问题
遇到问题不要慌 留言即可