一、创建一个java项目
对应的pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.credithc</groupId>
<artifactId>hive_udf_v1.0</artifactId>
<version>1.0-SNAPSHOT</version>
<!-- 根据要连接的hadoop和hive,设置版本参数 -->
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<maven-compiler-plugin.version>3.7.0</maven-compiler-plugin.version>
</properties>
<!-- 因为使用CDH的hadoop和hive,因此要添加CDH的官方repository,才能够下载相应的依赖包 -->
<!-- 如果使用Apache版本的hadoop和hive,则不需要添加该repository -->
<repositories>
<repository>
<id>cloudera</id>
<url>http://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository>
</repositories>
<dependencies>
<!-- 添加依赖组件,根据上方配置的版本参数和repository知识库下载依赖 -->
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop-mr</artifactId>
<version>5.6.3</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop-hive</artifactId>
<version>5.6.3</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.0.0</version>
</dependency>
<!-- junit是java的单元测试框架 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.4.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
二、UDF函数创建开发:
package com.credithc.rc.kg.udf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by glin on 2018/11/1 0001. com.credithc.rc.kg.udf.MessageDecodeUdf
*/
public class MessageDecodeUdf extends UDF{
public MessageDecodeUdf(){
}
public String evaluate(String str,String params) {
if(StringUtils.isEmpty(str)||StringUtils.isEmpty(params))
return null;
String re = null;
try {
switch (params) {
//时间抽取
case "time":
re = parserTime(str);
break;
//银行名称抽取
case "bankName":
re = parserBankName(str);
break;
}
}catch (Exception e){
}
return re;
}
public String parserTime(String str){
//提取时间
Pattern p0 =Pattern.compile("\\d{4}年\\d{1,2}月\\d{1,2}日|\\d{1,2}月\\d{1,2}日|\\d{4}[-|/|.]\\d{1,2}[-|/|.]\\d{1,2}");
//时间匹配
Matcher m0 = p0.matcher(str);
if(m0.find()){
return m0.group(0);
}else{
return null;
}
}
public String parserBankName(String str){
//提取[]里的内容
Pattern p1 = Pattern.compile("\\[(.+?银行)\\]");
//银行
Matcher m1 = p1.matcher(str);
if(m1.find()){
return m1.group(1);
}else{
return null;
}
}
}
public static void main(String[] args) {
MessageDecodeUdf dd = new MessageDecodeUdf();
System.out.println(dd.evaluate(" 。下载“中国建设银行”手机银行APP 。[建设银行]", "time"));
}
}
测试运行结果:
三、导出 jar包:
三、上传hive测试:
找到该函数,鼠标右键选择Copy Reference 获得该函数的全路径:com.credithc.ss.sd.udf.MessageDecodeUdf
-- 将上传的jar包导入到classpath变量里
hdfs dfs -put /home/sd/test/hive_udf_v1.0-1.0-SNAPSHOT.jar /user/sd/hive_udf/
list jars; -- 查看导入的jar包
create temporary function message_udf as 'com.credithc.ss.sd.udf.MessageDecodeUdf'; -- 创建一个临时函数,关联该jar包
使用测试:
select message_udf ( str, params) from kkkk;