- hive如何注册udf
- spark如何注册udf
- 注册udf的注意事项
1. spark注册udf
'
1
// 1 用此方式注册udf函数,dsl和sql中都能使用。注意:注册名(sql中用)和返回值(dsl中用)名要一致
val simpleUDF = spark.udf.register("simpleUDF", (v: Int) => v * v)
// val simpleUDF = spark.udf.register("simpleUDF", v2 _)
// 2 构造数据源
val df = Seq(("id1", 1), ("id2", 4), ("id3", 5)).toDF("id", "value")
// 3 dsl中使用
df.select($"id", simpleUDF($"value").as("v2")).show()
// 4 sql使用
df.createOrReplaceTempView("df")
spark.sql("select id id2,simpleUDF(value) v2 from df").show()
'
2 上面注册方式报错时
val isNullOrBlankUdf: UserDefinedFunction = SparkTool.spark.udf.register( "isNullOrBlankUdf", (str: String) => {
if (null == str) {
true
} else {
// 统一转成小写
val lower_str = str.toLowerCase
// 需判断为空的特殊值
val words = "\"\",\\n,null"
val words_array = words.split( "," )
str.trim.isEmpty || words_array.contains( lower_str )
}
} )
2. hive注册udf
1 开发udf包
1)官方文档
https://spark.apache.org/docs/3.2.0/sql-ref-syntax-ddl-create-function.html#content
2)pom文件设置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.transsion</groupId>
<artifactId>bigdata-udfs</artifactId>
<version>0.1</version>
<name>bigdata-udfs</name>
<properties>
<scala.version>2.12.8</scala.version>
<filename>user_agent</filename>
<!--测试用: compile , 生产用: provided -->
<scope.version>provided</scope.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>${scope.version}</scope>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.4</version>
<scope>${scope.version}</scope>
</dependency>
</dependencies>
<build>
<finalName>${filename}</finalName>
<sourceDirectory>src/main/scala</sourceDirectory>
<resources>
<resource>
<directory>src/main/resource</directory>
<includes>
<!--包含文件夹以及子文件夹下所有资源-->
<include>**/*.*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<!-- <plugin>-->
<!-- <groupId>org.apache.maven.plugins</groupId>-->
<!-- <artifactId>maven-assembly-plugin</artifactId>-->
<!-- <executions>-->
<!-- <execution>-->
<!-- <id>make-a-jar</id>-->
<!-- <phase>package</phase>-->
<!-- <goals>-->
<!-- <goal>single</goal>-->
<!-- </goals>-->
<!-- <configuration>-->
<!-- <descriptorRefs>-->
<!-- <descriptorRef>jar-with-dependencies</descriptorRef>-->
<!-- </descriptorRefs>-->
<!-- </configuration>-->
<!-- </execution>-->
<!-- </executions>-->
<!-- </plugin>-->
</plugins>
</build>
</project>
3)udfdemo
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.Map;
/**
* 格式化json串中的key值: 字符串替换法
**/
public class KvReplace extends UDF {
public String evaluate(String json_str, String keys) {
// json串转 Map
Map ori_map;
try {
ori_map = JSON.parseObject(json_str, Map.class);
} catch (Exception e) {
return "{}";
}
// 为null时直接结束
if (null == ori_map) {
return "{}";
}
// 删除指定的key/value对
for (String col : keys.split(",")) {
ori_map.remove(col);
}
// 返回格式化后的json串
return JSON.toJSONString(ori_map);
}
}
2 注册函数的步骤
1 查看jar包存放位置
hadoop fs -ls /var/jars/
2 jar包存在时删除
hadoop fs -rm /var/jars/spark-udfs2.jar
3 上传jar包到hdfs
hadoop fs -put spark-udfs2.jar /var/jars/
4 启动spark-sql或hive客户端
spark-sql \
-S \
--name ShyTestError \
--master yarn \
--deploy-mode client \
--num-executors 1 \
--executor-memory 4G \
--executor-cores 1 \
--driver-memory 1G \
--conf spark.dynamicAllocation.enabled=false \
--conf spark.sql.session.timeZone=UTC \
--conf spark.default.parallelism=2 \
--conf spark.sql.shuffle.partitions=2
5 注册临时函数
定位到类名即可, 可以同时上传依赖jar包
create function tranadm.user_agent as 'com.ssjt.bigdata.user_agent' using jar 'hdfs:///var/jars/udfs2.jar',jar 'hdfs:///var/jars/fastjson-1.2.62.jar';
6 验证udf
重开一个窗口, 启动spark-sql验证
select tranadm.user_agent("101110",");
7 删除udf
drop function tranadm.user_agent;
3. UDF参考
1. udf中传入数组参数
import org.apache.hadoop.hive.ql.exec.UDF
/**
* 格式化json串中的key值: 字符串替换法
*/
class KeyReplaceFirst extends UDF {
/**
*
* @param eparma 要被处理的json串
* @param rules 处理规则, 注意数组的传入和使用方式
* @return 处理后的json串
*/
def evaluate(eparma: String, rules: util.ArrayList[String]): String = {
// 1 null时的特殊处理
if (null == eparma || null == rules) {
return eparma
}
// 2 特殊key替换
var tmp = eparma
import scala.collection.JavaConversions._
for (rule <- rules) {
val elements = rule.split( ":" )
val src_key = "\"" + elements( 0 ) + "\":"
val tgt_key = "\"" + elements( 1 ) + "\":"
tmp = tmp.replaceFirst( src_key, tgt_key )
}
// 3 返回结果
tmp
}
}
2. udf返回Map类型
import is.tagomor.woothee.Classifier
import org.apache.hadoop.hive.ql.exec.UDF
import java.util
/**
* 解析User Agent信息
*/
class UserAgentParse extends UDF {
def evaluate(ua: String):util.Map[String,String] = {
Classifier.parse(ua)
}
}
4. 注册建议
1. jar尽量存储在云上(s3/oss)而不是hdfs, 这样换集群不用重新注册, 迁云时也便于统一迁移.