Flink自定义函数实现列传行,数据格式为Json数据:[{"key1":"value1","key2":"value2"...}]
代码
Java
@FunctionHint(output = @DataTypeHint("ROW<drugUniversalName string, specifications string, goodsUnit string, " +
"location string, instruction string, consumption string, consumptionUnit string, frequency string, " +
"prescriptionsAmount string, prescriptionsUnit string, goodsNo string>"))
public class JsonArrayParseUDF extends TableFunction<Row> {
private static final Logger logger = Logger.getLogger(JsonArrayParseUDF.class);
public void eval(String json) {
if (StringUtils.isNullOrWhitespaceOnly(json)) {
return;
}
String drugUniversalName = null;
String specifications = null;
String goodsUnit = null;
String location = null;
String instruction = null;
String consumption = null;
String consumptionUnit = null;
String frequency = null;
String prescriptionsAmount = null;
String prescriptionsUnit = null;
String goodsNo = null;
try {
Gson gson = new Gson();
JsonArray jsonArray = gson.fromJson(json, JsonArray.class);
for (JsonElement jsonElement : jsonArray) {
JsonObject jsonObject = jsonElement.getAsJsonObject();
JsonElement drugUniversalNameTmp = jsonObject.get("drugUniversalName");
drugUniversalName = invalidate(drugUniversalNameTmp);
JsonElement specificationsTmp = jsonObject.get("specifications");
specifications = invalidate(specificationsTmp);
JsonElement goodsUnitTmp = jsonObject.get("goodsUnit");
goodsUnit = invalidate(goodsUnitTmp);
JsonElement locationTmp = jsonObject.get("location");
location = invalidate(locationTmp);
JsonElement instructionTmp = jsonObject.get("instruction");
instruction = invalidate(instructionTmp);
JsonElement consumptionTmp = jsonObject.get("consumption");
consumption = invalidate(consumptionTmp);
JsonElement consumptionUnitTmp = jsonObject.get("consumptionUnit");
consumptionUnit = invalidate(consumptionUnitTmp);
JsonElement frequencyTmp = jsonObject.get("frequency");
frequency = invalidate(frequencyTmp);
JsonElement prescriptionsAmountTmp = jsonObject.get("prescriptionsAmount");
prescriptionsAmount = invalidate(prescriptionsAmountTmp);
JsonElement prescriptionsUnitTmp = jsonObject.get("prescriptionsUnit");
prescriptionsUnit = invalidate(prescriptionsUnitTmp);
JsonElement goodsNoTmp = jsonObject.get("goodsNo");
goodsNo = invalidate(goodsNoTmp);
Row row = Row.of(drugUniversalName, specifications, goodsUnit, location, instruction, consumption,
consumptionUnit, frequency, prescriptionsAmount, prescriptionsUnit, goodsNo);
// System.out.println(row);
collect(row);
}
} catch (Exception e) {
logger.error("json parser failed :" + e.getMessage());
}
}
public String invalidate(JsonElement jsonElement) {
if (jsonElement != null) {
return jsonElement.getAsString();
} else {
return "";
}
}
// public static void main(String[] args) {
// JsonArrayParseUDF parseUDF = new JsonArrayParseUDF();
// String str = "[{\"drugUniversalName\":\"达格列净片\",\"specifications\":\"10mg*10片*3板\",\"goodsUnit\":\"盒\",\"location\":\"\",\"instruction\":\"口服\",\"consumption\":\"2.0\",\"consumptionUnit\":\"片\",\"frequency\":\"1日1次\",\"prescriptionsAmount\":\"5\",\"prescriptionsUnit\":\"盒\",\"countDosage\":\"\",\"description\":\"(安达唐)达格列净片10mg*10片*3板阿斯利康\",\"goodsNo\":\"1029990\"}]\n";
// parseUDF.eval(str);
// }
}
Scala
@FunctionHint(
output = new DataTypeHint(
"""
|ROW<drugUniversalName string, specifications string, goodsUnit string, location string,
|instruction string, consumption string, consumptionUnit string, frequency string,
|prescriptionsAmount string, prescriptionsUnit string, goodsNo string>
|"""))
class JsonArrayParseUDTF extends TableFunction[Row] {
private var logger: org.slf4j.Logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
def eval(str: String): Unit = {
if (!StringUtils.isNullOrWhitespaceOnly(str)) {
try {
val gson = new Gson
val array: JsonArray = gson.fromJson(str, classOf[JsonArray])
val iterator: util.Iterator[JsonElement] = array.iterator()
while (iterator.hasNext) {
val jsonElement = iterator.next()
val jsonObject = jsonElement.getAsJsonObject
val drugUniversalName = invalidate(jsonObject.get("drugUniversalName"))
val specifications = invalidate(jsonObject.get("specifications"))
val goodsUnit = invalidate(jsonObject.get("goodsUnit"))
val location = invalidate(jsonObject.get("location"))
val consumption = invalidate(jsonObject.get("consumption"))
val consumptionUnit = invalidate(jsonObject.get("consumptionUnit"))
val frequency = invalidate(jsonObject.get("frequency"))
val prescriptionsAmount = invalidate(jsonObject.get("prescriptionsAmount"))
val prescriptionsUnit = invalidate(jsonObject.get("prescriptionsUnit"))
val instruction = invalidate(jsonObject.get("instruction"))
val goodsNo = invalidate(jsonObject.get("goodsNo"))
collect(Row.of(drugUniversalName, specifications, goodsUnit, location, instruction, consumption, consumptionUnit, frequency, prescriptionsAmount, prescriptionsUnit, goodsNo))
}
} catch {
case e: Exception => logger.error(s"parser json failed : ${e.getMessage}")
}
}
}
def invalidate(jsonElement: JsonElement): String = if (jsonElement != null) jsonElement.getAsString else ""
}
打包后将没有依赖的包放入Flink/lib下。
注册函数
Flink sql-client 中永久注册自定义函数
# 配置sql-client-defaults.yaml
#==============================================================================
# User-defined functions
#==============================================================================
# Define scalar, aggregate, or table functions here.
functions: # [] # empty list
# A typical function definition looks like:
# constructor:
- name: unnest_udtf
from: class
class: com.zero.job.JsonArrayParseUDF
查看函数是否注册
bin/sql-client.sh embedded
show functions;
其他方式临时注册函数,sql-client或sql中
create temporary function unnest_udtf as 'com.zero.job.JsonArrayParseUDF';
代码中注册
# Java
tableEnv.createTemporarySystemFunction("unnest_udtf", new JsonArrayParseUDF)
# Scala
tableEnv.createTemporarySystemFunction("unnest_udtf", classOf[JsonArrayParseUDF])
使用函数
insert into dwd_ai_b_info
select
cast(id as string) as dl_uuid,
id,
created_at,
drugUniversalName as druguniversalname,
goodsNo as goodsno,
prescriptionsAmount as prescriptionsamount,
prescriptionsUnit as prescriptionsunit,
consumption as consumption,
consumptionUnit as consumptionunit,
specifications,
frequency,
instruction,
goodsUnit as goodsunit,
location,
date_format(localtimestamp, 'yyyyMMdd') as dt
from
(
select
id,
created_at,
rp # 需要解析的字段,这样更明显
from dwd_ai_a_info
) as a
left join
lateral table(unnest_udtf(`rp`)) as t(drugUniversalName, specifications, goodsUnit, location, instruction, consumption, consumptionUnit, frequency, prescriptionsAmount, prescriptionsUnit, goodsNo)
on true;
另
自定义UDF实现类似,只是继承类不同,例如:
class AddDimExt2Ext extends ScalarFunction{
def eval(channelId: Int, dimExt: String, ext: String): String = {
channelId match {
case 5 | 32=> JsonParseUtils.upsertKeyValueToJsonString(ext, "dim_ext", dimExt)
case _ => ext
}
}
}
# 添加ext到click_ext中
tableEnv.createTemporarySystemFunction("add_dim_ext", classOf[AddDimExt2Ext])
select
add_dim_ext(coalesce(channel_id, 0), dim_sv_conf.EXT, ext) as ext
from ...
附
本地测试
object TestFlinkUDTF {
private var logger: org.slf4j.Logger = _
def main(args: Array[String]): Unit = {
logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
org.apache.log4j.Logger.getLogger("org.apache").setLevel(Level.WARN)
val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
// val env = StreamExecutionEnvironment.getExecutionEnvironment
// 本地测试线程 1
env.setParallelism(1)
// 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
// env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 3))
// 1分钟做一次checkpoint
env.enableCheckpointing(5000)
// checkpoint 设置
val checkpointConfig = env.getCheckpointConfig
// 同一时间只允许进行一个检查点
checkpointConfig.setMaxConcurrentCheckpoints(1)
val fsState = """file:///F:\workspace\realtime-datalake\src\main\resources\cp"""
env.setStateBackend(new FsStateBackend(fsState))
def settings: EnvironmentSettings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
// 创建 streamTable 环境
val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, settings)
tableEnv.createTemporarySystemFunction("unnest_udtf", classOf[JsonArrayParseUDF])
val sourceTableDDL =
"""
|create table source_tb(
| id string primary key not enforced,
| json string
|) with (
| 'connector' = 'filesystem',
| 'path'='file:///F:\workspace\realtime-datalake\src\main\resources\source_tb',
| 'format'='csv'
|)
|""".stripMargin
tableEnv.executeSql(sourceTableDDL)
val insertMDL =
"""
|insert into source_tb values
|('id1','[{"drugUniversalName":"达格列净片","specifications":"10mg*10片*3板","goodsUnit":"盒","location":"","instruction":"口服","consumption":"2.0","consumptionUnit":"片","frequency":"1日1次","prescriptionsAmount":"5","prescriptionsUnit":"盒","countDosage":"","description":"(安达唐)达格列净片10mg*10片*3板阿斯利康","goodsNo":"1029990"}]'),
|('id2','')
|""".stripMargin
tableEnv.executeSql(insertMDL)
val printTableDDL =
"""
|create table print_sink_tb(
| id string,
| drugUniversalName string,
| goodsNo string,
| prescriptionsAmount string,
| prescriptionsUnit string,
| consumption string,
| consumptionUnit string,
| specifications string,
| frequency string,
| instruction string,
| goodsUnit string,
| location string
|) with (
| 'connector' = 'print'
|)
|""".stripMargin
tableEnv.executeSql(printTableDDL)
val queryDML =
"""
|insert into print_sink_tb
|select
| id,
| drugUniversalName,
| goodsNo,
| prescriptionsAmount,
| prescriptionsUnit,
| consumption,
| consumptionUnit,
| specifications,
| frequency,
| instruction,
| goodsUnit,
| location
|from source_tb left join
| lateral table(unnest_udtf2(json)) as t(drugUniversalName, goodsNo, prescriptionsAmount, prescriptionsUnit, consumption, consumptionUnit, specifications, frequency, instruction, goodsUnit, location)
| on true
|""".stripMargin
tableEnv.executeSql(queryDML)
// val query =
// """
// |insert into print_sink_tb select * from test_sink_tb
// |""".stripMargin
// tableEnv.executeSql(query)
// val table = tableEnv.sqlQuery(queryDML)
// tableEnv.toAppendStream[Row](table).print
// env.execute("test hoodie table")
}
}
pom文件
<properties>
<!-- project compiler -->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<!-- sdk -->
<java.version>1.8</java.version>
<scala.version>2.11.12</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<flink.version>1.12.3</flink.version>
<hoodie.version>0.8.0</hoodie.version>
<json4s.version>3.6.9</json4s.version>
<gson.version>2.8.7</gson.version>
<!-- maven compiler-->
<scala.maven.plugin.version>4.2.0</scala.maven.plugin.version>
<maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
<maven.assembly.plugin.version>3.1.1</maven.assembly.plugin.version>
<!-- <scope.type>provided</scope.type>-->
<scope.type>compile</scope.type>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${gson.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-client</artifactId>
<version>${hoodie.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-bundle_${scala.binary.version}</artifactId>
<version>${hoodie.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.10.0</version>
<scope>${scope.type}</scope>
</dependency>
<!-- Flink Dependency -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-orc_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<version>${flink.version}</version>
<scope>${scope.type}</scope>
</dependency>
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
<version>${json4s.version}</version>
<scope>${scope.type}</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 该插件用于将Scala代码编译成class文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${scala.maven.plugin.version}</version>
<executions>
<execution>
<goals>
<!--声明绑定到maven的compile阶段-->
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>${maven.assembly.plugin.version}</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>