三、代码过程
由于Flink一般在流式环境使用,故这里数据源使用Kafka,并建立动态表的形式实现,以更好的贴近实际的业务环境。
工具类:
package com.test.UDTF;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
-
@author: Rango
-
@create: 2021-05-04 16:50
-
@description: 建立函数,继承TableFunction并建立eval方法
/
@FunctionHint(output = @DataTypeHint(“ROW”))
public class KeywordUDTF extends TableFunction {
//按官方文档说明,须按eval命名
public void eval(String value){
List stringList = analyze(value);
for (String s : stringList) {
Row row = new Row(1);
row.setField(0,s);
collect(row);
}
}
//自定义分词方式
public List analyze(String text){
//字符串转文件流
StringReader sr = new StringReader(text);
//建立分词器对象
IKSegmenter ik = new IKSegmenter(sr,true);
//ik分词后对象为Lexeme
Lexeme lex = null;
//分词后转入列表
List keywordList = new ArrayList<>();
while(true){
try {
if ((lex = ik.next())!=null){
keywordList.add(lex.getLexemeText());
}else{
break;
}
} catch(IOException e) {
e.printStackTrace();
}
}return keywordList;
}
}
实现类
package com.test.UDTF;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/ -
@author: Rango
-
@create: 2021-05-04 17:11
-
@description:
**/
public class KeywordStatsApp {
public static void main(String[] args) throws Exception {
//建立环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); //注册函数 tableEnv.createTemporaryFunction("ik_analyze", KeywordUDTF.class); //建立动态表 tableEnv.executeSql("CREATE TABLE wordtable (" + "word STRING" + ") WITH ('connector' = 'kafka'," + "'topic' = 'keywordtest'," + "'properties.bootstrap.servers' = 'hadoop102:9092'," + "'properties.group.id' = 'keyword_stats_app'," + "'format' = 'json')"); //未切分效果 Table wordTable = tableEnv.sqlQuery("select word from wordtable"); //利用自定义函数对文本进行分切,切分后计为1,方便后续统计使用 Table wordTable1 = tableEnv.sqlQuery("select splitword,1 ct from wordtable," + "LATERAL TABLE(ik_analyze(word)) as T(splitword)"); tableEnv.toAppendStream(wordTable, Row.class).print("原格式>>>"); tableEnv.toAppendStream(wordTable1, Row.class).print("使用UDTF函数效果>>>"); env.execute();
}
}
补充下依赖
<java.version>1.8</java.version>
<flink.version>1.12.0</flink.version>
<scala.version>2.12</scala.version>
org.apache.flink
flink-java
KaTeX parse error: Expected group after '_' at position 157: …api-java-bridge_̲{scala.version}
KaTeX parse error: Expected group after '_' at position 155: …e-planner-blink_̲{scala.version}
KaTeX parse error: Expected group after '_' at position 309: …connector-kafka_̲{scala.version}
f l i n k . v e r s i o n < / v e r s i o n > < / d e p e n d e n c y > < d e p e n d e n c y > < g r o u p I d > o r g . a p a c h e . f l i n k < / g r o u p I d > < a r t i f a c t I d > f l i n k − j s o n < / a r t i f a c t I d > < v e r s i o n > {flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-json</artifactId> <version> flink.version</version></dependency><dependency><groupId>org.apache.flink</groupId><artifactId>flink−json</artifactId><version>{flink.version}
org.apache.flink
flink-clients_ s c a l a . v e r s i o n < / a r t i f a c t I d > < v e r s i o n > {scala.version}</artifactId> <version> scala.version</artifactId><version>{flink.version}
USB Microphone https://www.soft-voice.com/
Wooden Speakers https://www.zeshuiplatform.com/
亚马逊测评 www.yisuping.cn
深圳网站建设www.sz886.com