FlinkSQL使用自定义UDTF函数行转列-IK分词器

三、代码过程
由于Flink一般在流式环境使用,故这里数据源使用Kafka,并建立动态表的形式实现,以更好的贴近实际的业务环境。

工具类:
package com.test.UDTF;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**

  • @author: Rango

  • @create: 2021-05-04 16:50

  • @description: 建立函数,继承TableFunction并建立eval方法
    /
    @FunctionHint(output = @DataTypeHint(“ROW”))
    public class KeywordUDTF extends TableFunction {
    //按官方文档说明,须按eval命名
    public void eval(String value){
    List stringList = analyze(value);
    for (String s : stringList) {
    Row row = new Row(1);
    row.setField(0,s);
    collect(row);
    }
    }
    //自定义分词方式
    public List analyze(String text){
    //字符串转文件流
    StringReader sr = new StringReader(text);
    //建立分词器对象
    IKSegmenter ik = new IKSegmenter(sr,true);
    //ik分词后对象为Lexeme
    Lexeme lex = null;
    //分词后转入列表
    List keywordList = new ArrayList<>();
    while(true){
    try {
    if ((lex = ik.next())!=null){
    keywordList.add(lex.getLexemeText());
    }else{
    break;
    }
    } catch(IOException e) {
    e.printStackTrace();
    }
    }return keywordList;
    }
    }
    实现类
    package com.test.UDTF;
    import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
    import org.apache.flink.table.api.EnvironmentSettings;
    import org.apache.flink.table.api.Table;
    import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
    import org.apache.flink.types.Row;
    /

  • @author: Rango

  • @create: 2021-05-04 17:11

  • @description:
    **/
    public class KeywordStatsApp {
    public static void main(String[] args) throws Exception {
    //建立环境
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);

     EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
     StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
    
     //注册函数
     tableEnv.createTemporaryFunction("ik_analyze", KeywordUDTF.class);
    
     //建立动态表
     tableEnv.executeSql("CREATE TABLE wordtable (" +
             "word STRING" +
             ") WITH ('connector' = 'kafka'," +
             "'topic' = 'keywordtest'," +
             "'properties.bootstrap.servers' = 'hadoop102:9092'," +
             "'properties.group.id' = 'keyword_stats_app'," +
             "'format' = 'json')");
     //未切分效果
     Table wordTable = tableEnv.sqlQuery("select word from wordtable");
     //利用自定义函数对文本进行分切,切分后计为1,方便后续统计使用
     Table wordTable1 = tableEnv.sqlQuery("select splitword,1 ct from wordtable," +
             "LATERAL TABLE(ik_analyze(word)) as T(splitword)");
    
     tableEnv.toAppendStream(wordTable, Row.class).print("原格式>>>");
     tableEnv.toAppendStream(wordTable1, Row.class).print("使用UDTF函数效果>>>");
    
     env.execute();
    

    }
    }
    补充下依赖

    <java.version>1.8</java.version>
    <flink.version>1.12.0</flink.version>
    <scala.version>2.12</scala.version>



    org.apache.flink
    flink-java
    KaTeX parse error: Expected group after '_' at position 157: …api-java-bridge_̲{scala.version}
    KaTeX parse error: Expected group after '_' at position 155: …e-planner-blink_̲{scala.version}
    KaTeX parse error: Expected group after '_' at position 309: …connector-kafka_̲{scala.version}
    f l i n k . v e r s i o n < / v e r s i o n > < / d e p e n d e n c y > < d e p e n d e n c y > < g r o u p I d > o r g . a p a c h e . f l i n k < / g r o u p I d > < a r t i f a c t I d > f l i n k − j s o n < / a r t i f a c t I d > < v e r s i o n > {flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-json</artifactId> <version> flink.version</version></dependency><dependency><groupId>org.apache.flink</groupId><artifactId>flinkjson</artifactId><version>{flink.version}


    org.apache.flink
    flink-clients_ s c a l a . v e r s i o n < / a r t i f a c t I d > < v e r s i o n > {scala.version}</artifactId> <version> scala.version</artifactId><version>{flink.version}

    USB Microphone https://www.soft-voice.com/
    Wooden Speakers https://www.zeshuiplatform.com/
    亚马逊测评 www.yisuping.cn
    深圳网站建设www.sz886.com

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值