这是本次学习的最后一篇了,我在完成基本业务处理模型的基础之上https://blog.csdn.net/xxkalychen/article/details/117190236?spm=1001.2014.3001.5501,增加一个中文分词的功能。
一、添加pom依赖。
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
二、创建测试类。我们在上次修改的基础之上做一点小小改动。
package com.chris.flink;
import com.chris.flink.model.WordCount;
import com.chris.flink.utils.ElasticSearchUtil;
import com.huaban.analysis.jieba.JiebaSegmenter;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author Chris Chan
* Create on 2021/5/22 7:23
* Use for:
* Explain: Flink流式处理从Kafka获取的数据并写入ElasticSearch
*/
public class KafkaToESTest {
//ElasticSearch内索引名称
public static final String INDEX_NAME = "topic_flink";
//本地缓存旧数据
private static ConcurrentHashMap<String, Long> wordCountMap = new ConcurrentHashMap<>(16);
//结巴分词器
private static JiebaSegmenter jiebaSegmenter;
static {
ElasticSearchUtil.createIndex(INDEX_NAME);
ElasticSearchUtil.initWordCountMap(INDEX_NAME, wordCountMap);
jiebaSegmenter = new JiebaSegmenter();
}
public static void main(String[] args) throws Exception {
new KafkaToESTest().execute(args);
}
/**
* 初始化 缓存ElasticSearch旧数据
*
* @param env
* @return
*/
private DataStreamSource<Tuple2<String, Long>> init(StreamExecutionEnvironment env) {
List<Tuple2<String, Long>> wordCountList = new ArrayList<>(wordCountMap.size());
wordCountMap.forEach((key, value) -> wordCountList.add(new Tuple2<>(key, value)));
//避免集合为空
if (wordCountList.size() == 0) {
wordCountList.add(new Tuple2<>("flink", 0L));
}
return env.fromCollection(wordCountList);
}
private void execute(String[] args) throws Exception {
//获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//配置kafka
Properties properties = new Properties();
properties.put("bootstrap.servers", "flink.chris.com:9092");
properties.put("group.id", "flink_group_1");
//初始化
DataStreamSource<Tuple2<String, Long>> initStream = init(env);
//从socket获取数据
DataStreamSource<String> streamSource = env.addSource(new FlinkKafkaConsumer<>("topic_flink", new SimpleStringSchema(), properties));
//wordcount计算
SingleOutputStreamOperator<Tuple2<String, Long>> operator = streamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
/**
* map计算
* @param value 输入数据 用空格分隔的句子
* @param out map计算之后的收集器
* @throws Exception
*/
@Override
public void flatMap(String value, Collector<Tuple2<String, Long>> out) throws Exception {
//用空格分隔为单词
String[] words = value.split(" ");
//统计单词使用频次,放入收集器
Arrays.stream(words)
//中文分词
.flatMap(word -> jiebaSegmenter.process(word, JiebaSegmenter.SegMode.SEARCH).stream())
.map(segToken -> segToken.word)
//洗去前后空格
.map(String::trim)
//过滤掉空字符串
.filter(word -> !"".equals(word))
//加入收集器
.forEach(word -> out.collect(new Tuple2<>(word, 1L)));
}
});
//合并初始化流,按照二元组第一个字段word分组,把第二个字段统计出来
SingleOutputStreamOperator<Tuple2<String, Long>> resultOperator = operator.union(initStream).keyBy(new KeySelector<Tuple2<String, Long>, Object>() {
@Override
public Object getKey(Tuple2<String, Long> value) throws Exception {
return value.f0;
}
}).sum(1);
resultOperator.print();
//收集WordCount
resultOperator.map(new MapFunction<Tuple2<String, Long>, WordCount>() {
@Override
public WordCount map(Tuple2<String, Long> value) throws Exception {
WordCount wordCount = new WordCount(value.f0, value.f1);
//写入ElasticSearch
if (wordCount.getCount() > 0) {
ElasticSearchUtil.addWordCount(wordCount, INDEX_NAME);
}
return wordCount;
}
});
env.execute();
}
}
只是在flatMap计算时,在空格分割之后进行了中文分词。目前可以适应中英文数据输入。
好了,启动各种服务器,zookeeper、kafka、elasticsearch。打包程序,上传jar包,提交Job。在kafka输入数据。
致敬袁隆平!袁老,走好!!