package com.drunk02;
//导入所需的Java包。
import com.huaban.analysis.jieba.JiebaSegmenter;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
public class wordcloud1 {
public static void main(String[] args) throws Exception {
//创建Flink的StreamExecutionEnvironment和ExecutionEnvironment。
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
final ExecutionEnvironment env2 = ExecutionEnvironment.getExecutionEnvironment();
// parse user parameters
ParameterTool parameterTool = ParameterTool.fromArgs(args);
Properties properties = new Properties();
//解析用户参数,并设置Kafka的连接属性。
properties.setProperty("bootstrap.servers", "192.168.88.161:9092,192.168.88.162:9092,192.168.88.163:9092");
properties.setProperty("group.id", "w1");
//从文件中读取停用词,并将其存储在stopWordsSet中。
String stopWordsFilePath = "data/stopwords.txt";
Set<String> stopWordsSet = new HashSet<>();
DataSource<String> stopWordsStream = env2.readTextFile(stopWordsFilePath);
stopWordsStream.collect().forEach(stopWordsSet::add);
System.out.println("The size of stopWordsSet is: " + stopWordsSet.size());
//创建一个FlinkKafkaConsumer作为数据源,连接到Kafka主题。
FlinkKafkaConsumer<String> myConsumer = new FlinkKafkaConsumer<>(
"content",
new SimpleStringSchema(),
properties);
//将Kafka数据流添加到Flink的流处理环境中。
DataStream<String> stream = env.addSource(myConsumer);
SingleOutputStreamOperator<String> splitStream = stream.flatMap(new FlatMapFunction<String, String>() {
@Override
//对输入流进行分词处理,使用jieba分词工具,并过滤停用词。
public void flatMap(String s, Collector<String> collector) throws Exception {
JiebaSegmenter segmenter = new JiebaSegmenter();
for (String word : segmenter.sentenceProcess(s)) {
if (!stopWordsSet.contains(word)) {
collector.collect(word);
}
}
}
});
//对分词后的流进行词频统计,将结果存储在DataStream<Tuple2<String, Long>>中。
DataStream<Tuple2<String, Long>> counts = splitStream
.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
@Override
public void flatMap(String value, Collector<Tuple2<String, Long>> out) {
for (String word : value.split("\\s")) {
out.collect(new Tuple2<>(word, 1L));
}
}
})
.keyBy(0)
.sum(1);
//将Tuple2<String, Long>转换为Tuple2<String, Integer>,以便后续写入ClickHouse时使用。
DataStream<Tuple2<String, Integer>> countsInt = counts
.map(t -> new Tuple2<>(t.f0, t.f1.intValue()))
.returns(Types.TUPLE(Types.STRING, Types.INT));
//打印词频统计结果。
countsInt.print();
//设置ClickHouse表名和查询语句。
String tableName = "content_cloud";
String cleanTableQuery ="TRUNCATE TABLE "+tableName;
String checkTableQuery = "CREATE TABLE IF NOT EXISTS "+tableName+" (word String, count INTEGER,ts DateTime DEFAULT now()) ENGINE = MergeTree() ORDER BY ts";
Class.forName("ru.yandex.clickhouse.ClickHouseDriver");
//建立ClickHouse数据库连接,并执行清空表和创建表的操作。
Connection connection = DriverManager.getConnection("jdbc:clickhouse://192.168.88.161:8123/default");
Statement statement = connection.createStatement();
statement.executeUpdate(cleanTableQuery);
statement.executeUpdate(checkTableQuery);
statement.close();
connection.close();
System.out.println(123456);
//将词频统计结果写入ClickHouse数据库。
countsInt.addSink(JdbcSink.sink(
String.format(String.format("INSERT INTO %s (word, count) VALUES (?, ?)", tableName),
tableName),
(ps, t) -> {
ps.setString(1, t.f0);
ps.setInt(2, t.f1);
},
JdbcExecutionOptions.builder()
.withBatchSize(1)
.withMaxRetries(3)
.build(),
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:clickhouse://192.168.88.161:8123/default")
.withDriverName("ru.yandex.clickhouse.ClickHouseDriver")
.build()));
//执行Flink应用程序。
env.execute("content_cloud");
}
}
注:
在使用IDEA时可能会出现爆红但是可以正常执行,原因可能是由于IDEA没有正常适配ClickHouse中的SQL语句。
相关依赖需要根据实际环境进行适配,如果是Maven项目可以在Maven Central中查找相关依赖。
如有其他问题或建议欢迎留言。