在Flink上实现读取kafka中的数据进行词频统计并将结果保存在ClickHouse中。

package com.drunk02;
//导入所需的Java包。
import com.huaban.analysis.jieba.JiebaSegmenter;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
public class wordcloud1 {
    public static void main(String[] args) throws Exception {
        //创建Flink的StreamExecutionEnvironment和ExecutionEnvironment。
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        final ExecutionEnvironment env2 = ExecutionEnvironment.getExecutionEnvironment();
        // parse user parameters
        ParameterTool parameterTool = ParameterTool.fromArgs(args);
        Properties properties = new Properties();
        //解析用户参数,并设置Kafka的连接属性。
        properties.setProperty("bootstrap.servers", "192.168.88.161:9092,192.168.88.162:9092,192.168.88.163:9092");
        properties.setProperty("group.id", "w1");
        //从文件中读取停用词,并将其存储在stopWordsSet中。
        String stopWordsFilePath = "data/stopwords.txt";
        Set<String> stopWordsSet = new HashSet<>();
        DataSource<String> stopWordsStream = env2.readTextFile(stopWordsFilePath);
        stopWordsStream.collect().forEach(stopWordsSet::add);
        System.out.println("The size of stopWordsSet is: " + stopWordsSet.size());
        //创建一个FlinkKafkaConsumer作为数据源,连接到Kafka主题。
        FlinkKafkaConsumer<String> myConsumer = new FlinkKafkaConsumer<>(
                "content",
                new SimpleStringSchema(),
                properties);
        //将Kafka数据流添加到Flink的流处理环境中。
        DataStream<String> stream = env.addSource(myConsumer);
        SingleOutputStreamOperator<String> splitStream = stream.flatMap(new FlatMapFunction<String, String>() {
            @Override
            //对输入流进行分词处理,使用jieba分词工具,并过滤停用词。
            public void flatMap(String s, Collector<String> collector) throws Exception {
                JiebaSegmenter segmenter = new JiebaSegmenter();
                for (String word : segmenter.sentenceProcess(s)) {
                    if (!stopWordsSet.contains(word)) {
                        collector.collect(word);
                    }
                }
            }
        });
        //对分词后的流进行词频统计,将结果存储在DataStream<Tuple2<String, Long>>中。
        DataStream<Tuple2<String, Long>> counts = splitStream
                .flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
                    @Override
                    public void flatMap(String value, Collector<Tuple2<String, Long>> out) {
                        for (String word : value.split("\\s")) {
                            out.collect(new Tuple2<>(word, 1L));
                        }
                    }
                })
                .keyBy(0)
                .sum(1);
        //将Tuple2<String, Long>转换为Tuple2<String, Integer>,以便后续写入ClickHouse时使用。
        DataStream<Tuple2<String, Integer>> countsInt = counts
                .map(t -> new Tuple2<>(t.f0, t.f1.intValue()))
                .returns(Types.TUPLE(Types.STRING, Types.INT));
        //打印词频统计结果。
        countsInt.print();
        //设置ClickHouse表名和查询语句。
        String tableName = "content_cloud";
        String cleanTableQuery ="TRUNCATE TABLE "+tableName;
        String checkTableQuery = "CREATE TABLE IF NOT EXISTS "+tableName+" (word String, count INTEGER,ts DateTime DEFAULT now()) ENGINE = MergeTree() ORDER BY ts";
        Class.forName("ru.yandex.clickhouse.ClickHouseDriver");
        //建立ClickHouse数据库连接,并执行清空表和创建表的操作。
        Connection connection = DriverManager.getConnection("jdbc:clickhouse://192.168.88.161:8123/default");
        Statement statement = connection.createStatement();
        statement.executeUpdate(cleanTableQuery);
        statement.executeUpdate(checkTableQuery);
        statement.close();
        connection.close();
        System.out.println(123456);
        //将词频统计结果写入ClickHouse数据库。
        countsInt.addSink(JdbcSink.sink(
                String.format(String.format("INSERT INTO %s (word, count) VALUES (?, ?)", tableName),
                        tableName),

                (ps, t) -> {
                    ps.setString(1, t.f0);
                    ps.setInt(2, t.f1);
                },
                JdbcExecutionOptions.builder()
                        .withBatchSize(1)
                        .withMaxRetries(3)
                        .build(),
                new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                        .withUrl("jdbc:clickhouse://192.168.88.161:8123/default")
                        .withDriverName("ru.yandex.clickhouse.ClickHouseDriver")
                        .build()));
        //执行Flink应用程序。
        env.execute("content_cloud");
    }
}

注:

        在使用IDEA时可能会出现爆红但是可以正常执行,原因可能是由于IDEA没有正常适配ClickHouse中的SQL语句。

        相关依赖需要根据实际环境进行适配,如果是Maven项目可以在Maven Central中查找相关依赖。

 

如有其他问题或建议欢迎留言。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
可以通过FlinkKafka Consumer实现Kafka读取数据,并通过Flink的JDBC Output Format将偏移量保存到MySQL。以下是一个简单的示例代码: ``` import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks; import org.apache.flink.streaming.api.functions.sink.SinkFunction; import org.apache.flink.streaming.api.watermark.Watermark; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.streaming.connectors.kafka.KafkaSerializationSchema; import org.apache.flink.streaming.connectors.kafka.KafkaSink; import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper; import org.apache.flink.types.Row; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.clients.producer.RecordMetadata; import javax.annotation.Nullable; import java.nio.charset.StandardCharsets; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.Properties; import java.util.concurrent.ExecutionException; public class FlinkKafkaToMysql { public static void main(String[] args) throws Exception { // 获取参数 final ParameterTool parameterTool = ParameterTool.fromArgs(args); // 设置执行环境 final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE); // 设置Kafka Consumer Properties properties = new Properties(); properties.setProperty("bootstrap.servers", parameterTool.get("bootstrap.servers")); properties.setProperty("group.id", parameterTool.get("group.id")); FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(parameterTool.get("input.topic"), new SimpleStringSchema(), properties); // 设置Kafka Producer FlinkKafkaProducer<Row> producer = new FlinkKafkaProducer<>(parameterTool.get("output.topic"), new KafkaSerializationSchema<Row>() { @Override public ProducerRecord<byte[], byte[]> serialize(Row element, @Nullable Long timestamp) { return new ProducerRecord<>(parameterTool.get("output.topic"), element.toString().getBytes(StandardCharsets.UTF_8)); } }, properties, FlinkKafkaProducer.Semantic.EXACTLY_ONCE); // 设置JDBC Output Format JDBCOutputFormat jdbcOutputFormat = JDBCOutputFormat.buildJDBCOutputFormat() .setDrivername(parameterTool.get("jdbc.driver")) .setDBUrl(parameterTool.get("jdbc.url")) .setUsername(parameterTool.get("jdbc.username")) .setPassword(parameterTool.get("jdbc.password")) .setQuery("INSERT INTO " + parameterTool.get("mysql.table") + " (topic, partition, offset) VALUES (?, ?, ?)") .setSqlTypes(new int[] {java.sql.Types.VARCHAR, java.sql.Types.INTEGER, java.sql.Types.BIGINT}) .finish(); // 读取Kafka数据保存偏移量到MySQL DataStream<String> inputStream = env.addSource(consumer); inputStream.map(new MapFunction<String, Row>() { @Override public Row map(String value) throws Exception { String[] parts = value.split(","); return Row.of(parts[0], Integer.parseInt(parts[1]), Long.parseLong(parts[2])); } }).addSink(new SinkFunction<Row>() { @Override public void invoke(Row value, Context context) throws Exception { jdbcOutputFormat.writeRecord(value); } }); // 执行任务 env.execute("Flink Kafka to MySQL"); } } ``` 在上述代码,我们首先通过`ParameterTool`获取传入的参数,然后设置Flink的执行环境和Kafka Consumer。接着,我们创建了一个`JDBCOutputFormat`对象,用于将数据保存到MySQL。最后,我们通过`DataStream.map()`操作将Kafka读取数据转换成`Row`对象,并通过`SinkFunction`将偏移量保存到MySQL。 需要注意的是,在保存数据到MySQL时,我们使用了`JDBCOutputFormat`的`writeRecord()`方法,该方法会自动打开和关闭数据库连接。因此,我们不需要手动管理数据库连接。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值