【FLink-9-Flink练习】

Apache Minor Trend

已于 2023-01-17 23:27:19 修改

阅读量121

点赞数

分类专栏：大数据 flink 文章标签： flink 大数据数据库

于 2023-01-17 18:17:07 首次发布

本文链接：https://blog.csdn.net/weixin_38136584/article/details/128717570

版权

大数据同时被 2 个专栏收录

45 篇文章 2 订阅

订阅专栏

flink

16 篇文章 0 订阅

订阅专栏

FLink-9-Flink练习

Flink练习
- 1.产品需求梳理

Flink练习

1.产品需求梳理

@Desc: 创建两个流
流1 ：
“id,eventId,cnt”
1,event01,3
1,event02,2
2,event02,4
流2 ：
“id,gender,city”
1, male, shanghai
2, female, beijing

需求：
1 , 将流1的数据展开
比如，一条数据： 1,event01,3
需要展开成3条:
1,event01,随机数1
1,event01,随机数2
1,event01,随机数3

2 , 流1的数据，还需要关联上流2 的数据（性别，城市）
并且把关联失败的流1的数据，写入一个侧流；否则输出到主流
4 , 对主流数据按性别分组，取最大随机数所在的那一条数据作为结果输出
5 , 把侧流处理结果，写入文件系统，并写成 parquet格式
6 , 把主流处理结果，写入 mysql，并实现幂等更新

代码案例如下：

package com.yang.flink.test;

import com.yang.flink.vo.EventCount;
import com.yang.flink.vo.EventSum;
import com.yang.flink.vo.Message;
import org.apache.commons.lang3.RandomUtils;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.connector.jdbc.JdbcStatementBuilder;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.parquet.ParquetWriterFactory;
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.OnCheckpointRollingPolicy;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import scala.Int;

import java.sql.PreparedStatement;
import java.sql.SQLException;

/**
 * 需求描述：
 *  @Desc: 创建两个流
 *  流1 ：
 *  “id,eventId,cnt”
 *  1,event01,3
 *  1,event02,2
 *  2,event02,4
 *  流2 ：
 *  “id,gender,city”
 *  1, male, shanghai
 *  2, female, beijing
 *
 *  需求：
 *  1 , 将流1的数据展开
 * 	比如，一条数据： 1,event01,3
 * 	需要展开成3条:
 * 		1,event01,随机数1
 * 		1,event01,随机数2
 * 		1,event01,随机数3
 *
 *  2 , 流1的数据，还需要关联上 流2 的数据  （性别，城市）
 * 	并且把关联失败的流1的数据，写入一个侧流；否则输出到主流
 *  4 , 对主流数据按性别分组， 取 最大随机数所在的那一条数据 作为结果输出
 *  5 , 把侧流处理结果，写入 文件系统，并写成 parquet格式
 *  6 , 把主流处理结果，写入  mysql， 并实现幂等更新
 */
public class FlinkTest01 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //创建数据流一
        DataStreamSource<String> streamSource1 = env.socketTextStream("hadoop102", 9991);
        SingleOutputStreamOperator<EventCount> stream1 = streamSource1.process(new ProcessFunction<String, EventCount>() {

            @Override
            public void processElement(String value, Context ctx, Collector<EventCount> out) throws Exception {
                String[] split = value.split(",");
                for (int i = 0; i < Integer.parseInt(split[2]); i++) {
                    out.collect(new EventCount(Integer.parseInt(split[0]), split[1], RandomUtils.nextInt(10, 100)));
                }
            }
        });

        DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9992);
        SingleOutputStreamOperator<Message> stream2 = streamSource2.map(data -> {
            String[] split = data.split(",");
            return new Message(Integer.parseInt(split[0]), split[1], split[2]);
        });
        //将数据源2整合为广播变量
        MapStateDescriptor<Integer, Message> messageStateDescriptor = new MapStateDescriptor<>("message", Integer.class, Message.class);
        OutputTag<EventCount> tOutputTag = new OutputTag<EventCount>("noconn", TypeInformation.of(EventCount.class));
        BroadcastStream<Message> stream2Broadcast = stream2.broadcast(messageStateDescriptor);

        //两部分数据整合
        SingleOutputStreamOperator<EventSum> process = stream1.connect(stream2Broadcast).process(new BroadcastProcessFunction<EventCount, Message, EventSum>() {
            @Override
            public void processElement(EventCount value, ReadOnlyContext ctx, Collector<EventSum> out) throws Exception {

                ReadOnlyBroadcastState<Integer, Message> broadcastState = ctx.getBroadcastState(messageStateDescriptor);
                Message message = broadcastState.get(value.getId());
                if (broadcastState != null && message != null) {
                    //主流数据处理
                    out.collect(new EventSum(value.getId(), value.getEventId(), value.getCnt(), message.getGender(), message.getCity()));
                } else {
                    //侧流数据处理
                    ctx.output(tOutputTag, value);
                }
            }

            @Override
            public void processBroadcastElement(Message value, Context ctx, Collector<EventSum> out) throws Exception {
                BroadcastState<Integer, Message> broadcastState = ctx.getBroadcastState(messageStateDescriptor);
                broadcastState.put(value.getId(), value);

            }
        });
        SingleOutputStreamOperator<EventSum> mainResult = process.keyBy(data -> data.getGender()).maxBy("cnt");

        // TODO 把主流结果，写入mysql，并实现幂等更新
        /*mainResult.print("main");*/
        SinkFunction<EventSum> jdbcSink = JdbcSink.sink(
                "insert into t_eventuser values(?,?,?,?,?) on duplicate key update eventId=? , cnt =? ,gender =? ,city = ?",
                new JdbcStatementBuilder<EventSum>() {
                    @Override
                    public void accept(PreparedStatement stmt, EventSum eventUserInfo) throws SQLException {
                        stmt.setInt(1, eventUserInfo.getId());
                        stmt.setString(2, eventUserInfo.getEventId());
                        stmt.setInt(3, eventUserInfo.getCnt());
                        stmt.setString(4, eventUserInfo.getGender());
                        stmt.setString(5, eventUserInfo.getCity());

                        stmt.setString(6, eventUserInfo.getEventId());
                        stmt.setInt(7, eventUserInfo.getCnt());
                        stmt.setString(8, eventUserInfo.getGender());
                        stmt.setString(9, eventUserInfo.getCity());
                    }
                },
                JdbcExecutionOptions.builder()
                        .withMaxRetries(3)
                        .withBatchSize(1)
                        .build(),
                new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                        .withUrl("jdbc:mysql://localhost:3306/abc?serverTimezone=Asia/Shanghai&useUnicode=true&characterEncoding=UTF-8")
                        .withUsername("root")
                        .withPassword("123456")
                        .build()
        );
        mainResult.addSink(jdbcSink);


        // TODO 把侧流数据，写入文件系统，并生成parquet文件
        /*joinedResult.getSideOutput(cOutputTag).print("side");*/

        ParquetWriterFactory<EventCount> parquetWriterFactory = ParquetAvroWriters.forReflectRecord(EventCount.class);

        // 3. 利用生成好的parquetWriter，来构造一个 支持列式输出parquet文件的 sink算子
        FileSink<EventCount> bulkSink = FileSink.forBulkFormat(new Path("d:/sidesink/"), parquetWriterFactory)
                .withBucketAssigner(new DateTimeBucketAssigner<EventCount>("yyyy-MM-dd--HH"))
                .withRollingPolicy(OnCheckpointRollingPolicy.build())
                .withOutputFileConfig(OutputFileConfig.builder().withPartPrefix("yang").withPartSuffix(".parquet").build())
                .build();
        process.getSideOutput(tOutputTag).sinkTo(bulkSink);


        env.execute();
    }
}