【FLink-9-Flink练习】

FLink-9-Flink练习

Flink练习

1.产品需求梳理

@Desc: 创建两个流
流1 :
“id,eventId,cnt”
1,event01,3
1,event02,2
2,event02,4
流2 :
“id,gender,city”
1, male, shanghai
2, female, beijing

需求:
1 , 将流1的数据展开
比如,一条数据: 1,event01,3
需要展开成3条:
1,event01,随机数1
1,event01,随机数2
1,event01,随机数3

2 , 流1的数据,还需要关联上 流2 的数据 (性别,城市)
并且把关联失败的流1的数据,写入一个侧流;否则输出到主流
4 , 对主流数据按性别分组, 取 最大随机数所在的那一条数据 作为结果输出
5 , 把侧流处理结果,写入 文件系统,并写成 parquet格式
6 , 把主流处理结果,写入 mysql, 并实现幂等更新

  • 代码案例如下:
package com.yang.flink.test;

import com.yang.flink.vo.EventCount;
import com.yang.flink.vo.EventSum;
import com.yang.flink.vo.Message;
import org.apache.commons.lang3.RandomUtils;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.connector.jdbc.JdbcStatementBuilder;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.parquet.ParquetWriterFactory;
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.OnCheckpointRollingPolicy;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import scala.Int;

import java.sql.PreparedStatement;
import java.sql.SQLException;

/**
 * 需求描述:
 *  @Desc: 创建两个流
 *  流1 :
 *  “id,eventId,cnt”
 *  1,event01,3
 *  1,event02,2
 *  2,event02,4
 *  流2 :
 *  “id,gender,city”
 *  1, male, shanghai
 *  2, female, beijing
 *
 *  需求:
 *  1 , 将流1的数据展开
 * 	比如,一条数据: 1,event01,3
 * 	需要展开成3条:
 * 		1,event01,随机数1
 * 		1,event01,随机数2
 * 		1,event01,随机数3
 *
 *  2 , 流1的数据,还需要关联上 流2 的数据  (性别,城市)
 * 	并且把关联失败的流1的数据,写入一个侧流;否则输出到主流
 *  4 , 对主流数据按性别分组, 取 最大随机数所在的那一条数据 作为结果输出
 *  5 , 把侧流处理结果,写入 文件系统,并写成 parquet格式
 *  6 , 把主流处理结果,写入  mysql, 并实现幂等更新
 */
public class FlinkTest01 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //创建数据流一
        DataStreamSource<String> streamSource1 = env.socketTextStream("hadoop102", 9991);
        SingleOutputStreamOperator<EventCount> stream1 = streamSource1.process(new ProcessFunction<String, EventCount>() {

            @Override
            public void processElement(String value, Context ctx, Collector<EventCount> out) throws Exception {
                String[] split = value.split(",");
                for (int i = 0; i < Integer.parseInt(split[2]); i++) {
                    out.collect(new EventCount(Integer.parseInt(split[0]), split[1], RandomUtils.nextInt(10, 100)));
                }
            }
        });

        DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9992);
        SingleOutputStreamOperator<Message> stream2 = streamSource2.map(data -> {
            String[] split = data.split(",");
            return new Message(Integer.parseInt(split[0]), split[1], split[2]);
        });
        //将数据源2整合为广播变量
        MapStateDescriptor<Integer, Message> messageStateDescriptor = new MapStateDescriptor<>("message", Integer.class, Message.class);
        OutputTag<EventCount> tOutputTag = new OutputTag<EventCount>("noconn", TypeInformation.of(EventCount.class));
        BroadcastStream<Message> stream2Broadcast = stream2.broadcast(messageStateDescriptor);

        //两部分数据整合
        SingleOutputStreamOperator<EventSum> process = stream1.connect(stream2Broadcast).process(new BroadcastProcessFunction<EventCount, Message, EventSum>() {
            @Override
            public void processElement(EventCount value, ReadOnlyContext ctx, Collector<EventSum> out) throws Exception {

                ReadOnlyBroadcastState<Integer, Message> broadcastState = ctx.getBroadcastState(messageStateDescriptor);
                Message message = broadcastState.get(value.getId());
                if (broadcastState != null && message != null) {
                    //主流数据处理
                    out.collect(new EventSum(value.getId(), value.getEventId(), value.getCnt(), message.getGender(), message.getCity()));
                } else {
                    //侧流数据处理
                    ctx.output(tOutputTag, value);
                }
            }

            @Override
            public void processBroadcastElement(Message value, Context ctx, Collector<EventSum> out) throws Exception {
                BroadcastState<Integer, Message> broadcastState = ctx.getBroadcastState(messageStateDescriptor);
                broadcastState.put(value.getId(), value);

            }
        });
        SingleOutputStreamOperator<EventSum> mainResult = process.keyBy(data -> data.getGender()).maxBy("cnt");

        // TODO 把主流结果,写入mysql,并实现幂等更新
        /*mainResult.print("main");*/
        SinkFunction<EventSum> jdbcSink = JdbcSink.sink(
                "insert into t_eventuser values(?,?,?,?,?) on duplicate key update eventId=? , cnt =? ,gender =? ,city = ?",
                new JdbcStatementBuilder<EventSum>() {
                    @Override
                    public void accept(PreparedStatement stmt, EventSum eventUserInfo) throws SQLException {
                        stmt.setInt(1, eventUserInfo.getId());
                        stmt.setString(2, eventUserInfo.getEventId());
                        stmt.setInt(3, eventUserInfo.getCnt());
                        stmt.setString(4, eventUserInfo.getGender());
                        stmt.setString(5, eventUserInfo.getCity());

                        stmt.setString(6, eventUserInfo.getEventId());
                        stmt.setInt(7, eventUserInfo.getCnt());
                        stmt.setString(8, eventUserInfo.getGender());
                        stmt.setString(9, eventUserInfo.getCity());
                    }
                },
                JdbcExecutionOptions.builder()
                        .withMaxRetries(3)
                        .withBatchSize(1)
                        .build(),
                new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                        .withUrl("jdbc:mysql://localhost:3306/abc?serverTimezone=Asia/Shanghai&useUnicode=true&characterEncoding=UTF-8")
                        .withUsername("root")
                        .withPassword("123456")
                        .build()
        );
        mainResult.addSink(jdbcSink);


        // TODO 把侧流数据,写入文件系统,并生成parquet文件
        /*joinedResult.getSideOutput(cOutputTag).print("side");*/

        ParquetWriterFactory<EventCount> parquetWriterFactory = ParquetAvroWriters.forReflectRecord(EventCount.class);

        // 3. 利用生成好的parquetWriter,来构造一个 支持列式输出parquet文件的 sink算子
        FileSink<EventCount> bulkSink = FileSink.forBulkFormat(new Path("d:/sidesink/"), parquetWriterFactory)
                .withBucketAssigner(new DateTimeBucketAssigner<EventCount>("yyyy-MM-dd--HH"))
                .withRollingPolicy(OnCheckpointRollingPolicy.build())
                .withOutputFileConfig(OutputFileConfig.builder().withPartPrefix("yang").withPartSuffix(".parquet").build())
                .build();
        process.getSideOutput(tOutputTag).sinkTo(bulkSink);


        env.execute();
    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值