Flink练习
1.产品需求梳理
@Desc: 创建两个流
流1 :
“id,eventId,cnt”
1,event01,3
1,event02,2
2,event02,4
流2 :
“id,gender,city”
1, male, shanghai
2, female, beijing
需求:
1 , 将流1的数据展开
比如,一条数据: 1,event01,3
需要展开成3条:
1,event01,随机数1
1,event01,随机数2
1,event01,随机数3
2 , 流1的数据,还需要关联上 流2 的数据 (性别,城市)
并且把关联失败的流1的数据,写入一个侧流;否则输出到主流
4 , 对主流数据按性别分组, 取 最大随机数所在的那一条数据 作为结果输出
5 , 把侧流处理结果,写入 文件系统,并写成 parquet格式
6 , 把主流处理结果,写入 mysql, 并实现幂等更新
- 代码案例如下:
package com.yang.flink.test;
import com.yang.flink.vo.EventCount;
import com.yang.flink.vo.EventSum;
import com.yang.flink.vo.Message;
import org.apache.commons.lang3.RandomUtils;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.connector.jdbc.JdbcStatementBuilder;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.parquet.ParquetWriterFactory;
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.OnCheckpointRollingPolicy;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import scala.Int;
import java.sql.PreparedStatement;
import java.sql.SQLException;
/**
* 需求描述:
* @Desc: 创建两个流
* 流1 :
* “id,eventId,cnt”
* 1,event01,3
* 1,event02,2
* 2,event02,4
* 流2 :
* “id,gender,city”
* 1, male, shanghai
* 2, female, beijing
*
* 需求:
* 1 , 将流1的数据展开
* 比如,一条数据: 1,event01,3
* 需要展开成3条:
* 1,event01,随机数1
* 1,event01,随机数2
* 1,event01,随机数3
*
* 2 , 流1的数据,还需要关联上 流2 的数据 (性别,城市)
* 并且把关联失败的流1的数据,写入一个侧流;否则输出到主流
* 4 , 对主流数据按性别分组, 取 最大随机数所在的那一条数据 作为结果输出
* 5 , 把侧流处理结果,写入 文件系统,并写成 parquet格式
* 6 , 把主流处理结果,写入 mysql, 并实现幂等更新
*/
public class FlinkTest01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//创建数据流一
DataStreamSource<String> streamSource1 = env.socketTextStream("hadoop102", 9991);
SingleOutputStreamOperator<EventCount> stream1 = streamSource1.process(new ProcessFunction<String, EventCount>() {
@Override
public void processElement(String value, Context ctx, Collector<EventCount> out) throws Exception {
String[] split = value.split(",");
for (int i = 0; i < Integer.parseInt(split[2]); i++) {
out.collect(new EventCount(Integer.parseInt(split[0]), split[1], RandomUtils.nextInt(10, 100)));
}
}
});
DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9992);
SingleOutputStreamOperator<Message> stream2 = streamSource2.map(data -> {
String[] split = data.split(",");
return new Message(Integer.parseInt(split[0]), split[1], split[2]);
});
//将数据源2整合为广播变量
MapStateDescriptor<Integer, Message> messageStateDescriptor = new MapStateDescriptor<>("message", Integer.class, Message.class);
OutputTag<EventCount> tOutputTag = new OutputTag<EventCount>("noconn", TypeInformation.of(EventCount.class));
BroadcastStream<Message> stream2Broadcast = stream2.broadcast(messageStateDescriptor);
//两部分数据整合
SingleOutputStreamOperator<EventSum> process = stream1.connect(stream2Broadcast).process(new BroadcastProcessFunction<EventCount, Message, EventSum>() {
@Override
public void processElement(EventCount value, ReadOnlyContext ctx, Collector<EventSum> out) throws Exception {
ReadOnlyBroadcastState<Integer, Message> broadcastState = ctx.getBroadcastState(messageStateDescriptor);
Message message = broadcastState.get(value.getId());
if (broadcastState != null && message != null) {
//主流数据处理
out.collect(new EventSum(value.getId(), value.getEventId(), value.getCnt(), message.getGender(), message.getCity()));
} else {
//侧流数据处理
ctx.output(tOutputTag, value);
}
}
@Override
public void processBroadcastElement(Message value, Context ctx, Collector<EventSum> out) throws Exception {
BroadcastState<Integer, Message> broadcastState = ctx.getBroadcastState(messageStateDescriptor);
broadcastState.put(value.getId(), value);
}
});
SingleOutputStreamOperator<EventSum> mainResult = process.keyBy(data -> data.getGender()).maxBy("cnt");
// TODO 把主流结果,写入mysql,并实现幂等更新
/*mainResult.print("main");*/
SinkFunction<EventSum> jdbcSink = JdbcSink.sink(
"insert into t_eventuser values(?,?,?,?,?) on duplicate key update eventId=? , cnt =? ,gender =? ,city = ?",
new JdbcStatementBuilder<EventSum>() {
@Override
public void accept(PreparedStatement stmt, EventSum eventUserInfo) throws SQLException {
stmt.setInt(1, eventUserInfo.getId());
stmt.setString(2, eventUserInfo.getEventId());
stmt.setInt(3, eventUserInfo.getCnt());
stmt.setString(4, eventUserInfo.getGender());
stmt.setString(5, eventUserInfo.getCity());
stmt.setString(6, eventUserInfo.getEventId());
stmt.setInt(7, eventUserInfo.getCnt());
stmt.setString(8, eventUserInfo.getGender());
stmt.setString(9, eventUserInfo.getCity());
}
},
JdbcExecutionOptions.builder()
.withMaxRetries(3)
.withBatchSize(1)
.build(),
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://localhost:3306/abc?serverTimezone=Asia/Shanghai&useUnicode=true&characterEncoding=UTF-8")
.withUsername("root")
.withPassword("123456")
.build()
);
mainResult.addSink(jdbcSink);
// TODO 把侧流数据,写入文件系统,并生成parquet文件
/*joinedResult.getSideOutput(cOutputTag).print("side");*/
ParquetWriterFactory<EventCount> parquetWriterFactory = ParquetAvroWriters.forReflectRecord(EventCount.class);
// 3. 利用生成好的parquetWriter,来构造一个 支持列式输出parquet文件的 sink算子
FileSink<EventCount> bulkSink = FileSink.forBulkFormat(new Path("d:/sidesink/"), parquetWriterFactory)
.withBucketAssigner(new DateTimeBucketAssigner<EventCount>("yyyy-MM-dd--HH"))
.withRollingPolicy(OnCheckpointRollingPolicy.build())
.withOutputFileConfig(OutputFileConfig.builder().withPartPrefix("yang").withPartSuffix(".parquet").build())
.build();
process.getSideOutput(tOutputTag).sinkTo(bulkSink);
env.execute();
}
}