DWM层双流join(订单表和订单明细表join)
官方文档:
https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/stream/operators/joining.html
代码实现:
package com.yyds.app.dwm;
import com.alibaba.fastjson.JSONObject;
import com.yyds.bean.OrderDetail;
import com.yyds.bean.OrderInfo;
import com.yyds.bean.OrderWide;
import com.yyds.utils.MyKafkaUtils;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import java.text.SimpleDateFormat;
/**
* dwm层订单宽表
*/
public class OrderWideApp {
public static void main(String[] args) throws Exception {
// TODO 1、获取执行环境
System.setProperty("HADOOP_USER_NAME","root");
// 获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 开启 Checkpoint,每隔 5 秒钟做一次 Checkpoint
env.enableCheckpointing(5000L);
//指定 CK 的一致性语义
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// 设置超时时间
//env.getCheckpointConfig().setAlignmentTimeout(10000L);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000L);
// 重启策略
// env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3,5000L));
//设置任务关闭的时候保留最后一次 CK 数据
env.getCheckpointConfig().enableExternalizedCheckpoints(
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
);
// 设置状态后端
env.setStateBackend(new FsStateBackend("hdfs://centos01:8020/flinkCDC/ck"));
// TODO 2、获取dwd中kafka中订单和订单明细数据 转换为javaBean对象 提取时间戳 并生成watermark
String orderInfoSourceTopic = "dwd_order_info";
String orderDetailSourceTopic = "dwd_order_detail";
String orderWideSinkTopic = "dwm_order_wide";
String groupId = "order_wide_group";
SingleOutputStreamOperator<OrderInfo> orderInfoStreamOperator = env
.addSource(MyKafkaUtils.getKafkaConsumer(orderInfoSourceTopic, groupId))
.map(line -> {
OrderInfo orderInfo = JSONObject.parseObject(line, OrderInfo.class);
// 补充字段
String create_time = orderInfo.getCreate_time();
String[] dateTimeArr = create_time.split(" ");
orderInfo.setCreate_date(dateTimeArr[0]);
orderInfo.setCreate_hour(dateTimeArr[1].split(":")[0]);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
long ts = sdf.parse(create_time).getTime();
orderInfo.setCreate_ts(ts);
return orderInfo;
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<OrderInfo>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderInfo>() {
@Override
public long extractTimestamp(OrderInfo element, long recordTimestamp) {
return element.getCreate_ts();
}
}));
SingleOutputStreamOperator<OrderDetail> orderDetailStreamOperator = env.addSource(MyKafkaUtils.getKafkaConsumer(orderDetailSourceTopic, groupId))
.map(line -> {
OrderDetail orderDetail = JSONObject.parseObject(line, OrderDetail.class);
// 补充字段
String create_time = orderDetail.getCreate_time();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
long ts = sdf.parse(create_time).getTime();
orderDetail.setCreate_ts(ts);
return orderDetail;
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<OrderDetail>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderDetail>() {
@Override
public long extractTimestamp(OrderDetail element, long recordTimestamp) {
return element.getCreate_ts();
}
}));
// TODO 3、双流join
/*
orangeStream
.keyBy(<KeySelector>)
.intervalJoin(greenStream.keyBy(<KeySelector>))
.between(Time.milliseconds(-2), Time.milliseconds(1))
.process (new ProcessJoinFunction<Integer, Integer, String(){
@Override
public void processElement(Integer left, Integer right, Context ctx, Collector<String> out) {
out.collect(left + "," + right);
}
});
*/
SingleOutputStreamOperator<OrderWide> joinedOrderWideDS = orderInfoStreamOperator
.keyBy(orderInfo -> orderInfo.getId())
.intervalJoin(orderDetailStreamOperator.keyBy(orderDetail -> orderDetail.getOrder_id()))
.between(Time.seconds(-5), Time.seconds(5)) // 最大延迟时间
.process(new ProcessJoinFunction<OrderInfo, OrderDetail, OrderWide>() {
@Override
public void processElement(OrderInfo orderInfo, OrderDetail orderDetail, Context ctx, Collector<OrderWide> out) throws Exception {
out.collect(new OrderWide(orderInfo, orderDetail));
}
});
joinedOrderWideDS.print("joinedOrderWideDS-------------");
// TODO 4、关联维度信息
// TODO 5、将数据写入kafka
// TODO 6、启动任务
env.execute("OrderWideApp");
}
}