简介
使用left-join以后写入kafka
工具类
public class KafkaUtil {
private final static String BOOTSTRAP_SERVERS="master:9092";
/**
* Kafka-Source DDL 语句
*
* @param topic 数据源主题
* @param groupId 消费者组
* @return 拼接好的 Kafka 数据源 DDL 语句
*/
public static String getKafkaDDL(String topic, String groupId) {
return " with ('connector' = 'kafka', " +
" 'topic' = '" + topic + "'," +
" 'properties.bootstrap.servers' = '" + BOOTSTRAP_SERVERS + "', " +
" 'properties.group.id' = '" + groupId + "', " +
" 'format' = 'json', " +
" 'scan.startup.mode' = 'group-offsets')";
}
/**
* Kafka-Sink DDL 语句
*
* @param topic 输出到 Kafka 的目标主题
* @return 拼接好的 Kafka-Sink DDL 语句
*/
public static String getUpsertKafkaDDL(String topic) {
return "WITH ( " +
" 'connector' = 'upsert-kafka', " +
" 'topic' = '" + topic + "', " +
" 'properties.bootstrap.servers' = '" + BOOTSTRAP_SERVERS + "', " +
" 'key.format' = 'json', " +
" 'value.format' = 'json' " +
")";
}
}
实现类模板
public class DoubleJoin {
public static void main(String[] args) throws Exception {
//得到执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> socketTextStream = env.socketTextStream("master", 9996);
//并行度设置为1才能看到效果,因为如果不为1,那么有些分区的水位线就是负无穷
//由于自己的水位线是分区里面最小的水位线,那么自己的一直都是负无穷
//就触发不了水位线的上升
env.setParallelism(1);
//第一个参数就一个名字,第二个参数用来表示事件时间
SingleOutputStreamOperator<Tuple2<String, Long>> initData = socketTextStream.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String value) throws Exception {
String[] s = value.split(" ");
//假设我们在控制台输入的参数是a 15s,那么我们要15*1000才能得到时间戳的毫秒时间
return Tuple2.of(s[0], Long.parseLong(s[1]) * 1000L);
}
});
DataStreamSource<String> socketTextStream2 = env.socketTextStream("master", 9997);
//并行度设置为1才能看到效果,因为如果不为1,那么有些分区的水位线就是负无穷
//由于自己的水位线是分区里面最小的水位线,那么自己的一直都是负无穷
//就触发不了水位线的上升
env.setParallelism(1);
//第一个参数就一个名字,第二个参数用来表示事件时间
SingleOutputStreamOperator<Tuple2<String, Long>> initData2 = socketTextStream2.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String value) throws Exception {
String[] s = value.split(" ");
//假设我们在控制台输入的参数是a 15s,那么我们要15*1000才能得到时间戳的毫秒时间
return Tuple2.of(s[0], Long.parseLong(s[1]) * 1000L);
}
});
//使用FlinkSql
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
tableEnv.createTemporaryView("t1", initData);
tableEnv.createTemporaryView("t2", initData2);
tableEnv.getConfig().setIdleStateRetention(Duration.ofSeconds(10));
Table table = tableEnv.sqlQuery("select * from t1 left join t2 on t1.f0=t2.f0");
tableEnv.createTemporaryView("t",table);
//创建kafka的表
tableEnv.executeSql("CREATE TABLE test( " +
//这里的字段要对应上面的select * from t1 right join t2 on t1.f0=t2.f0字段
// Query schema: [f0: STRING, f1: BIGINT, f00: STRING, f10: BIGINT NOT NULL]
// Sink schema: [id: STRING, name: STRING, test1: STRING, test2: STRING]
" f0 STRING, " +
" f1 BIGINT, " +
" f00 STRING, " +
" f10 BIGINT, " +
//要设定一个主键才不会报错
" PRIMARY KEY (f0) NOT ENFORCED " +
")"+ KafkaUtil.getUpsertKafkaDDL("test"));
//把上面的left的数据join到test的主题里面
tableEnv.executeSql("insert into test select * from t")
.print();
env.execute();
}
}
输入数据
nc -lk 9996
a 6
a 6
nc -lk 9997
a 1
a 6
消费
./kafka-console-consumer.sh --bootstrap-server master:9092 --topic test
得到的数据
{"f0":"a","f1":6000,"f00":null,"f10":null}
null
null
{"f0":"a","f1":6000,"f00":"a","f10":6000}
{"f0":"a","f1":6000,"f00":"a","f10":6000}
结论
可以看到在前面如果left join没有join到数据,也就是右边的值为空的时候,那么,如果有匹配的值的时候,它会出现撤回的情况,也就是把前面的数据删除掉,输出两个null值,然后再输出连接后的数据