前言
之前的文章中讲过了Flink 流流join
也讲过了翻车版本的流批join。为什么翻车了,那篇文章也说了。如果事实表和维度表进行join,Flink会认为这是一个批处理程序。也就是说程序会自己暂停。
流批join的需求还蛮多的,比如我们有一张用户点击网站的数据,还有一张用户表在MySQL中,我们需要关联MySQL中的数据来丰富实时流数据,这就需要用到流批join了。
数据
MySQL数据
模拟维度表。
Kafka数据
模拟点击流。
代码
异步io函数
package it.kenn.asyncio;
import it.kenn.pojo.Click;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
/**
* @date 2020-12-27
* 异步io测试
* 参考文档:https://www.cnblogs.com/zz-ksw/p/13228642.html
*/
public class AsyncDatabaseRequest extends RichAsyncFunction<Click, Tuple5<String, Integer,String, String,String>> {
private transient Connection client;
@Override
public void open(Configuration parameters) throws Exception {
Class.forName("com.mysql.jdbc.Driver");
client = DriverManager.getConnection("jdbc:mysql://localhost:3306/aspirin?useSSL=false", "root", "root");
client.setAutoCommit(false);
}
@Override
public void close() throws Exception {
client.close();
}
@Override
public void asyncInvoke(Click input, ResultFuture<Tuple5<String, Integer,String, String,String>> resultFuture) throws Exception {
List<Tuple5<String, Integer, String, String, String>> list = new ArrayList<>();
Statement statement = client.createStatement();
ResultSet resultSet = statement.executeQuery("select user_name, age, gender from user_data_for_join where user_name= '" + input.getUser() + "'");
if (resultSet != null && resultSet.next()) {
String name = resultSet.getString("user_name");
int age = resultSet.getInt("age");
String gender = resultSet.getString("gender");
Tuple5<String, Integer,String, String,String> res = Tuple5.of(name, age, gender, input.getSite(), input.getTime());
list.add(res);
}
// 将数据搜集
resultFuture.complete(list);
}
@Override
public void timeout(Click input, ResultFuture<Tuple5<String, Integer,String, String,String>> resultFuture) throws Exception {
}
}
主函数
package it.kenn.asyncio;
import it.kenn.pojo.Click;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import java.util.concurrent.TimeUnit;
import static org.apache.flink.table.api.Expressions.$;
/**
* 利用异步io实现事实表与维表的join操作
*/
public class AsyncJoin {
private static String kafkaTable = "CREATE TABLE KafkaTable (\n" +
" `user` STRING,\n" +
" `site` STRING,\n" +
" `time` STRING\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'test-old',\n" +
" 'properties.bootstrap.servers' = 'localhost:9092',\n" +
" 'properties.group.id' = 'testGroup',\n" +
" 'scan.startup.mode' = 'earliest-offset',\n" +
" 'format' = 'json'\n" +
")";
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env);
//注册表
tableEnvironment.executeSql(kafkaTable);
Table kafkaTable = tableEnvironment.from("KafkaTable").select($("user"), $("site"), $("time"));
//将table转为datastream
DataStream<Click> clickDataStream = tableEnvironment.toAppendStream(kafkaTable, Click.class);
SingleOutputStreamOperator<Tuple5<String, Integer, String, String, String>> asyncStream =
//注意这里将事实表和维度表连接起来的是AsyncDataStream.unorderedWait方法
AsyncDataStream.unorderedWait(clickDataStream, new AsyncDatabaseRequest(), 1000, TimeUnit.MILLISECONDS, 100);
asyncStream.print("async:");
env.execute("AsyncIOFunctionTest");
}
}
结果
总结
上面这是一个简单的demo,与异步io相关的很多细节、参数等都还没有说到。
有参考文章:https://www.cnblogs.com/zz-ksw/p/13228642.html
和官网:https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/stream/operators/asyncio.html