文章目录
异步 I/O 是为了解决 flink 与外部系统 (REST SERVER/Hbase/Mysql 等) 进行频繁交互时的延时而提出的一个特性。
官方文档见
- https://ci.apache.org/projects/flink/flink-docs-master/dev/stream/operators/asyncio.html
- https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=65870673
- https://docs.google.com/document/d/1Lr9UYXEz6s6R_3PWg3bZQLF3upGaNEkc0rQCFSzaYDI/edit#
中文博客也有很多介绍:
- http://wuchong.me/blog/2017/05/17/flink-internals-async-io/
- https://blog.icocoro.me/2019/05/26/1905-apache-flinkv2-asyncio/
这里只提供代码示例。
先简单说一下业务逻辑:有一个 scoreDataStream 流,流中是 Score 信息,需要通过 Score 中的 stu_id http 请求获取 Student 信息,然后组合输出。代码中使用了 httpasyncclient 实现回调。
- AsyncHttpRequest 算子,继承 RichAsyncFunction<IN, OUT>
import com.google.gson.Gson;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.apache.flink.util.Preconditions;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.concurrent.FutureCallback;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClients;
import org.apache.http.impl.nio.conn.PoolingNHttpClientConnectionManager;
import org.apache.http.impl.nio.reactor.DefaultConnectingIOReactor;
import org.apache.http.impl.nio.reactor.IOReactorConfig;
import org.apache.http.nio.reactor.ConnectingIOReactor;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.CancellationException;
/**
* An implementation of the 'AsyncFunction' that sends requests and sets the callback.
*/
class AsyncHttpRequest extends RichAsyncFunction<Score, Tuple2<Score,Student>> {
/** The database specific client that can issue concurrent requests with callbacks */
private transient CloseableHttpAsyncClient client;
// 连接超时 ms(三次握手建立连接的时间)
private int connectionTimeOut;
// socket 超时 ms (http 请求返回结果的时间)
private int socketTimeOut;
// 从连接池中获取 connection 的超时时间(默认不限制,连接用完后会阻塞在这里)
private int connectionRequestTimeOut = -1;
// 连接池大小
private int poolMaxTotal;
// 单个 host 的最大连接数
private int maxPerRoute;
// 默认长连接
private boolean keepAlive = true;
private Logger logger = LoggerFactory.getLogger(AsyncHttpRequest.class);
public AsyncHttpRequest(int capacity, int connectionTimeOut, int socketTimeOut){
Preconditions.checkArgument(capacity > 0);
Preconditions.checkArgument(connectionTimeOut > 0);
Preconditions.checkArgument(socketTimeOut > 0);
this.maxPerRoute = this.poolMaxTotal = capacity;
this.connectionTimeOut = connectionTimeOut;
this.socketTimeOut = socketTimeOut;
}
public AsyncHttpRequest setConnectionTimeout(int connectionTimeOut) {
this.connectionTimeOut = connectionTimeOut;
return this;
}
public AsyncHttpRequest setSocketTimeout(int socketTimeOut) {
this.socketTimeOut = socketTimeOut;
return this;
}
public AsyncHttpRequest setConnectionRequestTimeOut(int connectionRequestTimeOut) {
this.connectionRequestTimeOut = connectionRequestTimeOut;
return this;
}
public AsyncHttpRequest setPoolMaxTotal(int poolMaxTotal) {
this.poolMaxTotal = poolMaxTotal;
return this;
}
public AsyncHttpRequest setMaxPerRoute(int maxPerRoute) {
this.maxPerRoute = maxPerRoute;
return this;
}
@Override
public void open(Configuration parameters) throws Exception {
RequestConfig requestConfig = RequestConfig.custom()
.setConnectTimeout(connectionTimeOut)
.setSocketTimeout(socketTimeOut)
.setConnectionRequestTimeout(connectionRequestTimeOut)
.build();
//配置io线程
IOReactorConfig ioReactorConfig = IOReactorConfig.custom().
setIoThreadCount(Runtime.getRuntime().availableProcessors())
.setSoKeepAlive(keepAlive)
.build();
//设置连接池大小
ConnectingIOReactor ioReactor = new DefaultConnectingIOReactor(ioReactorConfig);
PoolingNHttpClientConnectionManager connManager = new PoolingNHttpClientConnectionManager(ioReactor);
connManager.setMaxTotal(poolMaxTotal);
connManager.setDefaultMaxPerRoute(maxPerRoute);
client = HttpAsyncClients.custom().
setConnectionManager(connManager)
.setDefaultRequestConfig(requestConfig)
.build();
client.start();
}
@Override
public void close() throws Exception {
client.close();
}
@Override
public void asyncInvoke(Score score, final ResultFuture<Tuple2<Score,Student>> resultFuture) throws Exception {
// 这里拼接 http 请求
HttpGet httpGet = new HttpGet("http://localhost:12345?stu_id=" + score.getStuId());
// 异步客户端,通过回调方式
client.execute(httpGet, new FutureCallback<HttpResponse>() {
@Override
public void completed(HttpResponse httpResponse) {
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
Gson gson = new Gson();
Student student = null;
try {
student = gson.fromJson(EntityUtils.toString(httpEntity), Student.class);
} catch (IOException e) {
logger.error("failed to convert httpEntity to String",e);
resultFuture.completeExceptionally(e);
}
// 最终调用 resultFuture.complete 或 resultFuture.completeExceptionally
resultFuture.complete(Collections.singleton(new Tuple2<>(score, student)));
} else {
logger.error("got wrong status code" + httpResponse.getStatusLine().getStatusCode());
resultFuture.completeExceptionally(new RuntimeException("got wrong status code" + httpResponse.getStatusLine().getStatusCode()));
}
}
@Override
public void failed(Exception e) {
logger.error("failed to get httpresult",e);
resultFuture.completeExceptionally(e);
}
@Override
public void cancelled() {
logger.error("http request cancelled:" + httpGet);
resultFuture.completeExceptionally(new CancellationException("http request cancelled:" + httpGet));
}
});
}
}
- 定义 scoreDataStream 流, 这里可以从 kafka 中读取
DataStream<Score> scoreDataStream = ...
- AsyncDataStream.unorderedWait / AsyncDataStream.orderedWait 对 scoreDataStream 流进行处理
// 每个子任务能同时异步执行的元素个数
int capacity = ...
// 异步 io 算子中元素最大等待时间。如果超过这个时间,客户端没有返回结果,会抛出异常
int timeout = ...
// 每个子任务的最大线程池个数
int poolMaxTotal = ...
// http 连接超时 ms(三次握手建立连接的时间)
int connectionTimeOut = ...
// 调用 AsyncDataStream.unorderedWait/orderedWait 对 scoreDataStream 流进行异步io 处理,得到 mergedStream
DataStream<Tuple2<Score, Student>> mergedStream =
AsyncDataStream.unorderedWait(
scoreDataStream,
new AsyncHttpRequest(capacity, connectionTimeOut , timeout),
timeout, TimeUnit.MILLISECONDS,
poolMaxTotal );
// 可以对 mergedStream 流进行后续处理
- 附 Score 和 Student 类
Score.java
public class Score {
public String name;
public int score;
public int stu_id;
public Score(){
}
public Score(String name, int score, int stu_id) {
this.name = name;
this.score = score;
this.stu_id = stu_id;
}
public String getName() {
return name;
}
public int getScore() {
return score;
}
public int getStuId() {
return stu_id;
}
@Override
public String toString(){
return "Score{" +
"stu_id=" + stu_id +
", name='" + name + '\'' +
", score='" + score +
'}';
}
}
Student.java
public class Student {
public int id;
public String name;
public String password;
public int age;
public Student() {
}
public Student(int id, String name, String password, int age) {
this.id = id;
this.name = name;
this.password = password;
this.age = age;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
@Override
public String toString() {
return "Student{" +
"id=" + id +
", name='" + name + '\'' +
", password='" + password + '\'' +
", age=" + age +
'}';
}
}