flink asyncio 理论与实现

weixin_45626756

已于 2023-07-09 09:05:15 修改

阅读量128

点赞数

分类专栏： flink 异步IO 文章标签： flink

于 2021-04-17 11:30:44 首次发布

本文链接：https://blog.csdn.net/weixin_45626756/article/details/115763416

版权

flink 同时被 2 个专栏收录

9 篇文章 1 订阅

订阅专栏

异步IO

2 篇文章 0 订阅

订阅专栏

异步IO

维表JOIN：
flink流处理，经常需要和外部系统交互，用维表补全事实表字段。
默认情况下，MapFunction中单个并行度只能用同步的方式去交互（请求外部存储、IO阻塞、等待请求返回、继续下一个请求），网络IO消耗大量时间。为了提高效率，可以增加MapFunction的并行度，但增加并行度就会消耗更多的资源，并不是很好的解决方案。
因此，出现了flink的Async IO，减少了整体网络等待的开销。
flink与外部数据建立连接时，使用asyncio异步访问数据库，提高写库性能。减少了整体等待外部交互系统的等待耗时，降低时延，提高系统吞吐量

在这里插入图片描述 flink中的异步IO分为两种，有序和无序，这里的有序和无序不是指写库的顺序，既然是异步写库，写库顺序就自然是无序的。主要是发送到下游OutPut的顺序，有序会按照接受顺序发送，无序就是谁先完成写库谁先发送到下游。

有序Ordered：
reocrd异步写库进入一个queue中，守护现场emitter不断的拉取头部完成了写库操作的数据，如果头部record没有完成写库，就会阻塞，因此该方式效率会低一些。
在这里插入图片描述无序 Unordered
维护两个队列（uncompletedQueue、UnorderedElementQueue），record开始异步写库进入UncompletedQueue，写库完成后进入UnorderedElementQueue，Emitter守护线程不断的拉取UnorderedElementQueue中的reocrd。
注意：
1、外部数据源必须支持异步客户端，如果客户端是线程安全的(多个客户端可以一起使用)，可以不加transient关键字。否则，最好加上transient，不对其进行初始化，在open方法中，为每一个TaskManager初始化一个客户端。（以下实例中，数据库连接虽然没加transient修饰，但是链接通过DBmanager类维护，DBManager中链接使用了static修饰，每个TaskManager中唯一一个connection
）
2、TimeOut参数控制请求最长等待时间。默认，异步请求超时时，会引发异常重启或停止作业，如果要处理超时，可以重写AsyncFunction#timeout方法
3、Capacity参数控制请求并发数(默认100)，到达上线会触发反压
4、Async IO可以和缓存结合起来，减少请求外部存储的次数，提高效率

public DataStream<AdamData> apply(DataStream<AdamData> dstream, JobConfig jobConfig) throws Exception {
        AdamTransformFunc adamTransformFunc = new DefaultTransform();
        String className = getFunc();
        if (StringUtils.isNotBlank(className)) {
            adamTransformFunc = ((Class<AdamTransformFunc>) Class.forName(className)).newInstance();
        }
        return AsyncDataStream.unorderedWait(dstream, new AdamAsync(jobConfig, this, adamTransformFunc),
                timeout, TimeUnit.MILLISECONDS).name(getName());
    }

public class AdamAsync extends RichAsyncFunction<AdamData, AdamData> {

    private static final long serialVersionUID = -2718020292411805149L;

    private JobConfig jobConfig;
    private AsyncConfig operator;
    private AdamTransformFunc adamTransformFunc;
    private transient ExecutorService threadPool;

    public AdamAsync(JobConfig jobConfig, AsyncConfig operator, AdamTransformFunc adamTransformFunc) {
        this.jobConfig = jobConfig;
        this.operator = operator;
        this.adamTransformFunc = adamTransformFunc;
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        threadPool = Executors.newFixedThreadPool(operator.getThreadNum());
        Log.init(jobConfig);
        DbManager.init(jobConfig);
        if (adamTransformFunc != null) {
            adamTransformFunc.open(jobConfig, operator);
        }
    }

    @Override
    public void close() throws Exception {
        threadPool.shutdown();
        DbManager.close();
        adamTransformFunc.close();
        super.close();
    }

    @Override
    public void asyncInvoke(AdamData input, ResultFuture<AdamData> resultFuture) throws Exception {
        if (input == null) {
            return;
        }
        // 因为不是所有数据库都支持异步客户端，并且为了代码的通用性，
        //这里没有要求建立外部数据库的异步客户端进行交互，而是使用Java自身提供的CompletableFuture进行异步编程，实现异步提交请求
        CompletableFuture.supplyAsync(() -> {
            // 这里要做下 input clone 深拷贝，否则在transform做数据修改时，会偶发的导致 checkpoint 失败
            AdamData inputClone = new AdamData();
            inputClone.data = JSONObject.parseObject(input.data.toJSONString());
            if (input.logCtx != null) {
                inputClone.logCtx = input.logCtx.clone();
                inputClone.logCtx.setCurrentStage(operator.getName(), null);
            }
            //这里可能会与外部数据库建立链接，asyncio可以提高效率
            Collection<AdamData> result = adamTransformFunc.transform(inputClone, jobConfig, operator);
            if (result != null) {
                result = result.stream().filter(record -> record != null).map(record -> {
                    if (record.logCtx != null) {
                        record.logCtx.setCurrentStageEndTime();
                    }
                    return record;
                }).collect(Collectors.toList());
            }
            return result;
        }, threadPool).thenAccept(result ->  {
            if (result != null && !result.isEmpty()) {
            	// 一定要记得放回 resultFuture，不然数据全部是timeout 的
                resultFuture.complete(result);
            }
        });
    }

}

一个TaskManager的所有task公用一个connection

@Data
@Slf4j
public class DbManager {
    private static Map<String, Connection> hbaseMap = new ConcurrentHashMap<>();
    private static Map<String, JedisPool> redisMap = new ConcurrentHashMap<>();
    private static int hbaseRefCount = 0;
    private static int redisRefCount = 0;

    private final static int DEFAULT_POOL_SIZE = 10;

    public static Connection getHbaseConnection(String name) {
        Connection conn = hbaseMap.get(name);
        if (conn == null) {
            log.error("hbase connector error, connector={}", name);
        }
        return conn;
    }

    public static JedisPool getJedisPool(String name) {
        JedisPool pool = redisMap.get(name);
        if (pool == null) {
            log.error("redis connector error, connector={}", name);
        }
        return pool;
    }

    synchronized public static void close() {
        closeHbase();
        closeRedis();
    }

    synchronized private static void closeHbase() {
        // 因为是许多算子线程复用，所以根据引用次数来判定是否需要关闭
        if (hbaseRefCount == 0) {
            for (Connection conn : hbaseMap.values()) {
                try {
                    conn.close();
                } catch (IOException e) {
                }
            }
            hbaseMap.clear();
        } else {
            hbaseRefCount--;
        }
    }

    synchronized private static void closeRedis() {
        // 因为是许多算子线程复用，所以根据引用次数来判定是否需要关闭
        if (redisRefCount == 0) {
            for (JedisPool pool : redisMap.values()) {
                pool.close();
            }
            redisMap.clear();
        } else {
            redisRefCount--;
        }
    }

    synchronized public static void init(JobConfig jobConfig) throws IOException {
        if (jobConfig == null) {
            return;
        }
        initHbase(jobConfig.getHbaseConnectorConfigMap());
        initRedis(jobConfig.getRedisConnectorConfigMap());
    }

    synchronized private static void initHbase(Map<String, HbaseConnectorConfig> configMap) throws IOException {
        hbaseRefCount++;
        if (configMap == null) {
            return;
        }
        for (HbaseConnectorConfig config : configMap.values()) {
            addHbase(config);
        }
    }

    synchronized private static void initRedis(Map<String, RedisConnectorConfig> configMap) throws IOException {
        redisRefCount++;
        if (configMap == null) {
            return;
        }
        for (RedisConnectorConfig config : configMap.values()) {
            addRedis(config);
        }
    }

    private static void addHbase(HbaseConnectorConfig config) throws IOException {
        if (hbaseMap.containsKey(config.getName())) {
            return;
        }
        hbaseMap.put(config.getName(), createHbaseConnection(config));
    }

    public static Connection createHbaseConnection(HbaseConnectorConfig config) throws IOException {
        // http://hbase.apache.org/book.html#architecture.client
        // 按照官方文档的说法，Connection比较重，Table, Admin, RegionLocator是轻量级的，随用随建
        Configuration conf = HBaseConfiguration.create();
        conf.set(HConstants.ZOOKEEPER_QUORUM, config.getQuorum());
        conf.set(HConstants.ZOOKEEPER_CLIENT_PORT, config.getClientPort());
        if (StringUtils.isNumeric(config.getPoolSize())) {
            conf.set(HConstants.HBASE_CLIENT_IPC_POOL_SIZE, config.getPoolSize());
        } else {
            conf.set(HConstants.HBASE_CLIENT_IPC_POOL_SIZE, String.valueOf(DEFAULT_POOL_SIZE));
        }
        conf.set(HConstants.HBASE_CLIENT_RETRIES_NUMBER, "10");
        conf.set(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, "600000");
        // hadoop 账号密码需要在woater任务中设置
        // conf.set("hbase.user.name", config.getUserName());
        // conf.set("hbase.user.password", config.getUserPassword());

        // In HBase 1.0 and later, HTable is deprecated in favor of Table. Table does not use autoflush. To do buffered writes, use the BufferedMutator class.
        return ConnectionFactory.createConnection(conf);
    }

    private static void addRedis(RedisConnectorConfig config) {
        if (redisMap.containsKey(config.getName())) {
            return;
        }
        JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
        jedisPoolConfig.setMaxTotal(config.getPoolSize() == null ? DEFAULT_POOL_SIZE : config.getPoolSize());
        jedisPoolConfig.setMaxIdle(config.getPoolSize() == null ? DEFAULT_POOL_SIZE : config.getPoolSize());
        jedisPoolConfig.setMinIdle(config.getPoolSize() == null ? 0 : config.getPoolSize());
        // jedisPoolConfig.setTestOnBorrow(true);
        jedisPoolConfig.setTimeBetweenEvictionRunsMillis(-1);
        JedisPool jedisPool = new JedisPool(jedisPoolConfig, config.getIp(), config.getPort());
        redisMap.put(config.getName(), jedisPool);
    }

}