技术学习——基于flink-cdc实现processFunction中参数的值,来实现动态修改计算规则
1、问题分析
我们平常在工作中总是会遇到一些需求,是需要我们在动态的设置一些flink计算任务中某些计算规则的参数,例如通过用户在连续几秒内统计登录失败的次数超过一定次数,就需要产生告警提示,其中时间和次数需要可以通过web页面自己设置。一般这种时候我们会选择BroadcastStream广播流来实现,今天我们来提供一个新的思路
2、实现思路
这次我们打算利用flink-cdc来解决这个问题,首先我们需要定义一个本地缓存,缓存中存储这对应数据库中的计算规则及参数,然后启动flink-cdc将数据库中的数据同步到本地缓存中去,然后每次计算的时候去缓存中获取计算参数,当数据库内容修改的时候会同步到本地内存中,同样也会修改计算的参数
3、具体实现
数据库配置
/*
Navicat Premium Data Transfer
Source Server : 10.10.0.234
Source Server Type : MySQL
Source Server Version : 50738
Source Host : 10.10.0.234:3306
Source Schema : test
Target Server Type : MySQL
Target Server Version : 50738
File Encoding : 65001
Date: 25/08/2022 11:05:31
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for sys_rule
-- ----------------------------
DROP TABLE IF EXISTS `sys_rule`;
CREATE TABLE `sys_rule` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`rule` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of sys_rule
-- ----------------------------
INSERT INTO `sys_rule` VALUES (1, '{\"time\":5,\"count\":3}');
SET FOREIGN_KEY_CHECKS = 1;
项目结构
配置依赖
父级依赖
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<flink.version>1.13.6</flink.version>
<scala.binary.version>2.12</scala.binary.version>
<target.java.version>1.8</target.java.version>
<maven.compiler.source>${target.java.version}</maven.compiler.source>
<maven.compiler.target>${target.java.version}</maven.compiler.target>
<slf4j.version>1.7.7</slf4j.version>
<log4j.version>1.2.17</log4j.version>
<lombok.version>1.18.6</lombok.version>
<hutool.version>5.6.6</hutool.version>
<fastjson.version>1.2.25</fastjson.version>
<kafka.version>2.8.0</kafka.version>
</properties>
<dependencies>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>${hutool.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_${scala.binary.version}</artifactId>
<version>${kafka.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
模块依赖
<properties>
<mysql.version>8.0.20</mysql.version>
</properties>
<repositories>
<dependencies>
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>2.1.1</version>
<!-- <version>2.0.2</version>-->
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
编写本地缓存
package com.learn.flink.cdc.cache;
import com.learn.flink.cdc.bean.RuleEntity;
import lombok.extern.slf4j.Slf4j;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@Slf4j
public class CacheManager {
private static Map<Integer, RuleEntity> ruleEntityMap = new ConcurrentHashMap<>();
public static RuleEntity getRuleById(Integer id){
return ruleEntityMap.get(id);
}
public static void setRuleEntityMap(RuleEntity ruleEntity){
ruleEntityMap.put(ruleEntity.getId(),ruleEntity);
}
public static void deleteRule(RuleEntity ruleEntity){
ruleEntityMap.remove(ruleEntity.getId());
}
public static void printRuleEntityMap(){
log.info(ruleEntityMap.toString());
}
}
启动flink-cdc
//配置flink-cdc
MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
.hostname("10.10.0.234")
.port(3306)
.databaseList("test") //订阅的库
.tableList("test.sys_rule")
.username("root")
.password("Mysql@123")
.deserializer(new FormatSchema())
.build();
DataStreamSink<String> mySQL_source = env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source")
//将数据刷新到本地缓存
.setParallelism(1).addSink(new RefreshLocalCacheSink());
配置flink任务
// 1. 从kafka中读取数据
// URL resource = LoginFail.class.getResource("/LoginLog.csv");
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "10.10.0.234:9092");
properties.setProperty("group.id", "consumer");
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("auto.offset.reset", "latest");
DataStream<String> inputStream = env.addSource(new FlinkKafkaConsumer<String>("login", new SimpleStringSchema(), properties));
inputStream.print("input");
DataStream<LoginEvent> loginEventStream = inputStream
.map(line -> {
String[] fields = line.split(",");
return LoginEvent.builder()
.userId(new Long(fields[0]))
.ip(fields[1])
.loginState(fields[2])
.timestamp(new Long(fields[3]))
.build();
})
.assignTimestampsAndWatermarks(WatermarkStrategy
.<LoginEvent>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<LoginEvent>() {
@Override
public long extractTimestamp(LoginEvent element, long recordTimestamp) {
return element.getTimestamp()*1000L;
}
}));
loginEventStream.print("login_event");
// 自定义处理函数检测连续登录失败事件
SingleOutputStreamOperator<LoginFailWarning> warningStream = loginEventStream
.keyBy(LoginEvent::getUserId)
.process(new LoginFailDetectWarningCDC());
warningStream.print();
env.execute("login fail detect job");
核心判断逻辑
package com.learn.flink.cdc.function;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
import com.learn.flink.cdc.bean.DataStruct;
import com.learn.flink.cdc.bean.LoginEvent;
import com.learn.flink.cdc.bean.LoginFailWarning;
import com.learn.flink.cdc.bean.RuleEntity;
import com.learn.flink.cdc.cache.CacheManager;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.guava18.com.google.common.collect.Lists;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
@Slf4j
public class LoginFailDetectWarningCDC extends KeyedProcessFunction<Long, LoginEvent, LoginFailWarning> {
// 定义状态:保存2秒内所有的登录失败事件
ListState<LoginEvent> loginFailEventListState;
// 定义状态:保存注册的定时器时间戳
ValueState<Long> timerTsState;
// 定义状态:保存当前定时器的定时状态
ValueState<Long> timeState;
@Override
public void open(Configuration parameters) throws Exception {
loginFailEventListState = getRuntimeContext().getListState(new ListStateDescriptor<LoginEvent>("login-fail-list", LoginEvent.class));
timerTsState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("timer-ts", Long.class));
timeState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("time", Long.class));
}
@Override
public void processElement(LoginEvent value, Context ctx, Collector<LoginFailWarning> out) throws Exception {
//获取计算规则计算时间
RuleEntity ruleEntity = CacheManager.getRuleById(1);
log.info(ruleEntity.toString());
JSONObject jsonObject = JSONUtil.parseObj(ruleEntity.getRule());
Long time = jsonObject.getLong("time");
// 判断当前登录事件类型
if ("fail".equals(value.getLoginState())) {
// 1. 如果是失败事件,添加到列表状态中
loginFailEventListState.add(value);
// 如果没有定时器,注册一个2秒之后的定时器
if (timerTsState.value() == null) {
Long ts = (value.getTimestamp() + time) * 1000L;
ctx.timerService().registerEventTimeTimer(ts);
timerTsState.update(ts);
timeState.update(time);
}
} else {
// 2. 如果是登录成功,删除定时器,清空状态,重新开始
if (timerTsState.value() != null) {
ctx.timerService().deleteEventTimeTimer(timerTsState.value());
}
loginFailEventListState.clear();
timerTsState.clear();
timeState.clear();
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<LoginFailWarning> out) throws Exception {
log.info("定时任务触发");
// 定时器触发,说明2秒内没有登录成功来,判断ListState中失败的个数
ArrayList<LoginEvent> loginFailEvents = Lists.newArrayList(loginFailEventListState.get());
Integer failTimes = loginFailEvents.size();
//获取计算规则失败次数
RuleEntity ruleEntity = CacheManager.getRuleById(1);
JSONObject jsonObject = JSONUtil.parseObj(ruleEntity.getRule());
Integer maxFailTimes = jsonObject.getInt("count");
if (failTimes >= maxFailTimes) {
// 如果超出设定的最大失败次数,输出报警
out.collect(LoginFailWarning.builder()
.userId(ctx.getCurrentKey())
.firstFailTime(loginFailEvents.get(0).getTimestamp())
.lastFailTime(loginFailEvents.get(failTimes - 1).getTimestamp())
.warningMsg("login fail in "+timeState.value()+"s for " + failTimes + " times,is more than the settings"+maxFailTimes)
.build());
}
// 清空状态
loginFailEventListState.clear();
timerTsState.clear();
timeState.clear();
}
}
具体实现效果
4、存在问题
1、该项目采用了flink-cdc,需要大家开启mysql数据库的binlog日志并且配置需要同步数据的数据库表
2、该项目实现存在了两条同时并行的进程,flnk-cdc和flink任务,如果flink任务在启动的时候flinkcdc还没有初始化完成则会报错,启动失败,因此需要添加一个初始化的方法在启动flink任务之前,目前项目暂时没有提供
3、注意本地缓存一定要是线程安全的不然会出现数据不一致问题,由于没有试验过多并行度问题,所以暂时不清楚会遇到什么问题,大家可以自己实验下
项目代码连接https://gitee.com/hdzxy/learn-flink.git