Flink集成Kafka 并将评估状态存入状态让外界查询
首先需要在工具类上添加两个get方法用来获取应用信息应用名登录用户名来作为状态键
有关这个工具类请看前篇博客《接风控责任链之测试与数据抽取工具类》
//取登录应用信息应用名
public static String getUserIdentify(String input){
//指定一个验证数据对象
EvaluateData evaluateData = new EvaluateData();
//获取匹配体
Matcher matcher = EvaluateUtil.LEGAL_PATTERN.matcher(input);
if(matcher.find()){
String group = matcher.group(2);
return group;
}
return null;
}
//取登录用户名
public static String getApplicationName(String input){
//指定一个验证数据对象
EvaluateData evaluateData = new EvaluateData();
System.out.println("走这里了");
//获取匹配体
Matcher matcher = EvaluateUtil.LEGAL_PATTERN.matcher(input);
if(matcher.find()){
String group = matcher.group(4);
return group;
}
return null;
}
其次要把之前的评估项目引入到目前写的集成Flink计算引擎项目
<!-- 引入前期写的评估模型-->
<dependency>
<groupId>com.baizhi</groupId>
<artifactId>EvaluateModel</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
当然其他的需要用到的根据需求来引
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!--scala编译插件-->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!--创建fatjar插件-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
<!--编译插件-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
集成Flink 计算机引擎并连接Kafka
import java.util.Properties
import com.baizhi.enties.{EvaluateReport, HistoryData}
import com.baizhi.evaluate.impl.{AreaEvaluate, DeviceEvaluate, InputFeatureEvaluate, SimilarityEvaluate, SpeedEvaluate, TimeSlotEvaluate, TotalEvaluate}
import com.baizhi.evaluate.{Evaluate, EvaluateChain}
import com.baizhi.until.EvaluateUtil
import com.baizhi.update.impl.{GeoPointsUpdate, HistoryCities, HistoryDeviceInformations, HistoryLoginTime, HistoryLoginTimeSlot, HistoryOrdernessPasswords, LatestInputFeatures}
import com.baizhi.update.{Updater, UpdaterChain}
import org.apache.flink.api.common.functions.{ReduceFunction, RichMapFunction}
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.StateTtlConfig.{StateVisibility, UpdateType}
import org.apache.flink.api.common.state.{MapState, MapStateDescriptor, ReducingState, ReducingStateDescriptor, StateTtlConfig, ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.time.Time
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, FlinkKafkaProducer}
import java.util.{ArrayList => JList}
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchema
import org.codehaus.jackson.map.ObjectMapper
object UserLoginEvaluateTopology {
def main(args: Array[String]): Unit = {
//1.创建流计算执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//间隔5s执行一次checkpoint 精准一次
env.enableCheckpointing(5000,CheckpointingMode.EXACTLY_ONCE)
//设置检查点超时 4s
env.getCheckpointConfig.setCheckpointTimeout(4000)
//开启本次检查点 与上一次完成的检查点时间间隔不得小于 2s 优先级高于 checkpoint interval
env.getCheckpointConfig.setMinPauseBetweenCheckpoints(2000)
//如果检查点失败,任务宣告退出 setFailOnCheckpointingErrors(true)
env.getCheckpointConfig.setTolerableCheckpointFailureNumber(0)
//设置如果任务取消,系统该如何处理检查点数据
//RETAIN_ON_CANCELLATION:如果取消任务的时候,没有加--savepoint,系统会保留检查点数据
//DELETE_ON_CANCELLATION:取消任务,自动是删除检查点(不建议使用)
env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
//2.创建DataStream - 细化
val props = new Properties()
props.setProperty("bootstrap.servers", "SparkTwo:9092")//设置路径
props.setProperty("group.id", "g1") //设置消费组
//3.读源 (1.消费主题,2.反序列化,3.传入配置文件)
val lines = env.addSource(new FlinkKafkaConsumer[String]("userlogin",new SimpleStringSchema(),props))
//输入到kafka
val kafakaSink = new FlinkKafkaProducer[EvaluateReport]("defult_topic",
new UserDefinedKeyedSerializationSchema, props, Semantic.AT_LEAST_ONCE)
//执行DataStream的转换算子
lines.filter(EvaluateUtil.isLegal(_))
.map(line=>
(EvaluateUtil.getApplicationName(line)+":"+EvaluateUtil.getUserIdentify(line),line)
)
.keyBy(t=> t._1)//将第一个设为键同时也是KedState 的状态键
.map(new UserHistoryDataMapFunction )
.filter(_.getApplicationName!=null)
.addSink(kafakaSink)
//5.执行流计算任务
env.execute("UserLoginEvaluateTopology")
}
//将数据放到kafka中间件的配置
class UserDefinedKeyedSerializationSchema extends KeyedSerializationSchema[EvaluateReport]{
override def serializeKey(t: EvaluateReport): Array[Byte] = null
override def serializeValue(t: EvaluateReport): Array[Byte] = t.toString.getBytes()
override def getTargetTopic(t: EvaluateReport): String = {
if(t.getApplicationName.contains("QQ")){
return "topic_qq"
}else if (t.getApplicationName.contains("wx")){
return "topic_wx"
}
"topic_all"
}
}
class UserHistoryDataMapFunction extends RichMapFunction[(String, String), EvaluateReport] {
//存储用户的历史状态 之所以用String 是要用json 传对象
var historyDataState: ValueState[String] = _
var evaluateReportState: MapState[String, String] = _
//记录一天以内用户登录总次数
var numLoginCountState: ReducingState[Int] = _
var updates: JList[Updater] = _
var evaluates: JList[Evaluate] = _
override def open(parameters: Configuration): Unit = {
//声明历史数据描述
val historyDataDescriptor = new ValueStateDescriptor[String]("HistoryData", createTypeInformation[String])
//设置历史状态可查询
historyDataDescriptor.setQueryable("queryHistoryData")
//声明评估数据描述
val evaluateReportStateDescriptor = new MapStateDescriptor[String, String]("EvaluateReport", createTypeInformation[String], createTypeInformation[String])
//设置评估状态可查询
evaluateReportStateDescriptor.setQueryable("queryEvaluateReport")
//当天登录次数描述
val numLoginCountDescriptor = new ReducingStateDescriptor[Int]("numLoginCount", new ReduceFunction[Int] {
override def reduce(value1: Int, value2: Int): Int = value1 + value2
}, createTypeInformation[Int])
//开启numLoginCount TTL特性、设置为1天以内
val ttlConfig = StateTtlConfig.newBuilder(Time.days(1))
.cleanupFullSnapshot()//设置快照清除
.cleanupInRocksdbCompactFilter(1000)//设置RocksDB清除策略
.setUpdateType(UpdateType.OnCreateAndWrite)//只有修改操作,才会更新时间
.setStateVisibility(StateVisibility.NeverReturnExpired)//设置过期数据永不返回
.build()
//配置TTL
numLoginCountDescriptor.enableTimeToLive(ttlConfig)
historyDataState = getRuntimeContext.getState(historyDataDescriptor)//历史状态
numLoginCountState = getRuntimeContext.getReducingState(numLoginCountDescriptor)//当天登录次数
evaluateReportState = getRuntimeContext.getMapState(evaluateReportStateDescriptor)//登录状态
//所有的更新链条
updates = new JList[Updater]()
//创建历史数据更新的链属性
updates.add(new GeoPointsUpdate)
updates.add(new HistoryCities)
updates.add(new HistoryDeviceInformations)
updates.add(new HistoryLoginTime)
updates.add(new HistoryLoginTimeSlot)
updates.add(new HistoryOrdernessPasswords)
updates.add(new LatestInputFeatures)
//创建评估链的属性
evaluates = new JList[Evaluate]()
evaluates.add(new AreaEvaluate)
evaluates.add(new DeviceEvaluate)
evaluates.add(new InputFeatureEvaluate)
evaluates.add(new SimilarityEvaluate(0.9))
evaluates.add(new SpeedEvaluate(750.0))
evaluates.add(new TimeSlotEvaluate(1))
evaluates.add(new TotalEvaluate(2))
println("xixi")
}
override def map(value: (String, String)): EvaluateReport = {
//获取用户历史状态
var historyDataJson = historyDataState.value()
//使用jackson序列化和反序列化 解耦 应用中可查这个状态
val mapper = new ObjectMapper()
var historyData: HistoryData = null
if (historyDataJson == null) {
historyData = new HistoryData()
} else {
//通过jackson的反序列化
historyData = mapper.readValue(historyDataJson, classOf[HistoryData])
}
var evaluateReport = new EvaluateReport();
//判断日志类型
//如果是登录成功的数据
if (EvaluateUtil.isLoginSuccess(value._2)) { //更新historyData
//解析
val loginSuccessData = EvaluateUtil.parseLoginSuccessData(value._2)
val updaterChain = new UpdaterChain(updates)
updaterChain.doChain(loginSuccessData, historyData)
//更新历史状态 序列化成字符串
historyDataState.update(mapper.writeValueAsString(historyData))
println("guofen"+historyDataState.value())
} else if (EvaluateUtil.isEvaluate(value._2)) { //走评估逻辑
val evaluateData = EvaluateUtil.parseEvaluateData(value._2)
//创建一个报告
val evaluateReport = new EvaluateReport(evaluateData.getApplicationName,
evaluateData.getUserIdentify,
evaluateData.getLoginSequence,
evaluateData.getEvaluateTime,
evaluateData.getCityName,
evaluateData.getGeoPoint
);
//创建评估链
val evaluateChain = new EvaluateChain(evaluates)
evaluateChain.daChain(evaluateData, historyData, evaluateReport)
//存储用户的评估状态,建议(日期:登录序列形式。每一次put之前删除一周以前所有评估状态) 把评估报告序列化成了json
evaluateReportState.put(evaluateData.getLoginSequence, mapper.writeValueAsString(evaluateReport))
return evaluateReport
}
evaluateReport;
}
}
}
将其打包放在WEB UI进行运行
注意:
如果评估模型项目jar 包引不过去记得在评估模型项目的
点install 会将它的jar 打包放在maven 本地仓库供其引用
如果这个项目代码又改动记得更新仓库了的这个jar
附加状态查询
在上述流计算中,有3个状态值
1,历史数据
2.每天的登录次数
3.评估报告
业务系统可以通过,查询评估报告,完成用户登录风险的等级评定 此处属于 异步实现
同步实现:业务系统进行登录风险的评估计算,需要云计算暴露 ”历史数据的查询接口”
注意所要输入的job ID 是这个id
查询历史数据
import org.apache.flink.api.common.{ExecutionConfig, JobID}
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.queryablestate.client.QueryableStateClient
import org.apache.flink.api.scala._
object History {
def main(args: Array[String]): Unit = {
/**
* 查历史状态数据
* */
//链接proxy(代理)服务器
val client = new QueryableStateClient("SparkTwo", 9069)
var jobID=JobID.fromHexString("d7dea1883d218879af44bfe1e30f6194")
var queryName="queryHistoryData" //状态名字
var queryKey="张三:QQ" //用户需要查询的 key
//创建一个状态描述器
var rsd=new ValueStateDescriptor[String]("queryHistoryData",TypeInformation.of(classOf[String]).createSerializer(new ExecutionConfig))
//查询,获取完整的查询结果
val resultFuture = client.getKvState(jobID, queryName, queryKey, createTypeInformation[String], rsd)
//同步获取结果
val state: ValueState[String] = resultFuture.get()
//输出
println("结果:"+state.value())
//结束
client.shutdownAndWait()
}
}
评估状态查询
import org.apache.flink.api.common.state.{MapState, MapStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.common.{ExecutionConfig, JobID}
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.queryablestate.client.QueryableStateClient
object ReadReportQueryableState {
def main(args: Array[String]): Unit = {
//创建一个可查询客户端
val client = new QueryableStateClient("pro1",9069)
//设置jobID
val id = JobID.fromHexString("d9751ed5b01767a4c83025351b58527e")
//设置查询的状态名
val name = "queryEvaluateReport"
//设置key的类型
val ty: TypeInformation[String] = createTypeInformation[String]
//创建一个状态描述器
val des = new MapStateDescriptor[String,String]("xxxxx",
createTypeInformation[String].createSerializer(new ExecutionConfig),
createTypeInformation[String].createSerializer(new ExecutionConfig))
//查询,获取完整的查询结果
val result = client.getKvState(id,name,"张三:QQ",ty,des)
//同步获取结果
val state: MapState[String, String] = result.get()
//输出
println(state.entries())
//停止
client.shutdownAndWait()
}
}
测试数据向Kafka 生产者发送
[root@SparkTwo kafka_2.11-2.2.0]# ./bin/kafka-console-producer.sh --broker-list SparkTwo:9092 --topic userlogin
>INFO 2020-04-03 10:12:01 QQ SUCCESS [张三] 6ebaf4ac780f40f486359f3ea6934620 "103158" jinan "116.4,39.5" [1250,14000,2000] "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36\"
>INFO 2020-04-03 10:12:01 QQ SUCCESS [张三] 6ebaf4ac780f40f486359f3ea6934620 "103158" jinan "116.4,39.5" [1250,14000,2000] "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36\"
INFO 2020-04-03 10:12:00 WX SUCCESS [张三] 6ebaf4ac780f40f486359f3ea6934623 “456123” Zhengzhou “114.4,34.5” [1400,16000,2100] “Mozilla/8.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36”
INFO 2020-04-02 10:50:00 QQ SUCCESS [张三] 6ebaf4ac780f40f486359f3ea6934622 “123456” Beijing “116.4,39.5” [1300,17000,2200] “Mozilla/7.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36”
INFO 2020-04-02 10:52:00 QQ EVALUATE [张三] 6ebaf4ac780f40f486359f3ea6934622 “123456” Beijing “116.4,39.5” [1300,17000,2200] “Mozilla/7.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36”
INFO 2020-04-03 10:54:01 WX EVALUATE [张三] 6ebaf4ac780f40f486359f3ea6934620 “123458” Zhengzhou “114.4,34.5” [1250,14000,2000] “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36”
351