通过一个小案例来具体实现Flink跟mysql的结合
通常实时流里需要对一些数据进行判断、过滤,或者分类;但是实时流一旦启动起来,只要不变逻辑基本不会去重启程序;但是如果当前这些过滤分类的条件会经常发生变动,这时候就想起一种很常见的方案,那就是配置维表。
那么在mysql里配置一个维表,Flink定时读取维表数据进行跟实时流做join,这样就可以使程序变的更加灵活了。
下面介绍一种比较简易的方案
通过实现RichSourceFunction来进行MySQL的连接
package source
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import scala.collection.mutable
import scala.collection.mutable.Set
class TimingWithMysql (sql: String) extends RichSourceFunction[Set[String]]{
private var isRunning = true
private val jdbcUrl = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2B8&useSSL=false"
private val username = "root"
private val password = "123456"
private val driverName ="com.mysql.jdbc.Driver"
private var conn:Connection = null
private var ps: PreparedStatement = null
import scala.collection.mutable.Set
val pubcodeSet: Set[String] = Set()
override def open(parameters: Configuration): Unit = {
super.open(parameters)
Class.forName(driverName) // 加载数据库驱动
conn = DriverManager.getConnection(jdbcUrl, username, password)
ps = conn.prepareStatement(sql) // 需要执行的SQL语句
}
override def run(sourceContext: SourceFunction.SourceContext[mutable.Set[String]]): Unit = {
while (isRunning) {
val resultSet = ps.executeQuery
while ( {
resultSet.next
}) {
val pub_code = resultSet.getString("pub_code")
pubcodeSet.add(pub_code)
sourceContext.collect(pubcodeSet) // 以流的形式发送结果
}
Thread.sleep(60*1000)
}
}
override def cancel(): Unit = {isRunning=false}
override def close(): Unit = {
super.close()
if (conn != null) conn.close()
if (ps != null) ps.close()
}
}
主方法:将mysql的数据广播出去,与实时流进行join
package articletimeliness
import java.text.SimpleDateFormat
import java.util
import java.util.Properties
import com.alibaba.fastjson.JSON
import org.apache.flink.api.common.state.MapStateDescriptor
import org.apache.flink.api.common.typeinfo.BasicTypeInfo
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.table.api._
import org.apache.flink.table.api.bridge.scala._
import scala.collection.JavaConverters._
import org.apache.flink.types.Row
import org.apache.flink.util.Collector
import source.TimingWithMysql
import scala.collection.mutable
object TimeLiness {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
env.setParallelism(1)
env.enableCheckpointing(1000L)
val config: CheckpointConfig = env.getCheckpointConfig
config.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
config.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
config.setCheckpointInterval(1000)
config.setCheckpointTimeout(60000)
config.setMinPauseBetweenCheckpoints(500)
config.setMaxConcurrentCheckpoints(1)
// 定时加载mysql维表,并广播
val CONFIG_KEYWORDS = new MapStateDescriptor[String, String](
"config-keywords",
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO)
val sql = "select name from test where flag ='1'"
// env.addSource(new TimingWithMysql(sql)).print()
// 广播数据BroadcastStream[mutable.Set[String]]
val broadcastStream: BroadcastStream[mutable.Set[String]] = env
.addSource(new TimingWithMysql(sql))
.setParallelism(1)
.broadcast(CONFIG_KEYWORDS)
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("group.id", "AllTimeLiness")
properties.setProperty("zookeeper.connect", "localhost:2181")
properties.setProperty("auto.offset.reset", "latest")
// properties.setProperty("auto.offset.reset", "eariest")
val topics = new util.LinkedList[String]
topics.add("topic_1")
topics.add("topic_2")
val kafkaStream: DataStream[(String, Long, String)] = env.addSource(new FlinkKafkaConsumer[String](topics, new SimpleStringSchema(), properties))
val jsonStream = kafkaStream.map(data => {
.....
TimeLinessInput(pub_code, crawler_id, is_comment, job_id, cts_diff, push_time_diff, create_time_diff, topic_str)
})
val joinStream: DataStream[TimeLinessInput] = jsonStream
.connect(broadcastStream)
.process(new BroadcastProcessFunction[TimeLinessInput, mutable.Set[String], TimeLinessInput] {
import scala.collection.mutable.Set
var keywords: mutable.Set[String] = Set()
var pubcode: String = null
override def processElement(in1: TimeLinessInput,
readOnlyContext: BroadcastProcessFunction[TimeLinessInput, mutable.Set[String],
TimeLinessInput]#ReadOnlyContext, collector: Collector[TimeLinessInput]): Unit = {
if (keywords.contains(in1.pub_code)) {
pubcode = in1.pub_code
} else {
pubcode = "other_pubcode"
}
collector.collect(TimeLinessInput(pubcode, in1.crawler_id, in1.is_comment, in1.job_id, in1.cts_diff, in1.push_time_diff, in1.create_time_diff, in1.topic_str))
}
override def processBroadcastElement(in2: mutable.Set[String],
context: BroadcastProcessFunction[TimeLinessInput, mutable.Set[String], TimeLinessInput]#Context,
collector: Collector[TimeLinessInput]): Unit = {
keywords = in2
}
})
joinStream.print()
env.execute("Time Liness job")
}
}

被折叠的 条评论
为什么被折叠?



