5、Flink Source(mysql)

通过一个小案例来具体实现Flink跟mysql的结合

通常实时流里需要对一些数据进行判断、过滤,或者分类;但是实时流一旦启动起来,只要不变逻辑基本不会去重启程序;但是如果当前这些过滤分类的条件会经常发生变动,这时候就想起一种很常见的方案,那就是配置维表。

那么在mysql里配置一个维表,Flink定时读取维表数据进行跟实时流做join,这样就可以使程序变的更加灵活了。

下面介绍一种比较简易的方案

通过实现RichSourceFunction来进行MySQL的连接

package source

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}

import scala.collection.mutable
import scala.collection.mutable.Set

class TimingWithMysql (sql: String) extends RichSourceFunction[Set[String]]{

  private var isRunning = true
  
  private val jdbcUrl = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2B8&useSSL=false"
  private val username = "root"
  private val password = "123456"
  private val driverName ="com.mysql.jdbc.Driver"
  private var conn:Connection = null
  private var ps: PreparedStatement = null
  import scala.collection.mutable.Set
  val pubcodeSet: Set[String] = Set()

  override def open(parameters: Configuration): Unit = {
    super.open(parameters)
    Class.forName(driverName) // 加载数据库驱动

    conn = DriverManager.getConnection(jdbcUrl, username, password)

    ps = conn.prepareStatement(sql) // 需要执行的SQL语句
  }
  override def run(sourceContext: SourceFunction.SourceContext[mutable.Set[String]]): Unit = {
    while (isRunning) {
      val resultSet = ps.executeQuery

      while ( {
        resultSet.next
      }) {

        val pub_code = resultSet.getString("pub_code")
        pubcodeSet.add(pub_code)
        sourceContext.collect(pubcodeSet) // 以流的形式发送结果


      }
      Thread.sleep(60*1000)

    }
  }

  override def cancel(): Unit = {isRunning=false}
  override def close(): Unit = {
    super.close()
    if (conn != null) conn.close()
    if (ps != null) ps.close()
  }

}

主方法:将mysql的数据广播出去,与实时流进行join

package articletimeliness

import java.text.SimpleDateFormat
import java.util
import java.util.Properties

import com.alibaba.fastjson.JSON
import org.apache.flink.api.common.state.MapStateDescriptor
import org.apache.flink.api.common.typeinfo.BasicTypeInfo
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.table.api._
import org.apache.flink.table.api.bridge.scala._

import scala.collection.JavaConverters._

import org.apache.flink.types.Row
import org.apache.flink.util.Collector
import source.TimingWithMysql

import scala.collection.mutable

object TimeLiness {

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(env, settings)
    env.setParallelism(1)

    env.enableCheckpointing(1000L)

    val config: CheckpointConfig = env.getCheckpointConfig
    config.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    config.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    config.setCheckpointInterval(1000)
    config.setCheckpointTimeout(60000)
    config.setMinPauseBetweenCheckpoints(500)
    config.setMaxConcurrentCheckpoints(1)


    // 定时加载mysql维表,并广播
    val CONFIG_KEYWORDS = new MapStateDescriptor[String, String](
      "config-keywords",
      BasicTypeInfo.STRING_TYPE_INFO,
      BasicTypeInfo.STRING_TYPE_INFO)

    val sql = "select name from test where flag ='1'"

    //        env.addSource(new TimingWithMysql(sql)).print()


    // 广播数据BroadcastStream[mutable.Set[String]]
    val broadcastStream: BroadcastStream[mutable.Set[String]] = env
      .addSource(new TimingWithMysql(sql))
      .setParallelism(1)
      .broadcast(CONFIG_KEYWORDS)


    val properties = new Properties()


    properties.setProperty("bootstrap.servers", "localhost:9092")
    properties.setProperty("group.id", "AllTimeLiness")
    properties.setProperty("zookeeper.connect", "localhost:2181")
    properties.setProperty("auto.offset.reset", "latest")
    //            properties.setProperty("auto.offset.reset", "eariest")

    val topics = new util.LinkedList[String]
    topics.add("topic_1")
    topics.add("topic_2")
   

    val kafkaStream: DataStream[(String, Long, String)] = env.addSource(new FlinkKafkaConsumer[String](topics, new SimpleStringSchema(), properties))
   

    val jsonStream = kafkaStream.map(data => {
      .....
      TimeLinessInput(pub_code, crawler_id, is_comment, job_id, cts_diff, push_time_diff, create_time_diff, topic_str)
    })
   

    val joinStream: DataStream[TimeLinessInput] = jsonStream
      .connect(broadcastStream)
      .process(new BroadcastProcessFunction[TimeLinessInput, mutable.Set[String], TimeLinessInput] {

        import scala.collection.mutable.Set

        var keywords: mutable.Set[String] = Set()
        var pubcode: String = null

        override def processElement(in1: TimeLinessInput,
                                    readOnlyContext: BroadcastProcessFunction[TimeLinessInput, mutable.Set[String],
                                      TimeLinessInput]#ReadOnlyContext, collector: Collector[TimeLinessInput]): Unit = {
          if (keywords.contains(in1.pub_code)) {
            pubcode = in1.pub_code
          } else {
            pubcode = "other_pubcode"
          }
          collector.collect(TimeLinessInput(pubcode, in1.crawler_id, in1.is_comment, in1.job_id, in1.cts_diff, in1.push_time_diff, in1.create_time_diff, in1.topic_str))
        }

        override def processBroadcastElement(in2: mutable.Set[String],
                                             context: BroadcastProcessFunction[TimeLinessInput, mutable.Set[String], TimeLinessInput]#Context,
                                             collector: Collector[TimeLinessInput]): Unit = {
          keywords = in2
        }
      })


    joinStream.print()
    env.execute("Time Liness job")
  }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值