Flink集成MySQL：数据提取与存储

Oz_Mood

已于 2024-03-25 19:13:41 修改

阅读量366

点赞数 4

文章标签： flink mysql 大数据

于 2024-03-25 19:10:10 首次发布

本文链接：https://blog.csdn.net/Oz_Mood/article/details/137022755

版权

本文介绍了如何使用ApacheFlink处理流式数据，从MySQL数据库提取数据并将其与Kafka数据合并，通过双流JOIN策略整合后存入ClickHouse数据库，展示了数据处理的关键步骤和代码实例。

摘要由CSDN通过智能技术生成

分为两个部分主要是从mysql的数据库中取出数据和将数据插入到mysql数据库中。

从MySQL数据库中提取数据并与Kafka数据合并

1 创建一个类继承RichMapFunction[IN,OUT] (这里的in就是进来的数据类型，Out是处理完的数据类型，这里最好创建一个样例类，用来包装最后想要的数据)

class MySqlFunction extends RichMapFunction[String, Order_Event]

2 定义mysql的连接并将想要查询的数据放入到一个HashMap中

	//定义SQL的连接、预编译器,给定初始值占位符    
	private var connection: Connection = _
	// 用户信息缓存
    private val cache = new util.HashMap[String, (String, String, Double)]()

    override def open(parameters: Configuration): Unit = {
      // 定义mysql数据库连接url和驱动程序及账号、密码
      val url = "jdbc:mysql://192.168.45.13:3306/ds_db01?useSSL=false"
      val driver = "com.mysql.jdbc.Driver"
      val username = "root"
      val password = "123456"

      // 加载Mysql JDBC驱动
      Class.forName(driver)
      connection = DriverManager.getConnection(url, username, password)

      // 执行SQL语句
      val sql = "select customer_id,customer_name,mobile_phone,customer_money from customer_inf"
      // 执行预编译 SQL 语句
      val stmt = connection.prepareStatement(sql)
	  // 执行查询
      val rs = stmt.executeQuery()

      // 把数据放入到cache中
      while (rs.next()) {
        val id = rs.getString("customer_id")
        val consignee = rs.getString("customer_name")
        val consignee_tel = rs.getString("mobile_phone")
        val final_total_amount = rs.getDouble("customer_money")
        cache.put(id, (consignee, consignee_tel, final_total_amount))
      }
      connection.close()
    }

3 将kafka数据流的数据和mysql中查询到的数据包装成最后想要输出的样例类

    override def map(in: Order_Master_Data): Order_Event = {
      val data = in.getData
      val feight_fee = data.shipping_money
      val id = data.customer_id.toString
      Order_Event(id, cache.get(id)._1, cache.get(id)._2, cache.get(id)._3, feight_fee)
    }
  }

将数据存入MySQL数据库

1 创建一个类继承RichSinkFunction[IN] (这里放的是进入的数据类型)

 class MyJDBCSinkProcessFunction extends RichSinkFunction[Order_Event]

2 定义mysql的数据连接以及定义预编译sql

     private var connection: Connection = _
     private var stmt: PreparedStatement = _
 
     override def open(parameters: Configuration): Unit = {
       val url = "jdbc:mysql://192.168.45.13:3306/ds_db01?useSSL=false"
       val driver = "com.mysql.jdbc.Driver"
       val username = "root"
       val password = "123456"
 
       Class.forName(driver)
       connection = DriverManager.getConnection(url, username, password)
 
 
       val sql = "insert into order_info(id,consignee,consignee_tel,final_total_amount,feight_fee) values(?,?,?,?,?)"
       stmt = connection.prepareStatement(sql)
     }

3 设置预编译语句中的数据，最后执行update SQL

     override def invoke(value: Order_Event, context: SinkFunction.Context): Unit = {
       stmt.setString(1, value.id)
       stmt.setString(2, value.consignee)
       stmt.setString(3, value.consignee_tel)
       stmt.setDouble(4, value.final_total_amount)
       stmt.setDouble(5, value.feight_fee)
 
       // 执行update SQL
       stmt.executeUpdate()
     }
4 关闭连接

 override def close(): Unit = {
   stmt.close()
   connection.close()
 }

我们这里举个题目做个例子

采用双流JOIN的方式（本系统稳定，无需担心数据迟到与丢失的问题,建议使用滚动窗口），结合订单信息和订单详细信息，拼接成如下表所示格式，其中包含订单id、订单总金额、商品数，将数据存入ClickHouse数据库shtd_result的orderpostiveaggr表中（表结构如下），然后在Linux的ClickHouse命令行中根据sn降序排序，查询出前5条。

完整的代码：

object Flink9 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val properties = new Properties()
    properties.setProperty("bootstrap.servers", "192.168.45.13:9092,192.168.45.14:9092,192.168.45.15:9092")
    properties.setProperty("group.id", "pt-test9")

    val kafkaConsumer = new FlinkKafkaConsumer[String]("order", new SimpleStringSchema(), properties)

    val order_Stream = env.addSource(kafkaConsumer)
      .filter(_.length > 150)
      .map(data => {
        val str = data.replaceAll("NULL", "0")
        val gson = new Gson()
        gson.fromJson(str, classOf[Order_Detail_Data])
      })
      .filter(data => {
        val table_name = data.getTable
        table_name.equals("order_detail")
      })
      .keyBy(r => 1)
      .process(new KeyedProcessFunction[Int, Order_Detail_Data, util.HashMap[String, (Double, Long)]] {
        lazy val totalMap: MapState[String, (Double, Long)] = getRuntimeContext.getMapState(new MapStateDescriptor[String, (Double, Long)]("totalMap", classOf[String], classOf[(Double, Long)]))

        override def processElement(i: Order_Detail_Data, context: KeyedProcessFunction[Int, Order_Detail_Data, util.HashMap[String, (Double, Long)]]#Context, collector: Collector[util.HashMap[String, (Double, Long)]]): Unit = {
          val map = new util.HashMap[String, (Double, Long)]()
          totalMap.keys().forEach(data => map.put(data, (totalMap.get(data)._1, totalMap.get(data)._2)))

          val data = i.getData
          val order_sn = data.order_sn
          if (map.containsKey(order_sn)) {
            map.put(order_sn, (map.get(order_sn)._1 + data.product_price, map.get(order_sn)._2 + data.product_cnt))
          } else {
            map.put(order_sn, (data.product_price, data.product_cnt))
          }

          collector.collect(map)

        }
      })

    order_Stream.addSink(new RichSinkFunction[util.HashMap[String, (Double, Long)]] {
      //定义SQL的连接、预编译器,给定初始值占位符
      var connection: Connection = _
      var stmt: PreparedStatement = _

      override def open(parameters: Configuration): Unit = {
        // 定义连接url和驱动程序及账号、密码
        val url = "jdbc:clickhouse://192.168.45.13/shtd_result?useSSL=false"
        val driver = "ru.yandex.clickhouse.ClickHouseDriver"
        val username = "default"
        val password = "123456"


        Class.forName(driver)
        connection = DriverManager.getConnection(url, username, password)

        val sql = "insert into orderpostiveaggr(sn,orderprice,orderdetailcount) values(?,?,?)"
        stmt = connection.prepareStatement(sql)
      }

      override def invoke(value: util.HashMap[String, (Double, Long)], context: SinkFunction.Context): Unit = {
        value.forEach { case (key, (double, long)) => {
          stmt.setString(1, key)
          stmt.setDouble(2, double)
          stmt.setLong(3, long)
        }
          stmt.executeUpdate()
        }
      }
    })

    env.execute()
  }
}