flink实时计算kafka数据,存储到Mysql(Scala版本)

Flink实时计算kafka数据,存储到Mysql(Scala版本)

本文是在基于搭建好集群执行
需要配置如下
1、完整CDH集群
2、Mysql开启binlog日志,如没开启参考此链接:Mysql开启binlog日志
3、启动Maxwell,如没有安装参考此链接:Maxwell安装及配置

项目说明

  1. kafka实时接收Maxwell监控到的日志
  2. 使用flink实时消费kakfa数据,处理json日志并拿到想要字段进行滚动窗口计算
  3. 把计算出来的数据存入Mysql数据库(也可以换成其他数据库,比如Tidb,具体看需求)

部分kafka数据样例(插入,更新,删除三条样例数据)

{"database":"test","table":"person","type":"insert","ts":1638343178,"xid":10873875,"commit":true,"data":{"id":69,"name":"sd","age":null}}
{"database":"test","table":"person","type":"delete","ts":1638341838,"xid":10838691,"commit":true,"data":{"id":32,"name":"rr","age":3}}
{"database":"test","table":"person","type":"update","ts":1638343457,"xid":10877187,"commit":true,"data":{"id":66,"name":"de","age":null},"old":{"name":"的"}}

项目需求

  1. flink实时统计当天新增日志中有(insert)操作人数
  2. 写入到 person_count 表
  3. 包含字段(数据库名称,表名,执行操作,日期,人数)

执行成功图

在这里插入图片描述

架构图

在这里插入图片描述

代码案例一

package com.jt.flink.Count

import java.sql.{Connection, DriverManager, PreparedStatement}
import java.text.SimpleDateFormat
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.scala.DataStream
import org.apache.kafka.clients.consumer.ConsumerConfig
import com.alibaba.fastjson.JSON
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
import java.util.Properties
import com.jt.util.ConfigUtil
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer


object FlinkCountKafkaDemo1 {
  def main(args: Array[String]): Unit = {
    //创建流处理环境
    val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //之后开启CheckPointing可以开启重启策略
    environment.enableCheckpointing(5000)
    //设置重启策略为,出现三次异常重启3次,隔10秒一次
    environment.getConfig.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 10000))
    //系统异常退出或者人为退出,不删除checkpoint数据
    environment.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    //设置Checkpoint模式(与Kafka整合,要设置Checkpoint模式为Exactly_Once)
    environment.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    environment.setParallelism(1)
//    environment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //配置kafka信息
    val properties = new Properties()
    properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "srv191:9092,srv192:9092,srv193:9092")
    properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "console-consumer-3145")
    properties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
    properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
    //如果没有记录偏移量,第一次从最开始消费:earliest 从最新的位置开始消费:latest
    properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest")
    //kafka的消费者,不自动提交偏移量
    properties.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")

    // 获取topic数据
    val valueTopic: DataStream[String] = environment.addSource(new FlinkKafkaConsumer[String]("kafka", new SimpleStringSchema(), properties))
    //查看获取到得topic数据
//    valueTopic.map(t => t.toString).print()

    val dataStream: DataStreamSink[((String, String, String, String), Int)] = valueTopic.map(line => {
      val database = JSON.parseObject(line).getString("database").toString.replaceAll("\\\"", "")
      val table = JSON.parseObject(line).getString("table").toString.replaceAll("\\\"", "")
      val Type = JSON.parseObject(line).getString("type").toString.replaceAll("\\\"", "")
      val dataTs = JSON.parseObject(line).getString("ts").toLong
      val time = new SimpleDateFormat("yyyy-MM-dd").format(dataTs * 1000L)
      (database, table, Type, time)
    })
      .keyBy(k => k._3)
      /**
       * 过滤Type!=insert的数据
       * 并且把元组内容当做Key,1作为value
       */
      .filter(_._3 == "insert")
      .map(a => (a, 1))
      /**
       * 分组算子:
       *      1、0或者1代表的是下标,就是上面DataStreamSink返回的二元组下标,
       *      2、0代表上面返回元组中的数据
       *      3、1代表的是元组数据中出现的次数
       */
      .keyBy(0)
      //设置5秒滚动窗口,每隔5秒计算一次窗口内数据
      .timeWindow(Time.seconds(5))
      .sum(1)//聚合累加算子
      .addSink(new MysqlSink)
//      .print()

    //提交flink任务job
    environment.execute()
  }
  class MysqlSink extends RichSinkFunction[((String, String, String,String), Int)] {

    //获取配置文件
    val driver = ConfigUtil.getString("mysql-driver")
    val url = ConfigUtil.getString("mysql-url")
    val user = ConfigUtil.getString("mysql-user")
    val password = ConfigUtil.getString("mysql-password")

    private var connection: Connection = null
    private var ps: PreparedStatement = null

    override def open(parameters: Configuration): Unit = {
      //1:加载驱动
      Class.forName(driver)
      //2:创建连接
      connection = DriverManager.getConnection(url, user, password)
      //3:获得执行语句
      val sql = "insert into person_count(db,TableName,TypeTable,timeStamp,person_count) values(?,?,?,?,?);"
      ps = connection.prepareStatement(sql)
    }

    override def invoke(value: ((String, String, String,String), Int)): Unit = {
      try {
        //4.组装数据,执行插入操作
        ps.setString(1, value._1._1)
        ps.setString(2, value._1._2)
        ps.setString(3, value._1._3)
        ps.setString(4, value._1._4)
        ps.setInt(5, value._2)
        ps.executeUpdate()
      } catch {
        case e: Exception => println(e.getMessage)
      }
    }

    //关闭连接操作
    override def close(): Unit = {
      if (connection != null) {
        connection.close()
      }
      if (ps != null) {
        ps.close()
      }
    }
  }
}

代码案例二

package com.jt.flink.Count

import java.sql.{Connection, DriverManager, PreparedStatement}
import java.text.SimpleDateFormat
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.scala.DataStream
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import java.util.Properties
import com.jt.util.ConfigUtil
import org.apache.flink.streaming.util.serialization.JSONKeyValueDeserializationSchema
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

object FlinkCountKafkaDemo2 {
  def main(args: Array[String]): Unit = {
    //创建flink流计算执行环境
    val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //开启CheckPointing可以开启重启策略
    environment.enableCheckpointing(5000)
    //设置重启策略为,出现三次异常重启3次,隔10秒一次
    environment.getConfig.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 10000))
    //系统异常退出或者人为退出,不删除checkpoint数据
    environment.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    //设置Checkpoint模式(与Kafka整合,要设置Checkpoint模式为Exactly_Once)端到端的一致性
    environment.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    //测试环境设置并行度为1,生产环境可以调大也可以不设置
    environment.setParallelism(1)
    //创建事件创建时间
    //    environment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //配置kafka信息
    val properties = new Properties()
    //配置kafkaIP地址以及kafka端口号
    properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "srv191:9092,srv192:9092,srv193:9092")
    //设置kakfa消费者组名称
    properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "console-consumer-3145")
    //开启序列化配置
    properties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
    //开启反序列化配置
    properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
    //如果没有记录偏移量,第一次从最开始消费:earliest 最新消息进行消费:latest
    properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
    //kafka的消费者,不自动提交偏移量
    properties.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
    //添加数据来源为kafka
    val valueTopic: DataStream[ObjectNode] = environment.addSource(new FlinkKafkaConsumer[ObjectNode]("kafka", new JSONKeyValueDeserializationSchema(false), properties))
    /**
     * 打印输出kafka接收到的json数据
     * {"key":{"database":"test","table":"person","pk.id":170},
     * "value":{"database":"test","table":"person","type":"insert","ts":1638855746,"xid":7811455,"commit":true,
     * "data":{"id":170,"name":"1","age":333}}}
     */
//    valueTopic.map(t => t.toString).print()
    val dataStream: DataStreamSink[((String, String, String, String), Int)] =
      valueTopic.map(t =>{
        /**
         * 处理获取到json数据以及想要拿到字段
         * 注意:本次使用的方法只支持K,V解析获取字段
         * 如想直接根据Key获取可根据在上方添加kafka数据方法里面直接编写如下:
         * environment.addSource(new FlinkKafkaConsumer[ObjectNode]("kafka", new SimpleStringSchema(), properties))
         * val database = JSON.parseObject(line).getString("database")
         * 需要添加alibaba>fastjson依赖
         */
        val node = t.get("value")
        val database = node.get("database").toString.replaceAll("\\\"", "")
        val table = node.get("table").toString.replaceAll("\\\"", "")
        val Type = node.get("type").toString.replaceAll("\\\"", "")
        val ts = node.get("ts").asLong()
        val time = new SimpleDateFormat("yyyy-MM-dd").format(ts * 1000L)
        (database, table, Type, time)
      })
        //根据Type进行分组把相同的Key放进同一个里面
        .keyBy(k => k._3)
        //过滤除insert以外所有的类型操作
        .filter(_._3 == "insert")
        //计数
        .map(a => (a, 1))
        // 分组算子  0或1代表的是下标,就是上面DataStreamSink返回的二元组下标
        // 0代表上面返回元组中的数据
        // 1代表的是元组数据中出现的次数
        .keyBy(0)
        //定义一个5秒钟滚动窗口,每5秒钟统计一次
        .timeWindow(Time.seconds(5))
        //聚合
        .sum(1)
        //添加自定义Mysqlsink
//        .addSink(new MysqlSink)
        .print()

    //提交flink任务job
    environment.execute()

  }
}
class MysqlSink extends RichSinkFunction[((String, String, String, String), Int)] {
  //获取配置文件
  val driver = ConfigUtil.getString("mysql-driver")
  val url = ConfigUtil.getString("mysql-url")
  val user = ConfigUtil.getString("mysql-user")
  val password = ConfigUtil.getString("mysql-password")

  private var connection: Connection = null
  private var ps: PreparedStatement = null

  override def open(parameters: Configuration): Unit = {
    //1:加载驱动
    Class.forName(driver)
    //2:创建连接
    connection = DriverManager.getConnection(url, user, password)
    //3:获得执行语句
    val sql = "insert into person_count(db,TableName,TypeTable,timeStamp,person_count) values(?,?,?,?,?);"
    ps = connection.prepareStatement(sql)
  }

  override def invoke(value: ((String, String, String, String), Int)): Unit = {
    try {
      //4.组装数据,执行插入操作
      ps.setString(1, value._1._1)
      ps.setString(2, value._1._2)
      ps.setString(3, value._1._3)
      ps.setString(4, value._1._4)
      ps.setInt(5, value._2)
      ps.executeUpdate()
    } catch {
      case e: Exception => println(e.getMessage)
    }
  }
  //关闭连接操作
  override def close(): Unit = {
    if (connection != null) {
      connection.close()
    }
    if (ps != null) {
      ps.close()
    }
  }
}

建议使用第二方案代码,flink自带的解析json字符串反序列化有问题

封装的驱动连接如下

package com.jt.util;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.configuration.reloading.FileChangedReloadingStrategy;
import org.apache.log4j.Logger;

public class ConfigUtil {
    private static final Logger logger = Logger.getLogger(ConfigUtil.class);
    private static PropertiesConfiguration config = null;

    private ConfigUtil() {

    }

    static {
        try {
            //初始化配置
            config = new PropertiesConfiguration("config.properties");
            //文件修改之后自动加载
            config.setReloadingStrategy(new FileChangedReloadingStrategy());
            //配置文件自动保存
            config.setAutoSave(true);
        } catch (ConfigurationException ex) {
            logger.error(ex.getMessage());
        }
    }

    /**
     * 获取String参数
     *
     * @param key
     * @return
     */
    public static String getString(String key) {
        return config.getString(key, "");
    }

    /**
     * 获取String数组参数
     *
     * @param key
     * @return
     */
    public static String[] getStringArray(String key) {
        return config.getStringArray(key);
    }

    /**
     * 获取Integer参数
     *
     * @param key
     * @return
     */
    public static Integer getInteger(String key) {
        return config.getInteger(key, 0);
    }

    /**
     * 获取Double参数
     *
     * @param key
     * @return
     */
    public static Double getDouble(String key) {
        return config.getDouble(key, 0D);
    }

    /**
     * 获取Long参数
     *
     * @param key
     * @return
     */
    public static Long getLong(String key) {
        return config.getLong(key, 0L);
    }

    /**
     * 新增或修改参数
     *
     * @param key
     * @param value
     */
    public static void setProperty(String key, Object value) {
        config.setProperty(key, value);
    }
}

配置文件

mysql-driver=com.mysql.jdbc.Driver
mysql-url=jdbc:mysql://xxxx.xxxx.xxxx.xxxx:3306/test01?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull
mysql-user=username
mysql-password=password

#datasource parameters
initSize=1
maxSize=4

pom.xml依赖配置

<dependencies>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.66</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>1.10.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-jdbc_2.12</artifactId>
            <version>1.11.0</version>
        </dependency>


        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-cep_2.11</artifactId>
            <version>1.11.2</version>
        </dependency>

        <!-- json 依赖 -->
        <dependency>
            <groupId>net.sf.json-lib</groupId>
            <artifactId>json-lib</artifactId>
            <version>2.4</version>
            <classifier>jdk15</classifier>
        </dependency>
        <dependency>
            <groupId>commons-beanutils</groupId>
            <artifactId>commons-beanutils</artifactId>
            <version>1.7.0</version>
        </dependency>
        <dependency>
            <groupId>commons-collections</groupId>
            <artifactId>commons-collections</artifactId>
            <version>3.1</version>
        </dependency>
        <dependency>
            <groupId>commons-lang</groupId>
            <artifactId>commons-lang</artifactId>
            <version>2.5</version>
        </dependency>
        <dependency>
            <groupId>net.sf.ezmorph</groupId>
            <artifactId>ezmorph</artifactId>
            <version>1.0.3</version>
        </dependency>



        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.1.0</version>
<!--            <scope>provided</scope>-->
        </dependency>



        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>


        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
            <version>1.3.3</version>
        </dependency>


        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.11.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.48</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.8</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <!-- 指定编译java的插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
            </plugin>
            <!-- 指定编译scala的插件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>
                        </configuration>
                    </execution>
                </executions>
            </plugin>

        </plugins>
    </build>
  • 4
    点赞
  • 40
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
首先,你需要在 Scala 代码中引入以下依赖: ```scala libraryDependencies += "org.apache.flink" %% "flink-scala" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-streaming-scala" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-connector-kafka" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-connector-hive" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-connector-jdbc" % flinkVersion ``` 然后,你可以使用以下代码来消费 Kafka 数据: ```scala import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer val env = StreamExecutionEnvironment.getExecutionEnvironment val kafkaConsumer = new FlinkKafkaConsumer[String]("topic", new SimpleStringSchema(), properties) val stream = env.addSource(kafkaConsumer) // 对数据进行处理 val result = stream.map(...) ``` 其中,`properties` 是一个 `Properties` 对象,用于配置 Kafka 的连接信息。 接下来,你需要将处理后的数据写入到 Hive 和 Doris 中。可以使用以下代码: ```scala import org.apache.flink.table.api.bridge.scala._ import org.apache.flink.table.catalog.hive.HiveCatalog import org.apache.flink.streaming.api.scala.StreamTableEnvironment val tableEnv = StreamTableEnvironment.create(env) val hiveCatalog = new HiveCatalog("myHiveCatalog", "myDatabase", "/path/to/hive/conf", "2.3.4") tableEnv.registerCatalog("myHiveCatalog", hiveCatalog) tableEnv.useCatalog("myHiveCatalog") tableEnv.executeSql("CREATE TABLE myHiveTable (...) WITH (...)") result.toTable(tableEnv, "myResultTable") tableEnv.executeSql("INSERT INTO myHiveTable SELECT * FROM myResultTable") val jdbcUrl = "jdbc:mysql://localhost:3306/my_database" tableEnv.executeSql(s"CREATE TABLE myDorisTable (...) WITH (...)") tableEnv.executeSql(s"INSERT INTO myDorisTable SELECT * FROM myResultTable") ``` 其中,`myHiveCatalog` 是 Hive 的 Catalog 名称,`myDatabase` 是 Hive 中的数据库名称,`/path/to/hive/conf` 是 Hive 的配置文件所在路径,`2.3.4` 是 Hive 的版本号。 `myHiveTable` 和 `myDorisTable` 是你要写入数据的表名,`(...)` 是表的列定义和其他属性,`myResultTable` 是处理后的数据表名。 `jdbcUrl` 是 Doris 数据库的连接信息,你需要根据实际情况进行修改。 你需要将上述代码中的 `...` 替换为实际的处理逻辑和表定义。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值