FlinkSQL MySQL维表关联

该博客介绍了如何使用Flink SQL处理Kafka数据,包括数据清洗、维表关联以及结果数据写回Kafka。通过设置Flink环境、配置参数、定义UDF函数以及创建Kafka和MySQL的源表与Sink表,实现了从Kafka消费数据,进行MySQL维表的Temporal Join操作,并将处理后的数据重新发布到Kafka。
摘要由CSDN通过智能技术生成

需求:消费Kafka数据,进行数据清洗及维表关联补齐字段,结果数据写入Kafka。 

import java.time.Duration

import com.sm.function.udf._
import com.sm.common.conf.PropManager
import com.sm.constants.Constants
import com.sm.utils.FlinkUtils
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.time.Time
import org.apache.flink.streaming.api.environment.{CheckpointConfig, ExecutionCheckpointingOptions}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.log4j.Level
import org.slf4j.LoggerFactory

/**
  * flink sql 维表关联
  * 消费kafka, MySQL维表关联, 结果写入kafka
  *
  * create by LiuJinHe 2020/10/22
  */
object LoginEtlKafka2Kafka {
  private var logger: org.slf4j.Logger = _

  def main(args: Array[String]): Unit = {
    logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
    org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache").setLevel(Level.INFO)

    // 初始化 stream 环境
    // 本地测试,需要 flink-runtime-web 依赖
    //   val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    // 本地测试线程 1
    //    env.setParallelism(1)

    //    val params = ParameterTool.fromArgs(args)
    //    env.getConfig.setGlobalJobParameters(params)

    // 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 3))

    // 事件处理的时间,由系统时间决定
    env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)

    // 创建 streamTable 环境
    val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, FlinkUtils.getSettings)
    //    tableConfig.setIdleStateRetentionTime(Time.hours(12), Time.hours(24))

    // checkpoint 设置
    val tableConfig = tableEnv.getConfig.getConfiguration
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE)
    // checkpoint的超时时间周期,1 分钟做一次checkpoint, 每次checkpoint 完成后 sink 才会执行
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(60))
    // checkpoint的超时时间, 检查点一分钟内没有完成将被丢弃
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT, Duration.ofSeconds(60))
    // checkpoint 最小间隔,两个检查点之间至少间隔 30 秒
    tableConfig.set(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS, Duration.ofSeconds(30))
    // 同一时间只允许进行一个检查点
    tableConfig.set(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS, Integer.valueOf(1))
    // 手动cancel时是否保留checkpoint
    tableConfig.set(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT,
      CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

    // 设置状态的最小空闲时间和最大的空闲时间
    tableEnv.getConfig.setIdleStateRetentionTime(Time.hours(12), Time.hours(24))

    tableConfig.setString("table.exec.mini-batch.enabled", "true")
    tableConfig.setString("table.exec.mini-batch.allow-latency", "2s")
    tableConfig.setString("table.exec.mini-batch.size", "5000")
    tableConfig.setString("table.optimizer.distinct-agg.split.enabled", "true")

    // 测试
    //    val odsLoginTopic = "ods_login"
    //    val bootstrapServers = "192.168.100.39:9092"
    //    val groupId = "flink-test-group"
    //    val startMode = "group-offsets"
    //    val dwdSdkUserLogin = "dwd_sdk_user_login"

    /**
      * 加载配置
      */
    val odsUserLoginTopic = PropManager.getProp(Constants.ODS_USER_LOGIN_TOPIC)
    val dwdUserLoginTopic = PropManager.getProp(Constants.DWD_USER_LOGIN_TOPIC)
    val bootstrapServers = PropManager.getProp(Constants.BOOTSTRAP_SERVERS)
    val ConsumerGroupId = PropManager.getProp(Constants.CONSUMER_GROUP_ID)
    val ProducerGroupId = PropManager.getProp(Constants.PRODUCER_GROUP_ID)
    val startMode = PropManager.getProp(Constants.STARTUP_MODE)


    /**
      * kafka source table
      */
    tableEnv.executeSql("drop table if exists login_kafka_Source")

    val kafkaSourceDDL =
      s"""
         |create table $odsUserLoginTopic (
         | _game_id int,
         | _login_name string,
         | _time bigint,
         | _time_server bigint,
         | _device_key string,
         | _os string,
         | _imei string,
         | ts as to_timestamp(from_unixtime(_time / 1000, 'yyyy-MM-dd HH:mm:ss')),
         | proc_time as proctime()
         |) with (
         | 'connector' = 'kafka',
         | 'topic' = '$odsUserLoginTopic',
         | 'properties.bootstrap.servers' = '$bootstrapServers',
         | 'properties.group.id' = '$ConsumerGroupId',
         | 'scan.startup.mode' = '$startMode',
         | 'format' = 'json',
         | 'json.fail-on-missing-field' = 'false',
         | 'json.ignore-parse-errors' = 'true'
         |)
      """.stripMargin
    tableEnv.executeSql(kafkaSourceDDL)

    /**
      * dwd
      */
    tableEnv.executeSql(s"drop table if exists $dwdUserLoginTopic")

    val kafkaSinkDDL =
      s"""
         |create table $dwdUserLoginTopic (
         |  cp_game_id int,
         |  game_id int,
         |  login_account string,
         |  `time` string,
         |  time_server string,
         |  device_key string,
         |  os string,
         |  primary key (`time`, package_id, core_account) not enforced
         |) with (
         | 'connector' = 'kafka',
         | 'topic' = '$dwdUserLoginTopic',
         | 'properties.bootstrap.servers' = '$bootstrapServers',
         | 'properties.group.id' = '$ProducerGroupId',
         | 'sink.partitioner' = 'round-robin',
         | 'format' = 'json',
         | 'json.fail-on-missing-field' = 'false',
         | 'json.ignore-parse-errors' = 'true'
         |)
    """.stripMargin
    tableEnv.executeSql(kafkaSinkDDL)

    /**
      * DIM
      */
    val dimSourceDDL =
      """
        |create table dim_mysql_games (
        |  ID bigint,
        |  CP_GAME_ID int
        |) with (
        | 'connector' = 'jdbc',
        | 'url' = 'jdbc:mysql://cdh-slave04:3306/game_platform?useSSL=false&useUnicode=true&characterEncoding=UTF-8&characterSetResults=UTF-8&zeroDateTimeBehavior=CONVERT_TO_NULL&serverTimezone=UTC',
        | 'username' = 'root',
        | 'password' = '123456',
        | 'table-name' = 'games',
        | 'driver' = 'com.mysql.cj.jdbc.Driver',
        | 'scan.fetch-size' = '200',
        | 'lookup.cache.max-rows' = '2000',
        | 'lookup.cache.ttl' = '300s'
        |)
      """.stripMargin
    tableEnv.executeSql(dimSourceDDL)

    /**
      * 注册自定义函数
      */
    tableEnv.createTemporarySystemFunction("ParseDeviceCode", classOf[ParseDeviceCode])
    tableEnv.createTemporarySystemFunction("ParseDeviceKey", classOf[ParseDeviceKey])
    tableEnv.createTemporarySystemFunction("ParseOsVersion", classOf[ParseOsVersion])
    tableEnv.createTemporarySystemFunction("ParseDeviceId", classOf[ParseDeviceId])
    tableEnv.createTemporarySystemFunction("ParseMd5DeviceId", classOf[ParseMd5DeviceId])

    /**
      * temporal join
      */
    val queryDDL =
      s"""
         |insert into $dwdUserLoginTopic
         |select
         | dim.CP_GAME_ID as cp_game_id,
         | _game_id as game_id,
         | coalesce(_login_name, '') as login_account,
         | from_unixtime(_time / 1000, 'yyyy-MM-dd HH:mm:ss') as `time`,
         | from_unixtime(_time_server / 1000, 'yyyy-MM-dd HH:mm:ss') as time_server,
         | ParseDeviceKey(_device_key, _os, _imei) as device_key,
         | _os as os
         |from $odsUserLoginTopic as login
         | left join dim_mysql_games for system_time as of login.proc_time as dim
         | on dim.ID = login._game_id
      """.stripMargin
    tableEnv.executeSql(queryDDL)
  }
}

 

配置加载 

默认加载resources下 application.properties

import com.typesafe.config.{Config, ConfigFactory}

/**
  * create by LiuJinHe 2020/3/11
  */
object PropManager {
  final val config: Config = ConfigFactory.load
  def getProp(key: String): String = config.getString(key).trim
}

 

自定义udf

import com.mysql.cj.util.StringUtils
import com.sm.common.utils.MD5Utils
import org.apache.flink.table.functions.ScalarFunction

/**
  * 自定义UDF, 解析 Device key
  *
  * create by LiuJinHe 2020/9/22
  */
class ParseDeviceKey extends ScalarFunction {
  def eval(deviceKey: String, os: String, imei: String): String = {
    if (!StringUtils.isNullOrEmpty(deviceKey)) {
      deviceKey.toLowerCase
    } else if (os == "1") {
      ""
    } else {
      MD5Utils.MD5Encode(imei).toLowerCase
    }
  }
}

pom

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <scala.version>2.11.12</scala.version>
        <scala.binary.version>2.11</scala.binary.version>
        <flink.version>1.11.2</flink.version>
        <hive.version>2.1.1-cdh6.2.0</hive.version>
        <hadoop.version>3.0.0-cdh6.2.0</hadoop.version>
    </properties>

    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
            <releases>
                <enabled>true</enabled>
            </releases>
            <snapshots>
                <enabled>false</enabled>
            </snapshots>
        </repository>
    </repositories>

    <dependencies>
        <dependency>
            <groupId>com.sm.common</groupId>
            <artifactId>common</artifactId>
            <version>1.0-SNAPSHOT</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>

        <!-- Flink Dependency -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala-bridge_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-json</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-jdbc_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>com.alibaba.ververica</groupId>
            <artifactId>flink-sql-connector-mysql-cdc</artifactId>
            <version>1.1.0</version>
            <scope>provided</scope>
        </dependency>

        <!--        other Dependency-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.16</version>
            <!--            <scope>provided</scope>-->
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <!-- 该插件用于将Scala代码编译成class文件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>4.2.0</version>
                <executions>
                    <execution>
                        <goals>
                            <!--声明绑定到maven的compile阶段-->
                            <goal>compile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

 

提交到集群,$FLINK_HOME/lib 下放入以下jar,重启集群,打包时全部 <scope>provided</scope>

flink-connector-jdbc_2.11-1.11.2.jar
flink-sql-connector-kafka_2.11-1.11.2.jar
flink-sql-connector-mysql-cdc-1.1.0.jar    # mysql cdc,包含mysql-connector-java

 

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

訾零

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值