业务上需要同步oracle的数据到starrocks,先开始调研使用了flinkCDC,运行一段时间后发现Oracle内存不足,查阅相关issues以及相关资料,最终确认是flinkCDC2.3版本中debezium版本太低导致的,具体issues参考: https://github.com/ververica/flink-cdc-connectors/issues/815
所以只能更换方案使用高版本debezium + kafka connect的方式来同步对应的数据到kafka中,后面使用flink sql消费对应的kafka消息,来达到实时同步的目的。
本地测试调研使用mysql source作为测试案例
启动本地zookeeper以及kafka,
我这里的版本是 zookeeper3.4.6、kafka2.1
采集mysql binlog数据发送kafka:
package debezium_cdc
import io.debezium.engine.DebeziumEngine.{ChangeConsumer, CompletionCallback}
import io.debezium.engine.format.Json
import io.debezium.engine.{ChangeEvent, DebeziumEngine}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import java.util
import java.util.Properties
import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
import java.util.function.Consumer
import scala.collection.JavaConverters.asScalaBufferConverter
/**
* 采集binlog日志发送kafka
*
* @author zhangyunhao
*/
object MysqlDebeziumEngine {
def main(args: Array[String]): Unit = {
val bootstrapList = "localhost:9092"
val topicName = "test_zyh_kafka_cdc"
// kafka配置
val kafkaProps = new Properties()
kafkaProps.put("bootstrap.servers", bootstrapList)
kafkaProps.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
kafkaProps.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String,String](kafkaProps)
// debezium配置
val props: Properties = new Properties()
// engine的参数设置
props.setProperty("name", "engine")
props.setProperty("offset.storage", "org.apache.kafka.connect.storage.FileOffsetBackingStore")
props.setProperty("offset.storage.file.filename", "/Users/zhangyunhao/IdeaProjects/flink_explore/offsets.log")
props.setProperty("offset.flush.interval.ms", "6000")
props.setProperty("converter.schemas.enable", "true")
// mysql connector的参数设置
props.setProperty("connector.class", "io.debezium.connector.mysql.MySqlConnector")
props.setProperty("database.hostname", "127.0.0.1")
props.setProperty("database.port", "3306")
props.setProperty("database.user", "root")
props.setProperty("database.password", "123456")
props.setProperty("database.server.id", "85744") // 随便设置
props.setProperty("database.server.name", "my-app-connector") // 随便设置
props.setProperty("database.include.list", "db_test") // 同步库
props.setProperty("table.include.list", "db_test.stu_test") // 同步表
props.setProperty("snapshot.mode", "schema_only")
props.setProperty("decimal.handling.mode", "double")
props.setProperty("database.history",
"io.debezium.relational.history.FileDatabaseHistory")
props.setProperty("database.history.file.filename",
"/Users/zhangyunhao/IdeaProjects/flink_explore/dbhistory.log")
try {
// 创建engine。DebeziumEngine继承了Closeable,会自动关闭
val engine: DebeziumEngine[ChangeEvent[String, String]] =
DebeziumEngine.create(classOf[Json])
.using(props)
.notifying(new Consumer[ChangeEvent[String, String]] {
override def accept(changeEvent: ChangeEvent[String, String]): Unit = {
println("日志key" + changeEvent.key())
println("日志value" + changeEvent.value())
}
})
.notifying(
new ChangeConsumer[ChangeEvent[String, String]] {
override def handleBatch(list: util.List[ChangeEvent[String, String]], recordCommitter: DebeziumEngine.RecordCommitter[ChangeEvent[String, String]]): Unit = {
for (changeEvent <- list.asScala) {
println("日志key" + changeEvent.key())
println("日志value" + changeEvent.value())
// 消息发送kafka
producer.send(new ProducerRecord[String,String](topicName, changeEvent.key(), changeEvent.value()))
producer.flush()
recordCommitter.markProcessed(changeEvent)
}
recordCommitter.markBatchFinished()
}
}
)
// 加上回调代码,查看错误信息
.using(new CompletionCallback {
override def handle(success: Boolean, message: String, error: Throwable): Unit = {
if (!success && error != null) {
System.out.println("----------error------")
System.out.println(message)
error.printStackTrace()
}
}
})
.build()
// 异步执行engine
val executor: ExecutorService = Executors.newSingleThreadExecutor()
executor.execute(engine)
// 优雅的关闭应用
executor.shutdown() // 执行shutdown,等待已经提交的任务执行完毕
// 持续监控任务是否完成,如果未完成,则继续等待
while (!executor.awaitTermination(10, TimeUnit.SECONDS)) {
println("Waiting another 10 seconds for the embedded engine to shut down")
}
} catch {
case e: InterruptedException => {
Thread.currentThread().interrupt()
}
}
}
}
使用flink sql消费对应的消息:
package debezium_cdc
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{EnvironmentSettings}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
object BinlogParseApp {
def main(args: Array[String]): Unit = {
val fsSettings: EnvironmentSettings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val fsEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// fsEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
fsEnv.setParallelism(1)
val tEnv: StreamTableEnvironment = StreamTableEnvironment.create(fsEnv,fsSettings)
// 可以删除,注意这里要设置主键字段
// 目前debezium-json格式支持的连接器,
// 参考 https://nightlies.apache.org/flink/flink-docs-release-1.13/zh/docs/connectors/table/formats/overview/
val sourceSql =
"""
|CREATE TABLE topic_products (
| id BIGINT,
| name STRING,
| PRIMARY KEY (id) NOT ENFORCED
|) WITH (
| 'connector' = 'kafka',
| 'topic' = 'test_zyh_kafka_cdc',
| 'properties.bootstrap.servers' = 'localhost:9092',
| 'properties.group.id' = 'testGroup_zyh',
| 'format' = 'debezium-json',
| 'debezium-json.schema-include' = 'true'
|)
|""".stripMargin
tEnv.executeSql(sourceSql)
// tEnv.executeSql("select * from topic_products").print()
val sinkSql =
"""
|CREATE TABLE user_sink_table (
| id BIGINT,
| name STRING,
| PRIMARY KEY (id) NOT ENFORCED
|) WITH (
| 'connector' = 'jdbc',
| 'url' = 'jdbc:mysql://localhost:3306/db_test',
| 'table-name' = 'stu_test2',
| 'username' = 'root',
| 'password' = '123456',
| 'sink.buffer-flush.interval' = '5s',
| 'sink.buffer-flush.max-rows' = '100'
|)
|""".stripMargin
tEnv.executeSql(sinkSql)
val insertSql =
"""
|insert into user_sink_table
|select
| id,
| name
|from
|topic_products
|""".stripMargin
tEnv.executeSql(insertSql)
}
}
在表 db_test.stu_test 上执行增删改操作,在对应的sink表 db_test.stu_test2 也会出现对应的操作。
⚠️: flink sql程序的source需要定义主键,否则无法处理删除消息