目前有业务需求,需要将一个数据库的数据表同步到另一个数据库,但是读取不到源数据库的binlog数据,只有一个只读账号,所以开发了这样一个工具。此方案对解耦性强,适合各种类型的数据库之间的数据同步,只需要修改不同数据库的驱动即可,可以实现秒级的数据同步。
以下都是以mysql为例:
1、创建一张表实现数据库之间增量同步增量字段的保存
CREATE TABLE `offset_tab` (
`app_name` varchar(100) DEFAULT NULL,
`last_offset` varchar(100) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
2、创建源表test和目标表test2
CREATE TABLE `test` (
`id` int(11) DEFAULT NULL,
`name` varchar(100) DEFAULT NULL,
`create_time` varchar(100) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `test2` (
`id` int(11) DEFAULT NULL,
`name` varchar(100) DEFAULT NULL,
`create_time` varchar(100) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
3、引入相关的maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.fbq</groupId>
<artifactId>spark-streaming-cdc</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.53</version>
</dependency>
</dependencies>
<build>
<finalName>SparkStreamingCdc</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.4</version>
<configuration>
<source>8</source>
<target>8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.fbq.SparkStreamingCdc</mainClass>
<addClasspath>true</addClasspath> <!-- 在jar的MF文件中生成classpath属性 -->
<classpathPrefix>lib/</classpathPrefix> <!-- classpath前缀,即依赖jar包的路径 -->
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.5.0</version>
<executions>
<execution>
<id>scala-compile</id>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<!--includes是一个数组,包含要编译的code-->
<includes>
<include>**/*.scala</include>
</includes>
</configuration>
</execution>
<execution>
<id>scala-test-compile</id>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
4、定义一个case class保存数据和偏移量
import org.apache.spark.sql.Row
case class JdbcRecord(row: Row, offset: String)
5、自定义一个Receiver
import org.apache.log4j.Logger
import org.apache.spark.sql.Row
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import java.sql._
import java.text.SimpleDateFormat
import java.util.{Date, Properties}
@SerialVersionUID(1L)
class JdbcSource(pro: Properties)
extends Receiver[JdbcRecord](StorageLevel.MEMORY_AND_DISK_2) {
@transient private lazy val log = Logger.getLogger(getClass.getSimpleName)
@transient private var sourceConnection: Connection = _
@transient private var metadataConnection: Connection = _
@transient private var stmt: PreparedStatement = _
@transient private var offset: String = _
override def onStart(): Unit = {
try {
//初始化源数据库
Class.forName(pro.getProperty("source.driver"))
val url = pro.getProperty("source.url")
val user = pro.getProperty("source.user")
val password = pro.getProperty("source.password")
sourceConnection = DriverManager.getConnection(url, user, password)
//初始化meta数据库
Class.forName(pro.getProperty("meta.driver"))
val metaUrl = pro.getProperty("meta.url")
val metaUser = pro.getProperty("meta.user")
val metaPassword = pro.getProperty("meta.password")
metadataConnection = DriverManager.getConnection(metaUrl, metaUser, metaPassword)
receive()
} catch {
case e@(_: SQLException | _: ClassNotFoundException) =>
e.printStackTrace()
}
}
/**
* 读取数据并将数据发送给spark
*/
private def receive(): Unit = {
//创建线程读取数据
new Thread() {
setOffset()
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
//创建一个新的offset时间戳
val newOffset: String = format.format(new Date())
//读取源数据库中的增量数据
stmt = sourceConnection.prepareStatement(pro.getProperty("source.sql"))
stmt.setTimestamp(1, Timestamp.valueOf(offset))
println("old offset:" + offset + "new offset:" + newOffset)
val rs: ResultSet = stmt.executeQuery
//读取源数据库的表所有列字段
val columns: scala.Array[String] = pro.getProperty("source.column").split(",")
while (rs.next) {
//按列读取数据到数组中
val data = columns.map(columnName => rs.getString(columnName))
//将读取到的增量数据保存至store中
store(JdbcRecord(Row.fromSeq(data.toSeq), newOffset))
}
Thread.sleep(pro.getProperty("window.time.dual").toLong * 1000)
}.start()
//重启任务
restart("restart")
}
/**
* @return
*/
private def setOffset(): Unit = {
val offsetSql = s"select last_offset from offset_tab where app_name=?"
stmt = metadataConnection.prepareStatement(offsetSql)
stmt.setString(1, pro.getProperty("app.name"))
val rs = stmt.executeQuery
if (rs.next()) {
offset = rs.getString("last_offset")
} else {
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
offset = format.format(new Date())
}
}
override def onStop(): Unit = {
try {
if (stmt != null) stmt.close()
if (sourceConnection != null) sourceConnection.close()
if (metadataConnection != null) metadataConnection.close()
} catch {
case e: SQLException =>
e.printStackTrace()
}
}
}
6、定义一个主类读取源数据库、写入目标数据库
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.io.FileInputStream
import java.sql.DriverManager
import java.util.Properties
object SparkStreamingCdc {
def main(args: Array[String]): Unit = {
val profileName = args(0)
//配置文件名称
val pro: Properties = new Properties()
pro.load(new FileInputStream(profileName))
val spark = SparkSession.builder()
.master("yarn-cluster")
//.master("local[*]")
.config("spark.dynamicAllocation.enabled", false)
.enableHiveSupport()
.getOrCreate()
val windowTime = pro.getProperty("window.time.dual").toLong
spark.sparkContext.setLogLevel("INFO")
val ssc = new StreamingContext(spark.sparkContext, Seconds(windowTime))
val source = new JdbcSource(pro)
val updateList = ssc.receiverStream(source).window(Seconds(windowTime), Seconds(windowTime))
val columns = pro.getProperty("source.column").split(",")
val schema: StructType = StructType(columns.map(column => StructField(column, StringType, nullable = true)))
updateList.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
//rdd转成dataframe
val frame = spark.sqlContext.createDataFrame(rdd.map(item => item.row), schema)
//保存最新的偏移量
val updateOffsetSql = s"update offset_tab set last_offset=? where app_name=?"
val url = pro.getProperty("meta.url")
val user = pro.getProperty("meta.user")
val password = pro.getProperty("meta.password")
Class.forName(pro.getProperty("meta.driver"))
val connection = DriverManager.getConnection(url, user, password)
val statement = connection.prepareStatement(updateOffsetSql)
statement.setString(1, rdd.first().offset)
statement.setString(2, pro.getProperty("app.name"))
val prop = new java.util.Properties
prop.setProperty("user", pro.getProperty("target.user"))
prop.setProperty("password", pro.getProperty("target.password"))
prop.setProperty("driver", pro.getProperty("target.driver"))
frame.write.mode("append").jdbc(pro.getProperty("target.url"), pro.getProperty("target.table"), prop)
//提交偏移量
statement.executeUpdate()
if (connection != null) connection.close()
if (statement != null) statement.close()
}
})
ssc.start()
ssc.awaitTermination()
}
}
7、添加配置文件
#应用名称
app.name=spark_cdc
#数据同步间隔
window.time.dual=60
#source db
#源表
source.table=test
#源数据库的驱动
source.driver=com.mysql.jdbc.Driver
#源数据库的url
source.url=jdbc:mysql://localhost:3306/test?serverTimezone=Asia/Shanghai
#源数据库的账户和密码
source.user=root
source.password=123456
#读取源数据库增量数据的sql,偏移量字段
source.sql=select id,name,create_time form test where create_time >= ?
#源表的字段列
source.column=id,name,create_time
# offset db 需要保存偏移量的数据库账号和密码,offset_tab在这个库中
meta.driver=com.mysql.jdbc.Driver
meta.url=jdbc:mysql://localhost:3306/test?serverTimezone=Asia/Shanghai
meta.user=root
meta.password=123456
# target db 目标数据库的驱动类,账号和密码,以及要写入的数据库
target.driver=com.mysql.jdbc.Driver
target.url=jdbc:mysql://localhost:3306/test?serverTimezone=Asia/Shanghai
target.user=root
target.password=123456
target.table=test