基于Spark+SQLServerCDC的SQLServer定时增量同步方案
启动类,负责初始化一些准备事宜。
package com.etl;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SQLContext;
import java.sql.Connection;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
import static java.lang.Thread.sleep;
public class SqlServerETLMysql {
/**
* main方法
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR);
Logger.getLogger("org.apache.hadoop").setLevel(Level.ERROR);
// 声明Spark环境
JavaSparkContext sparkContext = new JavaSparkContext(new SparkConf()
.setAppName("SparkSQLETL")
.setMaster("yarn")
.set("spark.app.id", "1000")
.set("spark.driver.memory", "2g")
.set("spark.executor.memory", "2g")
.set("spark.executor.instances", "32")
.set("spark.executor.cores", "3")
.set("spark.default.parallelism", "10"))
);
SQLContext sqlContext = new SQLContext(sparkContext);
String dateStr = dateStr();
// ddl修改
Dataset<String> ddlDF = ddlHistory(sqlContext);
if (ddlDF.count() != 0) {
Map<String, Integer> sum = SparkTransform.ddlHistory(ddlDF);
System.out.println(dateStr+" 数据结构修改完成,共修改:" + sum.get("sum") + "处");
System.out.println(dateStr+" 新增字段:" + sum.get("addSum") + "处");
System.out.println(dateStr+" 删除字段:" + sum.get("dropSum") + "处");
System.out.println(dateStr+" 修改字段:" + sum.get("alterSum") + "处");
}
// 获取lsn_time_mapping表中数据
Dataset ltmDF = SqlServerExtract.lsnTimeMapping(sqlContext);
// 获取数据库中开启CDC的表
writeFile(sqlContext, dateStr);
// 多线程
SparkRun sparkRun = new SparkRun(sqlContext,ltmDF,dateStr);
Thread SparkRun1 = new Thread(sparkRun);
Thread SparkRun2 = new Thread(sparkRun);
Thread SparkRun3 = new Thread(sparkRun);
SparkRun1.setName("线程1");
SparkRun2.setName("线程2");
SparkRun3.setName("线程3");
SparkRun1.start();
sleep(1000);
SparkRun2.start();
sleep(1000);
SparkRun3.start();
}
/**
* 获取数据库中开启cdc的表,并将表写入库名+当前日期时间的目录中,同时作为定时任务是否启动的标志
*
* @param sqlContext sparksql初始化环境
* @param dateStr 当前时间日期拼接的字符串
* @throws Exception 无返回
*/
public static void writeFile(SQLContext sqlContext, String dateStr) throws Exception {
// 获取tables表中的数据
Dataset readDF = SqlServerExtract.kaiqi(sqlContext, "sys.tables");
// 提取开启cdc的表名称
readDF = readDF.select("name").where("is_tracked_by_cdc = 1");
// 将数据写入
readDF.write().csv("CDCTableName" + dateStr);
}
/**
* 获取ddl操作记录
*
* @param sqlContext
* @return
* @throws Exception
*/
public static Dataset ddlHistory(SQLContext sqlContext) throws Exception {
Dataset readDF = SqlServerExtr