一,导入策略为新增和变化的表的数据的导入
1.1,es的数据导入
package com.mingzhi.universal
import com.mingzhi.IDate
import com.mingzhi.utils.{SparkUtils, TableUtils}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.elasticsearch.spark.sql.EsSparkSQL
/**
* 导新增和变化 wfs_order_list_index
*/
object load_new_change_from_es {
def main(args: Array[String]): Unit = {
val conf: SparkUtils.DefConf = SparkUtils.getDefConf
val builder = SparkSession.builder()
if (System.getProperties.getProperty("os.name").contains("Windows")) {
conf.host = "192.168.0.207"
conf.from_db = "wfs_order_list_index"
conf.from_table = "_doc"
conf.to_db = "paascloud"
conf.to_table = conf.from_db
conf.dt = "2020-06-16"
conf.dt1 = "2020-06-17"
conf.createTime = "orderCreateTime"
conf.updateTime = "orderUpdateTime"
builder.master("local[*]")
} else {
if (args.length != 10) {
throw new IllegalArgumentException("需要10个参数,分别是es地址,es库,es表," +
"目标库,目标表,起始时间,结束时间," +
"数据的创建时间字段,数据的更新时间字段,数据存储格式(parquet or json)")
}
conf.host = args(0)
conf.from_db = args(1)
conf.from_table = args(2)
conf.to_db = args(3)
conf.to_table = args(4)
conf.dt = args(5)
conf.dt1 = args(6)
conf.createTime = args(7)
conf.updateTime = args(8)
conf.format = args(9)
}
builder
.config("es.nodes", conf.host)
.config("es.port", "9200")
.config("es.mapping.date.rich", false)
.config("dfs.replication", 1)
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.appName(conf.from_table)
.enableHiveSupport()
val spark: SparkSession = builder.getOrCreate()
new IDate {
override def onDate(dt: String): Unit = {
println("回调日期:" + dt)
process(spark, conf.from_db, conf.from_table, conf.to_db, conf.to_table, dt, conf.createTime, conf.updateTime, conf.format)
}
}.invoke(conf.dt, conf.dt1)
spark.stop()
}
private def process(spark: SparkSession, from_db: String, from_table: String, to_db: String, to_table: String, dt: String, ct: String, ut: String, format: String) = {
val esQuery =
s"""
|{
| "query":
| {
| "bool":
| {
| "should":
| [
| {
| "range": {
| "$ct":
| {
| "gte": "$dt 00:00:00",
| "lte": "$dt 23:59:59"
| }
| }
| },
| {
| "range": {
| "$ut":
| {
| "gte": "$dt 00:00:00",
| "lte": "$dt 23:59:59"
| }
| }
| }
| ]
| }
| }
|}
|""".stripMargin
val resDF: DataFrame = EsSparkSQL.esDF(spark, s"/$from_db/$from_table", esQuery)
resDF.createOrReplaceTempView("t_temp")
/**
* 每一条数据打上计算日期dt的标签,==>ods的对应分区
*/
val frame = spark.sql(
s"""
|
|select '$dt' as dt ,* from t_temp
|
|
|""".stripMargin)
frame.show(false)
// println(frame.count())
if (TableUtils.tableExists(spark, to_db, to_table)) {
spark.sql(s"ALTER TABLE $to_db.$to_table DROP IF EXISTS PARTITION (dt='$dt')")
println(s"${to_table}表存在,删除${dt}分区的数据")
} else {
println(s"${to_table}表不存在,do nothing")
}
frame
.coalesce(1)
.write
.format(format)
.mode(SaveMode.Append)
.partitionBy("dt")
.saveAsTable(s"${to_db}.$to_table")
}
}
1.2,mysql的数据导入
package com.mingzhi.universal
import com.mingzhi.IDate
import com.mingzhi.utils.{SparkUtils, TableUtils}
import org.apache.spark.sql.{SaveMode, SparkSession}
/**
* 导新增和变化 tbwork_order
*/
object load_new_change_from_mysql {
def main(args: Array[String]): Unit = {
val conf = SparkUtils.getDefConf
val builder = SparkSession.builder()
builder
.config("spark.sql.parquet.writeLegacyFormat", "true")
.enableHiveSupport()
if (System.getProperties.getProperty("os.name").contains("Windows")) {
builder.master("local[*]")
conf.host = "192.168.0.201"
conf.from_db = "paascloud_wfs"
conf.from_table = "tbwork_order"
conf.to_db = "paascloud"
conf.to_table = conf.from_table
conf.user_name = "root"
conf.password = "/r,G)7r8kVTo"
conf.dt = "2020-06-19"
conf.dt1 = "2020-06-20"
conf.createTime = "create_time"
conf.updateTime = "update_time"
} else {
conf.host = args(0)
conf.from_db = args(1)
conf.from_table = args(2)
conf.to_db = args(3)
conf.to_table = args(4)
conf.user_name = args(5)
conf.password = args(6)
conf.dt = args(7)
conf.dt1 = args(8)
conf.createTime = args(9)
conf.updateTime = args(10)
}
builder.appName(conf.from_table)
val spark = builder.getOrCreate()
val jdbcDF = spark.read
.format("jdbc")
.option("url", s"jdbc:mysql://${conf.host}:3306/${conf.from_db}")
.option("user", conf.user_name)
.option("password", conf.password)
.option("dbtable", conf.from_table)
.load()
jdbcDF.createOrReplaceTempView("t_temp")
new IDate {
override def onDate(dt: String): Unit = {
process(spark, "t_temp", conf.to_db, conf.to_table, dt, conf.createTime, conf.updateTime)
}
}.invoke(conf.dt, conf.dt1)
spark.stop()
}
private def process(spark: SparkSession, from_table: String, to_db: String, to_table: String, dt: String, ct: String, ut: String) = {
val frame = spark.sql(
s"""
|select '$dt' as dt,*
|from $from_table
|where date_format($ct,'yyyy-MM-dd')='$dt' or date_format($ut,'yyyy-MM-dd')='$dt'
|""".stripMargin)
frame.show(false)
// println(frame.count())
if (TableUtils.tableExists(spark, to_db, to_table)) {
spark.sql(
s"""
|alter table $to_db.$to_table drop if exists partition(dt='$dt')
|""".stripMargin)
println(s"${to_table}表存在,删除${dt}分区的数据")
} else {
println(s"${to_table}表不存在,do nothing")
}
frame
.coalesce(1)
.write
.format("parquet")
.mode(SaveMode.Append)
.partitionBy("dt")
.saveAsTable(s"${to_db}.$to_table")
}
}