importorg.apache.spark.sql.{AnalysisException, DataFrame}importorg.apache.spark.sql.types.StructType
importorg.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcOptionsInWrite}importorg.apache.spark.sql.execution.datasources.jdbc.JdbcUtils.{createConnectionFactory, getInsertStatement, savePartition}importorg.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects}importjava.sql.Connection
object JdbcUtilsEnhance {def updateTable(
df: DataFrame,
tableSchema: Option[StructType],
isCaseSensitive:Boolean,
options: JdbcOptionsInWrite):Unit={val url = options.url
val table = options.table
val dialect = JdbcDialects.get(url)
println(s"dialect = ${dialect}")val rddSchema = df.schema
val getConnection:()=> Connection = createConnectionFactory(options)val batchSize = options.batchSize
println(s"batchSize = ${batchSize}")val isolationLevel = options.isolationLevel
val updateStmt = getUpdateStatement(table, rddSchema, tableSchema, isCaseSensitive, dialect)val repartitionedDF = options.numPartitions match{case Some(n)if n <=0=>thrownew IllegalArgumentException(s"Invalid value `$n` for parameter `${JDBCOptions.JDBC_NUM_PARTITIONS}` in table writing "+"via JDBC. The minimum value is 1.")case Some(n)if n < df.rdd.getNumPartitions => df.coalesce(n)case _ => df
}
repartitionedDF.rdd.foreachPartition { iterator =>
savePartition(
getConnection, table, iterator, rddSchema, updateStmt, batchSize, dialect, isolationLevel,
options)}}def getUpdateStatement(
table:String,
rddSchema: StructType,
tableSchema: Option[StructType],
isCaseSensitive:Boolean,
dialect: JdbcDialect):String={val columns =if(tableSchema.isEmpty){
rddSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")}else{val columnNameEquality =if(isCaseSensitive){
org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
}else{
org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
}// The generated insert statement needs to follow rddSchema's column sequence and// tableSchema's column names. When appending data into some case-sensitive DBMSs like// PostgreSQL/Oracle, we need to respect the existing case-sensitive column names instead of// RDD column names for user convenience.val tableColumnNames = tableSchema.get.fieldNames
rddSchema.fields.map { col =>val normalizedName = tableColumnNames.find(f => columnNameEquality(f, col.name)).getOrElse {thrownew Exception(s"""Column "${col.name}" not found in schema $tableSchema""")}
dialect.quoteIdentifier(normalizedName)}.mkString(",")}val placeholders = rddSchema.fields.map(_ =>"?").mkString(",")val sql =s"""INSERT INTO $table ($columns) VALUES ($placeholders)
|ON DUPLICATE KEY UPDATE
|${columns.split(",").map(col =>s"$col=VALUES($col)").mkString(",")}
|""".stripMargin
println(sql)
sql
}}
importorg.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
importorg.apache.spark.sql.execution.datasources.jdbc.{JdbcOptionsInWrite, JdbcRelationProvider, JdbcUtils}importorg.apache.spark.sql.sources.BaseRelation
importorg.apache.spark.sql.{DataFrame, SQLContext, SaveMode}class MysqlUpdateRelationProvider extends JdbcRelationProvider {overridedef createRelation(
sqlContext: SQLContext,
mode: SaveMode,
parameters: Map[String,String],
df: DataFrame): BaseRelation ={val options =new JdbcOptionsInWrite(parameters)val isCaseSensitive = sqlContext.sparkSession.sessionState.conf.caseSensitiveAnalysis
val conn = JdbcUtils.createConnectionFactory(options)()try{val tableExists = JdbcUtils.tableExists(conn, options)if(tableExists){
mode match{case SaveMode.Overwrite =>if(options.isTruncate && isCascadingTruncateTable(options.url)== Some(false)){// In this case, we should truncate table and then load.
truncateTable(conn, options)val tableSchema = JdbcUtils.getSchemaOption(conn, options)// saveTable(df, tableSchema, isCaseSensitive, options)
JdbcUtilsEnhance.updateTable(df, tableSchema, isCaseSensitive, options)}else{// Otherwise, do not truncate the table, instead drop and recreate it
dropTable(conn, options.table, options)
createTable(conn, options.table, df.schema, isCaseSensitive, options)// saveTable(df, Some(df.schema), isCaseSensitive, options)
JdbcUtilsEnhance.updateTable(df, Some(df.schema), isCaseSensitive, options)}case SaveMode.Append =>val tableSchema = JdbcUtils.getSchemaOption(conn, options)// saveTable(df, tableSchema, isCaseSensitive, options)
JdbcUtilsEnhance.updateTable(df, tableSchema, isCaseSensitive, options)case SaveMode.ErrorIfExists =>thrownew Exception(s"Table or view '${options.table}' already exists. "+s"SaveMode: ErrorIfExists.")case SaveMode.Ignore =>// With `SaveMode.Ignore` mode, if table already exists, the save operation is expected// to not save the contents of the DataFrame and to not change the existing data.// Therefore, it is okay to do nothing here and then just return the relation below.}}else{
createTable(conn, options.table, df.schema, isCaseSensitive, options)// saveTable(df, Some(df.schema), isCaseSensitive, options)
JdbcUtilsEnhance.updateTable(df, Some(df.schema), isCaseSensitive, options)}}finally{
conn.close()}
createRelation(sqlContext, parameters)}}