项目场景:
提示:这里简述项目相关背景:
项目场景:spark upsert mysql数据,想通过foreachpartition 执行的结果来进行流程控制。
问题描述
def insertOrUpdateDFtoDB(tableName: String, resultDateFrame: DataFrame, updateColumns: Array[String]): Boolean = {
var count = 0
var status = true
val colNumbsers = resultDateFrame.columns.length
println("colNumbsers长度: "+resultDateFrame.columns.length.toString)
println("updateColumns长度: "+updateColumns.length.toString)
var sql = getInsertOrUpdateSql(tableName, resultDateFrame.columns, updateColumns)
val columnDataTypes = resultDateFrame.schema.fields.map(_.dataType)
println(s"\n$sql")
resultDateFrame.foreachPartition((partitionRecords:Iterator[Row]) => {
val conn = getOnlineConn()
val prepareStatement = conn.prepareStatement(sql)
try {
conn.setAutoCommit(false)
partitionRecords.foreach(record => {
//设置需要插入的字段
for (i <- 1 to colNumbsers) {
val value = record.get(i - 1)
val dateType = columnDataTypes(i - 1)
if (value != null) {
prepareStatement.setString(i, value.toString)
dateType match {
case _: ByteType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
case _: ShortType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
case _: IntegerType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
case _: LongType => prepareStatement.setLong(i, record.getAs[Long](i - 1))
case _: BooleanType => prepareStatement.setBoolean(i, record.getAs[Boolean](i - 1))
case _: FloatType => prepareStatement.setFloat(i, record.getAs[Float](i - 1))
case _: DoubleType => prepareStatement.setDouble(i, record.getAs[Double](i - 1))
case _: StringType => prepareStatement.setString(i, record.getAs[String](i - 1))
case _: TimestampType => prepareStatement.setTimestamp(i, record.getAs[Timestamp](i - 1))
// case _: DateType => prepareStatement.setDate(i, record.getAs[Date](i - 1))
}
} else {
println(i,updateColumns(i-1),record.getString(updateColumns.length-1))
}
}
//设置需要 更新的字段值
for (i <- 1 to updateColumns.length) {
val fieldIndex = record.fieldIndex(updateColumns(i - 1))
val value = record.get(i-1)
val dataType = columnDataTypes(fieldIndex)
// println(s"\n更新字段值属性索引: $fieldIndex")//,属性值:$value,属性类型:$dataType")
if (value != null) {
dataType match {
case _: ByteType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
case _: ShortType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
case _: IntegerType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
case _: LongType => prepareStatement.setLong(colNumbsers + i, record.getAs[Long](fieldIndex))
case _: BooleanType => prepareStatement.setBoolean(colNumbsers + i, record.getAs[Boolean](fieldIndex))
case _: FloatType => prepareStatement.setFloat(colNumbsers + i, record.getAs[Float](fieldIndex))
case _: DoubleType => prepareStatement.setDouble(colNumbsers + i, record.getAs[Double](fieldIndex))
case _: StringType => prepareStatement.setString(colNumbsers + i, record.getAs[String](fieldIndex))
case _: TimestampType => prepareStatement.setTimestamp(colNumbsers + i, record.getAs[Timestamp](fieldIndex))
// case _: DateType => prepareStatement.setDate(colNumbsers + i, record.getAs[Date](fieldIndex))
}
}
}
prepareStatement.addBatch()
count += 1
})
//批次大小为2000
if (count % 2000 == 0) {
prepareStatement.executeBatch()
conn.commit()
}
} catch {
case e: Exception =>
status = false
println(s"@@ ${e}")
} finally {
prepareStatement.executeBatch()
conn.commit()
prepareStatement.close()
conn.close()
}
})
status
}
想通过 最后的 status 来获取 更新插入mysql 的 结果状态,但是 status 一直是true ,即使 走了 catch 里面的 逻辑 最后也是true,就没办法判断过程中有没有数据丢失的或者因 格式不行 导致的 失败的
原因分析:
在Spark中,每个分区的处理是并行的,因此对于共享变量的修改可能会发生竞争条件 导致 变量不发生改变。
解决方案:
为了避免这种情况,可以使用累加器(accumulator)来收集分区内的结果,并在分区结束后获取一个状态。
def insertOrUpdateDFtoDB(sc:SparkContext,tableName: String, resultDateFrame: DataFrame, updateColumns: Array[String]): Boolean = {
var count = 0
var status = true
val colNumbsers = resultDateFrame.columns.length
println("colNumbsers长度: "+resultDateFrame.columns.length.toString)
println("updateColumns长度: "+updateColumns.length.toString)
var sql = getInsertOrUpdateSql(tableName, resultDateFrame.columns, updateColumns)
val columnDataTypes = resultDateFrame.schema.fields.map(_.dataType)
println(s"\n$sql")
//声明累加器
val acc = sc.longAccumulator("acc_count")
resultDateFrame.foreachPartition((partitionRecords:Iterator[Row]) => {
val conn = getOnlineConn()
var acc_count = 1
val prepareStatement = conn.prepareStatement(sql)
try {
conn.setAutoCommit(false)
partitionRecords.foreach(record => {
//设置需要插入的字段
for (i <- 1 to colNumbsers) {
val value = record.get(i - 1)
val dateType = columnDataTypes(i - 1)
if (value != null) {
prepareStatement.setString(i, value.toString)
dateType match {
case _: ByteType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
case _: ShortType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
case _: IntegerType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
case _: LongType => prepareStatement.setLong(i, record.getAs[Long](i - 1))
case _: BooleanType => prepareStatement.setBoolean(i, record.getAs[Boolean](i - 1))
case _: FloatType => prepareStatement.setFloat(i, record.getAs[Float](i - 1))
case _: DoubleType => prepareStatement.setDouble(i, record.getAs[Double](i - 1))
case _: StringType => prepareStatement.setString(i, record.getAs[String](i - 1))
case _: TimestampType => prepareStatement.setTimestamp(i, record.getAs[Timestamp](i - 1))
// case _: DateType => prepareStatement.setDate(i, record.getAs[Date](i - 1))
}
}
}
//设置需要 更新的字段值
for (i <- 1 to updateColumns.length) {
val fieldIndex = record.fieldIndex(updateColumns(i - 1))
val value = record.get(i-1)
val dataType = columnDataTypes(fieldIndex)
// println(s"\n更新字段值属性索引: $fieldIndex")//,属性值:$value,属性类型:$dataType")
if (value != null) {
dataType match {
case _: ByteType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
case _: ShortType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
case _: IntegerType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
case _: LongType => prepareStatement.setLong(colNumbsers + i, record.getAs[Long](fieldIndex))
case _: BooleanType => prepareStatement.setBoolean(colNumbsers + i, record.getAs[Boolean](fieldIndex))
case _: FloatType => prepareStatement.setFloat(colNumbsers + i, record.getAs[Float](fieldIndex))
case _: DoubleType => prepareStatement.setDouble(colNumbsers + i, record.getAs[Double](fieldIndex))
case _: StringType => prepareStatement.setString(colNumbsers + i, record.getAs[String](fieldIndex))
case _: TimestampType => prepareStatement.setTimestamp(colNumbsers + i, record.getAs[Timestamp](fieldIndex))
// case _: DateType => prepareStatement.setDate(colNumbsers + i, record.getAs[Date](fieldIndex))
}
}
}
prepareStatement.addBatch()
count += 1
})
//批次大小为2000
if (count % 2000 == 0) {
prepareStatement.executeBatch()
conn.commit()
}
acc.add(acc_count) //累加器+1
} catch {
case e: Exception =>
println(s"@@ ${e}")
} finally {
prepareStatement.executeBatch()
conn.commit()
prepareStatement.close()
conn.close()
}
})
val result = acc.value
if(result==5){ //5 是 分区数量,取决于 foreachpartition 有多少分区数
status =true
}else{
status =false
}
status
}