spark生产过程遇到的问题(累加器相关)

项目场景:

提示:这里简述项目相关背景:

项目场景:spark upsert mysql数据,想通过foreachpartition 执行的结果来进行流程控制。


问题描述

def insertOrUpdateDFtoDB(tableName: String, resultDateFrame: DataFrame, updateColumns: Array[String]): Boolean = {
    var count = 0
    var status = true
    val colNumbsers = resultDateFrame.columns.length
    println("colNumbsers长度:  "+resultDateFrame.columns.length.toString)
    println("updateColumns长度:  "+updateColumns.length.toString)
    var sql = getInsertOrUpdateSql(tableName, resultDateFrame.columns, updateColumns)
    val columnDataTypes = resultDateFrame.schema.fields.map(_.dataType)
    println(s"\n$sql")
    resultDateFrame.foreachPartition((partitionRecords:Iterator[Row]) => {
      val conn = getOnlineConn()
      val prepareStatement = conn.prepareStatement(sql)
      try {
        conn.setAutoCommit(false)
        partitionRecords.foreach(record => {
          //设置需要插入的字段
          for (i <- 1 to colNumbsers) {
            val value = record.get(i - 1)
            val dateType = columnDataTypes(i - 1)
            if (value != null) {
              prepareStatement.setString(i, value.toString)
              dateType match {
                case _: ByteType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
                case _: ShortType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
                case _: IntegerType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
                case _: LongType => prepareStatement.setLong(i, record.getAs[Long](i - 1))
                case _: BooleanType => prepareStatement.setBoolean(i, record.getAs[Boolean](i - 1))
                case _: FloatType => prepareStatement.setFloat(i, record.getAs[Float](i - 1))
                case _: DoubleType => prepareStatement.setDouble(i, record.getAs[Double](i - 1))
                case _: StringType => prepareStatement.setString(i, record.getAs[String](i - 1))
                case _: TimestampType => prepareStatement.setTimestamp(i, record.getAs[Timestamp](i - 1))
                //  case _: DateType => prepareStatement.setDate(i, record.getAs[Date](i - 1))
              }
            } else {
              println(i,updateColumns(i-1),record.getString(updateColumns.length-1))
            }
          }
          //设置需要 更新的字段值
          for (i <- 1 to updateColumns.length) {
            val fieldIndex = record.fieldIndex(updateColumns(i - 1))
            val value = record.get(i-1)
            val dataType = columnDataTypes(fieldIndex)
            // println(s"\n更新字段值属性索引: $fieldIndex")//,属性值:$value,属性类型:$dataType")
            if (value != null) {
              dataType match {
                case _: ByteType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
                case _: ShortType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
                case _: IntegerType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
                case _: LongType => prepareStatement.setLong(colNumbsers + i, record.getAs[Long](fieldIndex))
                case _: BooleanType => prepareStatement.setBoolean(colNumbsers + i, record.getAs[Boolean](fieldIndex))
                case _: FloatType => prepareStatement.setFloat(colNumbsers + i, record.getAs[Float](fieldIndex))
                case _: DoubleType => prepareStatement.setDouble(colNumbsers + i, record.getAs[Double](fieldIndex))
                case _: StringType => prepareStatement.setString(colNumbsers + i, record.getAs[String](fieldIndex))
                case _: TimestampType => prepareStatement.setTimestamp(colNumbsers + i, record.getAs[Timestamp](fieldIndex))
                //  case _: DateType => prepareStatement.setDate(colNumbsers + i, record.getAs[Date](fieldIndex))
              }
            }
          }
          prepareStatement.addBatch()
          count += 1
        })
        //批次大小为2000
        if (count % 2000 == 0) {
          prepareStatement.executeBatch()
          conn.commit()
        }
      } catch {
        case e: Exception =>
          status = false
          println(s"@@  ${e}")
      } finally {
        prepareStatement.executeBatch()
        conn.commit()
        prepareStatement.close()
        conn.close()
      }
    })
    status
  }

想通过 最后的 status 来获取 更新插入mysql 的 结果状态,但是 status 一直是true ,即使 走了 catch 里面的 逻辑 最后也是true,就没办法判断过程中有没有数据丢失的或者因 格式不行 导致的 失败的


原因分析:

在Spark中,每个分区的处理是并行的,因此对于共享变量的修改可能会发生竞争条件 导致 变量不发生改变。


解决方案:

为了避免这种情况,可以使用累加器(accumulator)来收集分区内的结果,并在分区结束后获取一个状态。

def insertOrUpdateDFtoDB(sc:SparkContext,tableName: String, resultDateFrame: DataFrame, updateColumns: Array[String]): Boolean = {
    var count = 0
    var status = true
    val colNumbsers = resultDateFrame.columns.length
    println("colNumbsers长度:  "+resultDateFrame.columns.length.toString)
    println("updateColumns长度:  "+updateColumns.length.toString)
    var sql = getInsertOrUpdateSql(tableName, resultDateFrame.columns, updateColumns)
    val columnDataTypes = resultDateFrame.schema.fields.map(_.dataType)
    println(s"\n$sql")

    //声明累加器
    val acc = sc.longAccumulator("acc_count")
    resultDateFrame.foreachPartition((partitionRecords:Iterator[Row]) => {
      val conn = getOnlineConn()
      var acc_count = 1
      val prepareStatement = conn.prepareStatement(sql)
      try {
        conn.setAutoCommit(false)
        partitionRecords.foreach(record => {
          //设置需要插入的字段
          for (i <- 1 to colNumbsers) {
            val value = record.get(i - 1)
            val dateType = columnDataTypes(i - 1)
            if (value != null) {
              prepareStatement.setString(i, value.toString)
              dateType match {
                case _: ByteType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
                case _: ShortType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
                case _: IntegerType => prepareStatement.setInt(i, record.getAs[Int](i - 1))
                case _: LongType => prepareStatement.setLong(i, record.getAs[Long](i - 1))
                case _: BooleanType => prepareStatement.setBoolean(i, record.getAs[Boolean](i - 1))
                case _: FloatType => prepareStatement.setFloat(i, record.getAs[Float](i - 1))
                case _: DoubleType => prepareStatement.setDouble(i, record.getAs[Double](i - 1))
                case _: StringType => prepareStatement.setString(i, record.getAs[String](i - 1))
                case _: TimestampType => prepareStatement.setTimestamp(i, record.getAs[Timestamp](i - 1))
                //  case _: DateType => prepareStatement.setDate(i, record.getAs[Date](i - 1))
              }
            }
          }
          //设置需要 更新的字段值
          for (i <- 1 to updateColumns.length) {
            val fieldIndex = record.fieldIndex(updateColumns(i - 1))
            val value = record.get(i-1)
            val dataType = columnDataTypes(fieldIndex)
            // println(s"\n更新字段值属性索引: $fieldIndex")//,属性值:$value,属性类型:$dataType")
            if (value != null) {
              dataType match {
                case _: ByteType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
                case _: ShortType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
                case _: IntegerType => prepareStatement.setInt(colNumbsers + i, record.getAs[Int](fieldIndex))
                case _: LongType => prepareStatement.setLong(colNumbsers + i, record.getAs[Long](fieldIndex))
                case _: BooleanType => prepareStatement.setBoolean(colNumbsers + i, record.getAs[Boolean](fieldIndex))
                case _: FloatType => prepareStatement.setFloat(colNumbsers + i, record.getAs[Float](fieldIndex))
                case _: DoubleType => prepareStatement.setDouble(colNumbsers + i, record.getAs[Double](fieldIndex))
                case _: StringType => prepareStatement.setString(colNumbsers + i, record.getAs[String](fieldIndex))
                case _: TimestampType => prepareStatement.setTimestamp(colNumbsers + i, record.getAs[Timestamp](fieldIndex))
                //  case _: DateType => prepareStatement.setDate(colNumbsers + i, record.getAs[Date](fieldIndex))
              }
            }
          }
          prepareStatement.addBatch()
          count += 1
        })
        //批次大小为2000
        if (count % 2000 == 0) {
          prepareStatement.executeBatch()
          conn.commit()
        }
        acc.add(acc_count) //累加器+1
      } catch {
        case e: Exception =>
          println(s"@@  ${e}")
      } finally {
        prepareStatement.executeBatch()
        conn.commit()
        prepareStatement.close()
        conn.close()
      }
    })
    

val result = acc.value
    if(result==5){ //5 是 分区数量,取决于 foreachpartition 有多少分区数 
      status =true
    }else{
      status =false
    }

    status
  }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值