spark1.6临时表uncacheTable内部bug,spark2.0后修复

问题:前面的无用临时表的uncacheTable会导致 下次select后面其他临时表的时候出现unknown accumulator id;若是uncacheTable后面刚注册的临时表,再查前面的临时表则没问题。报错如下:
18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13
18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(18, 0)
java.util.NoSuchElementException: key not found: 13
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:58)
at scala.collection.mutable.HashMap.apply(HashMap.scala:64)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1085)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1081)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:98)
at org.apache.spark.scheduler.DAGScheduler.updateAccumulators(DAGScheduler.scala:1081)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1151)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1637)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13
18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(21, 2)
java.util.NoSuchElementException: key not found: 13
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:58)
at scala.collection.mutable.HashMap.apply(HashMap.scala:64)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1085)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1081)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:98)
at org.apache.spark.scheduler.DAGScheduler.updateAccumulators(DAGScheduler.scala:1081)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1151)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1637)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

手动makeRDD后toDF在注册临时表无此类问题, 原因暂时未明(mappartitionsRDD )
我的 解决方式目前是action算出要用的东西后clearCache,或者不清理临时表内存。

原因分析:先看 核心问题代码如下
val halfHourDev: RDD[(Long, Long, String, String, String, String)] =
// sc.makeRDD(Seq((440300L,440303L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")))
// sc.parallelize(Seq((440300L,440303L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")),4)
// sqlContext.sql("SELECT f_region_id,f_city_id,cast(f_device_type as STRING) as f_device_type,f_device_id,cast(f_login_time as STRING) as f_login_time,cast(f_logout_time as STRING) as f_logout_time from t_loginDF")
sqlContext.sql("SELECT 440303L as region_id,440300L as city_id,'1' as device_type,'1000003697' as device_id,'2018-04-02 23:05:14' as login_time,'2018-04-03 02:40:20' as logout_time from t_loginDF")
.map(x=> //半小时分片统计在线设备,flatmap方便裂变
( x.getAs[Long]("city_id"),
x.getAs[Long]("region_id"),
x.getAs[String]("device_type"),
x.getAs[String]("device_id"),
x.getAs[String]("login_time"),
x.getAs[String]("logout_time")) )
halfHourDev.collect().foreach(println)
// sqlContext.clearCache()
// sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id
val day2=dt.transformDateStr("20180402")
halfHourDev.flatMap(v=>
dt.divideTime(day2,v._5,v._6).map(vv=>(v._1,v._2,v._3,v._4,vv._1,vv._2 )))
.toDF("city_id","region_id","device_type","device_id","hour","timerange")
.registerTempTable("t_half_hour_dev")
sqlContext.cacheTable("t_half_hour_dev")
sqlContext.sql("select * from t_half_hour_dev").show(2)
sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id
核心问题:
18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13
18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(21, 2)
java.util.NoSuchElementException: key not found: 13
......
halfHourDev是通过临时表( 外部数据集创建mappartitionsRDD 才会引发此问题,makerdd创建的parallelcollectionRDD不会引发此问题)创建的RDD, 如果在flatMap之前uncacheTable(直接uncacheTable不会引发此问题 )其它无用临时表(在flatMap之后uncacheTable不会引发此问题),会导致上述 error问题。

参考复现问题 代码
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._

object Test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("test")
.setMaster("local[1]")
conf.set("spark.cores.max", "1")
conf.set("spark.executor.memory","1G")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)

sc.setLogLevel("WARN")
spark_sql_bug(sqlContext,sc)
println("completed")
sc.stop()
}
def spark_sql_bug(sqlContext:HiveContext,sc:SparkContext){
import sqlContext.implicits._
val r=sc.makeRDD(Seq(("555500","555501","1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")))
.toDF("city_id","region_id","device_type","device_id","login_time","logout_time")
.withColumn("f_home_id",lit(93L)).withColumn("f_city_id",lit(440300L)).withColumn("f_region_id",lit(440303L))
r .registerTempTable("t_loginDF")
sqlContext.cacheTable("t_loginDF")
val halfHourDev: RDD[(Long, Long, String, String, String, String)] = //mappartitionsRDD才会引发,parallelcollectionRDD没问题
// sc.makeRDD(Seq(( 555500L, 555501L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")))
// sc.parallelize(Seq(( 555500L, 555501L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")),4)
// sqlContext.sql("SELECT region_id,city_id,cast(device_type as STRING) as device_type,device_id,cast(login_time as STRING) as login_time,cast(logout_time as STRING) as logout_time from t_loginDF")
sqlContext.sql("SELECT 440303L as region_id,440300L as city_id,'1' as device_type,'1000003697' as device_id,'2018-04-02 23:05:14' as login_time,'2018-04-03 02:40:20' as logout_time from t_loginDF")
.map(x=> //半小时分片统计在线设备,flatmap方便裂变
( x.getAs[Long]("city_id"),
x.getAs[Long]("region_id"),
x.getAs[String]("device_type"),
x.getAs[String]("device_id"),
x.getAs[String]("login_time"),
x.getAs[String]("logout_time")) )
.persist()
halfHourDev.collect().foreach(println)
// sqlContext.clearCache() //clearCache无问题
sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id
halfHourDev.flatMap(v=>
Set((23,60))
.map(vv=>(v._1,v._2,v._3,v._4,vv._1,vv._2 )))
.toDF("city_id","region_id","device_type","device_id","hour","timerange")
.registerTempTable("t_half_hour_dev")
sqlContext.cacheTable("t_half_hour_dev")
sqlContext.sql("select * from t_half_hour_dev").show(2) //action 引发问题
// sqlContext.uncacheTable("t_loginDF") //这里uncacheTable无问题
println("uncacheTable 引发 error")
}
}



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值