spark1.6临时表uncacheTable内部bug，spark2.0后修复

最新推荐文章于 2023-09-11 09:41:19 发布

数道难

最新推荐文章于 2023-09-11 09:41:19 发布

阅读量1.2k

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/u013303361/article/details/79894116

版权

spark 专栏收录该内容

42 篇文章 1 订阅

订阅专栏

 
 问题：前面的无用临时表的uncacheTable会导致 
 下次select后面其他临时表的时候出现unknown accumulator id；若是uncacheTable后面刚注册的临时表，再查前面的临时表则没问题。报错如下： 

  18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13 

  18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(18, 0) 

  java.util.NoSuchElementException: key not found: 13 

  at scala.collection.MapLike$class.default(MapLike.scala:228) 

  at scala.collection.AbstractMap.default(Map.scala:58) 

  at scala.collection.mutable.HashMap.apply(HashMap.scala:64) 

  at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1085) 

  at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1081) 

  at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) 

  at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) 

  at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226) 

  at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39) 

  at scala.collection.mutable.HashMap.foreach(HashMap.scala:98) 

  at org.apache.spark.scheduler.DAGScheduler.updateAccumulators(DAGScheduler.scala:1081) 

  at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1151) 

  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1637) 

  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) 

  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) 

  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) 

  18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13 

  18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(21, 2) 

  java.util.NoSuchElementException: key not found: 13 

  at scala.collection.MapLike$class.default(MapLike.scala:228) 

  at scala.collection.AbstractMap.default(Map.scala:58) 

  at scala.collection.mutable.HashMap.apply(HashMap.scala:64) 

  at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1085) 

  at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1081) 

  at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) 

  at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) 

  at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226) 

  at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39) 

  at scala.collection.mutable.HashMap.foreach(HashMap.scala:98) 

  at org.apache.spark.scheduler.DAGScheduler.updateAccumulators(DAGScheduler.scala:1081) 

  at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1151) 

  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1637) 

  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) 

  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) 

  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) 

  手动makeRDD后toDF在注册临时表无此类问题， 
 原因暂时未明(mappartitionsRDD 
 )。 

  我的 
 解决方式目前是action算出要用的东西后clearCache，或者不清理临时表内存。 

  原因分析：先看 
 核心问题代码如下 

  val halfHourDev: RDD[(Long, Long, String, String, String, String)] = 

  // sc.makeRDD(Seq((440300L,440303L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20"))) 

  // sc.parallelize(Seq((440300L,440303L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")),4) 

  // sqlContext.sql("SELECT f_region_id,f_city_id,cast(f_device_type as STRING) as f_device_type,f_device_id,cast(f_login_time as STRING) as f_login_time,cast(f_logout_time as STRING) as f_logout_time from t_loginDF") 

  sqlContext.sql("SELECT 440303L as region_id,440300L as city_id,'1' as device_type,'1000003697' as device_id,'2018-04-02 23:05:14' as login_time,'2018-04-03 02:40:20' as logout_time from t_loginDF") 

  .map(x=> //半小时分片统计在线设备，flatmap方便裂变 

  ( x.getAs[Long]("city_id"), 

  x.getAs[Long]("region_id"), 

  x.getAs[String]("device_type"), 

  x.getAs[String]("device_id"), 

  x.getAs[String]("login_time"), 

  x.getAs[String]("logout_time")) ) 

  halfHourDev.collect().foreach(println) 

  // sqlContext.clearCache() 

  // sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id 

  val day2=dt.transformDateStr("20180402") 

  halfHourDev.flatMap(v=> 

  dt.divideTime(day2,v._5,v._6).map(vv=>(v._1,v._2,v._3,v._4,vv._1,vv._2 ))) 

  .toDF("city_id","region_id","device_type","device_id","hour","timerange") 

  .registerTempTable("t_half_hour_dev") 

  sqlContext.cacheTable("t_half_hour_dev") 

  sqlContext.sql("select * from t_half_hour_dev").show(2) 

  sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id 

 
 核心问题： 

  18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13 

  18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(21, 2) 

  java.util.NoSuchElementException: key not found: 13 

  ...... 

  halfHourDev是通过临时表（ 
 外部数据集创建mappartitionsRDD 才会引发此问题，makerdd创建的parallelcollectionRDD不会引发此问题）创建的RDD， 
 如果在flatMap之前uncacheTable（直接uncacheTable不会引发此问题 
 ）其它无用临时表（在flatMap之后uncacheTable不会引发此问题），会导致上述 
 error问题。 

  参考复现问题 
 代码： 

  import org.apache.spark.{SparkConf, SparkContext} 

  import org.apache.spark.sql.hive.HiveContext 

  import org.apache.spark.rdd.RDD 

  import org.apache.spark.sql.functions._ 

  object Test { 

  def main(args: Array[String]): Unit = { 

  val conf = new SparkConf() 

  conf.setAppName("test") 

  .setMaster("local[1]") 

  conf.set("spark.cores.max", "1") 

  conf.set("spark.executor.memory","1G") 

  val sc = new SparkContext(conf) 

  val sqlContext = new HiveContext(sc) 

  sc.setLogLevel("WARN") 

  spark_sql_bug(sqlContext,sc) 

  println("completed") 

  sc.stop() 

}

  def spark_sql_bug(sqlContext:HiveContext,sc:SparkContext){ 

  import sqlContext.implicits._ 

  val r=sc.makeRDD(Seq(("555500","555501","1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20"))) 

  .toDF("city_id","region_id","device_type","device_id","login_time","logout_time") 

  .withColumn("f_home_id",lit(93L)).withColumn("f_city_id",lit(440300L)).withColumn("f_region_id",lit(440303L)) 

  r .registerTempTable("t_loginDF") 

  sqlContext.cacheTable("t_loginDF") 

  val halfHourDev: RDD[(Long, Long, String, String, String, String)] = //mappartitionsRDD才会引发，parallelcollectionRDD没问题 

  // sc.makeRDD(Seq(( 
 555500L, 
 555501L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20"))) 

  // sc.parallelize(Seq(( 
 555500L, 
 555501L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")),4) 

  // sqlContext.sql("SELECT region_id,city_id,cast(device_type as STRING) as device_type,device_id,cast(login_time as STRING) as login_time,cast(logout_time as STRING) as logout_time from t_loginDF") 

  sqlContext.sql("SELECT 440303L as region_id,440300L as city_id,'1' as device_type,'1000003697' as device_id,'2018-04-02 23:05:14' as login_time,'2018-04-03 02:40:20' as logout_time from t_loginDF") 

  .map(x=> //半小时分片统计在线设备，flatmap方便裂变 

  ( x.getAs[Long]("city_id"), 

  x.getAs[Long]("region_id"), 

  x.getAs[String]("device_type"), 

  x.getAs[String]("device_id"), 

  x.getAs[String]("login_time"), 

  x.getAs[String]("logout_time")) ) 

  .persist() 

  halfHourDev.collect().foreach(println) 

  // sqlContext.clearCache() //clearCache无问题 

  sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id 

  halfHourDev.flatMap(v=> 

  Set((23,60)) 

  .map(vv=>(v._1,v._2,v._3,v._4,vv._1,vv._2 ))) 

  .toDF("city_id","region_id","device_type","device_id","hour","timerange") 

  .registerTempTable("t_half_hour_dev") 

  sqlContext.cacheTable("t_half_hour_dev") 

  sqlContext.sql("select * from t_half_hour_dev").show(2) //action 引发问题 

  // sqlContext.uncacheTable("t_loginDF") //这里uncacheTable无问题 

  println("uncacheTable 引发 error") 

}

}

数道难

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录