问题:前面的无用临时表的uncacheTable会导致
下次select后面其他临时表的时候出现unknown accumulator id;若是uncacheTable后面刚注册的临时表,再查前面的临时表则没问题。报错如下:
18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13
18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(18, 0)
java.util.NoSuchElementException: key not found: 13
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:58)
at scala.collection.mutable.HashMap.apply(HashMap.scala:64)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1085)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1081)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:98)
at org.apache.spark.scheduler.DAGScheduler.updateAccumulators(DAGScheduler.scala:1081)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1151)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1637)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13
18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(21, 2)
java.util.NoSuchElementException: key not found: 13
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:58)
at scala.collection.mutable.HashMap.apply(HashMap.scala:64)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1085)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$updateAccumulators$1.apply(DAGScheduler.scala:1081)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:98)
at org.apache.spark.scheduler.DAGScheduler.updateAccumulators(DAGScheduler.scala:1081)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1151)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1637)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
手动makeRDD后toDF在注册临时表无此类问题,
原因暂时未明(mappartitionsRDD
)。
我的
解决方式目前是action算出要用的东西后clearCache,或者不清理临时表内存。
原因分析:先看
核心问题代码如下
val halfHourDev: RDD[(Long, Long, String, String, String, String)] =
// sc.makeRDD(Seq((440300L,440303L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")))
// sc.parallelize(Seq((440300L,440303L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")),4)
// sqlContext.sql("SELECT f_region_id,f_city_id,cast(f_device_type as STRING) as f_device_type,f_device_id,cast(f_login_time as STRING) as f_login_time,cast(f_logout_time as STRING) as f_logout_time from t_loginDF")
sqlContext.sql("SELECT 440303L as region_id,440300L as city_id,'1' as device_type,'1000003697' as device_id,'2018-04-02 23:05:14' as login_time,'2018-04-03 02:40:20' as logout_time from t_loginDF")
.map(x=> //半小时分片统计在线设备,flatmap方便裂变
( x.getAs[Long]("city_id"),
x.getAs[Long]("region_id"),
x.getAs[String]("device_type"),
x.getAs[String]("device_id"),
x.getAs[String]("login_time"),
x.getAs[String]("logout_time")) )
halfHourDev.collect().foreach(println)
// sqlContext.clearCache()
// sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id
val day2=dt.transformDateStr("20180402")
halfHourDev.flatMap(v=>
dt.divideTime(day2,v._5,v._6).map(vv=>(v._1,v._2,v._3,v._4,vv._1,vv._2 )))
.toDF("city_id","region_id","device_type","device_id","hour","timerange")
.registerTempTable("t_half_hour_dev")
sqlContext.cacheTable("t_half_hour_dev")
sqlContext.sql("select * from t_half_hour_dev").show(2)
sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id
核心问题:
18/04/04 03:53:56 WARN Accumulators: Ignoring accumulator update for unknown accumulator id 13
18/04/04 03:53:56 ERROR DAGScheduler: Failed to update accumulators for ResultTask(21, 2)
java.util.NoSuchElementException: key not found: 13
......
halfHourDev是通过临时表(
外部数据集创建mappartitionsRDD 才会引发此问题,makerdd创建的parallelcollectionRDD不会引发此问题)创建的RDD,
如果在flatMap之前uncacheTable(直接uncacheTable不会引发此问题
)其它无用临时表(在flatMap之后uncacheTable不会引发此问题),会导致上述
error问题。
参考复现问题
代码:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
object Test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("test")
.setMaster("local[1]")
conf.set("spark.cores.max", "1")
conf.set("spark.executor.memory","1G")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
sc.setLogLevel("WARN")
spark_sql_bug(sqlContext,sc)
println("completed")
sc.stop()
}
def spark_sql_bug(sqlContext:HiveContext,sc:SparkContext){
import sqlContext.implicits._
val r=sc.makeRDD(Seq(("555500","555501","1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")))
.toDF("city_id","region_id","device_type","device_id","login_time","logout_time")
.withColumn("f_home_id",lit(93L)).withColumn("f_city_id",lit(440300L)).withColumn("f_region_id",lit(440303L))
r .registerTempTable("t_loginDF")
sqlContext.cacheTable("t_loginDF")
val halfHourDev: RDD[(Long, Long, String, String, String, String)] = //mappartitionsRDD才会引发,parallelcollectionRDD没问题
// sc.makeRDD(Seq((
555500L,
555501L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")))
// sc.parallelize(Seq((
555500L,
555501L,"1","1000003697","2018-04-02 23:05:14","2018-04-03 02:40:20")),4)
// sqlContext.sql("SELECT region_id,city_id,cast(device_type as STRING) as device_type,device_id,cast(login_time as STRING) as login_time,cast(logout_time as STRING) as logout_time from t_loginDF")
sqlContext.sql("SELECT 440303L as region_id,440300L as city_id,'1' as device_type,'1000003697' as device_id,'2018-04-02 23:05:14' as login_time,'2018-04-03 02:40:20' as logout_time from t_loginDF")
.map(x=> //半小时分片统计在线设备,flatmap方便裂变
( x.getAs[Long]("city_id"),
x.getAs[Long]("region_id"),
x.getAs[String]("device_type"),
x.getAs[String]("device_id"),
x.getAs[String]("login_time"),
x.getAs[String]("logout_time")) )
.persist()
halfHourDev.collect().foreach(println)
// sqlContext.clearCache() //clearCache无问题
sqlContext.uncacheTable("t_loginDF") //这会导致后面查临时表出现unknown accumulator id
halfHourDev.flatMap(v=>
Set((23,60))
.map(vv=>(v._1,v._2,v._3,v._4,vv._1,vv._2 )))
.toDF("city_id","region_id","device_type","device_id","hour","timerange")
.registerTempTable("t_half_hour_dev")
sqlContext.cacheTable("t_half_hour_dev")
sqlContext.sql("select * from t_half_hour_dev").show(2) //action 引发问题
// sqlContext.uncacheTable("t_loginDF") //这里uncacheTable无问题
println("uncacheTable 引发 error")
}
}