Task not serializable是Spark开发过程最令人头疼的问题之一,这里记录下出现这个问题的两个实例,一个是自己遇到的,另一个是stackoverflow上看到。等有时间了再仔细探究出现Task not serialiazable的各种原因以及出现问题后如何快速定位问题的所在,至少目前阶段碰到此类问题,没有什么章法
1.
package spark.examples.streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import scala.collection.mutable
object NetCatStreamingWordCount3 {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("NetCatWordCount")
conf.setMaster("local[3]")
val ssc = new StreamingContext(conf, Seconds(5))
val lines = ssc.socketTextStream("localhost", 9999)
lines.foreachRDD(rdd => {
rdd.foreachPartition(partitionIterable=> {
val map = mutable.Map[String, String]()
while(partitionIterable.hasNext) {
val v = partitionIterable.next()
map += v ->v
}
map.foreach(entry => {
if (entry._1.equals("abc")) {
return; //return语句导致Task无法序列化,两个字:诡异,三个字:太诡异
}
})
})
})
ssc.start()
ssc.awaitTermination()
}
}
异常信息:
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:166)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:158)
at org.apache.spark.SparkContext.clean(SparkContext.scala:1622)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:805)
at spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1.apply(NetCatStreamingWordCount3.scala:15)
at spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1.apply(NetCatStreamingWordCount3.scala:14)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:534)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:534)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:42)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:32)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:176)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:176)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:176)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:175)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
Caused by: java.io.NotSerializableException: java.lang.Object
Serialization stack:
- object not serializable (class: java.lang.Object, value: java.lang.Object@143d53c)
- field (class: spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1, name: nonLocalReturnKey1$1, type: class java.lang.Object)
- object (class spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1, <function1>)
- field (class: spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1$$anonfun$apply$1, name: $outer, type: class spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1)
- object (class spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:38)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:164)
... 20 more
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:166)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:158)
at org.apache.spark.SparkContext.clean(SparkContext.scala:1622)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:805)
at spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1.apply(NetCatStreamingWordCount3.scala:15)
at spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1.apply(NetCatStreamingWordCount3.scala:14)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:534)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:534)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:42)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:32)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:176)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:176)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:176)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:175)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
Caused by: java.io.NotSerializableException: java.lang.Object
Serialization stack:
- object not serializable (class: java.lang.Object, value: java.lang.Object@143d53c)
- field (class: spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1, name: nonLocalReturnKey1$1, type: class java.lang.Object)
- object (class spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1, <function1>)
- field (class: spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1$$anonfun$apply$1, name: $outer, type: class spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1)
- object (class spark.examples.streaming.NetCatStreamingWordCount3$$anonfun$main$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:38)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:164)
... 20 more
2.
package spark.examples.rdd
import org.apache.spark.{SparkConf, SparkContext}
object TaskNotSerializationTest {
def main(args: Array[String]) {
new Testing().runJob
}
}
class Testing {
val conf = new SparkConf().setMaster("local").setAppName("TaskNotSerializationTest")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List(1, 2, 3))
def runJob = {
rdd.map(someFunc).collect().foreach(println)
}
def someFunc(a: Int) = a + 1
}
异常信息:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:166)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:158)
at org.apache.spark.SparkContext.clean(SparkContext.scala:1622)
at org.apache.spark.rdd.RDD.map(RDD.scala:286)
at spark.examples.rdd.Testing.runJob(TaskNotSerializationTest.scala:20)
at spark.examples.rdd.TaskNotSerializationTest$.main(TaskNotSerializationTest.scala:10)
at spark.examples.rdd.TaskNotSerializationTest.main(TaskNotSerializationTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:134)
Caused by: java.io.NotSerializableException: spark.examples.rdd.Testing
Serialization stack:
- object not serializable (class: spark.examples.rdd.Testing, value: spark.examples.rdd.Testing@b8972)
- field (class: spark.examples.rdd.Testing$$anonfun$runJob$1, name: $outer, type: class spark.examples.rdd.Testing)
- object (class spark.examples.rdd.Testing$$anonfun$runJob$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:38)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:164)
... 11 more
第二个问题:stackoverflow上有比较详细的讨论:
http://stackoverflow.com/questions/22592811/task-not-serializable-java-io-notserializableexception-when-calling-function-ou