错误信息:
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:393)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.rdd.RDD.$anonfun$foreach$1(RDD.scala:926)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:925)
at com.kzw.bigdata.spark.Streaming02.SparkStreaming01$.$anonfun$main$4(SparkStreamingClosures.scala:24)
at com.kzw.bigdata.spark.Streaming02.SparkStreaming01$.$anonfun$main$4$adapted(SparkStreamingClosures.scala:20)
at org.apache.spark.streaming.dstream.DStream.$anonfun$foreachRDD$2(DStream.scala:628)
at org.apache.spark.streaming.dstream.DStream.$anonfun$foreachRDD$2$adapted(DStream.scala:628)
at org.apache.spark.streaming.dstream.ForEachDStream.$anonfun$generateJob$2(ForEachDStream.scala:51)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream.$anonfun$generateJob$1(ForEachDStream.scala:51)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.$anonfun$run$1(JobScheduler.scala:257)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:257)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.NotSerializableException: java.lang.Object
Serialization stack:
- object not serializable (class: java.lang.Object, value: java.lang.Object@3a774eea)
- writeObject data (class: java.util.HashMap)
- object (class java.util.HashMap, {utf-8=java.lang.Object@3a774eea, UTF-8=java.lang.Object@3a774eea, US-ASCII=com.mysql.jdbc.SingleByteCharsetConverter@17971925})
- field (class: com.mysql.jdbc.ConnectionImpl, name: charsetConverterMap, type: interface java.util.Map)
- object (class com.mysql.jdbc.JDBC4Connection, com.mysql.jdbc.JDBC4Connection@1de36d8b)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 1)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class com.kzw.bigdata.spark.Streaming02.SparkStreaming01$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic com/kzw/bigdata/spark/Streaming02/SparkStreaming01$.$anonfun$main$5$adapted:(Ljava/sql/Connection;Lscala/Tuple2;)Ljava/lang/Object;, instantiatedMethodType=(Lscala/Tuple2;)Ljava/lang/Object;, numCaptured=1])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class com.kzw.bigdata.spark.Streaming02.SparkStreaming01$$$Lambda$1035/951772831, com.kzw.bigdata.spark.Streaming02.SparkStreaming01$$$Lambda$1035/951772831@d1c2ddc)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)
... 26 more
错误信息很明显 com.mysql.jdbc 序列化有问题
错误的源码示例:
val sparkConf = new SparkConf().setMaster("local[2]").setAppName(this.getClass.getSimpleName)
val ssc = new StreamingContext(sparkConf,Seconds(5))
val lines = ssc.socketTextStream("hadoop001",9999)
val wc = lines.flatMap(_.split(",")).map((_,1)).reduceByKey(_+_)
wc.foreachRDD(rdd => {
val connection = MysqlUtil.getConnection()
rdd.foreach(pair => {
val sql = s"insert into wc(word,cnt) values('${pair._1}', ${pair._2})"
connection.createStatement().execute(sql)
})
MysqlUtil.closeResourse(connection)
})
为啥这看上去没毛病的代码会是有错的呢?
其实官网说的很清楚:
foreachRDD : 应用中最通常被使用的output operator,它可以使func函数作用在DStream中的每一个RDD上面。这个函数可以把RDD中的数据推送到外部系统,比如存储成文件或者写入到数据库中。注意,这个函数会运行在driver端,然后把其内部定义的RDD actions作用在DStream RDD上。
错误很明显了,connection 在driver端获取到的,而 rdd.foreach()作用在worker端,,connection 对象几乎是不可以跨机器使用的,所以就会报错。
这里涉及Spark的一个重要概念 闭包
闭包:在函数内部引用了一个外部的变量
解决办法:
1,为每一个DStream 创建一个connection ,缺陷:效率低下,资源的开销大
dstream.foreachRDD { rdd =>
rdd.foreach { record =>
val connection = createNewConnection()
connection.send(record)
connection.close()
}
}
2,使用RDD.foreachPartition来为每一个数据分区创建一个connection对象,然后使用这个对象发送分片数据到外部系统,完成之后销毁这个对象
// 官方最优的例子:
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
// ConnectionPool is a static, lazily initialized pool of connections
val connection = ConnectionPool.getConnection()
partitionOfRecords.foreach(record => connection.send(record))
ConnectionPool.returnConnection(connection) // return to the pool for future reuse
}
}
// 自己仿写的
wc.foreachRDD(rdd => {
rdd.foreachPartition(partition =>{
// 要是换成别的,就是换这里的连接操作
val connection = MysqlUtil.getConnection()
partition.foreach(x=>{
val sql = s"insert into wc_test(word,cnt)values('${x._1}',${x._2})"
connection.createStatement().execute(sql)
})
MysqlUtil.closeResourse(connection)
})
})
采用scalikejdbc的方式:
DBs.setupAll()
wc.foreachRDD(rdd=>{
rdd.foreachPartition(partition=>{
partition.foreach(pair=>{
DB.autoCommit{
implicit session=>{
SQL("insert into wc_test(word,cnt)values(?,?)").bind(pair._1,pair._2).update().apply()
}
}
})
})
})