用idea执行执行spark ml任务,参考官方的代码,但是运行时候报错如下:
org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:176)
org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
lineReg$.main(lineReg.scala:42)
lineReg.main(lineReg.scala)
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:393)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$1(RDD.scala:850)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:849)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:630)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3383)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2544)
at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3364)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2551)
at org.apache.spark.sql.Dataset.first(Dataset.scala:2558)
at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:321)
at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:183)
at scala.util.Try$.apply(Try.scala:209)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:183)
at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:319)
at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:176)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
at lineReg$.main(lineReg.scala:42)
at lineReg.main(lineReg.scala)
Caused by: java.io.NotSerializableException: scala.runtime.LazyRef
Serialization stack:
- object not serializable (class: scala.runtime.LazyRef, value: LazyRef thunk)
- element of array (index: 2)
- array (class [Ljava.lang.Object;, size 3)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.catalyst.expressions.ScalaUDF, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/catalyst/expressions/ScalaUDF.$anonfun$f$2:(Lscala/Function1;Lorg/apache/spark/sql/catalyst/expressions/Expression;Lscala/runtime/LazyRef;Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, instantiatedMethodType=(Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, numCaptured=3])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$1885/853264965, org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$1885/853264965@6604f246)
- field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, name: f, type: interface scala.Function1)
- object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, UDF(named_struct(square, square#35)))
- field (class: org.apache.spark.sql.catalyst.expressions.Alias, name: child, type: class org.apache.spark.sql.catalyst.expressions.Expression)
- object (class org.apache.spark.sql.catalyst.expressions.Alias, UDF(named_struct(square, square#35)) AS features#41)
- element of array (index: 3)
- array (class [Ljava.lang.Object;, size 4)
- field (class: scala.collection.mutable.ArrayBuffer, name: array, type: class [Ljava.lang.Object;)
- object (class scala.collection.mutable.ArrayBuffer, ArrayBuffer(square#35, price#36, rand#37, UDF(named_struct(square, square#35)) AS features#41))
- field (class: org.apache.spark.sql.execution.ProjectExec, name: projectList, type: interface scala.collection.Seq)
- object (class org.apache.spark.sql.execution.ProjectExec, Project [square#35, price#36, rand#37, UDF(named_struct(square, square#35)) AS features#41]
+- Sort [rand#37 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(rand#37 ASC NULLS FIRST, 200)
+- *(1) Project [_1#31 AS square#35, _2#32 AS price#36, _3#33 AS rand#37]
+- *(1) SerializeFromObject [assertnotnull(input[0, scala.Tuple3, true])._1 AS _1#31, assertnotnull(input[0, scala.Tuple3, true])._2 AS _2#32, assertnotnull(input[0, scala.Tuple3, true])._3 AS _3#33]
+- *(1) MapElements lineReg$$$Lambda$1793/897098843@1dd443c1, obj#30: scala.Tuple3
+- *(1) DeserializeToObject createexternalrow(square#11.toString, price#12.toString, StructField(square,StringType,true), StructField(price,StringType,true)), obj#29: org.apache.spark.sql.Row
+- *(1) FileScan csv [square#11,price#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/E:/BaiduNetdiskDownload/资料与代码spark/coding-271/ml/house.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<square:string,price:string>
)
- field (class: org.apache.spark.sql.execution.SortExec, name: child, type: class org.apache.spark.sql.execution.SparkPlan)
- object (class org.apache.spark.sql.execution.SortExec, Sort [square#35 ASC NULLS FIRST, price#36 ASC NULLS FIRST, rand#37 ASC NULLS FIRST, features#41 ASC NULLS FIRST], false, 0
+- Project [square#35, price#36, rand#37, UDF(named_struct(square, square#35)) AS features#41]
+- Sort [rand#37 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(rand#37 ASC NULLS FIRST, 200)
+- *(1) Project [_1#31 AS square#35, _2#32 AS price#36, _3#33 AS rand#37]
+- *(1) SerializeFromObject [assertnotnull(input[0, scala.Tuple3, true])._1 AS _1#31, assertnotnull(input[0, scala.Tuple3, true])._2 AS _2#32, assertnotnull(input[0, scala.Tuple3, true])._3 AS _3#33]
+- *(1) MapElements lineReg$$$Lambda$1793/897098843@1dd443c1, obj#30: scala.Tuple3
+- *(1) DeserializeToObject createexternalrow(square#11.toString, price#12.toString, StructField(square,StringType,true), StructField(price,StringType,true)), obj#29: org.apache.spark.sql.Row
+- *(1) FileScan csv [square#11,price#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/E:/BaiduNetdiskDownload/资料与代码spark/coding-271/ml/house.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<square:string,price:string>
)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 12)
- element of array (index: 1)
- array (class [Ljava.lang.Object;, size 3)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.execution.WholeStageCodegenExec, functionalInterfaceMethod=scala/Function2.apply:(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/execution/WholeStageCodegenExec.$anonfun$doExecute$4$adapted:(Lorg/apache/spark/sql/catalyst/expressions/codegen/CodeAndComment;[Ljava/lang/Object;Lorg/apache/spark/sql/execution/metric/SQLMetric;Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, instantiatedMethodType=(Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, numCaptured=3])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$1413/744918058, org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$1413/744918058@7f41d979)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)
... 36 more
经过查看相关文档,发现关键字是:
java.io.NotSerializableException: scala.runtime.LazyRef
查看文档:
https://stackoverflow.com/questions/61198637/spark-error-java-io-notserializableexception-scala-runtime-lazyref/61220420
有如下回复:
答题意思就是scala的版本要在2.12.8以上,于是乎查看了idea的scala版本
很显然,我们的scala版本不对,调整成2.12.10,问题解决