Spark2.1.1<scala版本不兼容问题>

4 篇文章 0 订阅

今天在拆分以下数据集的时候

area,perimeter,compactness,lengthOfKernel,widthOfKernel,asymmetryCoefficient,lengthOfKernelGroove
15.26,14.84,0.871,5.763,3.312,2.221,5.22
14.88,14.57,0.8811,5.554,3.333,1.018,4.956
14.29,14.09,0.905,5.291,3.337,2.699,4.825
13.84,13.94,0.8955,5.324,3.379,2.259,4.805
16.14,14.99,0.9034,5.658,3.562,1.355,5.175
14.38,14.21,0.8951,5.386,3.312,2.462,4.956
14.69,14.49,0.8799,5.563,3.259,3.586,5.219
14.11,14.1,0.8911,5.42,3.302,2.7,5
16.63,15.46,0.8747,6.053,3.465,2.04,5.877
16.44,15.25,0.888,5.884,3.505,1.969,5.533
15.26,14.85,0.8696,5.714,3.242,4.543,5.314
14.03,14.16,0.8796,5.438,3.201,1.717,5.001
13.89,14.02,0.888,5.439,3.199,3.986,4.738
13.78,14.06,0.8759,5.479,3.156,3.136,4.872
13.74,14.05,0.8744,5.482,3.114,2.932,4.825
14.59,14.28,0.8993,5.351,3.333,4.185,4.781
13.99,13.83,0.9183,5.119,3.383,5.234,4.781
15.69,14.75,0.9058,5.527,3.514,1.599,5.046

我得到了这么一个错误:


Exception in thread "main" org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2101)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1950)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at homework.Seed$.main(Seed.scala:21)
	at homework.Seed.main(Seed.scala)
Caused by: java.io.NotSerializableException: java.lang.Object
Serialization stack:
	- object not serializable (class: java.lang.Object, value: java.lang.Object@118102ee)
	- element of array (index: 0)
	- array (class [Ljava.lang.Object;, size 1)
	- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
	- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class homework.Seed$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic homework/Seed$.$anonfun$main$2:(Ljava/lang/Object;Ljava/lang/String;)Lscala/collection/Iterable;, instantiatedMethodType=(Ljava/lang/String;)Lscala/collection/Iterable;, numCaptured=1])
	- writeReplace data (class: java.lang.invoke.SerializedLambda)
	- object (class homework.Seed$$$Lambda$3/2139501486, homework.Seed$$$Lambda$3/2139501486@446c3920)
	- field (class: org.apache.spark.rdd.RDD$$anonfun$flatMap$1$$anonfun$apply$6, name: cleanF$2, type: interface scala.Function1)
	- object (class org.apache.spark.rdd.RDD$$anonfun$flatMap$1$$anonfun$apply$6, <function3>)
	- field (class: org.apache.spark.rdd.MapPartitionsRDD, name: f, type: interface scala.Function3)
	- object (class org.apache.spark.rdd.MapPartitionsRDD, MapPartitionsRDD[3] at flatMap at Seed.scala:21)
	- field (class: org.apache.spark.rdd.RDD$$anonfun$collect$1, name: $outer, type: class org.apache.spark.rdd.RDD)
    - object (class org.apache.spark.rdd.RDD$$anonfun$collect$1, <function0>)
	- field (class: org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13, name: $outer, type: class org.apache.spark.rdd.RDD$$anonfun$collect$1)
    - object (class org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
    ... 12 more

我的代码是这样的

case class SEED(area: Double, perimeter: Double, compactness: Double, lengthOfKernel: Double,
                widthOfKernel: Double, asymmetryCoefficient: Double, lengthOfKernelGroove: Double)

object Seed {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("filter")
    val sc = new SparkContext(conf)
    val root = Seed.getClass.getResource("/")
    val data = sc.textFile(root + "seeds.csv")

    val regex ="""^\d""".r
    val result = data.filter(regex.findFirstIn(_)!=None)
      .flatMap { str => {
        val info = str.split(",")
        if (info(0).toDouble > 15)
          Some(SEED(info(0).toDouble, info(1).toDouble, info(2).toDouble, info(3).toDouble, info(4).toDouble, info(5).toDouble, info(6).toDouble))
        else
          None
      }
      }
    result.collect().foreach(println)

  }
}

仔细检查代码,并没有发现什么错误,后来经过经过排查,是我使用的scala版本和spark的版本不兼容导致的,我使用的是spark

 <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>

scala2.12.2

spark2.1.1使用scala2.11.8编译的,将scalaSDK换成2.11.8问题得到解决

下面给出spark2.1.1编译的相关的所有版本信息:

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
    <java.version>1.7</java.version>
    <maven.version>3.3.9</maven.version>
    <sbt.project.name>spark</sbt.project.name>
    <slf4j.version>1.7.16</slf4j.version>
    <log4j.version>1.2.17</log4j.version>
    <hadoop.version>2.2.0</hadoop.version>
    <protobuf.version>2.5.0</protobuf.version>
    <yarn.version>${hadoop.version}</yarn.version>
    <flume.version>1.6.0</flume.version>
    <zookeeper.version>3.4.5</zookeeper.version>
    <curator.version>2.4.0</curator.version>
    <hive.group>org.spark-project.hive</hive.group>
    <!-- Version used in Maven Hive dependency -->
    <hive.version>1.2.1.spark2</hive.version>
    <!-- Version used for internal directory structure -->
    <hive.version.short>1.2.1</hive.version.short>
    <derby.version>10.12.1.1</derby.version>
    <parquet.version>1.8.1</parquet.version>
    <hive.parquet.version>1.6.0</hive.parquet.version>
    <jetty.version>9.2.16.v20160414</jetty.version>
    <javaxservlet.version>3.1.0</javaxservlet.version>
    <chill.version>0.8.0</chill.version>
    <ivy.version>2.4.0</ivy.version>
    <oro.version>2.0.8</oro.version>
    <codahale.metrics.version>3.1.2</codahale.metrics.version>
    <avro.version>1.7.7</avro.version>
    <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
    <jets3t.version>0.7.1</jets3t.version>
    <aws.kinesis.client.version>1.6.1</aws.kinesis.client.version>
    <!-- the producer is used in tests -->
    <aws.kinesis.producer.version>0.10.2</aws.kinesis.producer.version>
    <!--  org.apache.httpcomponents/httpclient-->
    <commons.httpclient.version>4.5.2</commons.httpclient.version>
    <commons.httpcore.version>4.4.4</commons.httpcore.version>
    <!--  commons-httpclient/commons-httpclient-->
    <httpclient.classic.version>3.1</httpclient.classic.version>
    <commons.math3.version>3.4.1</commons.math3.version>
    <!-- managed up from 3.2.1 for SPARK-11652 -->
    <commons.collections.version>3.2.2</commons.collections.version>
    <scala.version>2.11.8</scala.version>
    <scala.binary.version>2.11</scala.binary.version>
    <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
    <fasterxml.jackson.version>2.6.5</fasterxml.jackson.version>
    <snappy.version>1.1.2.6</snappy.version>
    <netlib.java.version>1.1.2</netlib.java.version>
    <calcite.version>1.2.0-incubating</calcite.version>
    <commons-codec.version>1.10</commons-codec.version>
    <commons-io.version>2.4</commons-io.version>
    <!-- org.apache.commons/commons-lang/-->
    <commons-lang2.version>2.6</commons-lang2.version>
    <!-- org.apache.commons/commons-lang3/-->
    <commons-lang3.version>3.5</commons-lang3.version>
    <datanucleus-core.version>3.2.10</datanucleus-core.version>
    <janino.version>3.0.0</janino.version>
    <jersey.version>2.22.2</jersey.version>
    <joda.version>2.9.3</joda.version>
    <jodd.version>3.5.2</jodd.version>
    <jsr305.version>1.3.9</jsr305.version>
    <libthrift.version>0.9.3</libthrift.version>
    <antlr4.version>4.5.3</antlr4.version>
    <jpam.version>1.1</jpam.version>
    <selenium.version>2.52.0</selenium.version>
    <paranamer.version>2.8</paranamer.version>
    <maven-antrun.version>1.8</maven-antrun.version>
    <commons-crypto.version>1.0.0</commons-crypto.version>

    <test.java.home>${java.home}</test.java.home>
    <test.exclude.tags></test.exclude.tags>

    <!-- When using different JDKs for the build, we can't use Zinc for the jdk8 part. -->
    <useZincForJdk8>true</useZincForJdk8>

    <!-- Package to use when relocating shaded classes. -->
    <spark.shade.packageName>org.spark_project</spark.shade.packageName>

    <!-- Modules that copy jars to the build directory should do so under this location. -->
    <jars.target.dir>${project.build.directory}/scala-${scala.binary.version}/jars</jars.target.dir>

    <!-- Allow modules to enable / disable certain build plugins easily. -->
    <build.testJarPhase>prepare-package</build.testJarPhase>
    <build.copyDependenciesPhase>none</build.copyDependenciesPhase>

    <!--
      Dependency scopes that can be overridden by enabling certain profiles. These profiles are
      declared in the projects that build assemblies.

      For other projects the scope should remain as "compile", otherwise they are not available
      during compilation if the dependency is transivite (e.g. "graphx/" depending on "core/" and
      needing Hadoop classes in the classpath to compile).
    -->
    <flume.deps.scope>compile</flume.deps.scope>
    <hadoop.deps.scope>compile</hadoop.deps.scope>
    <hive.deps.scope>compile</hive.deps.scope>
    <parquet.deps.scope>compile</parquet.deps.scope>
    <parquet.test.deps.scope>test</parquet.test.deps.scope>

    <!--
      Overridable test home. So that you can call individual pom files directly without
      things breaking.
    -->
    <spark.test.home>${session.executionRootDirectory}</spark.test.home>

    <PermGen>64m</PermGen>
    <MaxPermGen>512m</MaxPermGen>
    <CodeCacheSize>512m</CodeCacheSize>
  </properties>

版本兼容问题真的特别特别重要!!!!!!

  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值