spark序列化问题解决

最近公司在做一个电商推荐系统项目,其中涉及到一个评分转换功能,就是将用户在电商网站的行为转换为对应的评分数据,然后使用spark mllib中提供的方法使用

在做评分转换的过程中,遇到的序列化问题,今天就好好整理了一下spark中序列化问题.

spark版本:2.1.0 配置如下:

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-mllib_2.10</artifactId>
    <version>${spark.version}</version>
</dependency>

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.10</artifactId>
    <version>${spark.version}</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql_2.10</artifactId>
    <version>${spark.version}</version>
</dependency>


下面就从代码最low的版本说起

import java.io.FileInputStream
import java.util.Properties

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SparkSession}

/**
  * Created by xiaoxiao_zhang on 2017/5/9.
  * 评分转换:
  * 将用户行为转化为对应的评分(浏览,收藏,加购物车,购买)
  *
  */

/**
  * Created by xiaoxiao_zhang on 2017/5/11.
  */
class ScoreConvert1(sc:SparkContext) extends Serializable{

//  def covert2Score(sourceRdd: RDD[String]) = {
  def covert2Score() = {
    val sourceRdd: RDD[String] = sc.textFile("data/tianchi_mobile_recommend_train_user.csv")
    val scoreRdd = sourceRdd.map(t => {
      val splits = t.split(",")
      if (splits.length >= 3) {
        val actionScore = splits(2) match {
          case "1" => 1
          case "2" => 3
          case "3" => 6
          case "4" => 8
          case _ => 0.0
        }
        Array(splits(0), splits(1), actionScore).mkString(",")
      } else {
        null
      }
    })
    scoreRdd
  }

  def parseRating(str: String): Rating1 = {
    val fields = str.split(",")
    Rating1(fields(0).toInt, fields(1).toInt, fields(2).toFloat)
  }

  def fillWithScore(scores: RDD[String], spark: SparkSession) {
    import spark.implicits._
    val ratings = scores
      .map(parseRating)
      .toDF()

    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
    val als = new ALS()
      .setMaxIter(5)
      .setRegParam(0.01)
      .setUserCol("userId")
      .setItemCol("itemId")
      .setRatingCol("rating")
    val model: ALSModel = als.fit(training)
    model.itemFactors.show()

    println("-----1111-------")
    val userFactors = model.userFactors
    val itemFactors = model.itemFactors

    userFactors.show(truncate = false)
    println("------2222-------")
    model.itemFactors.show(truncate = false)

    val joinedDF = userFactors.join(itemFactors,Seq("features"),"inner")
    println("------3333-------")
    joinedDF.show(10)
  }

}


object ScoreConvert1 {

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("ScoreConvert").setMaster("local[5]")
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("ERROR")
    val props = new Properties()
    props.load(new FileInputStream(Constant.profileName))

    val spark = SparkSession.builder().getOrCreate()

    val sourceRdd: RDD[String] = sc.textFile("data/tianchi_mobile_recommend_train_user.csv")
    val convert: ScoreConvert1 = new ScoreConvert1(sc)
    val scores: RDD[String] = convert.covert2Score()
    println("scores-----" + scores.take(3).toList)
    convert.fillWithScore(scores, spark)

  }
}

case class Rating1(userId: Int, itemId: Int, rating: Float)

执行之后,直接报如下错误:

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:370)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:369)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.map(RDD.scala:369)
at com.allyes.awise.eng.score.ScoreConvert1.fillWithScore(ScoreConvert1.scala:53)
at com.allyes.awise.eng.score.ScoreConvert1$.main(ScoreConvert1.scala:97)
at com.allyes.awise.eng.score.ScoreConvert1.main(ScoreConvert1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
Serialization stack:
- object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext@63649129)
- field (class: com.allyes.awise.eng.score.ScoreConvert1, name: sc, type: class org.apache.spark.SparkContext)
- object (class com.allyes.awise.eng.score.ScoreConvert1, com.allyes.awise.eng.score.ScoreConvert1@2423696d)
- field (class: com.allyes.awise.eng.score.ScoreConvert1$$anonfun$2, name: $outer, type: class com.allyes.awise.eng.score.ScoreConvert1)
- object (class com.allyes.awise.eng.score.ScoreConvert1$$anonfun$2, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 17 more

Process finished with exit code 1


报错很明显,是sparkContext没有实现序列化, 而sparkContext是spark程序的执行入口,是不需要实例化的, 看一下报错的那一行

def fillWithScore(scores: RDD[String], spark: SparkSession) {
  import spark.implicits._
  val ratings = scores
    .map(parseRating)
    .toDF()
在rdd中map之后,调用的是 parseRating 这个方法,

而这个方法只是将字符串切分,然后封装到case class中,感觉没什么问题,

和同事交流以及查看网上的一些资料之后发现,在executor端执行的程序如果需要使用外部对象,那么外部对象必须序列化,在这里, parseRating

方法是在类 ScoreConvert1.scala中的方法,而 fillWithScore 也是ScoreConvert1.scala中的成员方法,

在executor调用parseRating方法时候必须保证方法所在的类中的成员变量,成员方法,外部变量都实现序列化,也就是说这里虽然ScoreConvert1这个类实现了序列化,但是其中的外部变量sparkContext没有实现序列化,所以会报错 !!!


既然找到了错误原因,那就改进吧,

 方法一:  将parseRating方法中所做的事情,放到map中,这样的话,就不需要ScoreConvert1.scala实现序列化



方法二:  使用 parseRating这个函数,这时候就需要将 ScoreConvert1.scala 中的成员变量,方法,外部变量全部都实现序列化,但是这时候可以把读取数据的那部分操作放到方法的外部,然后将rdd作为参数传到方法中,这时候,ScoreConvert1.scala 必须实现序列化,变化如下标记





另外,这时候,sc这个参数传到 ScoreConvert1这个类中已经没意义了,因为在这个类中,没有使用sc的


改进如下:

import java.io.FileInputStream
import java.util.Properties

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SparkSession}

/**
  * Created by xiaoxiao_zhang on 2017/5/9.
  * 评分转换:
  * 将用户行为转化为对应的评分(浏览,收藏,加购物车,购买)
  *
  */

/**
  * Created by xiaoxiao_zhang on 2017/5/11.
  */
class ScoreConvert1 extends Serializable {

  def covert2Score(sourceRdd: RDD[String]) = {
    val scoreRdd = sourceRdd.map(t => {
      val splits = t.split(",")
      if (splits.length >= 3) {
        val actionScore = splits(2) match {
          case "1" => 1
          case "2" => 3
          case "3" => 6
          case "4" => 8
          case _ => 0.0
        }
        Array(splits(0), splits(1), actionScore).mkString(",")
      } else {
        null
      }
    })
    scoreRdd
  }

  def parseRating(str: String): Rating1 = {
    val fields = str.split(",")
    Rating1(fields(0).toInt, fields(1).toInt, fields(2).toFloat)
  }

  def fillWithScore(scores: RDD[String], spark: SparkSession) {
    import spark.implicits._
    val ratings = scores
      .map(parseRating)
      .toDF()

    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
    val als = new ALS()
      .setMaxIter(5)
      .setRegParam(0.01)
      .setUserCol("userId")
      .setItemCol("itemId")
      .setRatingCol("rating")
    val model: ALSModel = als.fit(training)
    model.itemFactors.show()

    println("-----1111-------")
    val userFactors = model.userFactors
    val itemFactors = model.itemFactors

    userFactors.show(truncate = false)
    println("------2222-------")
    model.itemFactors.show(truncate = false)

    val joinedDF = userFactors.join(itemFactors,Seq("features"),"inner")
    println("------3333-------")
    joinedDF.show(10)
  }
}


object ScoreConvert1 {

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("ScoreConvert").setMaster("local[5]")
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("ERROR")
    val props = new Properties()
    props.load(new FileInputStream(Constant.profileName))

    val spark = SparkSession.builder().getOrCreate()

    val sourceRdd: RDD[String] = sc.textFile("data/tianchi_mobile_recommend_train_user.csv")
    val convert: ScoreConvert1 = new ScoreConvert1()
    val scores: RDD[String] = convert.covert2Score(sourceRdd)
    println("scores-----" + scores.take(3).toList)
    convert.fillWithScore(scores, spark)

  }
}

case class Rating1(userId: Int, itemId: Int, rating: Float)


总结: 只要在.map,.filter这样的rdd内部想使用外部变量的话必须实现序列化,这里外部变量可能是函数所在类的成员变量,类的成员函数,类的构造函数中的其他外部变量

不然的话要么就将函数所做的事情放到map这样的内部做,这样就不需要对象实现序列化至此,困扰2天的序列化问题已经解决


记于 2017-05-11   推荐系统项目



  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值