最近公司在做一个电商推荐系统项目,其中涉及到一个评分转换功能,就是将用户在电商网站的行为转换为对应的评分数据,然后使用spark mllib中提供的方法使用
在做评分转换的过程中,遇到的序列化问题,今天就好好整理了一下spark中序列化问题.
spark版本:2.1.0 配置如下:
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>${spark.version}</version> </dependency>
下面就从代码最low的版本说起
import java.io.FileInputStream import java.util.Properties import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.recommendation.{ALS, ALSModel} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{SparkSession} /** * Created by xiaoxiao_zhang on 2017/5/9. * 评分转换: * 将用户行为转化为对应的评分(浏览,收藏,加购物车,购买) * */ /** * Created by xiaoxiao_zhang on 2017/5/11. */ class ScoreConvert1(sc:SparkContext) extends Serializable{ // def covert2Score(sourceRdd: RDD[String]) = { def covert2Score() = { val sourceRdd: RDD[String] = sc.textFile("data/tianchi_mobile_recommend_train_user.csv") val scoreRdd = sourceRdd.map(t => { val splits = t.split(",") if (splits.length >= 3) { val actionScore = splits(2) match { case "1" => 1 case "2" => 3 case "3" => 6 case "4" => 8 case _ => 0.0 } Array(splits(0), splits(1), actionScore).mkString(",") } else { null } }) scoreRdd } def parseRating(str: String): Rating1 = { val fields = str.split(",") Rating1(fields(0).toInt, fields(1).toInt, fields(2).toFloat) } def fillWithScore(scores: RDD[String], spark: SparkSession) { import spark.implicits._ val ratings = scores .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("itemId") .setRatingCol("rating") val model: ALSModel = als.fit(training) model.itemFactors.show() println("-----1111-------") val userFactors = model.userFactors val itemFactors = model.itemFactors userFactors.show(truncate = false) println("------2222-------") model.itemFactors.show(truncate = false) val joinedDF = userFactors.join(itemFactors,Seq("features"),"inner") println("------3333-------") joinedDF.show(10) } } object ScoreConvert1 { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("ScoreConvert").setMaster("local[5]") val sc = new SparkContext(sparkConf) sc.setLogLevel("ERROR") val props = new Properties() props.load(new FileInputStream(Constant.profileName)) val spark = SparkSession.builder().getOrCreate() val sourceRdd: RDD[String] = sc.textFile("data/tianchi_mobile_recommend_train_user.csv") val convert: ScoreConvert1 = new ScoreConvert1(sc) val scores: RDD[String] = convert.covert2Score() println("scores-----" + scores.take(3).toList) convert.fillWithScore(scores, spark) } } case class Rating1(userId: Int, itemId: Int, rating: Float)
执行之后,直接报如下错误:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298) at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288) at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108) at org.apache.spark.SparkContext.clean(SparkContext.scala:2094) at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:370) at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:369) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) at org.apache.spark.rdd.RDD.map(RDD.scala:369) at com.allyes.awise.eng.score.ScoreConvert1.fillWithScore(ScoreConvert1.scala:53) at com.allyes.awise.eng.score.ScoreConvert1$.main(ScoreConvert1.scala:97) at com.allyes.awise.eng.score.ScoreConvert1.main(ScoreConvert1.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147) Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext Serialization stack: - object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext@63649129) - field (class: com.allyes.awise.eng.score.ScoreConvert1, name: sc, type: class org.apache.spark.SparkContext) - object (class com.allyes.awise.eng.score.ScoreConvert1, com.allyes.awise.eng.score.ScoreConvert1@2423696d) - field (class: com.allyes.awise.eng.score.ScoreConvert1$$anonfun$2, name: $outer, type: class com.allyes.awise.eng.score.ScoreConvert1) - object (class com.allyes.awise.eng.score.ScoreConvert1$$anonfun$2, <function1>) at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40) at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46) at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100) at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295) ... 17 more Process finished with exit code 1
报错很明显,是sparkContext没有实现序列化, 而sparkContext是spark程序的执行入口,是不需要实例化的, 看一下报错的那一行
def fillWithScore(scores: RDD[String], spark: SparkSession) { import spark.implicits._ val ratings = scores .map(parseRating) .toDF()在rdd中map之后,调用的是 parseRating 这个方法,
而这个方法只是将字符串切分,然后封装到case class中,感觉没什么问题,
和同事交流以及查看网上的一些资料之后发现,在executor端执行的程序如果需要使用外部对象,那么外部对象必须序列化,在这里, parseRating
方法是在类 ScoreConvert1.scala中的方法,而 fillWithScore 也是ScoreConvert1.scala中的成员方法,
在executor调用parseRating方法时候必须保证方法所在的类中的成员变量,成员方法,外部变量都实现序列化,也就是说这里虽然ScoreConvert1这个类实现了序列化,但是其中的外部变量sparkContext没有实现序列化,所以会报错 !!!
既然找到了错误原因,那就改进吧,
方法一: 将parseRating方法中所做的事情,放到map中,这样的话,就不需要ScoreConvert1.scala实现序列化
方法二: 使用 parseRating这个函数,这时候就需要将 ScoreConvert1.scala 中的成员变量,方法,外部变量全部都实现序列化,但是这时候可以把读取数据的那部分操作放到方法的外部,然后将rdd作为参数传到方法中,这时候,ScoreConvert1.scala 必须实现序列化,变化如下标记
另外,这时候,sc这个参数传到 ScoreConvert1这个类中已经没意义了,因为在这个类中,没有使用sc的
改进如下:
import java.io.FileInputStream import java.util.Properties import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.recommendation.{ALS, ALSModel} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{SparkSession} /** * Created by xiaoxiao_zhang on 2017/5/9. * 评分转换: * 将用户行为转化为对应的评分(浏览,收藏,加购物车,购买) * */ /** * Created by xiaoxiao_zhang on 2017/5/11. */ class ScoreConvert1 extends Serializable { def covert2Score(sourceRdd: RDD[String]) = { val scoreRdd = sourceRdd.map(t => { val splits = t.split(",") if (splits.length >= 3) { val actionScore = splits(2) match { case "1" => 1 case "2" => 3 case "3" => 6 case "4" => 8 case _ => 0.0 } Array(splits(0), splits(1), actionScore).mkString(",") } else { null } }) scoreRdd } def parseRating(str: String): Rating1 = { val fields = str.split(",") Rating1(fields(0).toInt, fields(1).toInt, fields(2).toFloat) } def fillWithScore(scores: RDD[String], spark: SparkSession) { import spark.implicits._ val ratings = scores .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("itemId") .setRatingCol("rating") val model: ALSModel = als.fit(training) model.itemFactors.show() println("-----1111-------") val userFactors = model.userFactors val itemFactors = model.itemFactors userFactors.show(truncate = false) println("------2222-------") model.itemFactors.show(truncate = false) val joinedDF = userFactors.join(itemFactors,Seq("features"),"inner") println("------3333-------") joinedDF.show(10) } } object ScoreConvert1 { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("ScoreConvert").setMaster("local[5]") val sc = new SparkContext(sparkConf) sc.setLogLevel("ERROR") val props = new Properties() props.load(new FileInputStream(Constant.profileName)) val spark = SparkSession.builder().getOrCreate() val sourceRdd: RDD[String] = sc.textFile("data/tianchi_mobile_recommend_train_user.csv") val convert: ScoreConvert1 = new ScoreConvert1() val scores: RDD[String] = convert.covert2Score(sourceRdd) println("scores-----" + scores.take(3).toList) convert.fillWithScore(scores, spark) } } case class Rating1(userId: Int, itemId: Int, rating: Float)
总结: 只要在.map,.filter这样的rdd内部想使用外部变量的话必须实现序列化,这里外部变量可能是函数所在类的成员变量,类的成员函数,类的构造函数中的其他外部变量
不然的话要么就将函数所做的事情放到map这样的内部做,这样就不需要对象实现序列化至此,困扰2天的序列化问题已经解决
记于 2017-05-11 推荐系统项目