问题描述
Error:(350, 43) Unable to find encoder for type stored in a Dataset. Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._ Support for serializing other types will be added in future releases. val userDataSet = spark.createDataset(usersForDSRDD)
出现问题的代码如下
import org.apache.spark import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoders, Row, SparkSession} import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} /** * FileName: RDD_Movie_Users_Analyzer * Author: hadoop * Email: 3165845957@qq.com * Date: 19-5-19 下午4:59 * Description: * */ object RDD_Movie_Users_Analyzer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[*]") .setAppName("RDD_Movie_Users_Analyzer") val spark = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ val sc = spark.sparkContext sc.setLogLevel("WARN") val path = "file:///home/hadoop/movierecommend/dataset/" //user.dat UserID|age|Gender|Occuption|Zip-code val usersRDD = sc.textFile(path + "user.dat") //movies.dat MovieId::Title::Genres val moviesRDD = sc.textFile(path+"movies.dat") //movies.dat UserID::MovieID::Rating::TimeStamp val ratingsRDD = sc.textFile(path + "ratings.dat") //RDD: MovieID,Title val movieInfo = moviesRDD.map(_.split("::")).map(x=>(x(0),x(1))).cache() //RDD: UserId,MovieId,Rating val ratings = ratingsRDD.map(_.split("::")).map(x=>(x(0),x(1),x(2))).cache() //UserID,Gender val usersGender = usersRDD.map(_.split("\\|")).map(x=>(x(0),x(2))) dataSetOps(usersRDD,ratingsRDD,spark) spark.stop() } /** * 通过DataSet实战电影点评系统案例 * @param usersRDD 用户信息RDD UserID|age|Gender|Occuption|Zip-code * @param ratingsRDD 用户电影评分数据 UserID::MovieID::Rating::TimeStamp * @param spark SparkSession */ def dataSetOps(usersRDD:RDD[String],ratingsRDD:RDD[String],spark:SparkSession): Unit ={ //通过DataSet实战电影点评系统案例 import spark.implicits._ //创建case User来封装用户数据 case class User(UserID:String,Gender:String,Age:String,OccupationID:String, Zip_Code:String) //创建case Rating来封装用户评分数据 case class Rating(UserID:String,MovieID:String,Rating:Double,Timestamp:String) //将用户数据封装到User class中 val usersForDSRDD = usersRDD.map(_.split("\\|")). map(line=>User(line(0).trim,line(2).trim,line(1).trim,line(3).trim,line(4).trim)) //最后创建DateSet val userDataSet = spark.createDataset(usersForDSRDD) userDataSet.show(10) //将评分数据封装到Rating class中 val ratingsForDSRDD = ratingsRDD.map(_.split("::")) .map(line=>Rating(line(0).trim,line(1).trim,line(2).trim.toDouble,line(3).trim)) val ratingsDataSet = spark.createDataset(ratingsForDSRDD) //下面的实现代码和使用 DataFrame方法几乎完全一样(把 DataFrame换成DataSet即可) ratingsDataSet.filter(s" MovieID = 1193").join(userDataSet,"UserID") .select("Gender","Age") .groupBy("Gender","Age") .count() .orderBy($"Gender".desc,$"Age") .show(10) } }
问题解决方法
首先需要引入隐式转换,然后将自定义的Case Class设置为全局变量。
import org.apache.spark import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoders, Row, SparkSession} import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} /** * FileName: RDD_Movie_Users_Analyzer * Author: hadoop * Email: 3165845957@qq.com * Date: 19-5-19 下午4:59 * Description: * */ object RDD_Movie_Users_Analyzer { //创建case User来封装用户数据 case class User(UserID:String,Gender:String,Age:String,OccupationID:String, Zip_Code:String) //创建case Rating来封装用户评分数据 case class Rating(UserID:String,MovieID:String,Rating:Double,Timestamp:String) def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[*]") .setAppName("RDD_Movie_Users_Analyzer") val spark = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ val sc = spark.sparkContext sc.setLogLevel("WARN") val path = "file:///home/hadoop/movierecommend/dataset/" //user.dat UserID|age|Gender|Occuption|Zip-code val usersRDD = sc.textFile(path + "user.dat") //movies.dat MovieId::Title::Genres val moviesRDD = sc.textFile(path+"movies.dat") //movies.dat UserID::MovieID::Rating::TimeStamp val ratingsRDD = sc.textFile(path + "ratings.dat") //RDD: MovieID,Title val movieInfo = moviesRDD.map(_.split("::")).map(x=>(x(0),x(1))).cache() //RDD: UserId,MovieId,Rating val ratings = ratingsRDD.map(_.split("::")).map(x=>(x(0),x(1),x(2))).cache() //UserID,Gender val usersGender = usersRDD.map(_.split("\\|")).map(x=>(x(0),x(2))) dataSetOps(usersRDD,ratingsRDD,spark) spark.stop() } /** * 通过DataSet实战电影点评系统案例 * @param usersRDD 用户信息RDD UserID|age|Gender|Occuption|Zip-code * @param ratingsRDD 用户电影评分数据 UserID::MovieID::Rating::TimeStamp * @param spark SparkSession */ def dataSetOps(usersRDD:RDD[String],ratingsRDD:RDD[String],spark:SparkSession): Unit ={ //通过DataSet实战电影点评系统案例 import spark.implicits._ //将用户数据封装到User class中 val usersForDSRDD = usersRDD.map(_.split("\\|")). map(line=>User(line(0).trim,line(2).trim,line(1).trim,line(3).trim,line(4).trim)) //最后创建DateSet val userDataSet = spark.createDataset(usersForDSRDD) userDataSet.show(10) //将评分数据封装到Rating class中 val ratingsForDSRDD = ratingsRDD.map(_.split("::")) .map(line=>Rating(line(0).trim,line(1).trim,line(2).trim.toDouble,line(3).trim)) val ratingsDataSet = spark.createDataset(ratingsForDSRDD) //下面的实现代码和使用 DataFrame方法几乎完全一样(把 DataFrame换成DataSet即可) ratingsDataSet.filter(s" MovieID = 1193").join(userDataSet,"UserID") .select("Gender","Age") .groupBy("Gender","Age") .count() .orderBy($"Gender".desc,$"Age") .show(10) } }
这样问题就解决了。