解决Unable to find encoder for type stored in a Dataset问题

175 篇文章 3 订阅
2 篇文章 0 订阅

问题描述

Error:(350, 43) Unable to find encoder for type stored in a Dataset.  Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._  Support for serializing other types will be added in future releases.
    val  userDataSet = spark.createDataset(usersForDSRDD)

出现问题的代码如下

import org.apache.spark
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoders, Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

/**
  * FileName: RDD_Movie_Users_Analyzer
  * Author:   hadoop
  * Email:    3165845957@qq.com
  * Date:     19-5-19 下午4:59
  * Description:
  *
  */
object RDD_Movie_Users_Analyzer {

  def main(args: Array[String]): Unit = {
    val  conf  = new SparkConf().setMaster("local[*]")
      .setAppName("RDD_Movie_Users_Analyzer")

    val spark = SparkSession.builder().config(conf).getOrCreate()
    import spark.implicits._

    val sc = spark.sparkContext
    sc.setLogLevel("WARN")

    val path = "file:///home/hadoop/movierecommend/dataset/"

    //user.dat UserID|age|Gender|Occuption|Zip-code
    val usersRDD = sc.textFile(path + "user.dat")
    //movies.dat MovieId::Title::Genres
    val moviesRDD = sc.textFile(path+"movies.dat")
    //movies.dat UserID::MovieID::Rating::TimeStamp
    val ratingsRDD = sc.textFile(path + "ratings.dat")

    //RDD: MovieID,Title
    val movieInfo = moviesRDD.map(_.split("::")).map(x=>(x(0),x(1))).cache()
    //RDD: UserId,MovieId,Rating
    val ratings = ratingsRDD.map(_.split("::")).map(x=>(x(0),x(1),x(2))).cache()
    //UserID,Gender
    val usersGender = usersRDD.map(_.split("\\|")).map(x=>(x(0),x(2)))

   
    dataSetOps(usersRDD,ratingsRDD,spark)

    spark.stop()

  }


  /**
    * 通过DataSet实战电影点评系统案例
    * @param usersRDD 用户信息RDD UserID|age|Gender|Occuption|Zip-code
    * @param ratingsRDD 用户电影评分数据 UserID::MovieID::Rating::TimeStamp
    * @param spark SparkSession
    */
  def dataSetOps(usersRDD:RDD[String],ratingsRDD:RDD[String],spark:SparkSession): Unit ={
    //通过DataSet实战电影点评系统案例
    import spark.implicits._
    //创建case User来封装用户数据
    case class User(UserID:String,Gender:String,Age:String,OccupationID:String, Zip_Code:String)

    //创建case Rating来封装用户评分数据
    case  class Rating(UserID:String,MovieID:String,Rating:Double,Timestamp:String)

    //将用户数据封装到User class中
    val usersForDSRDD = usersRDD.map(_.split("\\|")).
      map(line=>User(line(0).trim,line(2).trim,line(1).trim,line(3).trim,line(4).trim))

    //最后创建DateSet
    val  userDataSet = spark.createDataset(usersForDSRDD)
    userDataSet.show(10)
    //将评分数据封装到Rating class中
    val ratingsForDSRDD = ratingsRDD.map(_.split("::"))
    .map(line=>Rating(line(0).trim,line(1).trim,line(2).trim.toDouble,line(3).trim))

    val ratingsDataSet = spark.createDataset(ratingsForDSRDD)
    //下面的实现代码和使用 DataFrame方法几乎完全一样(把 DataFrame换成DataSet即可)
    ratingsDataSet.filter(s" MovieID = 1193").join(userDataSet,"UserID")
      .select("Gender","Age")
      .groupBy("Gender","Age")
      .count()
      .orderBy($"Gender".desc,$"Age")
      .show(10)
  }
}

问题解决方法

首先需要引入隐式转换,然后将自定义的Case Class设置为全局变量。

import org.apache.spark
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoders, Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

/**
  * FileName: RDD_Movie_Users_Analyzer
  * Author:   hadoop
  * Email:    3165845957@qq.com
  * Date:     19-5-19 下午4:59
  * Description:
  *
  */
object RDD_Movie_Users_Analyzer {
   //创建case User来封装用户数据
    case class User(UserID:String,Gender:String,Age:String,OccupationID:String, Zip_Code:String)

    //创建case Rating来封装用户评分数据
    case  class Rating(UserID:String,MovieID:String,Rating:Double,Timestamp:String)

  def main(args: Array[String]): Unit = {
    val  conf  = new SparkConf().setMaster("local[*]")
      .setAppName("RDD_Movie_Users_Analyzer")

    val spark = SparkSession.builder().config(conf).getOrCreate()
    import spark.implicits._

    val sc = spark.sparkContext
    sc.setLogLevel("WARN")

    val path = "file:///home/hadoop/movierecommend/dataset/"

    //user.dat UserID|age|Gender|Occuption|Zip-code
    val usersRDD = sc.textFile(path + "user.dat")
    //movies.dat MovieId::Title::Genres
    val moviesRDD = sc.textFile(path+"movies.dat")
    //movies.dat UserID::MovieID::Rating::TimeStamp
    val ratingsRDD = sc.textFile(path + "ratings.dat")

    //RDD: MovieID,Title
    val movieInfo = moviesRDD.map(_.split("::")).map(x=>(x(0),x(1))).cache()
    //RDD: UserId,MovieId,Rating
    val ratings = ratingsRDD.map(_.split("::")).map(x=>(x(0),x(1),x(2))).cache()
    //UserID,Gender
    val usersGender = usersRDD.map(_.split("\\|")).map(x=>(x(0),x(2)))

   
    dataSetOps(usersRDD,ratingsRDD,spark)

    spark.stop()

  }


  /**
    * 通过DataSet实战电影点评系统案例
    * @param usersRDD 用户信息RDD UserID|age|Gender|Occuption|Zip-code
    * @param ratingsRDD 用户电影评分数据 UserID::MovieID::Rating::TimeStamp
    * @param spark SparkSession
    */
  def dataSetOps(usersRDD:RDD[String],ratingsRDD:RDD[String],spark:SparkSession): Unit ={
    //通过DataSet实战电影点评系统案例
    import spark.implicits._
   

    //将用户数据封装到User class中
    val usersForDSRDD = usersRDD.map(_.split("\\|")).
      map(line=>User(line(0).trim,line(2).trim,line(1).trim,line(3).trim,line(4).trim))

    //最后创建DateSet
    val  userDataSet = spark.createDataset(usersForDSRDD)
    userDataSet.show(10)
    //将评分数据封装到Rating class中
    val ratingsForDSRDD = ratingsRDD.map(_.split("::"))
    .map(line=>Rating(line(0).trim,line(1).trim,line(2).trim.toDouble,line(3).trim))

    val ratingsDataSet = spark.createDataset(ratingsForDSRDD)
    //下面的实现代码和使用 DataFrame方法几乎完全一样(把 DataFrame换成DataSet即可)
    ratingsDataSet.filter(s" MovieID = 1193").join(userDataSet,"UserID")
      .select("Gender","Age")
      .groupBy("Gender","Age")
      .count()
      .orderBy($"Gender".desc,$"Age")
      .show(10)
  }
}

这样问题就解决了。

更多问题可参考:http://mangocool.com/1477619031890.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值
>