spark创建Dateset的几种方式

一:通过createDataset(seq,list,rdd)

import org.apache.spark.SparkContext
import org.apache.spark.sql.{Dataset, SparkSession}

object CreateDataset {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[4]").appName(this.getClass.getName).getOrCreate()
    //   需要导入隐式转换
    import spark.implicits._

    val sc: SparkContext = spark.sparkContext
    //通过seq创建Dataset
    val seqDs: Dataset[Int] =spark.createDataset(1 to 10)
    //通过list创建Dataset
    val listDs: Dataset[(String, Int)] = spark.createDataset(List(("a",1),("b",2),("c",3)))
    //通过rdd创建Dataset
    val rddDs: Dataset[(String, Int, Int)] = spark.createDataset(sc.parallelize(List(("a",1,2),("b",2,3),("c",3,4))))

    seqDs.show()
    listDs.show()
    rddDs.show()


  }

}

二:通过case class

1.通过case class样例类创建一个seq、list、Array、RDD,再.toDS转化为Dataset

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

import scala.collection.mutable

object CreateDataSetByCaseClass {

  case class Point(label:String,x:Double,y:Double)
  case class Category(id:Long,name:String)

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[4]").appName(this.getClass.getName).getOrCreate()
    //   需要导入隐式转换
    import spark.implicits._

    val sc: SparkContext = spark.sparkContext
    //通过Point的样例类创建一个seq,并将它转化为Dataset
    val points: Dataset[Point] = Seq(Point("bar",2.6,3.5),Point("foo",4.0,3.7)).toDS()
    //通过Category的样例类创建一个seq,并将它转化为Dataset
    val categories: Dataset[Category] = Seq(Category(1,"bar"),Category(2,"foo")).toDS()
    //进行join连接,注意这里需要传入三个”=“,这时一个方法
     points.join(categories,points("label")===categories("name")).show()

    //通过Point的样例类创建一个List,并将它转化为Dataset
    val points2: Dataset[Point] = List(Point("bar",2.6,3.5),Point("foo",4.0,3.7)).toDS()
    //通过Category的样例类创建一个List,并将它转化为Dataset
    val categories2: Dataset[Category] = List(Category(1,"bar"),Category(2,"foo")).toDS()
    //进行join连接,注意这里需要传入三个”=“,这时一个方法
    points2.join(categories2,points2("label")===categories2("name")).show()

    //通过Point的样例类创建一个RDD,并将它转化为Dataset
    val points3: Dataset[Point] = sc.parallelize(List(Point("bar",2.6,3.5),Point("foo",4.0,3.7))).toDS()
    //通过Category的样例类创建一个RDD,并将它转化为Dataset
    val categories3: Dataset[Category] = sc.parallelize(List(Category(1,"bar"),Category(2,"foo"))).toDS()
    points3.join(categories3,points3("label")===categories3("name")).show()
  }
}

2.先创建RDD,在把RDD和样例类进行关联,再.toDS转化为Dataset

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

import scala.collection.mutable

object CreateDataSetByCaseClass {

  case class Point(label:String,x:Double,y:Double)
  case class Category(id:Long,name:String)

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[4]").appName(this.getClass.getName).getOrCreate()
    //   需要导入隐式转换
    import spark.implicits._

    val sc: SparkContext = spark.sparkContext

    //过Point的数据创建一个RDD
      val pointRdd: RDD[(String, Double, Double)] = sc.parallelize(List(("bar",2.6,3.5),("foo",4.0,3.7)))
    //通过Category的数据创建一个RDD
      val categoriesRdd: RDD[(Int, String)] = sc.parallelize(List((1,"bar"),(2,"foo")))
    //两个RDD和样例类进行关联
      val pointsDS: Dataset[Point] = pointRdd.map(x=>Point(x._1,x._2,x._3)).toDS()
      val categoriesDs: Dataset[Category] = categoriesRdd.map(x=>Category(x._1,x._2)).toDS()

    //将两个DataSet进行关联,输出
    pointsDS.join(categoriesDs,pointsDS("label")===categoriesDs("name")).show()


  }

}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值