object Rdd2DataFrame { System.setProperty("hadoop.home.dir","D:\\hadoop\\hadoop-2.9.2") def main(args: Array[String]): Unit = { val spark = SparkSession.builder().master("local").appName("rdd2dataframe").getOrCreate() val lineRdd = spark.sparkContext.textFile("D:\\test.txt") //Rdd转DataFrame val df = rdd2DataFrame(lineRdd,spark) //DataFrame转Rdd dataFrame2Rdd(df) //Rdd转DataSet val ds = rdd2Dataset(lineRdd, spark) //DataSet转Rdd dataset2Rdd(ds) //DataFrame转DataSet val ds1 = dataFrame2DataSet(df) //DataSet转DataFrame dataset2DataFrame(ds1) spark.close() } //Rdd转DataFrame def rdd2DataFrame(lineRdd: RDD[String], spark: SparkSession):DataFrame={ //方式一 /*val personRdd = lineRdd.map(line=>Row(line.split(" ")(0),line.split(" ")(1).toInt)) val fields = List(StructField("name",StringType),StructField("age",IntegerType)) val schema = StructType(fields) val personDF = spark.createDataFrame(personRdd,schema)*/ //方式二 val personRdd = lineRdd.map(line=>(line.split(" ")(0),line.split(" ")(1).toInt)) import spark.implicits._ val personDF = personRdd.toDF("name","age") personDF.show() personDF } //DataFrame转Rdd def dataFrame2Rdd(df: sql.DataFrame):Unit={ val rdd = df.rdd rdd.foreach(println) } //Rdd转DataSet def rdd2Dataset(lineRdd: RDD[String], spark: SparkSession):Dataset[Person]={ val personRdd = lineRdd.map(line=>Person(line.split(" ")(0),line.split(" ")(1).toInt)) import spark.implicits._ //kyro序列化方式 //implicit val personEncoder = org.apache.spark.sql.Encoders.kryo[Person] //Encoder进行序列化(SparkSql推荐使用) //implicit val personEncoder: Encoder[Person] = ExpressionEncoder() //方式一 //val personDS = spark.createDataset(personRdd) //方式二 val personDS = personRdd.toDS() personDS.show() personDS } //DataSet转Rdd def dataset2Rdd(ds: Dataset[Person]):Unit={ val rdd = ds.rdd rdd.foreach(println) } //DataFrame转DataSet def dataFrame2DataSet(df: sql.DataFrame):Dataset[Person]={ implicit val personEncoder: Encoder[Person] = ExpressionEncoder() val personDs = df.as[Person] personDs.show() personDs } //DataSet转DataFrame def dataset2DataFrame(ds: Dataset[Person]):Unit={ val df = ds.toDF("name","age") df.show() } } case class Person(name:String,age:Int)
spark Rdd,DataSet,DataFrame三个互转
最新推荐文章于 2023-04-26 16:39:24 发布