case class Test(a: String, b: TestIn)
case class TestIn(c: String)
object Test {
// 在dataset中增加一动态列
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("test")
.master("local[*]")
// .enableHiveSupport()
.getOrCreate()
import spark.implicits._
val rdd = spark.sparkContext.parallelize(Seq(("202002100000", Test("v1", TestIn("v2"))), ("202002100005", Test("v3", TestIn("v4")))))
val dateStringRDD = rdd.map(_._1)
val voRDD = rdd.map(_._2).toDF().rdd
val rowRdd = voRDD.zip(dateStringRDD).map(data => {
val buffer = Row.unapplySeq(data._1).get.toBuffer
buffer.append(data._2)
new GenericRowWithSchema(buffer.toArray, data._1.schema.add("segment", StringType)).asInstanceOf[Row]
})
val df = spark.createDataFrame(rowRdd,rowRdd.first.schema)
df.show()
spark.stop()
}
}
样例类RDD动态加入一个字段转为DataFrame
最新推荐文章于 2022-08-18 11:08:42 发布