关于在计算中,动态添加字段处理方式,参考网上的方式,进行整理如下,作个记录
package com.ku.test
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable
object TestAddField0 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("test")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
val sourceFile: DataFrame = Seq(
(1, "24_男_上海市"),
(2, "33_女_北京市")
).toDF("id", "message")
sourceFile.show()
val fieldsConf = new mutable.HashMap[String, DataType]()
fieldsConf += (("age", IntegerType))
fieldsConf += (("gender", StringType))
fieldsConf += (("address", StringType))
val dataSF: RDD[Row] = sourceFile.rdd.map(
row => {
val message = row.getAs[String]("message")
val buffer = Row.unapplySeq(row).get.toBuffer
message.split("_").foreach(
dataVal => {
buffer.append("new_" + dataVal)
}
)
var schemaNew: StructType = row.schema
fieldsConf.foreach(conf => {
schemaNew = schemaNew.add(conf._1, conf._2)
})
// 使用Row的子类GenericRowWithSchema创建新的Row
val newSchema = new GenericRowWithSchema(buffer.toArray, schemaNew).asInstanceOf[Row]
newSchema
})
dataSF.foreach(row => println(row))
}
}
两次输出内容如下:
+---+------------+
| id| message|
+---+------------+
| 1|24_男_上海市|
| 2|33_女_北京市|
+---+------------+
[1,24_男_上海市,new_24,new_男,new_上海市]
[2,33_女_北京市,new_33,new_女,new_北京市]
Process finished with exit code 0