/**
* 列名根据索引位置自动重命名,同名不同数据类型,自动强制转换类型
*
* @param df 源dataframe
* @param sqlName 目标表名
* @return 转换后的dataframe
*/
def castDf(df: DataFrame, sqlName: String): DataFrame = {
val targetSql: String = s"select * from ${sqlName} limit 0"
val schemaDf = sparkSession.sql(targetSql)
val schemaMap = schemaDf.schema.fields.map(struct => {
(struct.name, struct.dataType)
}).toMap
var newDF = df
val sourceFields = df.schema.fieldNames
val targetFields = schemaDf.schema.fieldNames
println(s"castDf size -->${sourceFields.size}:${targetFields.size}")
//将dataFrame对应索引字段重命名
val size = schemaMap.keySet.size
Range(0, size)
.filter(i => !sourceFields(i).equals(targetFields(i)))
.foreach(i => newDF = newDF.withColumnRenamed(sourceFields(i), targetFields(i)))
val sourceMap = newDF.schema.fields.map(struct => {
(struct.name, struct.dataType)
}).toMap
//两个schema集合中字段相同但数据类型不同的才需要转换
targetFields
.filter(f => !schemaMap.get(f).equals(sourceMap.get(f)))
.foreach(field => {
newDF = newDF.withColumn(field, col(field).cast(schemaMap.getOrElse(field, StringType)))
})
newDF
}
Spark保存Parquet数据,自动转换成目标表数据类型
最新推荐文章于 2023-06-15 13:57:23 发布