spark业务开发-列拆分
输入数据
id,data
1,"Ming,20,15552211521"
2,"hong,19,13287994007"
3,"zhi,21,15552211523"
输出数据
+---+----+---+-----------+
| id| 列1|列2| 列3|
+---+----+---+-----------+
| 1|Ming| 20|15552211521|
| 2|hong| 19|13287994007|
| 3| zhi| 21|15552211523|
+---+----+---+-----------+
程序代码
package com.cch.bigdata.spark.process.split
import com.cch.bigdata.spark.process.AbstractTransform
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions}
import scala.collection.mutable.ListBuffer
class Spliter extends AbstractTransform {
private val column = "data"
private val separator = ","
private val new_column_names = Array[String]("c1","c2","c3")
private val new_column_cnames = Array[String]("列1","列2","列3")
override def process(): Unit = {
if (column.isEmpty) {
throw new RuntimeException("拆分列不能为空")
}
if (separator.isEmpty) {
throw new RuntimeException("拆分分隔符不能为空")
}
if (new_column_names.isEmpty) {
throw new RuntimeException("拆分后新列名称不能为空")
}
val df: DataFrame = loadCsv("src/main/resources/csv/split.csv",spark)
val names: Array[String] = df.schema.fieldNames
val splitColumnName = "split_column"
val splitDataFrame: DataFrame = df.withColumn(
splitColumnName,
functions.split(col(column),
separator
))
val list: ListBuffer[Column] = new ListBuffer()
names.foreach(c => {
if (!c.equals(column)) {
list.append(col(c))
}
})
var index = 0
new_column_names.foreach(c => {
list.append(col(splitColumnName).getItem(index).as(new_column_cnames(index)))
index += 1
})
splitDataFrame.select(list.map(c => {
c
}): _*).drop(splitColumnName).show()
}
override def getAppName(): String = "列拆分"
}
object Spliter{
def main(args: Array[String]): Unit = {
new Spliter().process()
}
}
参数解释
- column:指定需要拆分的列名
- separator:拆分列中需要根据什么分隔符来拆分
- new_column_names:为分割的列指定一个新名词
- new_column_cnames:新列的别名