import org.apache.spark.sql.SparkSession
/**
* 数据的行列变化:
* 多行数据到一行数据中
* 例如:
* +--------+----+-----+ +--------+---------+-----+
* |username|item|price| |username|item |price|
* +--------+----+-----+ +--------+---------+-----+
* |zhangsan| A| 1| |zhangsan|A,B,C,D | 12|
* |zhangsan| B| 2| |lisi |A,B,C | 16|
* |zhangsan| C| 3| |wangwu | C | 8|
* | lisi| A| 4| +--------+--------+-----+
* | lisi| C| 5|
* |zhangsan| D| 6|
* | lisi| B| 7|
* | wangwu| C| 8|
* +--------+----+-----+
* collect_list(item) : 将item数据根据分组合并到一个集合中。
* collect_set(item) : 将item数据根据分组合并到一个集合中,相同的数据会去重。
* concat(xx,xxx... ...) : 拼接字符串
* concat_ws (分隔符,集合):按照分隔符将集合中的元素进行拼接,返回字符串
* split(列,"分隔符") : 对某列进行按照分隔符切割得到对应的一个集合
* explode(集合) : 一边多转换数据
*
*/
object RowColumnTransfer1 {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master("local").appName("test").getOrCreate()
val df = session.read.option("header", true).csv("T:/code/spark_scala/data/spark/rowcolumnData")
df.createTempView("temp1")
session.sql(
"""
| select
| username,collect_list(item) as cl ,sum(price) as totalprice
| from temp1
| group by username
""".stripMargin).createTempView("temp2")
session.sql(
"""
| select
| username,concat_ws(",",cl) as cw,totalprice
| from temp2
""".stripMargin).createTempView("temp3")
session.sql(
"""
| select
| username,explode(split(cw,",")) as item ,totalprice
| from temp3
""".stripMargin).show(100)
}
}
结果:
+--------+----+----------+
|username|item|totalprice|
+--------+----+----------+
| wangwu| C| 8.0|
|zhangsan| A| 12.0|
|zhangsan| B| 12.0|
|zhangsan| C| 12.0|
|zhangsan| D| 12.0|
| lisi| A| 16.0|
| lisi| C| 16.0|
| lisi| B| 16.0|
+--------+----+----------+
第二种:
import org.apache.spark.sql.SparkSession
/**
* SparkSQL行列变化:
* +--------+----+-----+
* |username|item|price| +--------+----+----+---+----+
* +--------+----+-----+ |username| A| B| C| D|
* |zhangsan| A| 1| +--------+----+----+---+----+
* |zhangsan| B| 2| | wangwu|null|null| 8|null|
* |zhangsan| C| 3| |zhangsan| 1| 2| 3| 6|
* | lisi| A| 4| | lisi| 4| 7| 5|null|
* | lisi| C| 5| +--------+----+----+---+----+
* |zhangsan| D| 6|
* | lisi| B| 7|
* | wangwu| C| 8|
* +--------+----+-----+
* SQL函数:
* str_to_map(字段,分隔符1,分隔符2) : 把当前字符串字段按照分隔符1切分成多条数据,再对每条数据按照分隔符2切割成K,V格式的数据组成Map
* map(K1,V1,K2,V2,K3,V3... ...) : 得到一个map集合
*/
object RowColumnTransfer2 {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().appName("test").master("local").getOrCreate()
session.sparkContext.setLogLevel("Error")
val frame = session.read
.option("header", true)
.csv("T:/code/spark_scala/data/spark/rowcolumnData")
frame.createTempView("temp1")
session.sql(
"""
| select
| username,concat_ws("#",collect_list(concat(item,",",price))) as cw
| from temp1
| group by username
""".stripMargin).createTempView("temp2")
session.sql(
"""
| select
| username,str_to_map(cw,"#",",") as mp
| from temp2
""".stripMargin).createTempView("temp3")
session.sql(
"""
| select username ,mp['A'] as A,mp['B'] as B ,mp['C'] as C ,mp['D'] as D
| from temp3
""".stripMargin).createTempView("temp4")
session.sql(
"""
| select username,item,price
| from
| (select
| username,explode(map("A",A,"B",B,"C",C,"D",D)) as (item,price)
| from temp4) ttt
| where price is not null
""".stripMargin).show(100, false)
}
}
+--------+----+-----+
|username|item|price|
+--------+----+-----+
|wangwu |C |8 |
|zhangsan|A |1 |
|zhangsan|B |2 |
|zhangsan|C |3 |
|zhangsan|D |6 |
|lisi |A |4 |
|lisi |B |7 |
|lisi |C |5 |
+--------+----+-----+