首先创建一个测试数据
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('TestAPP')
.enableHiveSupport()
.getOrCreate()
df = spark.createDataFrame([('math','alice',88), ('chinese','alice',92), ('english','alice',77),
('math','bob',65), ('chinese','bob',87), ('english','bob',90),
('math','cary',67), ('chinese','cary',33), ('english','cary',24),
('math','josh',77), ('chinese','josh',87), ('english','josh',90)
], ['item','name','score']).orderBy('item')
df.show()
+-------+-----+-----+
| item| name|score|
+-------+-----+-----+
|chinese| cary| 33|
|chinese|alice| 92|
|chinese| josh| 87|
|chinese| bob| 87|
|english| josh| 90|
|english| bob| 90|
|english| cary| 24|
|english|alice| 77|
| math|alice| 88|
| math| bob| 65|
| math| cary| 67|
| math| josh| 77|
+-------+-----+-----+
1.行转列通过pivot
有以下两种方式
方法一:
# 行转列
df.createOrReplaceTempView('scores')
sql_content = '''select * from scores
pivot
(
sum(score) for
name in ('alice','bob','cary','josh')
)
'''
df_pivot = spark.sql(sql_content)
df_pivot.show()
+-------+-----+---+----+----+
| item|alice|bob|cary|josh|
+-------+-----+---+----+----+
|chinese| 92| 87| 33| 87|
|english| 77| 90| 24| 90|
| math| 88| 65| 67| 77|
+-------+-----+---+----+----+
方法二:
from pyspark.sql import functions as F
df.groupBy('item').pivot('name').sum('score').show()
+-------+-----+---+----+----+
| item|alice|bob|cary|josh|
+-------+-----+---+----+----+
|chinese| 92| 87| 33| 87|
|english| 77| 90| 24| 90|
| math| 88| 65| 67| 77|
+-------+-----+---+----+----+
2. 列转行通过stack
# 列转行
df_pivot.createOrReplaceTempView('v_pivot')
sql_content = '''select item,
stack(4, 'alice', alice, 'bob', bob, 'cary', cary, 'josh', josh) as (name, score )
from v_pivot
'''
df_unpivot1 = spark.sql(sql_content)
df_unpivot1.show()
+-------+-----+-----+
| item| name|score|
+-------+-----+-----+
|chinese|alice| 92|
|chinese| bob| 87|
|chinese| cary| 33|
|chinese| josh| 87|
|english|alice| 77|
|english| bob| 90|
|english| cary| 24|
|english| josh| 90|
| math|alice| 88|
| math| bob| 65|
| math| cary| 67|
| math| josh| 77|
+-------+-----+-----+
实际生产上,需要转换的列很多的时候,可以通过如下方式处理
app_columns = [col for col in df_pivot.columns if col not in 'item']
app_columns
['alice', 'bob', 'cary', 'josh']
app_stack_str = ','.join(map(lambda x: "'%s', %s" % (x, x), app_columns))
app_stack_str
"'alice', alice,'bob', bob,'cary', cary,'josh', josh"
df_pivot1 = df_pivot.select(*[df_pivot[col_name].cast("string") for col_name in df_pivot.columns])
df_unpivot1 = df_pivot1.selectExpr("item", "stack(%s, %s) as (feature, value)" % (len(app_columns), app_stack_str))
df_unpivot1 .show()
+-------+-------+-----+
| item|feature|value|
+-------+-------+-----+
|chinese| alice| 92|
|chinese| bob| 87|
|chinese| cary| 33|
|chinese| josh| 87|
|english| alice| 77|
|english| bob| 90|
|english| cary| 24|
|english| josh| 90|
| math| alice| 88|
| math| bob| 65|
| math| cary| 67|
| math| josh| 77|
+-------+-------+-----+