pyspark dataframe 列的合并与拆分,单行转多行

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row, SQLContext

from pyspark.sql.functions import udf, col, concat, concat_ws, explode, split
from pyspark.sql.types import MapType, IntegerType, StringType, FloatType, ArrayType
from pyspark.ml.feature import VectorAssembler
from itertools import chain
from pyspark.ml.linalg import Vectors, VectorUDT
import json

spark = SparkSession.builder.appName('test1') \
        .master('local') \
        .config('spark.executor.memory','4g') \
        .config('spark.executor.cores', '4') \
        .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

构建示例数据

test_score = spark.createDataFrame([['王强', '{"语文":119,"数学":125,"外语":120,"物理":96,"化学":70,"生物":40}'],
                                    ['刘明', '{"语文":105,"数学":121,"外语":110,"物理":71,"化学":74,"生物":45}'],
                                    ['张磊', '{"语文":110,"数学":103,"外语":124,"物理":86,"化学":52,"生物":55}']],['name','score'])
test_score.show(5,truncate=False,vertical=False)
test_score.printSchema()
+----+----------------------------------------------------------------+
|name|score                                                           |
+----+----------------------------------------------------------------+
|王强|{"语文":119,"数学":125,"外语":120,"物理":96,"化学":70,"生物":40}|
|刘明|{"语文":105,"数学":121,"外语":110,"物理":71,"化学":74,"生物":45}|
|张磊|{"语文":110,"数学":103,"外语":124,"物理":86,"化学":52,"生物":55}|
+----+----------------------------------------------------------------+

root
 |-- name: string (nullable = true)
 |-- score: string (nullable = true)

将数据中 score 列转换成映射

def udf_array_to_map(array):
    if array is None:
        return array
    return dict((i, j) for i,j in json.loads(array).items())
                                                
score_udf = udf(lambda x: udf_array_to_map(x), MapType(StringType(), IntegerType(), True))

test_score_map = test_score.withColumn('score_map', score_udf(col('score'))).select('name','score_map')
test_score_map.show(truncate=False,vertical=False)
test_score_map.printSchema()
+----+---------------------------------------------------------------------------+
|name|score_map                                                                  |
+----+---------------------------------------------------------------------------+
|王强|[物理 -> 96, 生物 -> 40, 化学 -> 70, 数学 -> 125, 语文 -> 119, 外语 -> 120]|
|刘明|[物理 -> 71, 生物 -> 45, 化学 -> 74, 数学 -> 121, 语文 -> 105, 外语 -> 110]|
|张磊|[物理 -> 86, 生物 -> 55, 化学 -> 52, 数学 -> 103, 语文 -> 110, 外语 -> 124]|
+----+---------------------------------------------------------------------------+

root
 |-- name: string (nullable = true)
 |-- score_map: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)

利用 explode 函数将 score_map 列中每行分成多行

test_score_explode = test_score_map.select('name', explode('score_map').alias('subject', 'score'))
test_score_explode .show()
test_score_explode.printSchema()
+----+-------+-----+
|name|subject|score|
+----+-------+-----+
|王强|   物理|   96|
|王强|   生物|   40|
|王强|   化学|   70|
|王强|   数学|  125|
|王强|   语文|  119|
|王强|   外语|  120|
|刘明|   物理|   71|
|刘明|   生物|   45|
|刘明|   化学|   74|
|刘明|   数学|  121|
|刘明|   语文|  105|
|刘明|   外语|  110|
|张磊|   物理|   86|
|张磊|   生物|   55|
|张磊|   化学|   52|
|张磊|   数学|  103|
|张磊|   语文|  110|
|张磊|   外语|  124|
+----+-------+-----+

root
 |-- name: string (nullable = true)
 |-- subject: string (nullable = false)
 |-- score: integer (nullable = true)

score_map 中的数据拆分成多个列

test_score_unfold = test_score_map.withColumn('语文', test_score_map['score_map'].getItem('语文'))\
    .withColumn('数学', test_score_map['score_map'].getItem('数学'))\
    .withColumn('外语', test_score_map['score_map'].getItem('外语'))\
    .withColumn('物理', test_score_map['score_map'].getItem('物理'))\
    .withColumn('化学', test_score_map['score_map'].getItem('化学'))\
    .withColumn('生物', test_score_map['score_map'].getItem('生物'))\
    .select('name','语文','数学','外语','物理','化学','生物')

test_score_unfold.show()
test_score_unfold.printSchema()
+----+----+----+----+----+----+----+
|name|语文|数学|外语|物理|化学|生物|
+----+----+----+----+----+----+----+
|王强| 119| 125| 120|  96|  70|  40|
|刘明| 105| 121| 110|  71|  74|  45|
|张磊| 110| 103| 124|  86|  52|  55|
+----+----+----+----+----+----+----+

root
 |-- name: string (nullable = true)
 |-- 语文: integer (nullable = true)
 |-- 数学: integer (nullable = true)
 |-- 外语: integer (nullable = true)
 |-- 物理: integer (nullable = true)
 |-- 化学: integer (nullable = true)
 |-- 生物: integer (nullable = true)

concat 将多列合并成一列

test_score_fold = test_score_unfold.withColumn("score_concat", concat(test_score_unfold['语文'], \
                                                test_score_unfold['数学'], test_score_unfold['外语'],\
                                                test_score_unfold['物理'], test_score_unfold['化学'],\
                                                test_score_unfold['生物'] )).select("name","score_concat")
test_score_fold.show()
test_score_fold.printSchema()
+----+---------------+
|name|   score_concat|
+----+---------------+
|王强|119125120967040|
|刘明|105121110717445|
|张磊|110103124865255|
+----+---------------+

root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = true)

concat_ws 将多列合并成一列 concat_ws可选择分割符

test_score_fold_ = test_score_unfold.withColumn("score_concat", concat_ws(',',test_score_unfold['语文'], \
                                                test_score_unfold['数学'], test_score_unfold['外语'],\
                                                test_score_unfold['物理'], test_score_unfold['化学'],\
                                                test_score_unfold['生物'] )).select("name","score_concat")
test_score_fold_.show()
test_score_fold_.printSchema()
+----+--------------------+
|name|        score_concat|
+----+--------------------+
|王强|119,125,120,96,70,40|
|刘明|105,121,110,71,74,45|
|张磊|110,103,124,86,52,55|
+----+--------------------+

root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = false)

split 将上面的数据转化成 array

test_score_split = test_score_fold_.withColumn("score_array", split(test_score_fold_['score_concat'], ","))
test_score_split.show(truncate=False,vertical=False)
test_score_split.printSchema()
+----+--------------------+---------------------------+
|name|score_concat        |score_array                |
+----+--------------------+---------------------------+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|
+----+--------------------+---------------------------+

root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = false)
 |-- score_array: array (nullable = false)
 |    |-- element: string (containsNull = true)

另一种方法

def div(data):
    return data.split(',')

udf_split = udf(div, ArrayType(StringType()))

test_score_split_ = test_score_fold_.withColumn('score_array',udf_split(col('score_concat')))
test_score_split_.show(truncate=False)
test_score_split_.printSchema()
+----+--------------------+---------------------------+
|name|score_concat        |score_array                |
+----+--------------------+---------------------------+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|
+----+--------------------+---------------------------+

root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = false)
 |-- score_array: array (nullable = true)
 |    |-- element: string (containsNull = true)

将 array 转成 vector

def to_vec(data):
    return Vectors.dense(data)

u_t = udf(to_vec, VectorUDT())

test_score_vector = test_score_split_.withColumn('score_vector',u_t(col('score_array')))
test_score_vector.show(truncate=False)
test_score_vector.printSchema()
root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = false)
 |-- score_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- score_vector: vector (nullable = true)

将 vector 类型 转成 array

def to_arr(data):
    return [float(i) for i in data]

udfage = udf(to_arr, ArrayType(FloatType()))

test_score_array = test_score_vector.withColumn('score_vector_to_array',udfage('score_vector'))
test_score_array.show(truncate=False)
test_score_array.printSchema()
+----+--------------------+---------------------------+----------------------------------+---------------------------------------+
|name|score_concat        |score_array                |score_vector                      |score_vector_to_array                  |
+----+--------------------+---------------------------+----------------------------------+---------------------------------------+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|[119.0,125.0,120.0,96.0,70.0,40.0]|[119.0, 125.0, 120.0, 96.0, 70.0, 40.0]|
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|[105.0,121.0,110.0,71.0,74.0,45.0]|[105.0, 121.0, 110.0, 71.0, 74.0, 45.0]|
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|[110.0,103.0,124.0,86.0,52.0,55.0]|[110.0, 103.0, 124.0, 86.0, 52.0, 55.0]|
+----+--------------------+---------------------------+----------------------------------+---------------------------------------+

root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = false)
 |-- score_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- score_vector: vector (nullable = true)
 |-- score_vector_to_array: array (nullable = true)
 |    |-- element: float (containsNull = true)

将上面转化后的数据变成多列

first_row = test_score_fold_.first()
numAttrs = len(first_row['score_concat'].split(","))
print("新增列的个数", numAttrs)
attrs = sc.parallelize(["语文","数学", "外语", "物理", "化学", "生物"]).zipWithIndex().collect()
print("列名:", attrs)
for name, index in attrs:
    df_split = df_split.withColumn(name, df_split['s'][index])

df_split.show(truncate=False,vertical=False)
新增列的个数 6
列名: [('语文', 0), ('数学', 1), ('外语', 2), ('物理', 3), ('化学', 4), ('生物', 5)]
+----+--------------------+---------------------------+----+----+----+----+----+----+
|name|score_concat        |s                          |语文|数学|外语|物理|化学|生物|
+----+--------------------+---------------------------+----+----+----+----+----+----+
|王强|119,125,120,96,70,40|[119, 125, 120, 96, 70, 40]|119 |125 |120 |96  |70  |40  |
|刘明|105,121,110,71,74,45|[105, 121, 110, 71, 74, 45]|105 |121 |110 |71  |74  |45  |
|张磊|110,103,124,86,52,55|[110, 103, 124, 86, 52, 55]|110 |103 |124 |86  |52  |55  |
+----+--------------------+---------------------------+----+----+----+----+----+----+

将单行数据变成多行

df_explode = test_score_fold_.withColumn("e", explode(split(test_score_fold_['score_concat'], ",")))
df_explode.show()
df_explode.printSchema()
+----+--------------------+---+
|name|        score_concat|  e|
+----+--------------------+---+
|王强|119,125,120,96,70,40|119|
|王强|119,125,120,96,70,40|125|
|王强|119,125,120,96,70,40|120|
|王强|119,125,120,96,70,40| 96|
|王强|119,125,120,96,70,40| 70|
|王强|119,125,120,96,70,40| 40|
|刘明|105,121,110,71,74,45|105|
|刘明|105,121,110,71,74,45|121|
|刘明|105,121,110,71,74,45|110|
|刘明|105,121,110,71,74,45| 71|
|刘明|105,121,110,71,74,45| 74|
|刘明|105,121,110,71,74,45| 45|
|张磊|110,103,124,86,52,55|110|
|张磊|110,103,124,86,52,55|103|
|张磊|110,103,124,86,52,55|124|
|张磊|110,103,124,86,52,55| 86|
|张磊|110,103,124,86,52,55| 52|
|张磊|110,103,124,86,52,55| 55|
+----+--------------------+---+

root
 |-- name: string (nullable = true)
 |-- score_concat: string (nullable = false)
 |-- e: string (nullable = true)

a = [Row(id='ID1', tokens=['one', 'two', 'two'], bigrams=['one two', 'two two'])]
b = spark.createDataFrame(a)
b.show()
b.printSchema()
+------------------+---+---------------+
|           bigrams| id|         tokens|
+------------------+---+---------------+
|[one two, two two]|ID1|[one, two, two]|
+------------------+---+---------------+

root
 |-- bigrams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
def concat_(*args):
    return list(chain(*args))

lll = udf(concat_, ArrayType(StringType()))

c = b.withColumn('bein',lll(col("tokens"), col("bigrams"))) 
c.show(truncate=False)
c.printSchema()
+------------------+---+---------------+---------------------------------+
|bigrams           |id |tokens         |bein                             |
+------------------+---+---------------+---------------------------------+
|[one two, two two]|ID1|[one, two, two]|[one, two, two, one two, two two]|
+------------------+---+---------------+---------------------------------+

root
 |-- bigrams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- bein: array (nullable = true)
 |    |-- element: string (containsNull = true)
  • 7
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

viziviuz

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值