spark 笔记 pyspark

pyspark.sql.functions 中函数应用 DataFrame 转变

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName('test1') \
        .master('local') \
        .getOrCreate()
sc = spark.sparkContext

构建数据集

data = ['{"name":"Natalie","gender":"F","age":16,"English":80,"Math":56,"Chinese":72}',
 '{"name":"Logan","gender":"M","age":18,"English":82,"Math":88,"Chinese":91}',
 '{"name":"Joseph","gender":"M","age":16,"English":75,"Math":69,"Chinese":80}',
 '{"name":"Mason","gender":"F","age":17,"English":50,"Math":58,"Chinese":70}',
 '{"name":"Harper","gender":"M","age":15,"English":65,"Math":70,"Chinese":65}',
 '{"name":"Anthony","gender":"F","age":17,"English":66,"Math":71,"Chinese":75}']

df = sc.parallelize(data).map(lambda x:[x]).toDF(['info'])
df.show(truncate=False)
+----------------------------------------------------------------------------+
|info                                                                        |
+----------------------------------------------------------------------------+
|{"name":"Natalie","gender":"F","age":16,"English":80,"Math":56,"Chinese":72}|
|{"name":"Logan","gender":"M","age":18,"English":82,"Math":88,"Chinese":91}  |
|{"name":"Joseph","gender":"M","age":16,"English":75,"Math":69,"Chinese":80} |
|{"name":"Mason","gender":"F","age":17,"English":50,"Math":58,"Chinese":70}  |
|{"name":"Harper","gender":"M","age":15,"English":65,"Math":70,"Chinese":65} |
|{"name":"Anthony","gender":"F","age":17,"English":66,"Math":71,"Chinese":75}|

解析json json_tuple()

解析出来全为字符串

from pyspark.sql.functions import json_tuple
df = df.select(json_tuple(df.info,'name','gender','age','English','Math','Chinese')\
           .name('name','gender','age','English','Math','Chinese'))
df.show()
+-------+------+---+-------+----+-------+
|   name|gender|age|English|Math|Chinese|
+-------+------+---+-------+----+-------+
|Natalie|     F| 16|     80|  56|     72|
|  Logan|     M| 18|     82|  88|     91|
| Joseph|     M| 16|     75|  69|     80|
|  Mason|     F| 17|     50|  58|     70|
| Harper|     M| 15|     65|  70|     65|
|Anthony|     F| 17|     66|  71|     75|
+-------+------+---+-------+----+-------+

将多列合并成一列 concat_ws()

from pyspark.sql.functions import concat_ws
df = df.withColumn('score',concat_ws(',',col('English'),col('Math'),col('Chinese'))).select('name','gender','age','score')
df.show()
+-------+------+---+--------+
|   name|gender|age|   score|
+-------+------+---+--------+
|Natalie|     F| 16|80,56,72|
|  Logan|     M| 18|82,88,91|
| Joseph|     M| 16|75,69,80|
|  Mason|     F| 17|50,58,70|
| Harper|     M| 15|65,70,65|
|Anthony|     F| 17|66,71,75|
+-------+------+---+--------+

增加一列 (科目) lit()literal

from pyspark.sql.functions import lit
df = df.withColumn('subject',lit('English,Math,Chinese'))
df.show()
+-------+------+---+--------+--------------------+
|   name|gender|age|   score|             subject|
+-------+------+---+--------+--------------------+
|Natalie|     F| 16|80,56,72|English,Math,Chinese|
|  Logan|     M| 18|82,88,91|English,Math,Chinese|
| Joseph|     M| 16|75,69,80|English,Math,Chinese|
|  Mason|     F| 17|50,58,70|English,Math,Chinese|
| Harper|     M| 15|65,70,65|English,Math,Chinese|
|Anthony|     F| 17|66,71,75|English,Math,Chinese|
+-------+------+---+--------+--------------------+

分割成列表 split()

from pyspark.sql.functions import split
df = df.select('name','gender','age',split(col('score'),',').alias('score_array'),split(col('subject'),',').alias('subject_array'))
df.show(truncate=False)
+-------+------+---+------------+------------------------+
|name   |gender|age|score_array |subject_array           |
+-------+------+---+------------+------------------------+
|Natalie|F     |16 |[80, 56, 72]|[English, Math, Chinese]|
|Logan  |M     |18 |[82, 88, 91]|[English, Math, Chinese]|
|Joseph |M     |16 |[75, 69, 80]|[English, Math, Chinese]|
|Mason  |F     |17 |[50, 58, 70]|[English, Math, Chinese]|
|Harper |M     |15 |[65, 70, 65]|[English, Math, Chinese]|
|Anthony|F     |17 |[66, 71, 75]|[English, Math, Chinese]|
+-------+------+---+------------+------------------------+

将两个数组对应成map map_from_arrays()

from pyspark.sql.functions import map_from_arrays
df = df.select('name','gender','age',map_from_arrays('subject_array','score_array').alias('sub_score_map'))
df.show(truncate=False)
+-------+------+---+------------------------------------------+
|name   |gender|age|sub_score_map                             |
+-------+------+---+------------------------------------------+
|Natalie|F     |16 |[English -> 80, Math -> 56, Chinese -> 72]|
|Logan  |M     |18 |[English -> 82, Math -> 88, Chinese -> 91]|
|Joseph |M     |16 |[English -> 75, Math -> 69, Chinese -> 80]|
|Mason  |F     |17 |[English -> 50, Math -> 58, Chinese -> 70]|
|Harper |M     |15 |[English -> 65, Math -> 70, Chinese -> 65]|
|Anthony|F     |17 |[English -> 66, Math -> 71, Chinese -> 75]|
+-------+------+---+------------------------------------------+

将map 转换成多行 explode()

from pyspark.sql.functions import explode
df = df.select('name','gender','age',explode('sub_score_map').alias('sub','score'))
df.show()
+-------+------+---+-------+-----+
|   name|gender|age|    sub|score|
+-------+------+---+-------+-----+
|Natalie|     F| 16|English|   80|
|Natalie|     F| 16|   Math|   56|
|Natalie|     F| 16|Chinese|   72|
|  Logan|     M| 18|English|   82|
|  Logan|     M| 18|   Math|   88|
|  Logan|     M| 18|Chinese|   91|
| Joseph|     M| 16|English|   75|
| Joseph|     M| 16|   Math|   69|
| Joseph|     M| 16|Chinese|   80|
|  Mason|     F| 17|English|   50|
|  Mason|     F| 17|   Math|   58|
|  Mason|     F| 17|Chinese|   70|
| Harper|     M| 15|English|   65|
| Harper|     M| 15|   Math|   70|
| Harper|     M| 15|Chinese|   65|
|Anthony|     F| 17|English|   66|
|Anthony|     F| 17|   Math|   71|
|Anthony|     F| 17|Chinese|   75|
+-------+------+---+-------+-----+

将两列对应成 map create_map()

from pyspark.sql.functions import create_map
df = df.select('name','gender','age',create_map('sub','score').name('map'))
df.show()
+-------+------+---+---------------+
|   name|gender|age|            map|
+-------+------+---+---------------+
|Natalie|     F| 16|[English -> 80]|
|Natalie|     F| 16|   [Math -> 56]|
|Natalie|     F| 16|[Chinese -> 72]|
|  Logan|     M| 18|[English -> 82]|
|  Logan|     M| 18|   [Math -> 88]|
|  Logan|     M| 18|[Chinese -> 91]|
| Joseph|     M| 16|[English -> 75]|
| Joseph|     M| 16|   [Math -> 69]|
| Joseph|     M| 16|[Chinese -> 80]|
|  Mason|     F| 17|[English -> 50]|
|  Mason|     F| 17|   [Math -> 58]|
|  Mason|     F| 17|[Chinese -> 70]|
| Harper|     M| 15|[English -> 65]|
| Harper|     M| 15|   [Math -> 70]|
| Harper|     M| 15|[Chinese -> 65]|
|Anthony|     F| 17|[English -> 66]|
|Anthony|     F| 17|   [Math -> 71]|
|Anthony|     F| 17|[Chinese -> 75]|
+-------+------+---+---------------+

按分组把列元素收集成数组 collect_list() collect_set()

from pyspark.sql.functions import collect_list,collect_set
df = df.groupBy('name').agg(collect_set('gender').name('gender'),collect_set('age').name('age'), collect_list('map').alias('map'))
df.show(truncate=False)
+-------+------+----+------------------------------------------------+
|name   |gender|age |map                                             |
+-------+------+----+------------------------------------------------+
|Natalie|[F]   |[16]|[[English -> 80], [Math -> 56], [Chinese -> 72]]|
|Mason  |[F]   |[17]|[[English -> 50], [Math -> 58], [Chinese -> 70]]|
|Logan  |[M]   |[18]|[[English -> 82], [Math -> 88], [Chinese -> 91]]|
|Anthony|[F]   |[17]|[[English -> 66], [Math -> 71], [Chinese -> 75]]|
|Joseph |[M]   |[16]|[[English -> 75], [Math -> 69], [Chinese -> 80]]|
|Harper |[M]   |[15]|[[English -> 65], [Math -> 70], [Chinese -> 65]]|
+-------+------+----+------------------------------------------------+

将数组中的 map 转成一个 map,将 agegender 列中元素取出

这里用到 udf 自定义函数,可以将列表中的map 转成一个map
element_at() 可以将列表中某个位置的元素取出来

from pyspark.sql.functions import udf, element_at
from pyspark.sql.types import MapType
from collections import ChainMap

def lal(ar):
    return dict(reduce(lambda x,y:ChainMap(x,y),ar))

mm_udf = udf(lal,MapType(StringType(),StringType(),True))
df = df.withColumn('map',mm_udf(col('map')))\
    .withColumn('gender',element_at(col('gender'),1))\
    .withColumn('age',element_at(col('age'),1))
df.show(truncate=False)
+-------+------+---+------------------------------------------+
|name   |gender|age|map                                       |
+-------+------+---+------------------------------------------+
|Natalie|F     |16 |[English -> 80, Chinese -> 72, Math -> 56]|
|Mason  |F     |17 |[English -> 50, Chinese -> 70, Math -> 58]|
|Logan  |M     |18 |[English -> 82, Chinese -> 91, Math -> 88]|
|Anthony|F     |17 |[English -> 66, Chinese -> 75, Math -> 71]|
|Joseph |M     |16 |[English -> 75, Chinese -> 80, Math -> 69]|
|Harper |M     |15 |[English -> 65, Chinese -> 65, Math -> 70]|
+-------+------+---+------------------------------------------+

将map 中的键 和 值 分开成两列 map_values() map_keys()

from pyspark.sql.functions import map_values,map_keys
df = df.withColumn('score',map_values('map'))\
        .withColumn('subject',map_keys('map'))\
        .select('name','gender','age','score','subject')
df.show(truncate=False)
+-------+------+---+------------+------------------------+
|name   |gender|age|score       |subject                 |
+-------+------+---+------------+------------------------+
|Natalie|F     |16 |[80, 72, 56]|[English, Chinese, Math]|
|Mason  |F     |17 |[50, 70, 58]|[English, Chinese, Math]|
|Logan  |M     |18 |[82, 91, 88]|[English, Chinese, Math]|
|Anthony|F     |17 |[66, 75, 71]|[English, Chinese, Math]|
|Joseph |M     |16 |[75, 80, 69]|[English, Chinese, Math]|
|Harper |M     |15 |[65, 65, 70]|[English, Chinese, Math]|
+-------+------+---+------------+------------------------+

将数组元素转成字符串 array_join()

from pyspark.sql.functions import array_join
df = df.withColumn('score',array_join(col('score'),','))\
        .withColumn('subject',array_join(col('subject'),','))
df.show()
+-------+------+---+--------+--------------------+
|   name|gender|age|   score|             subject|
+-------+------+---+--------+--------------------+
|Natalie|     F| 16|80,72,56|English,Chinese,Math|
|  Mason|     F| 17|50,70,58|English,Chinese,Math|
|  Logan|     M| 18|82,91,88|English,Chinese,Math|
|Anthony|     F| 17|66,75,71|English,Chinese,Math|
| Joseph|     M| 16|75,80,69|English,Chinese,Math|
| Harper|     M| 15|65,65,70|English,Chinese,Math|
+-------+------+---+--------+--------------------+

将列中元素取出形成新列

df = df.withColumn('score_array',split(col('score'),','))
df = df.withColumn('English',element_at(col('score_array'),1))\
        .withColumn('Chinese',element_at(col('score_array'),2))\
        .withColumn('Math',element_at(col('score_array'),3)).select('name','gender','age','English','Chinese','Math')
df.show()
+-------+------+---+-------+-------+----+
|   name|gender|age|English|Chinese|Math|
+-------+------+---+-------+-------+----+
|Natalie|     F| 16|     80|     72|  56|
|  Mason|     F| 17|     50|     70|  58|
|  Logan|     M| 18|     82|     91|  88|
|Anthony|     F| 17|     66|     75|  71|
| Joseph|     M| 16|     75|     80|  69|
| Harper|     M| 15|     65|     65|  70|
+-------+------+---+-------+-------+----+

转成json

df.select('name','gender','age','English','Chinese', 'Math').toJSON().collect()
['{"name":"Natalie","gender":"F","age":"16","English":"80","Chinese":"72","Math":"56"}',
 '{"name":"Mason","gender":"F","age":"17","English":"50","Chinese":"70","Math":"58"}',
 '{"name":"Logan","gender":"M","age":"18","English":"82","Chinese":"91","Math":"88"}',
 '{"name":"Anthony","gender":"F","age":"17","English":"66","Chinese":"75","Math":"71"}',
 '{"name":"Joseph","gender":"M","age":"16","English":"75","Chinese":"80","Math":"69"}',
 '{"name":"Harper","gender":"M","age":"15","English":"65","Chinese":"65","Math":"70"}']

你会发现数据有一步步回到了原始状态

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

viziviuz

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值