pyspark.sql.functions 中函数应用 DataFrame 转变
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('test1') \
.master('local') \
.getOrCreate()
sc = spark.sparkContext
构建数据集
data = ['{"name":"Natalie","gender":"F","age":16,"English":80,"Math":56,"Chinese":72}',
'{"name":"Logan","gender":"M","age":18,"English":82,"Math":88,"Chinese":91}',
'{"name":"Joseph","gender":"M","age":16,"English":75,"Math":69,"Chinese":80}',
'{"name":"Mason","gender":"F","age":17,"English":50,"Math":58,"Chinese":70}',
'{"name":"Harper","gender":"M","age":15,"English":65,"Math":70,"Chinese":65}',
'{"name":"Anthony","gender":"F","age":17,"English":66,"Math":71,"Chinese":75}']
df = sc.parallelize(data).map(lambda x:[x]).toDF(['info'])
df.show(truncate=False)
+----------------------------------------------------------------------------+
|info |
+----------------------------------------------------------------------------+
|{"name":"Natalie","gender":"F","age":16,"English":80,"Math":56,"Chinese":72}|
|{"name":"Logan","gender":"M","age":18,"English":82,"Math":88,"Chinese":91} |
|{"name":"Joseph","gender":"M","age":16,"English":75,"Math":69,"Chinese":80} |
|{"name":"Mason","gender":"F","age":17,"English":50,"Math":58,"Chinese":70} |
|{"name":"Harper","gender":"M","age":15,"English":65,"Math":70,"Chinese":65} |
|{"name":"Anthony","gender":"F","age":17,"English":66,"Math":71,"Chinese":75}|
解析json json_tuple()
解析出来全为字符串
from pyspark.sql.functions import json_tuple
df = df.select(json_tuple(df.info,'name','gender','age','English','Math','Chinese')\
.name('name','gender','age','English','Math','Chinese'))
df.show()
+-------+------+---+-------+----+-------+
| name|gender|age|English|Math|Chinese|
+-------+------+---+-------+----+-------+
|Natalie| F| 16| 80| 56| 72|
| Logan| M| 18| 82| 88| 91|
| Joseph| M| 16| 75| 69| 80|
| Mason| F| 17| 50| 58| 70|
| Harper| M| 15| 65| 70| 65|
|Anthony| F| 17| 66| 71| 75|
+-------+------+---+-------+----+-------+
将多列合并成一列 concat_ws()
from pyspark.sql.functions import concat_ws
df = df.withColumn('score',concat_ws(',',col('English'),col('Math'),col('Chinese'))).select('name','gender','age','score')
df.show()
+-------+------+---+--------+
| name|gender|age| score|
+-------+------+---+--------+
|Natalie| F| 16|80,56,72|
| Logan| M| 18|82,88,91|
| Joseph| M| 16|75,69,80|
| Mason| F| 17|50,58,70|
| Harper| M| 15|65,70,65|
|Anthony| F| 17|66,71,75|
+-------+------+---+--------+
增加一列 (科目) lit()
literal
from pyspark.sql.functions import lit
df = df.withColumn('subject',lit('English,Math,Chinese'))
df.show()
+-------+------+---+--------+--------------------+
| name|gender|age| score| subject|
+-------+------+---+--------+--------------------+
|Natalie| F| 16|80,56,72|English,Math,Chinese|
| Logan| M| 18|82,88,91|English,Math,Chinese|
| Joseph| M| 16|75,69,80|English,Math,Chinese|
| Mason| F| 17|50,58,70|English,Math,Chinese|
| Harper| M| 15|65,70,65|English,Math,Chinese|
|Anthony| F| 17|66,71,75|English,Math,Chinese|
+-------+------+---+--------+--------------------+
分割成列表 split()
from pyspark.sql.functions import split
df = df.select('name','gender','age',split(col('score'),',').alias('score_array'),split(col('subject'),',').alias('subject_array'))
df.show(truncate=False)
+-------+------+---+------------+------------------------+
|name |gender|age|score_array |subject_array |
+-------+------+---+------------+------------------------+
|Natalie|F |16 |[80, 56, 72]|[English, Math, Chinese]|
|Logan |M |18 |[82, 88, 91]|[English, Math, Chinese]|
|Joseph |M |16 |[75, 69, 80]|[English, Math, Chinese]|
|Mason |F |17 |[50, 58, 70]|[English, Math, Chinese]|
|Harper |M |15 |[65, 70, 65]|[English, Math, Chinese]|
|Anthony|F |17 |[66, 71, 75]|[English, Math, Chinese]|
+-------+------+---+------------+------------------------+
将两个数组对应成map map_from_arrays()
from pyspark.sql.functions import map_from_arrays
df = df.select('name','gender','age',map_from_arrays('subject_array','score_array').alias('sub_score_map'))
df.show(truncate=False)
+-------+------+---+------------------------------------------+
|name |gender|age|sub_score_map |
+-------+------+---+------------------------------------------+
|Natalie|F |16 |[English -> 80, Math -> 56, Chinese -> 72]|
|Logan |M |18 |[English -> 82, Math -> 88, Chinese -> 91]|
|Joseph |M |16 |[English -> 75, Math -> 69, Chinese -> 80]|
|Mason |F |17 |[English -> 50, Math -> 58, Chinese -> 70]|
|Harper |M |15 |[English -> 65, Math -> 70, Chinese -> 65]|
|Anthony|F |17 |[English -> 66, Math -> 71, Chinese -> 75]|
+-------+------+---+------------------------------------------+
将map 转换成多行 explode()
from pyspark.sql.functions import explode
df = df.select('name','gender','age',explode('sub_score_map').alias('sub','score'))
df.show()
+-------+------+---+-------+-----+
| name|gender|age| sub|score|
+-------+------+---+-------+-----+
|Natalie| F| 16|English| 80|
|Natalie| F| 16| Math| 56|
|Natalie| F| 16|Chinese| 72|
| Logan| M| 18|English| 82|
| Logan| M| 18| Math| 88|
| Logan| M| 18|Chinese| 91|
| Joseph| M| 16|English| 75|
| Joseph| M| 16| Math| 69|
| Joseph| M| 16|Chinese| 80|
| Mason| F| 17|English| 50|
| Mason| F| 17| Math| 58|
| Mason| F| 17|Chinese| 70|
| Harper| M| 15|English| 65|
| Harper| M| 15| Math| 70|
| Harper| M| 15|Chinese| 65|
|Anthony| F| 17|English| 66|
|Anthony| F| 17| Math| 71|
|Anthony| F| 17|Chinese| 75|
+-------+------+---+-------+-----+
将两列对应成 map create_map()
from pyspark.sql.functions import create_map
df = df.select('name','gender','age',create_map('sub','score').name('map'))
df.show()
+-------+------+---+---------------+
| name|gender|age| map|
+-------+------+---+---------------+
|Natalie| F| 16|[English -> 80]|
|Natalie| F| 16| [Math -> 56]|
|Natalie| F| 16|[Chinese -> 72]|
| Logan| M| 18|[English -> 82]|
| Logan| M| 18| [Math -> 88]|
| Logan| M| 18|[Chinese -> 91]|
| Joseph| M| 16|[English -> 75]|
| Joseph| M| 16| [Math -> 69]|
| Joseph| M| 16|[Chinese -> 80]|
| Mason| F| 17|[English -> 50]|
| Mason| F| 17| [Math -> 58]|
| Mason| F| 17|[Chinese -> 70]|
| Harper| M| 15|[English -> 65]|
| Harper| M| 15| [Math -> 70]|
| Harper| M| 15|[Chinese -> 65]|
|Anthony| F| 17|[English -> 66]|
|Anthony| F| 17| [Math -> 71]|
|Anthony| F| 17|[Chinese -> 75]|
+-------+------+---+---------------+
按分组把列元素收集成数组 collect_list()
collect_set()
from pyspark.sql.functions import collect_list,collect_set
df = df.groupBy('name').agg(collect_set('gender').name('gender'),collect_set('age').name('age'), collect_list('map').alias('map'))
df.show(truncate=False)
+-------+------+----+------------------------------------------------+
|name |gender|age |map |
+-------+------+----+------------------------------------------------+
|Natalie|[F] |[16]|[[English -> 80], [Math -> 56], [Chinese -> 72]]|
|Mason |[F] |[17]|[[English -> 50], [Math -> 58], [Chinese -> 70]]|
|Logan |[M] |[18]|[[English -> 82], [Math -> 88], [Chinese -> 91]]|
|Anthony|[F] |[17]|[[English -> 66], [Math -> 71], [Chinese -> 75]]|
|Joseph |[M] |[16]|[[English -> 75], [Math -> 69], [Chinese -> 80]]|
|Harper |[M] |[15]|[[English -> 65], [Math -> 70], [Chinese -> 65]]|
+-------+------+----+------------------------------------------------+
将数组中的 map 转成一个 map,将 age
、gender
列中元素取出
这里用到 udf 自定义函数,可以将列表中的map 转成一个map
element_at() 可以将列表中某个位置的元素取出来
from pyspark.sql.functions import udf, element_at
from pyspark.sql.types import MapType
from collections import ChainMap
def lal(ar):
return dict(reduce(lambda x,y:ChainMap(x,y),ar))
mm_udf = udf(lal,MapType(StringType(),StringType(),True))
df = df.withColumn('map',mm_udf(col('map')))\
.withColumn('gender',element_at(col('gender'),1))\
.withColumn('age',element_at(col('age'),1))
df.show(truncate=False)
+-------+------+---+------------------------------------------+
|name |gender|age|map |
+-------+------+---+------------------------------------------+
|Natalie|F |16 |[English -> 80, Chinese -> 72, Math -> 56]|
|Mason |F |17 |[English -> 50, Chinese -> 70, Math -> 58]|
|Logan |M |18 |[English -> 82, Chinese -> 91, Math -> 88]|
|Anthony|F |17 |[English -> 66, Chinese -> 75, Math -> 71]|
|Joseph |M |16 |[English -> 75, Chinese -> 80, Math -> 69]|
|Harper |M |15 |[English -> 65, Chinese -> 65, Math -> 70]|
+-------+------+---+------------------------------------------+
将map 中的键 和 值 分开成两列 map_values()
map_keys()
from pyspark.sql.functions import map_values,map_keys
df = df.withColumn('score',map_values('map'))\
.withColumn('subject',map_keys('map'))\
.select('name','gender','age','score','subject')
df.show(truncate=False)
+-------+------+---+------------+------------------------+
|name |gender|age|score |subject |
+-------+------+---+------------+------------------------+
|Natalie|F |16 |[80, 72, 56]|[English, Chinese, Math]|
|Mason |F |17 |[50, 70, 58]|[English, Chinese, Math]|
|Logan |M |18 |[82, 91, 88]|[English, Chinese, Math]|
|Anthony|F |17 |[66, 75, 71]|[English, Chinese, Math]|
|Joseph |M |16 |[75, 80, 69]|[English, Chinese, Math]|
|Harper |M |15 |[65, 65, 70]|[English, Chinese, Math]|
+-------+------+---+------------+------------------------+
将数组元素转成字符串 array_join()
from pyspark.sql.functions import array_join
df = df.withColumn('score',array_join(col('score'),','))\
.withColumn('subject',array_join(col('subject'),','))
df.show()
+-------+------+---+--------+--------------------+
| name|gender|age| score| subject|
+-------+------+---+--------+--------------------+
|Natalie| F| 16|80,72,56|English,Chinese,Math|
| Mason| F| 17|50,70,58|English,Chinese,Math|
| Logan| M| 18|82,91,88|English,Chinese,Math|
|Anthony| F| 17|66,75,71|English,Chinese,Math|
| Joseph| M| 16|75,80,69|English,Chinese,Math|
| Harper| M| 15|65,65,70|English,Chinese,Math|
+-------+------+---+--------+--------------------+
将列中元素取出形成新列
df = df.withColumn('score_array',split(col('score'),','))
df = df.withColumn('English',element_at(col('score_array'),1))\
.withColumn('Chinese',element_at(col('score_array'),2))\
.withColumn('Math',element_at(col('score_array'),3)).select('name','gender','age','English','Chinese','Math')
df.show()
+-------+------+---+-------+-------+----+
| name|gender|age|English|Chinese|Math|
+-------+------+---+-------+-------+----+
|Natalie| F| 16| 80| 72| 56|
| Mason| F| 17| 50| 70| 58|
| Logan| M| 18| 82| 91| 88|
|Anthony| F| 17| 66| 75| 71|
| Joseph| M| 16| 75| 80| 69|
| Harper| M| 15| 65| 65| 70|
+-------+------+---+-------+-------+----+
转成json
df.select('name','gender','age','English','Chinese', 'Math').toJSON().collect()
['{"name":"Natalie","gender":"F","age":"16","English":"80","Chinese":"72","Math":"56"}',
'{"name":"Mason","gender":"F","age":"17","English":"50","Chinese":"70","Math":"58"}',
'{"name":"Logan","gender":"M","age":"18","English":"82","Chinese":"91","Math":"88"}',
'{"name":"Anthony","gender":"F","age":"17","English":"66","Chinese":"75","Math":"71"}',
'{"name":"Joseph","gender":"M","age":"16","English":"75","Chinese":"80","Math":"69"}',
'{"name":"Harper","gender":"M","age":"15","English":"65","Chinese":"65","Math":"70"}']
你会发现数据有一步步回到了原始状态