1. 创建map
# Creates a new map column.
from pyspark.sql.functions import create_map
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.show()
df.select(create_map([df.name, df.age]).alias("map")).show()
# +-------------------+
# | map|
# +-------------------+
# |Map(John Doe -> 21)|
# +-------------------+
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
2. 创建列表
# Creates a new array column.
from pyspark.sql.functions import array
df.select(array('age', 'age').alias("arr")).show()
# +--------+
# | arr|
# +--------+
# |[21, 21]|
# +--------+
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
3. 元素存在判断
相当于 pandas.isin, pandas.notin
from pyspark.sql.functions import array_contains
df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
df.select(array_contains(df.data, "a")).show()
# +-----------------------+
# |array_contains(data, a)|
# +-----------------------+
# | true|
# | false|
# +-----------------------+
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
4. 数据拉直
这是我造的名词,大概意思是,如果col的值是列表之类的复合数据,则将每个数据单独赋予一行。
Returns a new row for each element in the given array or map
from pyspark.sql import Row
from pyspark.sql.functions import explode
eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
# +---+---------+-----------+
# | a| intlist| mapfield|
# +---+---------+-----------+
# | 1|[1, 2, 3]|Map(a -> b)|
# +---+---------+-----------+
eDF.select(explode('intlist').alias("anInt")).show()
# |anInt|
# +-----+
# | 1|
# | 2|
# | 3|
# +-----+
eDF.select(explode('mapfield').alias("key", "value")).show()
# +---+-----+
# |key|value|
# +---+-----+
# | a| b|
# +---+-----+
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
5. posexplode
# Returns a new row for each element with position in the given array or map.
from pyspark.sql import Row
from pyspark.sql.functions import posexplode
eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
# +---+---------+-----------+
# | a| intlist| mapfield|
# +---+---------+-----------+
# | 1|[1, 2, 3]|Map(a -> b)|
# +---+---------+-----------+
eDF.select(posexplode('intlist')).show()
# +---+---+
# |pos|col|
# +---+---+
# | 0| 1|
# | 1| 2|
# | 2| 3|
# +---+---+
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
6. json操作
6.1. get_json_object
6.2. json_tuple
6.3. from_json
6.4. to_json
7. 列表排序
# Collection function: sorts the input array in ascending or descending order according
# to the natural ordering of the array elements.
from pyspark.sql.functions import sort_array
df = spark.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])
df.select(sort_array(df.data).alias('r')).show()
# +---------+
# | r|
# +---------+
# |[1, 2, 3]|
# | [1]|
# | []|
# +---------+
df.select(sort_array(df.data, asc=False).alias('r')).show()
# +---------+
# | r|
# +---------+
# |[3, 2, 1]|
# | [1]|
# | []|
# +---------+
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
<link href="https://csdnimg.cn/release/phoenix/mdeditor/markdown_views-7b4cdcb592.css" rel="stylesheet">
</div>