pyspark 导入asc_pyspark基础语法(六):集合操作

本文参考了master苏:pyspark系列--pandas与pyspark对比

from pyspark.sql.functions import current_date
#导入spark相关的packages
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.appName('stastic').enableHiveSupport().getOrCreate()
import pyspark.sql.functions
# 原始数据 
test = spark.createDataFrame([('001','1',100,87,67,83,98), ('002','2',87,81,90,83,83), ('003','3',86,91,83,89,63),
                            ('004','2',65,87,94,73,88), ('005','1',76,62,89,81,98), ('006','3',84,82,85,73,99),
                            ('007','3',56,76,63,72,87), ('008','1',55,62,46,78,71), ('009','2',63,72,87,98,64)],                           ['number','class','language','math','english','physic','chemical'])
test.show()
+------+-----+--------+----+-------+------+--------+
|number|class|language|math|english|physic|chemical|
+------+-----+--------+----+-------+------+--------+
|   001|    1|     100|  87|     67|    83|      98|
|   002|    2|      87|  81|     90|    83|      83|
|   003|    3|      86|  91|     83|    89|      63|
|   004|    2|      65|  87|     94|    73|      88|
|   005|    1|      76|  62|     89|    81|      98|
|   006|    3|      84|  82|     85|    73|      99|
|   007|    3|      56|  76|     63|    72|      87|
|   008|    1|      55|  62|     46|    78|      71|
|   009|    2|      63|  72|     87|    98|      64|
+------+-----+--------+----+-------+------+--------+

1.创建map

# Creates a new map column.
from pyspark.sql.functions import create_map

test.select(create_map([test.number, test.math]).alias("map")).show()
+-----------+
|        map|
+-----------+
|[001 -> 87]|
|[002 -> 81]|
|[003 -> 91]|
|[004 -> 87]|
|[005 -> 62]|
|[006 -> 82]|
|[007 -> 76]|
|[008 -> 62]|
|[009 -> 72]
+-----------+

2.创建列表

# Creates a new array column.
from pyspark.sql.functions import array

test.select(array('number', 'math','english').alias("arr")).show()
+-------------+
|          arr|
+-------------+
|[001, 87, 67]|
|[002, 81, 90]|
|[003, 91, 83]|
|[004, 87, 94]|
|[005, 62, 89]|
|[006, 82, 85]|
|[007, 76, 63]|
|[008, 62, 46]|
|[009, 72, 87]|
+-------------+

3.判断元素是否存在

相当于 pandas.isin, pandas.notin

from pyspark.sql.functions import array_contains

df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])

df.select(array_contains(df.data, "a")).show()
+-----------------------+
|array_contains(data, a)|
+-----------------------+
|                   true|
|                  false|
+-----------------------+

4. 数据拉直

from pyspark.sql import Row
from pyspark.sql.functions import explode

eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
+---+---------+--------+
|  a|  intlist|mapfield|
+---+---------+--------+
|  1|[1, 2, 3]|[a -> b]|
+---+---------+--------+
eDF.select(explode('intlist').alias("anInt")).show()
+-----+
|anInt|
+-----+
|    1|
|    2|
|    3|
+-----
eDF.select(explode('mapfield').alias("key", "value")).show()
+---+-----+
|key|value|
+---+-----+
|  a|    b|
+---+-----+

5. posexplode

# Returns a new row for each element with position in the given array or map.
from pyspark.sql import Row
from pyspark.sql.functions import posexplode

eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
+---+---------+--------+
|  a|  intlist|mapfield|
+---+---------+--------+
|  1|[1, 2, 3]|[a -> b]|
+---+---------+--------+
eDF.select(posexplode('intlist')).show()
+---+---+
|pos|col|
+---+---+
|  0|  1|
|  1|  2|
|  2|  3|
+---+---+

6.列表排序

# Collection function: sorts the input array in ascending or descending order according
# to the natural ordering of the array elements.
from pyspark.sql.functions import sort_array

df = spark.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])

df.select(sort_array(df.data).alias('r')).show()
+---------+
|        r|
+---------+
|[1, 2, 3]|
|      [1]|
|       []|
+---------+
df.select(sort_array(df.data, asc=False).alias('r')).show()
+---------+
|        r|
+---------+
|[3, 2, 1]|
|      [1]|
|       []|
+---------+

更多资讯关注微信公众号:DataScienceArt,不定期分送各种小福利。

d04d6ca28f9c99bf02c76efe8d3c82dc.png
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值