Intro
pyspark udf的使用
数据构造
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType,IntegerType,StringType
def get_or_create(app_name):
spark = (
SparkSession.builder.appName(app_name)
.config("spark.driver.maxResultSize", "10g")
.config("spark.sql.execution.arrow.enabled", "true")
.config("spark.dynamicAllocation.enabled", "false")
.config("spark.sql.crossJoin.enabled", "true")
.config("spark.kryoserializer.buffer.max", "512m")
.getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")
return spark
import pandas as pd
spark = get_or_create("spark")
df = pd.DataFrame({"name": ["A", "B"],"name1": ["A", "B"], "age": [10, 20]})
df_spark = spark.createDataFrame(df)
df_spark.show(truncate=False)
+----+-----+---+
|name|name1|age|
+----+-----+---+
|A |A |10 |
|B |B |20 |
+----+-----+---+
udf
用法一
@F.udf(returnType=IntegerType())
def test(col1):
res = col1*2
return res
df_spark.withColumn('new_age',test("age")).show()
+----+-----+---+-------+
|name|name1|age|new_age|
+----+-----+---+-------+
| A| A| 10| 20|
| B| B| 20| 40|
+----+-----+---+-------+
用法二
test1 = F.udf(lambda age:age*2,IntegerType())
df_spark.withColumn('new_age',test1("age")).show()
+----+-----+---+-------+
|name|name1|age|new_age|
+----+-----+---+-------+
| A| A| 10| 20|
| B| B| 20| 40|
+----+-----+---+-------+
test2 = F.udf(lambda name,name1:name+name1,StringType())
df_spark.withColumn('new_name',test2("name","name1")).show()
+----+-----+---+--------+
|name|name1|age|new_name|
+----+-----+---+--------+
| A| A| 10| AA|
| B| B| 20| BB|
+----+-----+---+--------+
pandas_udf 有空要再研究下,pyspark优化的一些操作,多了解些。目前就是暴力,慢就加分区,加内存
2022-08-05 于南京市江宁区九龙湖