from pyspark.sql import SparkSession
from pyspark.sql import functions
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
df = spark.createDataFrame((
(1, "a"),
(2, "b"),
(3, "c")
)).toDF("label", "sentence")
df.show()
+-----+--------+
|label|sentence|
+-----+--------+
| 1| a|
| 2| b|
| 3| c|
+-----+--------+
def to_upper(s):
if s is not None:
return s.upper()
toDateUDF = udf(to_upper, StringType())
# 使用自定义函数
df.withColumn('sentence1',toDateUDF('sentence')).show()
+-----+--------+---------+
|label|sentence|sentence1|
+-----+--------+---------+
| 1| a| A|
| 2| b| B|
| 3| c| C|
df = df.withColumn('col_name', F.udf(to_upper, returnType=StringType())('sentence'))
df.show()
+-----+--------+--------+
|label|sentence|col_name|
+-----+--------+--------+
| 1| a| A|
| 2| b| B|
| 3| c| C|
+-----+--------+--------+