ref:SparkSQL-第四章-2-注册返回值是数组类型的UDF_哔哩哔哩_bilibili
定义RDD 转为spark的DataFrame, DLS 、SQL风格分别处理
SQL风格得创建临时表
truncate表示完全展示 没有省略号
import pyspark.sql.functions as F
from pyspark.sql.types import *
import os
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
import pandas as pd
import os
if __name__ == '__main__':
os.environ['SPARK_HOME'] = "/Users/jingjing.ma/Documents/spark-3.3.1-bin-hadoop3"
spark = SparkSession \
.builder \
.master('local[*]') \
.appName('try_test') \
.getOrCreate()
sc = spark.sparkContext
# 创建RDD 并转换
rdd = sc.parallelize([['hadoop java python'], ['hadoop flink spark']])
df = rdd.toDF(['line'])
# 注册UDF
def str_split(data):
return data.split(' ')
# TODO1 方式1构建UDF
func2 = spark.udf.register('func1', str_split, ArrayType(StringType()))
# DLS
df.select(func2(df['line'])).show()
# SQL
df.createTempView('lines')
spark.sql('SELECT func1(line) FROM lines').show(truncate=False)
# TODO2 方式2构建UDF
func3 = F.udf(str_split, ArrayType(StringType()))
df.select(func3(df['line'])).show(truncate=False)