自定义udtf函数
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
ss = SparkSession.builder.getOrCreate()
df1 = ss.read.text(paths='/data/words.txt')
df1.show(truncate=False)
df2 = df1.select(F.split('value', ',').alias('words_list'))
df2.show(truncate=False)
df3 = df2.select(F.explode('words_list').alias('word'))
df3.show()
df3.groupBy('word').count().orderBy('count', ascending=False).show()
df1.select(F.explode(F.split('value', ',')).alias('word')).\
groupBy('word').\
count().\
orderBy('count',ascending=False).\
show()