from pyspark.sql.column import Column
from pyspark.sql.column import _to_java_column
from pyspark.sql.column import _to_seq
spark = SparkSession.builder.appName("scala_udf_test").getOrCreate()
sc = spark.sparkContext
def group_concat(col):
_groupConcat = sc._jvm.com.learning.GroupConcat.apply
return Column(_groupConcat(_to_seq(sc, [col], _to_java_column)))
def process():
rows = [
("k1", "a"),
("k1", "b"),
("k1", "c"),
("k2", "d"),
("k3", "e"),
("k3", "f"),
]
df = spark.createDataFrame(rows, ['key', 'value'])
df.show(50)
df.groupBy("key").agg(group_concat("value").alias("concat")).show()
if __name__ == "__main__":
process()
pyspark group_concat
最新推荐文章于 2023-05-30 18:19:58 发布