方法一:使用udf
df = hc.createDataFrame([[1,[1,2,3]],[2,[2,3,4]],[3,[3,4,5]]],['id','list'])
schema=StructType(
[StructField('id', IntegerType(), True),
StructField('list', ArrayType(IntegerType()), True)]
)
@F.udf(returnType=schema)
def func(_id, _list):
return [_id, _list]
df2 = df.withColumn('id_list', func('id', 'list'))
方法二:使用map也能实现
def map_func(x):
v = (x['id'], x['list'])
return Row(
id = x.id,
list = x.list,
id_list = v
)
df_ = df.rdd.map(lambda x : map_func(x)).toDF()
df_.show(truncate=False)
然后id_list collect_list之后还能按照第一个位置的元素进行排序
df3 = df2.groupby().agg(
F.collect_list('id_list').alias('id_lists')
)
df3.show(truncate=False)
@F.udf(returnType=ArrayType(ArrayType(IntegerType())))
def fc(id_lists):
id_lists.sort(key = lambda x : x[0], reverse=True)
return [x[1] for x in id_lists]
df4 = df3.withColumn('lists', fc('id_lists'))