文章目录
关于pyspark分组后遍历分组后的数据参考这篇文章:
https://blog.csdn.net/qq_42363032/article/details/118298108
pyspark分组后如下,在pandas里分组后,每一个小df就是如下的每一行
data = ss.createDataFrame(data)
da_gb = data.groupby('alpos_id').agg(
fn.collect_list('impressions').alias('impressions_list'),
fn.collect_list('ecpm').alias('ecpm_list')
)
da_gb.show()
将pyspark分组后的数据,即每一行,转成pandas的df:
def row_dealwith(data):
ids = list(data.keys())[0] # 获取分组id
values = data.get(ids) # 获取分组后的字段值
lens = len(values)
# print(ids, values[0], values[1])
# 构造id
ids_li = []
for i in range(len(values[0])):
ids_li.append(ids)
# 横向分组转为纵向分组
zdict = {}
zlis = []
zdict['alpos_id'] = ids_li
for i in range(lens):
zdict[i] = values[i]
print(zdict)
da_gb = pd.DataFrame(zdict)
print(da_gb)
dardds = da_gb.rdd.map(lambda data: ({data.alpos_id: [data.impressions_list, data.ecpm_list]}))
dardds.foreach(row_dealwith)
'''
out:
{'alpos_id': ['0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930', '0_2011082923279930'], 0: [222.0, 2269.0, 212.0, 43.0, 29.0, 172.0, 192.0, 232.0, 288.0, 306.0, 328.0], 1: [14.4595, 14.0899, 14.3868, 12.5581, 12.069, 30.814, 14.1667, 12.6293, 15.5556, 8.5948, 11.2805]}
{'alpos_id': ['0_3001461399082077', '0_3001461399082077', '0_3001461399082077', '0_3001461399082077', '0_3001461399082077', '0_3001461399082077', '0_3001461399082077', '0_3001461399082077', '0_3001461399082077'], 0: [0.2, 0.0, 0.142857142857142, 0.0, 0.181818181818181, 0.3, 0.3125, 0.0, 0.0], 1: [43.6990133333333, 40.1434533333333, 41.21348, 34.8579266666666, 35.2619666666666, 35.6953, 44.22308, 44.4453, 44.18604]}
{'alpos_id': ['0_3071297379437968'], 0: [8.0], 1: [73.75]}
{'alpos_id': ['0_3031798112278383', '0_3031798112278383', '0_3031798112278383', '0_3031798112278383'], 0: [4.0, 62.0, 58.0, 4.0], 1: [2.5, 6.9355, 9.3103, 5.0]}
'''
汇总关键代码
def row_dealwith(data):
ids = list(data.keys())[0] # 获取分组id
values = data.get(ids) # 获取分组后的字段值
lens = len(values)
# print(ids, values[0], values[1])
# 构造id
ids_li = []
for i in range(len(values[0])):
ids_li.append(ids)
# 横向分组转为纵向分组
zdict = {}
zlis = []
zdict['alpos_id'] = ids_li
for i in range(lens):
zdict[i] = values[i]
print(zdict)
da_gb = pd.DataFrame(zdict)
print(da_gb)
def pyspark_gb(data):
data = ss.createDataFrame(data)
da_gb = data.groupby('alpos_id').agg(
fn.collect_list('impressions').alias('impressions_list'),
fn.collect_list('ecpm').alias('ecpm_list')
)
da_gb.show()
dardds = da_gb.rdd.map(lambda data: ({data.alpos_id: [data.impressions_list, data.ecpm_list]}))
# print(type(sss)) # pyspark.rdd.PipelinedRDD
# print(sss.take(5))
# sss.foreach(lambda x: print(x))
dardds.foreach(row_dealwith)