源shuju
question_id id
0 17576 70391,70394
1 17576 70391,70392,70393,70394
2 17576 70391,70392
3 40430 155032,155033,155034
4 40430 155032,155033,155034,155035
5 40430 155033,155034,155035
6 40430 155032,155035
7 40430 155034,155035
8 40430 155032,155034
9 40430 155032,155034,155035
10 40430 155033,155034
11 40430 155032,155033
12 40430 155033,155035
13 40430 155032,155033,155035
pandas solution
df.join(df['id'].str.split(',',expand=True)
result
0 1 2 3
0 70391 70394 None None
1 70391 70392 70393 70394
2 70391 70392 None None
3 155032 155033 155034 None
4 155032 155033 155034 155035
5 155033 155034 155035 None
6 155032 155035 None None
7 155034 155035 None None
8 155032 155034 None None
9 155032 155034 155035 None
10 155033 155034 None None
11 155032 155033 None None
12 155033 155035 None None
13 155032 155033 155035 None
#注意expand=True
df.join(df['id'].str.split(',',expand=True))
question_id id 0 1 2 3
0 17576 70391,70394 70391 70394 None None
1 17576 70391,70392,70393,70394 70391 70392 70393 70394
2 17576 70391,70392 70391 70392 None None
3 40430 155032,155033,155034 155032 155033 155034 None
4 40430 155032,155033,155034,155035 155032 155033 155034 155035
5 40430 155033,155034,155035 155033 155034 155035 None
6 40430 155032,155035 155032 155035 None None
7 40430 155034,155035 155034 155035 None None
8 40430 155032,155034 155032 155034 None None
9 40430 155032,155034,155035 155032 155034 155035 None
10 40430 155033,155034 155033 155034 None None
11 40430 155032,155033 155032 155033 None None
12 40430 155033,155035 155033 155035 None None
13 40430 155032,155033,155035 155032 155033 155035 None
pyspark solution
tdf=df.select(F.split(df.id,',').alias('ss'),'question_id','count_num')
tdf.sort('question_id').show()
res=tdf.select(F.explode(tdf.ss).alias('new'),'question_id','count_num')
res.sort('question_id').show()
res.groupBy('question_id','new').sum().sort('question_id').show()