Python处理诊断编码数据
需求:一个诊断名称对应多个诊断编码,取出现次数最多的那一个。
import pandas as pd
df = pd.DataFrame([['糖尿病','1'],['糖尿病','2'],['糖尿病','3'],['糖尿病','1'],['糖尿病',''],['糖尿病',''],
['高血压','1'],['高血压','2'],['高血压','3'],['高血压','1'],['高血压',''],['高血压','']
],columns=['diag_name','code'])
sheet_rows = df.shape[0]
result_file = './result/diag_stand.csv'
#定义DataFrame列名
df_col_names = ['diag_name','code']
#定义1个DataFrame
df_result = pd.DataFrame(columns=df_col_names)
#1.排掉"code"列为空的数据
for row in range(sheet_rows):
if df.iloc[row, 1] == '':
continue
else:
diag_name = df.iloc[row, 0] #行 列
code = df.iloc[row, 1]
df_temp = pd.DataFrame([[diag_name,code]],columns=df_col_names)
df_result = df_result.append(df_temp, ignore_index=True)
#2.按照diag_name,code分组,统计出现的次数,添加count列,按照diag_name,count分组,降序
gp = df_result.groupby(by=['diag_name','code']).size().reset_index(name='count').sort_values(by=['diag_name','count'],ascending=(False,False))
# print(gp)
#head(1) 每一组取第1条
gp2 = gp.groupby('diag_name').head(1).sort_values(by=['count'],ascending=(False))
gp2.to_csv(result_file,index=False)
处理前数据
处理后数据
reference
1.Python技巧之对DataFrame进行多列排序
https://blog.csdn.net/m0_37637511/article/details/79901071