1. 统一变量统计格式
1.1 读入数据
# 导入程序包
import pandas as pd
import numpy as np
import re
import sys
import os
project_path = os.getcwd()
df_model=pd.read_excel(project_path+'/data/processed_data/df_11.4_人工删除无意义列.xlsx')
if 'Unnamed: 0' in df_model.columns:
df_model = df_model.drop(['Unnamed: 0'], axis=1)
# 删除id、时间变量
df_model=df_model.drop(['patient_id','case_no','drug_name','start_datetime','end_datetime','bmd_time'],axis=1)
df_model.columns
# 选取变量
# discrete_col = ['日剂量','gender', '基础疾病','情感稳定剂','抗精神病药','抗焦虑药','安眠药','行为干预治疗']
discrete_col = ['日剂量','gender','抗精神病药','抗焦虑药','行为干预治疗']
continuous_col=[x for x in df_model.columns if x not in discrete_col]
1.2 分类变量统计
# 求分类变量比例
df_discrete_stat=pd.DataFrame(columns=['变量名称','所有事件(%d)' % df_model.shape[0],'缺失率(%)'])
for i in discrete_col:
print(i)
# 缺失率
if df_model[i].isnull().sum()==0:
miss_rate='0%'
else:
miss_rate=df_model[i].isnull().sum()/df_model.shape[0]
miss_rate="%.2f%%" % (miss_rate * 100) # 百分数输出
df_discrete_stat.loc[df_discrete_stat.shape[0],['变量名称','缺失率(%)']]=[i+',n(%)',miss_rate]
# 分类变量单独统计
name_list=[]
num_perc_list=[]
df_model_stat=df_model[df_model[i].notnull()].sort_values([i],ascending=True)
# 二分类还是多分类变量
if df_model_stat[i].nunique() <=2:
if re.match('gender|性别',i):
name_list=['男','女']
else:
name_list=['是','否']
for name,value in zip(name_list,[1,0]):
print(name)
num=df_model_stat[df_model_stat[i]==value].shape[0]
percent=num/df_model.shape[0]
percent="%.2f%%" % (percent * 100)
num_percent=str(num)+'('+percent+')'
num_perc_list.append(num_percent)
else:
for value in sorted(df_model_stat[i].unique()):
print(value)
name_list.append(value)
num=df_model_stat[df_model_stat[i]==value].shape[0]
percent=num/df_model.shape[0]
percent="%.2f%%" % (percent * 100)
num_percent=str(num)+'('+percent+')'
num_perc_list.append(num_percent)
df_temp = pd.DataFrame(data={'变量名称':name_list,
'所有事件(%d)' % df_model.shape[0]:num_perc_list})
df_discrete_stat=pd.concat([df_discrete_stat,df_temp],axis=0)
df_discrete_stat=df_discrete_stat.reset_index(drop=True)
1.3 连续变量统计
# 统计全变量体系各变量的平均数、上下四分位数、缺失率
feature_quarter_list=[]
mean_quarter_list=[]
feature_std_list=[]
mean_std_list=[]
miss_list=[]
for i in continuous_col:
# 计算上下四分位、均值、标准差
try:
data = df_model[i].astype('float')
stat_result = pd.DataFrame(data.describe())
mean_value=stat_result.loc['mean',i]
up_quarter=stat_result.loc['25%',i]
down_quarter=stat_result.loc['75%',i]
std_value=stat_result.loc['std',i]
except:
mean_value=np.nan
up_quarter=np.nan
down_quarter=np.nan
# 计算缺失率
if df_model[i].isnull().sum()==0:
miss_rate='0%'
else:
miss_rate=df_model[i].isnull().sum()/df_model.shape[0]
miss_rate="%.2f%%" % (miss_rate * 100) # 百分数输出
miss_list.append(miss_rate)
# mean(quarter)
feature_quarter_list.append(i+',mean(IQR)')
mean_quarter_list.append('%.2f(%.2f-%.2f)' % (mean_value,up_quarter,down_quarter))
# mean(std)
feature_std_list.append(i+',mean±std')
mean_std_list.append('%.2f±%.2f' % (mean_value,std_value))
df_continuous_quarter=pd.DataFrame(data={'特征':feature_quarter_list,
'mean_quarter_list':mean_quarter_list,
'miss_list':miss_list})
df_continuous_std=pd.DataFrame(data={'特征':feature_std_list,
'mean_std_list':mean_std_list,
'miss_list':miss_list})
df_continous_stat=pd.concat([df_continuous_quarter,df_continuous_std],axis=1)
# 保存连续变量统计
df_continous_stat.to_excel(project_path+'/data/stat/df_2.2_连续变量统计_importance.xlsx')
于鹏-提取上下四分位代码
## 输出上下四分位
def desc_IQR(data):
# 提取describe
df_desc = data.describe().round(2).copy()
df_desc = df_desc[df_desc.index.isin(['25%','50%','75%'])]
df_desc = df_desc.astype('str')
# 拼凑格式
df_result = df_desc.loc['50%',df_desc.columns] +
'(' + df_desc.loc['25%',df_desc.columns] + '-' +
df_desc.loc['75%',df_desc.columns] + ')'
result = df_result.to_frame(name='IQR').reset_index().rename(columns={'index':'variables'})
return result