导入数据-------------------------
train_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛训练集.csv', encoding='gbk')
test_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛测试集.csv', encoding='gbk')
print(train_df.shape, test_df.shape)
数据分析 -------------------------
print('------缺失值的数量------')
print('编号:',len(train_df[train_df['编号'].isnull().values==True]['编号']))
print('性别:',len(train_df[train_df['性别'].isnull().values==True]['性别']))
print('出生年份:',len(train_df[train_df['出生年份'].isnull().values==True]['出生年份']))
print('体重指数:',len(train_df[train_df['体重指数'].isnull().values==True]['体重指数']))
print('舒张压:',len(train_df[train_df['舒张压'].isnull().values==True]['舒张压']))
print('糖尿病家族史:',len(train_df[train_df['糖尿病家族史'].isnull().values==True]['糖尿病家族史']))
print('口服耐糖量测试:',len(train_df[train_df['口服耐糖量测试'].isnull().values==True]['口服耐糖量测试']))
print('胰岛素释放实验:',len(train_df[train_df['胰岛素释放实验'].isnull().values==True]['胰岛素释放实验']))
print('肱三头肌皮褶厚度:',len(train_df[train_df['肱三头肌皮褶厚度'].isnull().values==True]['肱三头肌皮褶厚度']))
print('患有糖尿病标识:',len(train_df[train_df['患有糖尿病标识'].isnull().values==True]['患有糖尿病标识']))
for i in train_df.columns:
statstic_cg_context = len(train_df[i].value_counts())
print('列名:',i,'有',statstic_cg_context,'种类别')
print('===============================')
计算数据的相关性------------------
可视化相关数据
# 根据correlation函数查看数据之间的相关性
train_df.corr()
sns.set(context='notebook',font='SimHei',style='whitegrid')
sns.pairplot(train_df)
sns.set(context='notebook',font='SimHei',style='whitegrid')
sns.pairplot(train_df,kind='reg',diag_kind='hist')
#可视化相关数据
sns.countplot(x='患有糖尿病标识', hue='性别', data=train_df)
sns.boxplot(y='出生年份', x='患有糖尿病标识', hue='性别', data=train_df)
简单逻辑回归预测----------------
import warnings
warnings.filterwarnings("ignore")
#选择特征
train_df_lg_x = train_df[['性别','出生年份','体重指数','糖尿病家族史','舒张压','口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']]
test_df_lg_x = test_df[['性别','出生年份','体重指数','糖尿病家族史','舒张压','口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']]
#离散化特征
dict_糖尿病家族史 = {
'无记录': 0,
'叔叔或姑姑有一方患有糖尿病': 1,
'叔叔或者姑姑有一方患有糖尿病': 1,
'父母有一方患有糖尿病': 2
}
#特征映射
train_df_lg_x['糖尿病家族史'] = train_df_lg_x['糖尿病家族史'].map(dict_糖尿病家族史)
test_df_lg_x['糖尿病家族史'] = test_df_lg_x['糖尿病家族史'].map(dict_糖尿病家族史)
# 处理缺失值
train_df_lg_x['舒张压'].fillna(89, inplace=True)
test_df_lg_x['舒张压'].fillna(89, inplace=True)
train_df_lg_y = train_df['患有糖尿病标识']
train_x = train_df_lg_x.values
train_y = train_df_lg_y.values
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
# 建立模型
# 训练模型
clf = LogisticRegression(C=1,random_state=3).fit(train_x, train_y)
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
result = confusion_matrix(train_y,clf.predict(train_x))
print('查看结果',result)
precision = precision_score(clf.predict(train_x), train_y)
print('模型准确度为',precision)
print('模型f1为',f1_score(clf.predict(train_x), train_y))
查看结果 [[2770 364] [585 1351]] 模型准确度为 0.6978305785123967 模型f1为 0.740071213366201
调划分数据集,调参优化
from sklearn.model_selection import train_test_split
train_x,vali_x, train_y,vali_y = train_test_split(train_x,train_y,test_size = 0.2, random_state = 31)
#模型惩罚系数调整为0.05,使用validation集进行拟合
clf3= LogisticRegression(C=0.07,penalty = 'l2', solver ='saga', l1_ratio=0.4 ,random_state=22).fit(train_x, train_y)
precision3 = precision_score(clf3.predict(vali_x), vali_y)
print('模型准确度为',precision3)
#模型准确度为 0.6884422110552764
#模型惩罚系数调整为0.05,使用validation集进行拟合
clf2= LogisticRegression(C=0.05,random_state=3).fit(train_x, train_y)
precision2 = precision_score(clf2.predict(vali_x), vali_y)
print('模型准确度为',precision2)
#模型准确度为 0.7010050251256281
f1_score(clf2.predict(vali_x), vali_y)
#F1准确率为:0.7390728476821192
特征工程----------------
# 训练集
#男女体重平均指数
#男女舒张压平均指数
man_height_gap = train_df[train_df['性别']==1]['体重指数'].mean()
women_height_gap = train_df[train_df['性别']==0]['体重指数'].mean()
man_stress_gap = train_df[train_df['性别']==1]['舒张压'].mean()
women_stress_gap = train_df[train_df['性别']==0]['舒张压'].mean()
# test集
#男女体重平均指数
#男女舒张压平均指数
man_height_gap2 = test_df[test_df['性别']==1]['体重指数'].mean()
women_height_gap2 = test_df[test_df['性别']==0]['体重指数'].mean()
man_stress_gap2 = test_df[test_df['性别']==1]['舒张压'].mean()
women_stress_gap2 = test_df[test_df['性别']==0]['舒张压'].mean()
- 计算每个患者与每个性别平均值的差异
-
# 处理训练集 train_clean = train_df train_clean['体重差异']=0 train_clean['舒张压差异'] = 0 train_clean['体重差异'].loc[train_clean['性别']==0]=train_clean['体重指数'].loc[train_clean['性别']==0]-women_height_gap train_clean['体重差异'].loc[train_clean['性别']==1]=train_clean['体重指数'].loc[train_clean['性别']==1]-man_height_gap train_clean['舒张压差异'].loc[train_clean['性别']==0]=train_clean['舒张压'].loc[train_clean['性别']==0]-women_stress_gap train_clean['舒张压差异'].loc[train_clean['性别']==1]=train_clean['舒张压'].loc[train_clean['性别']==1]-man_stress_gap # 处理test test_clean = test_df test_clean['体重差异']=0 test_clean['舒张压差异'] = 0 test_clean['体重差异'].loc[test_clean['性别']==0]=test_clean['体重指数'].loc[test_clean['性别']==0]-women_height_gap2 test_clean['体重差异'].loc[test_clean['性别']==1]=test_clean['体重指数'].loc[test_clean['性别']==1]-man_height_gap2 test_clean['舒张压差异'].loc[test_clean['性别']==0]=test_clean['舒张压'].loc[test_clean['性别']==0]-women_stress_gap2 test_clean['舒张压差异'].loc[test_clean['性别']==1]=test_clean['舒张压'].loc[test_clean['性别']==1]-man_stress_gap2 train_clean['舒张压差异'].fillna(0,inplace = True) test_clean['舒张压差异'].fillna(0,inplace = True)
# 处理train #离散化特征 dict_糖尿病家族史 = { '无记录': 0, '叔叔或姑姑有一方患有糖尿病': 1, '叔叔或者姑姑有一方患有糖尿病': 1, '父母有一方患有糖尿病': 2 } #特征映射 train_clean['糖尿病家族史'] = train_clean['糖尿病家族史'].map(dict_糖尿病家族史) train_clean['舒张压'].fillna(89, inplace=True) # 处理 舒张压 缺失值 train_clean['舒张压'].fillna(89, inplace=True) train_clean['舒张压'].fillna(89, inplace=True) #把异常值-1也看作空值 mean_man_antisuger = round(train_clean['口服耐糖量测试'].loc[train_clean['性别']==1].mean(),3) mean_women_antisuger = round(train_clean['口服耐糖量测试'].loc[train_clean['性别']==0].mean(),3) # 替换为平均值 train_clean['口服耐糖量测试'].loc[(train_clean['性别']==0)&(train_clean['口服耐糖量测试']== -1)] =mean_women_antisuger train_clean['口服耐糖量测试'].loc[(train_clean['性别']==1)&(train_clean['口服耐糖量测试']== -1)] =mean_man_antisuger #肱三头肌皮褶厚度,把异常值0也看作空值 mean_man_skin = round(train_clean['肱三头肌皮褶厚度'].loc[train_clean['性别']==1].mean(),3) mean_women_skin = round(train_clean['肱三头肌皮褶厚度'].loc[train_clean['性别']==0].mean(),3) # 肱三头肌皮褶厚度,替换为平均值 train_clean['肱三头肌皮褶厚度'].loc[(train_clean['性别']==0)&(train_clean['肱三头肌皮褶厚度']== 0)] =mean_women_skin train_clean['肱三头肌皮褶厚度'].loc[(train_clean['性别']==1)&(train_clean['肱三头肌皮褶厚度']== 0)] =mean_man_skin # 更改为年龄 train_clean['出生年份']=2022-train_clean['出生年份'] # 处理test #特征映射 test_clean['糖尿病家族史'] = test_clean['糖尿病家族史'].map(dict_糖尿病家族史) test_clean['舒张压'].fillna(89, inplace=True) # 处理 舒张压 缺失值 test_clean['舒张压'].fillna(89, inplace=True) test_clean['舒张压'].fillna(89, inplace=True) #口服耐糖量测试,把异常值-1也看作空值 mean_man_antisuger = round(test_clean['口服耐糖量测试'].loc[test_clean['性别']==1].mean(),3) mean_women_antisuger = round(test_clean['口服耐糖量测试'].loc[test_clean['性别']==0].mean(),3) # 口服耐糖量测试,替换为平均值 test_clean['口服耐糖量测试'].loc[(test_clean['性别']==0)&(test_clean['口服耐糖量测试']== -1)] =mean_women_antisuger test_clean['口服耐糖量测试'].loc[(test_clean['性别']==1)&(test_clean['口服耐糖量测试']== -1)] =mean_man_antisuger #肱三头肌皮褶厚度,把异常值0也看作空值 mean_man_skin = round(test_clean['肱三头肌皮褶厚度'].loc[test_clean['性别']==1].mean(),3) mean_women_skin = round(test_clean['肱三头肌皮褶厚度'].loc[test_clean['性别']==0].mean(),3) # 肱三头肌皮褶厚度,替换为平均值 test_clean['肱三头肌皮褶厚度'].loc[(test_clean['性别']==0)&(test_clean['肱三头肌皮褶厚度']== 0)] =mean_women_skin test_clean['肱三头肌皮褶厚度'].loc[(test_clean['性别']==1)&(test_clean['肱三头肌皮褶厚度']== 0)] =mean_man_skin # 更改为年龄 test_clean['出生年份']=2022-test_clean['出生年份']
在上述基础上将训练集20%划分为验证集,使用逻辑回归完成训练
-
train_x_clean = train_clean[['性别', '出生年份', '体重指数', '糖尿病家族史', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度', '体重差异', '舒张压差异']] train_y_clean = train_clean['患有糖尿病标识'] test_x_clean = test_clean[['性别', '出生年份', '体重指数', '糖尿病家族史', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度', '体重差异', '舒张压差异']] train_X,val_X,train_y,val_y = train_test_split(train_x_clean, train_y_clean, test_size=0.2,random_state=3) clf = LogisticRegression(C=1,random_state=3) clf.fit(train_X,train_y) prediction_lr = clf.predict(val_X) print('\nF1 score:',f1_score(val_y,prediction_lr))
F1 score: 0.7385019710906702
-
精度有提高到73.85%
-
任务5:特征筛选-------------
- 使用树模型完成模型的训练,通过特征重要性筛选出Top5的特征
-
from sklearn.tree import DecisionTreeClassifier model_tree = DecisionTreeClassifier() model_tree.fit(train_X,train_y) features_import = pd.DataFrame(train_x.columns, columns=['feature']) features_import['importance'] = model_tree.feature_importances_ features_import.sort_values('importance', inplace=True,ascending=False) features_import
- 使用筛选出的特征和逻辑回归进行训练
-
train_x2 = train_clean[['体重指数', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']] train_y2 = train_clean['患有糖尿病标识'] test_x2 = test_clean[['体重指数', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']] train_X2,val_X2,train_y2,val_y2 = train_test_split(train_x2, train_y2, test_size=0.2,random_state=3) clf = LogisticRegression(C=1,random_state=3) clf.fit(train_X2,train_y2) prediction_lr = clf.predict(val_X2) print('\nF1 score:',f1_score(val_y2,prediction_lr))
F1 score: 0.7347480106100797
- F1-score略有下降
- 特征减少,模型可以fit维度降低,准确率下降比较合理
-
任务6:高阶树模型------------------
- 将训练集20%划分为验证集,使用LightGBM完成训练
- 尝试调节搜索LightGBM的参数
- 将步骤4调参之后的模型从新训练,将最新预测的结果文件提交到比赛
import lightgbm as lgb from sklearn.model_selection import GridSearchCV #使用网格调参数 param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20,100,1000,2000,3000,4000]} clf_lgb = lgb.LGBMClassifier( max_depth=3, n_estimators=4000, n_jobs=-1, verbose=-1, verbosity=-1, learning_rate=0.1, ) clf_lgb = GridSearchCV(clf_lgb,param_grid) clf_lgb.fit(train_X,train_y) prediction_lr = clf_lgb.predict(val_X) print('\nF1 score:',f1_score(val_y,prediction_lr))
F1 score: 0.9529860228716646
- K-fold训练
-
from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score def run_model_cv(model, kf, X_tr, y, X_te, cate_col=None): train_pred = np.zeros( (len(X_tr), len(np.unique(y))) ) test_pred = np.zeros( (len(X_te), len(np.unique(y))) ) cv_clf = [] for tr_idx, val_idx in kf.split(X_tr, y): x_tr = X_tr.iloc[tr_idx]; y_tr = y.iloc[tr_idx] x_val = X_tr.iloc[val_idx]; y_val = y.iloc[val_idx] call_back = [ lgb.early_stopping(50), ] eval_set = [(x_val, y_val)] model.fit(x_tr, y_tr, eval_set=eval_set, callbacks=call_back, verbose=-1) cv_clf.append(model) train_pred[val_idx] = model.predict_proba(x_val) test_pred += model.predict_proba(X_te) test_pred /= kf.n_splits return train_pred, test_pred, cv_clf #数据集拆分 train_x_lgb,val_x_lgb,train_y_lgb,val_y_lgb = train_test_split(train_clean, train_clean, test_size=0.2,random_state=3) #数据集拆分 train_pred, test_pred, cv_clf = run_model_cv( clf_lgb, KFold(n_splits=5), train_x_lgb.drop(['编号','患有糖尿病标识'],axis=1), train_y_lgb['患有糖尿病标识'], val_x_lgb.drop(['编号','患有糖尿病标识'],axis=1),) test_pred_1 = [ i.argmax() for i in test_pred] print('准确率',accuracy_score(test_pred_1,val_y)) print('\nF1 score:',f1_score(val_y,test_pred_1))
准确率 0.965483234714004 F1 score: 0.9557522123893806
任务7:stacking---------------
X_train, X_test, y_train, y_test = train_test_split(train_clean.drop(['编号','患有糖尿病标识'], axis=1), train_clean['患有糖尿病标识'], test_size=0.5)
### 第一层模型
clfs = [ GBDT(n_estimators=100),
RF(n_estimators=100),
ET(n_estimators=100),
ADA(n_estimators=100)
]
X_train_stack = np.zeros((X_train.shape[0], len(clfs)))
X_test_stack = np.zeros((X_test.shape[0], len(clfs)))
### 6折stacking
n_folds = 6
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1)
for i,clf in enumerate(clfs):
# print("分类器:{}".format(clf))
X_stack_test_n = np.zeros((X_test.shape[0], n_folds))
for j,(train_index,test_index) in enumerate(skf.split(X_train,y_train)):
# print(j)
tr_x = X_train.iloc[train_index]
tr_y = y_train.iloc[train_index]
clf.fit(tr_x, tr_y)
# print(clf.predict(X_train.iloc[test_index]).shape)
# print(test_index.shape)
# print('------------------------')
#生成stacking训练数据集
X_train_stack [test_index, i] = clf.predict(X_train.iloc[test_index])
X_stack_test_n[:,j] = clf.predict(X_test)
#生成stacking测试数据集
X_test_stack[:,i] = X_stack_test_n.mean(axis=1)
###第二层模型LR
clf_second = LogisticRegression(solver="lbfgs")
clf_second.fit(X_train_stack,y_train)
pred = clf_second.predict_proba(X_test_stack)[:,1]
print('ROC',roc_auc_score(y_test,pred))