讯飞:糖尿病遗传风险预测Coggle挑战赛公开

导入数据-------------------------

train_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛训练集.csv', encoding='gbk')
test_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛测试集.csv', encoding='gbk')
print(train_df.shape, test_df.shape)

数据分析 -------------------------

print('------缺失值的数量------')
print('编号:',len(train_df[train_df['编号'].isnull().values==True]['编号']))
print('性别:',len(train_df[train_df['性别'].isnull().values==True]['性别']))
print('出生年份:',len(train_df[train_df['出生年份'].isnull().values==True]['出生年份']))
print('体重指数:',len(train_df[train_df['体重指数'].isnull().values==True]['体重指数']))
print('舒张压:',len(train_df[train_df['舒张压'].isnull().values==True]['舒张压']))
print('糖尿病家族史:',len(train_df[train_df['糖尿病家族史'].isnull().values==True]['糖尿病家族史']))
print('口服耐糖量测试:',len(train_df[train_df['口服耐糖量测试'].isnull().values==True]['口服耐糖量测试']))
print('胰岛素释放实验:',len(train_df[train_df['胰岛素释放实验'].isnull().values==True]['胰岛素释放实验']))
print('肱三头肌皮褶厚度:',len(train_df[train_df['肱三头肌皮褶厚度'].isnull().values==True]['肱三头肌皮褶厚度']))
print('患有糖尿病标识:',len(train_df[train_df['患有糖尿病标识'].isnull().values==True]['患有糖尿病标识']))

 

for i in train_df.columns:
    statstic_cg_context = len(train_df[i].value_counts())
    print('列名:',i,'有',statstic_cg_context,'种类别')
    print('===============================')

 

计算数据的相关性------------------

可视化相关数据

# 根据correlation函数查看数据之间的相关性
train_df.corr()

 

sns.set(context='notebook',font='SimHei',style='whitegrid')
sns.pairplot(train_df)

sns.set(context='notebook',font='SimHei',style='whitegrid')
sns.pairplot(train_df,kind='reg',diag_kind='hist')

#可视化相关数据
sns.countplot(x='患有糖尿病标识', hue='性别', data=train_df)

sns.boxplot(y='出生年份', x='患有糖尿病标识', hue='性别', data=train_df)

简单逻辑回归预测----------------

import warnings
warnings.filterwarnings("ignore")
#选择特征
train_df_lg_x = train_df[['性别','出生年份','体重指数','糖尿病家族史','舒张压','口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']]
test_df_lg_x = test_df[['性别','出生年份','体重指数','糖尿病家族史','舒张压','口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']]
#离散化特征
dict_糖尿病家族史 = {
    '无记录': 0,
    '叔叔或姑姑有一方患有糖尿病': 1,
    '叔叔或者姑姑有一方患有糖尿病': 1,
    '父母有一方患有糖尿病': 2
}
#特征映射
train_df_lg_x['糖尿病家族史'] = train_df_lg_x['糖尿病家族史'].map(dict_糖尿病家族史)
test_df_lg_x['糖尿病家族史'] = test_df_lg_x['糖尿病家族史'].map(dict_糖尿病家族史)

# 处理缺失值
train_df_lg_x['舒张压'].fillna(89, inplace=True)
test_df_lg_x['舒张压'].fillna(89, inplace=True)

train_df_lg_y = train_df['患有糖尿病标识']
train_x = train_df_lg_x.values
train_y = train_df_lg_y.values

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
# 建立模型
# 训练模型
clf = LogisticRegression(C=1,random_state=3).fit(train_x, train_y)

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

result = confusion_matrix(train_y,clf.predict(train_x))
print('查看结果',result)

precision = precision_score(clf.predict(train_x), train_y)
print('模型准确度为',precision)
print('模型f1为',f1_score(clf.predict(train_x), train_y))
查看结果 [[2770 364]
         [585 1351]]
模型准确度为 0.6978305785123967
模型f1为 0.740071213366201

调划分数据集,调参优化

from sklearn.model_selection import train_test_split 
train_x,vali_x, train_y,vali_y = train_test_split(train_x,train_y,test_size = 0.2, random_state =  31)

#模型惩罚系数调整为0.05,使用validation集进行拟合
clf3= LogisticRegression(C=0.07,penalty = 'l2', solver ='saga', l1_ratio=0.4 ,random_state=22).fit(train_x, train_y)
precision3 = precision_score(clf3.predict(vali_x), vali_y)
print('模型准确度为',precision3)
#模型准确度为 0.6884422110552764


#模型惩罚系数调整为0.05,使用validation集进行拟合
clf2= LogisticRegression(C=0.05,random_state=3).fit(train_x, train_y)

precision2 = precision_score(clf2.predict(vali_x), vali_y)
print('模型准确度为',precision2)
#模型准确度为 0.7010050251256281

f1_score(clf2.predict(vali_x), vali_y)
#F1准确率为:0.7390728476821192

特征工程----------------

# 训练集
#男女体重平均指数
#男女舒张压平均指数
man_height_gap = train_df[train_df['性别']==1]['体重指数'].mean()
women_height_gap = train_df[train_df['性别']==0]['体重指数'].mean()
man_stress_gap = train_df[train_df['性别']==1]['舒张压'].mean()
women_stress_gap = train_df[train_df['性别']==0]['舒张压'].mean()

# test集
#男女体重平均指数
#男女舒张压平均指数
man_height_gap2 = test_df[test_df['性别']==1]['体重指数'].mean()
women_height_gap2 = test_df[test_df['性别']==0]['体重指数'].mean()
man_stress_gap2 = test_df[test_df['性别']==1]['舒张压'].mean()
women_stress_gap2 = test_df[test_df['性别']==0]['舒张压'].mean()
  • 计算每个患者与每个性别平均值的差异
  • # 处理训练集
    train_clean = train_df
    train_clean['体重差异']=0
    train_clean['舒张压差异'] = 0
    train_clean['体重差异'].loc[train_clean['性别']==0]=train_clean['体重指数'].loc[train_clean['性别']==0]-women_height_gap
    train_clean['体重差异'].loc[train_clean['性别']==1]=train_clean['体重指数'].loc[train_clean['性别']==1]-man_height_gap
    train_clean['舒张压差异'].loc[train_clean['性别']==0]=train_clean['舒张压'].loc[train_clean['性别']==0]-women_stress_gap
    train_clean['舒张压差异'].loc[train_clean['性别']==1]=train_clean['舒张压'].loc[train_clean['性别']==1]-man_stress_gap
    
    
    # 处理test
    test_clean = test_df
    test_clean['体重差异']=0
    test_clean['舒张压差异'] = 0
    test_clean['体重差异'].loc[test_clean['性别']==0]=test_clean['体重指数'].loc[test_clean['性别']==0]-women_height_gap2
    test_clean['体重差异'].loc[test_clean['性别']==1]=test_clean['体重指数'].loc[test_clean['性别']==1]-man_height_gap2
    test_clean['舒张压差异'].loc[test_clean['性别']==0]=test_clean['舒张压'].loc[test_clean['性别']==0]-women_stress_gap2
    test_clean['舒张压差异'].loc[test_clean['性别']==1]=test_clean['舒张压'].loc[test_clean['性别']==1]-man_stress_gap2
    
    train_clean['舒张压差异'].fillna(0,inplace = True)
    test_clean['舒张压差异'].fillna(0,inplace = True)
    
    
    # 处理train
    #离散化特征
    dict_糖尿病家族史 = {
        '无记录': 0,
        '叔叔或姑姑有一方患有糖尿病': 1,
        '叔叔或者姑姑有一方患有糖尿病': 1,
        '父母有一方患有糖尿病': 2
    }
    #特征映射
    train_clean['糖尿病家族史'] = train_clean['糖尿病家族史'].map(dict_糖尿病家族史)
    train_clean['舒张压'].fillna(89, inplace=True)
    
    # 处理 舒张压 缺失值
    train_clean['舒张压'].fillna(89, inplace=True)
    train_clean['舒张压'].fillna(89, inplace=True)
    
    #把异常值-1也看作空值
    mean_man_antisuger = round(train_clean['口服耐糖量测试'].loc[train_clean['性别']==1].mean(),3)
    mean_women_antisuger = round(train_clean['口服耐糖量测试'].loc[train_clean['性别']==0].mean(),3)
    # 替换为平均值
    train_clean['口服耐糖量测试'].loc[(train_clean['性别']==0)&(train_clean['口服耐糖量测试']== -1)]  =mean_women_antisuger
    train_clean['口服耐糖量测试'].loc[(train_clean['性别']==1)&(train_clean['口服耐糖量测试']== -1)]  =mean_man_antisuger
    
    #肱三头肌皮褶厚度,把异常值0也看作空值
    mean_man_skin = round(train_clean['肱三头肌皮褶厚度'].loc[train_clean['性别']==1].mean(),3)
    mean_women_skin = round(train_clean['肱三头肌皮褶厚度'].loc[train_clean['性别']==0].mean(),3)
    # 肱三头肌皮褶厚度,替换为平均值
    train_clean['肱三头肌皮褶厚度'].loc[(train_clean['性别']==0)&(train_clean['肱三头肌皮褶厚度']== 0)]  =mean_women_skin
    train_clean['肱三头肌皮褶厚度'].loc[(train_clean['性别']==1)&(train_clean['肱三头肌皮褶厚度']== 0)]  =mean_man_skin
    
    # 更改为年龄
    train_clean['出生年份']=2022-train_clean['出生年份']
    # 处理test
    
    #特征映射
    test_clean['糖尿病家族史'] = test_clean['糖尿病家族史'].map(dict_糖尿病家族史)
    test_clean['舒张压'].fillna(89, inplace=True)
    
    # 处理 舒张压 缺失值
    test_clean['舒张压'].fillna(89, inplace=True)
    test_clean['舒张压'].fillna(89, inplace=True)
    
    #口服耐糖量测试,把异常值-1也看作空值
    mean_man_antisuger = round(test_clean['口服耐糖量测试'].loc[test_clean['性别']==1].mean(),3)
    mean_women_antisuger = round(test_clean['口服耐糖量测试'].loc[test_clean['性别']==0].mean(),3)
    # 口服耐糖量测试,替换为平均值
    test_clean['口服耐糖量测试'].loc[(test_clean['性别']==0)&(test_clean['口服耐糖量测试']== -1)]  =mean_women_antisuger
    test_clean['口服耐糖量测试'].loc[(test_clean['性别']==1)&(test_clean['口服耐糖量测试']== -1)]  =mean_man_antisuger
    
    #肱三头肌皮褶厚度,把异常值0也看作空值
    mean_man_skin = round(test_clean['肱三头肌皮褶厚度'].loc[test_clean['性别']==1].mean(),3)
    mean_women_skin = round(test_clean['肱三头肌皮褶厚度'].loc[test_clean['性别']==0].mean(),3)
    # 肱三头肌皮褶厚度,替换为平均值
    test_clean['肱三头肌皮褶厚度'].loc[(test_clean['性别']==0)&(test_clean['肱三头肌皮褶厚度']== 0)]  =mean_women_skin
    test_clean['肱三头肌皮褶厚度'].loc[(test_clean['性别']==1)&(test_clean['肱三头肌皮褶厚度']== 0)]  =mean_man_skin
    
    # 更改为年龄
    test_clean['出生年份']=2022-test_clean['出生年份']
    
    

    在上述基础上将训练集20%划分为验证集,使用逻辑回归完成训练

  • train_x_clean = train_clean[['性别', '出生年份', '体重指数', '糖尿病家族史', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度', '体重差异', '舒张压差异']]
    train_y_clean = train_clean['患有糖尿病标识']
    test_x_clean = test_clean[['性别', '出生年份', '体重指数', '糖尿病家族史', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度', '体重差异', '舒张压差异']]
    train_X,val_X,train_y,val_y = train_test_split(train_x_clean, train_y_clean, test_size=0.2,random_state=3)
    
    clf = LogisticRegression(C=1,random_state=3)
    clf.fit(train_X,train_y)
    prediction_lr = clf.predict(val_X)
    
    print('\nF1 score:',f1_score(val_y,prediction_lr))
    F1 score: 0.7385019710906702
  • 精度有提高到73.85%

  • 任务5:特征筛选-------------

  • 使用树模型完成模型的训练,通过特征重要性筛选出Top5的特征
  • from sklearn.tree import DecisionTreeClassifier
    model_tree = DecisionTreeClassifier()
    model_tree.fit(train_X,train_y)
    
    
    features_import = pd.DataFrame(train_x.columns, columns=['feature'])
    features_import['importance'] = model_tree.feature_importances_ 
    features_import.sort_values('importance', inplace=True,ascending=False)
    
    features_import

  • 使用筛选出的特征和逻辑回归进行训练
  • train_x2 = train_clean[['体重指数', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']]
    train_y2 = train_clean['患有糖尿病标识']
    test_x2 = test_clean[['体重指数', '舒张压', '口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']]
    train_X2,val_X2,train_y2,val_y2 = train_test_split(train_x2, train_y2, test_size=0.2,random_state=3)
    
    clf = LogisticRegression(C=1,random_state=3)
    clf.fit(train_X2,train_y2)
    prediction_lr = clf.predict(val_X2)
    
    print('\nF1 score:',f1_score(val_y2,prediction_lr))
    
     
    F1 score: 0.7347480106100797
    
  • F1-score略有下降
  • 特征减少,模型可以fit维度降低,准确率下降比较合理
  • 任务6:高阶树模型------------------

  • 将训练集20%划分为验证集,使用LightGBM完成训练
  • 尝试调节搜索LightGBM的参数
  • 将步骤4调参之后的模型从新训练,将最新预测的结果文件提交到比赛
    import lightgbm as lgb
    from sklearn.model_selection import GridSearchCV
    #使用网格调参数
    param_grid = {
        'learning_rate': [0.01, 0.1, 1],
        'n_estimators': [20,100,1000,2000,3000,4000]}
     
    clf_lgb = lgb.LGBMClassifier(
        max_depth=3, 
        n_estimators=4000, 
        n_jobs=-1, 
        verbose=-1,
        verbosity=-1,
        learning_rate=0.1,
    )
    clf_lgb = GridSearchCV(clf_lgb,param_grid)
    clf_lgb.fit(train_X,train_y)
    prediction_lr = clf_lgb.predict(val_X)
    print('\nF1 score:',f1_score(val_y,prediction_lr))
    
    F1 score: 0.9529860228716646
  • K-fold训练
  • from sklearn.model_selection import KFold
    
    from sklearn.metrics import accuracy_score
    
    def run_model_cv(model, kf, X_tr, y, X_te, cate_col=None):
        train_pred = np.zeros( (len(X_tr), len(np.unique(y))) )
        test_pred = np.zeros( (len(X_te), len(np.unique(y))) )
    
        cv_clf = []
        for tr_idx, val_idx in kf.split(X_tr, y):
            x_tr = X_tr.iloc[tr_idx]; y_tr = y.iloc[tr_idx]
    
            x_val = X_tr.iloc[val_idx]; y_val = y.iloc[val_idx]
    
            call_back = [
                lgb.early_stopping(50),
            ]
            eval_set = [(x_val, y_val)]
            model.fit(x_tr, y_tr, eval_set=eval_set, callbacks=call_back, verbose=-1)
    
            cv_clf.append(model)
    
            train_pred[val_idx] = model.predict_proba(x_val)
            test_pred += model.predict_proba(X_te)
    
        test_pred /= kf.n_splits
        return train_pred, test_pred, cv_clf
    
    
    #数据集拆分
    train_x_lgb,val_x_lgb,train_y_lgb,val_y_lgb = train_test_split(train_clean, train_clean, test_size=0.2,random_state=3)
    
    #数据集拆分
    train_pred, test_pred, cv_clf = run_model_cv(
        clf_lgb, 
        KFold(n_splits=5),
        train_x_lgb.drop(['编号','患有糖尿病标识'],axis=1),
        train_y_lgb['患有糖尿病标识'],
        val_x_lgb.drop(['编号','患有糖尿病标识'],axis=1),)
    
    test_pred_1 = [ i.argmax() for i in test_pred]
    print('准确率',accuracy_score(test_pred_1,val_y))
    print('\nF1 score:',f1_score(val_y,test_pred_1))
    
    
    准确率 0.965483234714004
    
    F1 score: 0.9557522123893806

任务7:stacking---------------


X_train, X_test, y_train, y_test = train_test_split(train_clean.drop(['编号','患有糖尿病标识'], axis=1), train_clean['患有糖尿病标识'], test_size=0.5)
### 第一层模型
clfs = [ GBDT(n_estimators=100),
       RF(n_estimators=100),
       ET(n_estimators=100),
       ADA(n_estimators=100)
]
X_train_stack  = np.zeros((X_train.shape[0], len(clfs)))
X_test_stack = np.zeros((X_test.shape[0], len(clfs))) 

### 6折stacking
n_folds = 6
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1)
for i,clf in enumerate(clfs):
#     print("分类器:{}".format(clf))
    X_stack_test_n = np.zeros((X_test.shape[0], n_folds))
    
    for j,(train_index,test_index) in enumerate(skf.split(X_train,y_train)):
#             print(j)

            tr_x = X_train.iloc[train_index]
            tr_y = y_train.iloc[train_index]
            clf.fit(tr_x, tr_y)
#             print(clf.predict(X_train.iloc[test_index]).shape)
#             print(test_index.shape)
#             print('------------------------')
             #生成stacking训练数据集
            X_train_stack [test_index, i] = clf.predict(X_train.iloc[test_index])
            X_stack_test_n[:,j] = clf.predict(X_test)

#生成stacking测试数据集
X_test_stack[:,i] = X_stack_test_n.mean(axis=1) 

    ###第二层模型LR
clf_second = LogisticRegression(solver="lbfgs")
clf_second.fit(X_train_stack,y_train)
pred = clf_second.predict_proba(X_test_stack)[:,1]
print('ROC',roc_auc_score(y_test,pred))

最后提升至 0.9613149971845755

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值