# 天池大数据竞赛——天池精准医疗大赛人工智能辅助糖尿病遗传风险预测赛后总结

## 六、预测算法

### 1. LightGBM

#### 相关代码如下，仍需要根据实际应用做出相应的更改。

# coding:utf-8
#Author: chenhao
#date: Jan.22.2018
#Description: Tianchi Medical solution train dataset with Lightgbm, use the coxbox to soft the dataset

import time
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from scipy import stats

data_path = 'data/'

train = pd.read_csv(data_path + 'd_train_20180102.csv', encoding='gb2312')
test = pd.read_csv(data_path + 'd_test_A_20180102.csv', encoding='gb2312')

def make_feat(train, test):
train_id = train.id.values.copy()
test_id = test.id.values.copy()
#对数据进行合并与重塑
data = pd.concat([train, test])

data['性别'] = data['性别'].map({'男': 1, '女': 0, '??':0})
data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-9-10')).dt.days

#data.fillna(data.median(axis=0), inplace=True)

train_feat = data[data.id.isin(train_id)]
test_feat = data[data.id.isin(test_id)]

#对数据缺失值进行处理
train_feat = train_feat.drop(['id','乙肝表面抗原','乙肝表面抗体','乙肝e抗原','乙肝e抗体','乙肝核心抗体'],axis=1)
test_feat = test_feat.drop(['id','乙肝表面抗原','乙肝表面抗体','乙肝e抗原','乙肝e抗体','乙肝核心抗体'], axis=1)

#对缺少一部分的数据进行填充
train_feat.fillna(train_feat.median(axis=0), inplace=True)
test_feat.fillna(test_feat.median(axis=0), inplace=True)

#删除离群值
train_feat = train_feat.drop(train_feat[train_feat['*r-谷氨酰基转换酶'] > 600 ].index)
train_feat = train_feat.drop(train_feat[train_feat['白细胞计数'] > 20.06].index)
train_feat = train_feat.drop(train_feat[train_feat['*丙氨酸氨基转换酶'] == 498.89].index)
train_feat = train_feat.drop(train_feat[train_feat['单核细胞%'] > 20 ].index)
train_feat = train_feat.drop(train_feat[train_feat['*碱性磷酸酶'] > 340].index)    #有待调整
train_feat = train_feat.drop(train_feat[train_feat['*球蛋白'] > 60].index)
train_feat = train_feat.drop(train_feat[train_feat['嗜酸细胞%'] > 20].index)
train_feat = train_feat.drop(train_feat[train_feat['*天门冬氨酸氨基转换酶'] > 300].index)
train_feat = train_feat.drop(train_feat[train_feat['血小板计数'] > 700].index)
train_feat = train_feat.drop(train_feat[train_feat['*总蛋白'] > 100].index)

return train_feat, test_feat

'''
#对训练数据进行平滑处理
train_feat['甘油三酯'], a = stats.boxcox(train_feat['甘油三酯'])
train_feat['*r-谷氨酰基转换酶'], b = stats.boxcox(train_feat['*r-谷氨酰基转换酶'])
train_feat['白球比例'], c = stats.boxcox(train_feat['白球比例'])
train_feat['*天门冬氨酸氨基转换酶'], d = stats.boxcox(train_feat['*天门冬氨酸氨基转换酶'])

test_feat['甘油三酯'], a1 = stats.boxcox(test_feat['甘油三酯'])
test_feat['*r-谷氨酰基转换酶'], a1 = stats.boxcox(test_feat['*r-谷氨酰基转换酶'])
test_feat['白球比例'], a1 = stats.boxcox(test_feat['白球比例'])
test_feat['*天门冬氨酸氨基转换酶'], a1 = stats.boxcox(test_feat['*天门冬氨酸氨基转换酶'])
#train_feat['甘油三酯'] += 2
#test_feat['甘油三酯'] += 2
'''

train_feat, test_feat = make_feat(train, test)

predictors = [f for f in test_feat.columns if f not in ['血糖']]

def evalerror(pred, df):
label = df.get_label().values.copy()
score = mean_squared_error(label, pred) * 0.5
return ('0.5mse', score, False)

print('开始训练...')

#原版调参数据
params = {
'learning_rate': 0.01,
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mse',
'sub_feature': 0.7,
'num_leaves': 60,
'colsample_bytree': 0.7,
'feature_fraction': 0.7,
'min_data': 100,
'min_hessian': 1,
'verbose': -1,
}

print('开始CV 5折训练...')
t0 = time.time()
train_preds = np.zeros(train_feat.shape[0])
#产生相应的id数为行数5列全零数据
test_preds = np.zeros((test_feat.shape[0], 5))
kf = KFold(len(train_feat), n_folds=5, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
print('第{}次训练...'.format(i))
train_feat1 = train_feat.iloc[train_index]
train_feat2 = train_feat.iloc[test_index]
lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['血糖'], categorical_feature=['性别'])
lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['血糖'])
gbm = lgb.train(params,
lgb_train1,
num_boost_round=3000,
valid_sets=lgb_train2,
verbose_eval=100,
feval=evalerror,
early_stopping_rounds=100)
feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
train_preds[test_index] += gbm.predict(train_feat2[predictors])
test_preds[:, i] = gbm.predict(test_feat[predictors])
print('线下得分：    {}'.format(mean_squared_error(train_feat['血糖'], train_preds) * 0.5))
print('CV训练用时{}秒'.format(time.time() - t0))

submission = pd.DataFrame({'pred': test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), header=None,index=False, float_format='%.3f')

### 2. XGBoost

#### 相关代码如下，仍需要根据实际应用做出相应的更改。

# coding:utf-8
#Author: chenhao
#date: Jan.21.2018
#Description: Tianchi Medical solution train dataset with XGBoost,besides, use the coxbox to soft the dataset

import time
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from dateutil.parser import parse
from scipy import stats

data_path = 'data/'

train = pd.read_csv(data_path + 'd_train_20180102.csv', encoding='gb2312')
test = pd.read_csv(data_path + 'd_test_A_20180102.csv', encoding='gb2312')

train['性别'] = train['性别'].map({'男': 1, '女': 0})
test['性别'] = test['性别'].map({'男': 1, '女': 0})

train['体检日期'] = (pd.to_datetime(train['体检日期']) - parse('2017-9-10')).dt.days
test['体检日期'] = (pd.to_datetime(test['体检日期']) - parse('2017-9-10')).dt.days

train.fillna(train.median(axis=0), inplace=True)
test.fillna(test.median(axis=0), inplace=True)

# 删除离群值
train = train.drop(train[train['*r-谷氨酰基转换酶'] > 600].index)
train = train.drop(train[train['白细胞计数'] > 20.06].index)
train = train.drop(train[train['*丙氨酸氨基转换酶'] == 498.89].index)
train = train.drop(train[train['单核细胞%'] > 20].index)
train = train.drop(train[train['*碱性磷酸酶'] > 340].index)  # 有待调整
train = train.drop(train[train['*球蛋白'] > 60].index)
train = train.drop(train[train['嗜酸细胞%'] > 20].index)
train = train.drop(train[train['*天门冬氨酸氨基转换酶'] > 300].index)
train = train.drop(train[train['血小板计数'] > 700].index)
train = train.drop(train[train['*总蛋白'] > 100].index)

#去除缺失值较多的数据
train_y = train['血糖']
train_x = train.drop(['id','血糖','乙肝表面抗原','乙肝表面抗体','乙肝核心抗体','乙肝e抗原','乙肝e抗体'], axis=1)
test_x = test.drop(['id','乙肝表面抗原','乙肝表面抗体','乙肝核心抗体','乙肝e抗原','乙肝e抗体'], axis=1)

train_out = xgb.DMatrix(train_x, label=train_y)
test_out = xgb.DMatrix(test_x)

test_preds = np.zeros((test_x.shape[0],1))
#submission = pd.DataFrame({'label': test_preds.mean(axis=1)})

#print (test_preds)

params = {'booster': 'gbtree',
'objective': 'reg:linear',
'eval_metric': 'mae',
'gamma': 0.1,
'min_child_weight': 1.3,
'max_depth': 5,
'lambda': 10,
'subsample': 0.71,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.01,
'tree_method': 'exact',
'seed': 0,
}

watchlist = [(train_out,'train')]
model = xgb.train(params,train_out,num_boost_round=3000,evals=watchlist)

test_preds[:,0] = model.predict(test_out)
print(test_preds)

submission = pd.DataFrame({'pred': test_preds.mean(axis=1)})


### 3. Keras

#### 相关代码如下，仍需要根据实际应用做出相应的更改。

# coding=utf-8
'''
Author: chenhao
date: Jan.25.2018
Description: Use the train_Drop_Delete_Log.csv and PolynomialFeature in XGB model
'''
import pandas as pd
import datetime
import numpy as np
from dateutil.parser import parse

modelfile = 'modelweight.model' #神经网络权重保存

data_path = 'data/'

train = pd.read_csv(data_path + 'train_Drop_Delete_Log_Poly_keras.csv', encoding='gb2312')
test = pd.read_csv(data_path + 'test_Drop_Delete_Log_Poly_keras.csv', encoding='gb2312')

# 对数据简单处理
train_y = train['BS']
train_x = train.drop(['BS'],axis=1)
test_x = test

test_preds = np.zeros((test_x.shape[0],1))

train_x = train_x.as_matrix()
train_y = train_y.as_matrix()
test_x = test_x.as_matrix()

# 3 建立一个简单BP神经网络模型
from keras.models import Sequential
from keras.layers.core import Dense, Activation
model = Sequential()  #层次模型
model.fit(train_x, train_y, nb_epoch = 1000, batch_size = 6) #训练模型1000次
model.save_weights(modelfile) #保存模型权重

#4 预测，并还原结果。
#x = ((data[feature] - data_mean[feature])/data_std[feature]).as_matrix()
#data[u'L1_pred'] = model.predict(x) * data_std['L1'] + data_mean['L1']

test_preds = model.predict(test_x)
print(test_preds)

submission = pd.DataFrame({'pred': test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), header=None, index=False, float_format='%.3f')