智算之道-肝炎预测

肝炎预测

1. 数据处理
将缺失较少的数据,使用众数或者平均数填充,缺失较多的数使用随机森林进行预估.

2. 模型训练
使用catboost进行预测,代码如下:

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
#Ignore RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

path = '/home/kesci/data/competition_A/'
data_train.info()

import pandas as pd
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.describe()

import pandas as pd
import numpy as np
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.describe()
data_train.shape[0]
data_shengao_west = data_train['身高'][data_train['区域']=='west'].mean()
data_shengao_west
data_shengao_east = data_train['身高'][data_train['区域']=='east'].mean()
data_shengao_east
data_shengao_north = data_train['身高'][data_train['区域']=='north'].mean()
data_shengao_north
data_shengao_south = data_train['身高'][data_train['区域']=='south'].mean()
data_shengao_south
for i in range(6000):
if(data_train['区域'].iloc[i]=='west' ):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_west
if(data_train['区域'].iloc[i]=='east'):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_east
if(data_train['区域'].iloc[i]=='north'):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_north
if(data_train['区域'].iloc[i]=='south'):
if pd.isnull(data_train.at[i,'身高']):
data_train.at[i,'身高']=data_shengao_south
data_train['身高'][data_train['性别']=='F'].mean()
data_train['身高'][data_train['性别']=='M'].mean()
data_weight_west = data_train['体重'][data_train['区域']=='west'].mean()
data_weight_west
data_weight_east = data_train['体重'][data_train['区域']=='east'].mean()
data_weight_east
data_weight_north = data_train['体重'][data_train['区域']=='north'].mean()
data_weight_north
data_weight_south = data_train['体重'][data_train['区域']=='south'].mean()
data_weight_south

for i in range(6000):
if(data_train['区域'].iloc[i]=='west' ):
if pd.isnull(data_train.at[i,'体重']):
data_train.at[i,'体重']=data_weight_west
if(data_train['区域'].iloc[i]=='east'):
if pd.isnull(data_train.at[i,'体重']):
data_train.at[i,'体重']=data_weight_east
if(data_train['区域'].iloc[i]=='north'):
if pd.isnull(data_train.at[i,'体重']):
data_train.at[i,'体重']=data_weight_north
if(data_train['区域'].iloc[i]=='south'):
if pd.isnull(data_train.at[i,'体重']):
data_train.at[i,'体重']=data_weight_south

for i in range(6000):
if(data_train['性别'].iloc[i]=='F' ):
if pd.isnull(data_train.at[i,'体重指数']):
data_train.at[i,'体重指数'] = (data_train.at[i,'身高']-80)*0.7
if(data_train['性别'].iloc[i]=='M' ):
if pd.isnull(data_train.at[i,'体重指数']):
data_train.at[i,'体重指数'] = (data_train.at[i,'身高']-70)*0.6
for i in range(6000):
if pd.isnull(data_train.at[i,'肥胖腰围']):
if(data_train['体重指数'].iloc[i] > 30 ):
data_train.at[i,'肥胖腰围']=1.0
else:
data_train.at[i,'肥胖腰围']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
'好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压','肝炎']

for i in tqdm(str_columns):#可用于显示进度
lbl = LabelEncoder()
data_train[i] = lbl.fit_transform(data_train[i].astype(str))
data_train['高血压'].fillna(data_train['高血压'].mode())
for i in range(6000):
if pd.isnull(data_train.at[i,'ALF']):
if(data_train['高血压'].iloc[i] == 0.0):
data_train.at[i,'ALF']=0.0
for i in range(6000):
if pd.isnull(data_train.at[i,'ALF']):
if(data_train['区域'].iloc[i] == 'west'):
data_train.at[i,'ALF']=0.0
data_train.info()

pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_test.describe()
data_test.shape[0]
data_shengao_west = data_test['身高'][data_test['区域']=='west'].mean()
data_shengao_west
data_shengao_east = data_test['身高'][data_test['区域']=='east'].mean()
data_shengao_east
data_shengao_north = data_test['身高'][data_test['区域']=='north'].mean()
data_shengao_north
data_shengao_south = data_test['身高'][data_test['区域']=='south'].mean()
data_shengao_south
for i in range(2785):
if(data_test['区域'].iloc[i]=='west' ):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_west
if(data_test['区域'].iloc[i]=='east'):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_east
if(data_test['区域'].iloc[i]=='north'):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_north
if(data_test['区域'].iloc[i]=='south'):
if pd.isnull(data_test.at[i,'身高']):
data_test.at[i,'身高']=data_shengao_south
data_test['身高'][data_test['性别']=='F'].mean()
data_test['身高'][data_test['性别']=='M'].mean()
data_weight_west = data_test['体重'][data_test['区域']=='west'].mean()
data_weight_west
data_weight_east = data_test['体重'][data_test['区域']=='east'].mean()
data_weight_east
data_weight_north = data_test['体重'][data_test['区域']=='north'].mean()
data_weight_north
data_weight_south = data_test['体重'][data_test['区域']=='south'].mean()
data_weight_south

for i in range(2785):
if(data_test['区域'].iloc[i]=='west' ):
if pd.isnull(data_test.at[i,'体重']):
data_test.at[i,'体重']=data_weight_west
if(data_test['区域'].iloc[i]=='east'):
if pd.isnull(data_test.at[i,'体重']):
data_test.at[i,'体重']=data_weight_east
if(data_test['区域'].iloc[i]=='north'):
if pd.isnull(data_test.at[i,'体重']):
data_test.at[i,'体重']=data_weight_north
if(data_test['区域'].iloc[i]=='south'):
if pd.isnull(data_test.at[i,'体重']):
data_test.at[i,'体重']=data_weight_south

for i in range(2785):
if(data_test['性别'].iloc[i]=='F' ):
if pd.isnull(data_test.at[i,'体重指数']):
data_test.at[i,'体重指数'] = (data_test.at[i,'身高']-80)*0.7
if(data_test['性别'].iloc[i]=='M' ):
if pd.isnull(data_test.at[i,'体重指数']):
data_test.at[i,'体重指数'] = (data_test.at[i,'身高']-70)*0.6
for i in range(2785):
if pd.isnull(data_test.at[i,'肥胖腰围']):
if(data_test['体重指数'].iloc[i] > 30 ):
data_test.at[i,'肥胖腰围']=1.0
else:
data_test.at[i,'肥胖腰围']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
'好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压']

for i in tqdm(str_columns):#可用于显示进度
lbl = LabelEncoder()
data_test[i] = lbl.fit_transform(data_test[i].astype(str))
data_test['高血压'].fillna(data_test['高血压'].mode())
for i in range(2785):
if pd.isnull(data_test.at[i,'ALF']):
if(data_test['高血压'].iloc[i] == 0.0):
data_test.at[i,'ALF']=0.0
for i in range(2785):
if pd.isnull(data_test.at[i,'ALF']):
if(data_test['区域'].iloc[i] == 'west'):
data_test.at[i,'ALF']=0.0
data_test.info()

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
'好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压','肝炎']
for i in str_columns:
lbl = LabelEncoder()
data_train[i] = lbl.fit_transform(data_train[i].astype(str))
for s in str_columns:
data_train[s].fillna(data_train[s].mode(),inplace=True)
for n in num_columns:
data_train[n].fillna(data_train[n].mean(),inplace=True)
for n in zero_to_one_columns:
data_train[n].fillna(data_train[n].mode(),inplace=True)
data_train.info()
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
'好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压']
for i in str_columns:
lbl = LabelEncoder()
data_test[i] = lbl.fit_transform(data_test[i].astype(str))
for s in str_columns:
data_test[s].fillna(data_test[s].mode(),inplace=True)
for n in num_columns:
data_test[n].fillna(data_test[n].mean(),inplace=True)
for n in zero_to_one_columns:
data_test[n].fillna(data_test[n].mode(),inplace=True)
data_train.info()


from sklearn.ensemble import RandomForestRegressor
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
'好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','肥胖腰围','性别','区域','饮酒','高血压']

#使用PCA主成分分析

n = ['ALF']+num_columns+zero_to_one_columns+str_columns
print(n)
### 使用 RandomForestClassifier 填补缺失的alf属性
def set_missing_alf(df):

# 把已有的数值型特征取出来丢进Random Forest Regressor中
alf_df = df[n]

# 分成已知alf和未知alf两部分
known_alf = alf_df[alf_df.ALF.notnull()].values
unknown_alf = alf_df[alf_df.ALF.isnull()].values

# y即目标alf
y = known_alf[:, 0]
# print(y)
# X即特征属性值
X = known_alf[:, 1:]

# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)

# 用得到的模型进行未知alf结果预测
predictedAges = rfr.predict(unknown_alf[:, 1::])

# 用得到的预测结果填补原缺失数据
df.loc[ (df['ALF'].isnull()), 'ALF' ] = predictedAges

return df

data_train = set_missing_alf(data_train)
data_test = set_missing_alf(data_test)


import os
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

%matplotlib inline
warnings.filterwarnings('ignore')
# pd.set_option('display.max_rows',None)
# pd.set_option('display.max_columns',None)

# 字符编码
for i in str_columns:
lbl = LabelEncoder()
data_train[i] = lbl.fit_transform(data_train[i].astype(str))
data_test[i] = lbl.fit_transform(data_test[i].astype(str))
# 数值归一化
data_train[num_columns] = MinMaxScaler().fit_transform(data_train[num_columns])
data_test[num_columns]  = MinMaxScaler().fit_transform(data_test[num_columns])

columns = [i for i in data_train.columns if i not in ['肝炎','ID']]

train_x,train_y = data_train[columns].values,data_train['肝炎'].values
test_x  = data_test[columns].values
submission['hepatitis'] =0

kfold = StratifiedKFold(n_splits=6, shuffle=False)
model = CatBoostClassifier(
iterations=600,#可修改
learning_rate=0.04,#可修改
loss_function='Logloss'
)
for train, valid in kfold.split(train_x, train_y):
X_train, Y_train = train_x[train], train_y[train]
X_valid, Y_valid = train_x[valid], train_y[valid]
model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 6



04-24 463

01-25 21
02-23
10-08 9041
10-16 96
08-16 296