智算之道-肝炎预测

肝炎预测

  1. 数据处理
    将缺失较少的数据,使用众数或者平均数填充,缺失较多的数使用随机森林进行预估.

  2. 模型训练
    使用catboost进行预测,代码如下:

最终初赛排名:1.1k个队伍 top1%
准确率:0.85130823

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
#Ignore RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility

import pandas as pd
import numpy as np 
from pandas import Series,DataFrame

data_train = pd.read_csv('/home/kesci/data/competition_A/train_set.csv',engine = 'python',encoding='UTF-8')
path = '/home/kesci/data/competition_A/'
data_test  = pd.read_csv(path+'test_set.csv') 
data_train.info()
import pandas as pd
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.head()
data_train.describe()
import pandas as pd
import numpy as np
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_train.head()
data_train.describe()
data_train.shape[0]
data_shengao_west = data_train['身高'][data_train['区域']=='west'].mean()
data_shengao_west
data_shengao_east = data_train['身高'][data_train['区域']=='east'].mean()
data_shengao_east
data_shengao_north = data_train['身高'][data_train['区域']=='north'].mean()
data_shengao_north
data_shengao_south = data_train['身高'][data_train['区域']=='south'].mean()
data_shengao_south
for i in range(6000):
    if(data_train['区域'].iloc[i]=='west' ):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_west
    if(data_train['区域'].iloc[i]=='east'):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_east
    if(data_train['区域'].iloc[i]=='north'):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_north
    if(data_train['区域'].iloc[i]=='south'):
        if pd.isnull(data_train.at[i,'身高']):
            data_train.at[i,'身高']=data_shengao_south
data_train['身高'][data_train['性别']=='F'].mean()
data_train['身高'][data_train['性别']=='M'].mean()
data_weight_west = data_train['体重'][data_train['区域']=='west'].mean()
data_weight_west
data_weight_east = data_train['体重'][data_train['区域']=='east'].mean()
data_weight_east
data_weight_north = data_train['体重'][data_train['区域']=='north'].mean()
data_weight_north
data_weight_south = data_train['体重'][data_train['区域']=='south'].mean()
data_weight_south

for i in range(6000):
    if(data_train['区域'].iloc[i]=='west' ):
        if pd.isnull(data_train.at[i,'体重']):
            data_train.at[i,'体重']=data_weight_west
    if(data_train['区域'].iloc[i]=='east'):
        if pd.isnull(data_train.at[i,'体重']):
            data_train.at[i,'体重']=data_weight_east
    if(data_train['区域'].iloc[i]=='north'):
        if pd.isnull(data_train.at[i,'体重']):
            data_train.at[i,'体重']=data_weight_north
    if(data_train['区域'].iloc[i]=='south'):
        if pd.isnull(data_train.at[i,'体重']):
            data_train.at[i,'体重']=data_weight_south

for i in range(6000):
    if(data_train['性别'].iloc[i]=='F' ):
        if pd.isnull(data_train.at[i,'体重指数']):
            data_train.at[i,'体重指数'] = (data_train.at[i,'身高']-80)*0.7
    if(data_train['性别'].iloc[i]=='M' ):
        if pd.isnull(data_train.at[i,'体重指数']):
            data_train.at[i,'体重指数'] = (data_train.at[i,'身高']-70)*0.6
for i in range(6000):    
    if pd.isnull(data_train.at[i,'肥胖腰围']):
        if(data_train['体重指数'].iloc[i] > 30 ):
            data_train.at[i,'肥胖腰围']=1.0
        else:
            data_train.at[i,'肥胖腰围']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压','肝炎']

for i in tqdm(str_columns):#可用于显示进度
    lbl = LabelEncoder()
    data_train[i] = lbl.fit_transform(data_train[i].astype(str))
data_train['高血压'].fillna(data_train['高血压'].mode())
for i in range(6000):    
    if pd.isnull(data_train.at[i,'ALF']):
        if(data_train['高血压'].iloc[i] == 0.0):
            data_train.at[i,'ALF']=0.0
for i in range(6000):    
    if pd.isnull(data_train.at[i,'ALF']):
        if(data_train['区域'].iloc[i] == 'west'):
            data_train.at[i,'ALF']=0.0
data_train.info()

pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:'%.5f' % x)
data_test.head()
data_test.describe()
data_test.shape[0]
data_shengao_west = data_test['身高'][data_test['区域']=='west'].mean()
data_shengao_west
data_shengao_east = data_test['身高'][data_test['区域']=='east'].mean()
data_shengao_east
data_shengao_north = data_test['身高'][data_test['区域']=='north'].mean()
data_shengao_north
data_shengao_south = data_test['身高'][data_test['区域']=='south'].mean()
data_shengao_south
for i in range(2785):
    if(data_test['区域'].iloc[i]=='west' ):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_west
    if(data_test['区域'].iloc[i]=='east'):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_east
    if(data_test['区域'].iloc[i]=='north'):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_north
    if(data_test['区域'].iloc[i]=='south'):
        if pd.isnull(data_test.at[i,'身高']):
            data_test.at[i,'身高']=data_shengao_south
data_test['身高'][data_test['性别']=='F'].mean()
data_test['身高'][data_test['性别']=='M'].mean()
data_weight_west = data_test['体重'][data_test['区域']=='west'].mean()
data_weight_west
data_weight_east = data_test['体重'][data_test['区域']=='east'].mean()
data_weight_east
data_weight_north = data_test['体重'][data_test['区域']=='north'].mean()
data_weight_north
data_weight_south = data_test['体重'][data_test['区域']=='south'].mean()
data_weight_south

for i in range(2785):
    if(data_test['区域'].iloc[i]=='west' ):
        if pd.isnull(data_test.at[i,'体重']):
            data_test.at[i,'体重']=data_weight_west
    if(data_test['区域'].iloc[i]=='east'):
        if pd.isnull(data_test.at[i,'体重']):
            data_test.at[i,'体重']=data_weight_east
    if(data_test['区域'].iloc[i]=='north'):
        if pd.isnull(data_test.at[i,'体重']):
            data_test.at[i,'体重']=data_weight_north
    if(data_test['区域'].iloc[i]=='south'):
        if pd.isnull(data_test.at[i,'体重']):
            data_test.at[i,'体重']=data_weight_south

for i in range(2785):
    if(data_test['性别'].iloc[i]=='F' ):
        if pd.isnull(data_test.at[i,'体重指数']):
            data_test.at[i,'体重指数'] = (data_test.at[i,'身高']-80)*0.7
    if(data_test['性别'].iloc[i]=='M' ):
        if pd.isnull(data_test.at[i,'体重指数']):
            data_test.at[i,'体重指数'] = (data_test.at[i,'身高']-70)*0.6
for i in range(2785):    
    if pd.isnull(data_test.at[i,'肥胖腰围']):
        if(data_test['体重指数'].iloc[i] > 30 ):
            data_test.at[i,'肥胖腰围']=1.0
        else:
            data_test.at[i,'肥胖腰围']=0.0
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压']

for i in tqdm(str_columns):#可用于显示进度
    lbl = LabelEncoder()
    data_test[i] = lbl.fit_transform(data_test[i].astype(str))
data_test['高血压'].fillna(data_test['高血压'].mode())
for i in range(2785):    
    if pd.isnull(data_test.at[i,'ALF']):
        if(data_test['高血压'].iloc[i] == 0.0):
            data_test.at[i,'ALF']=0.0
for i in range(2785):    
    if pd.isnull(data_test.at[i,'ALF']):
        if(data_test['区域'].iloc[i] == 'west'):
            data_test.at[i,'ALF']=0.0
data_test.info()
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压','肝炎']
for i in str_columns:
    lbl = LabelEncoder()
    data_train[i] = lbl.fit_transform(data_train[i].astype(str))
for s in str_columns:
    data_train[s].fillna(data_train[s].mode(),inplace=True)
for n in num_columns:
    data_train[n].fillna(data_train[n].mean(),inplace=True)
for n in zero_to_one_columns:
    data_train[n].fillna(data_train[n].mode(),inplace=True)
data_train.info()
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','教育','未婚','收入','肥胖腰围','性别','区域','护理来源','视力不佳','饮酒','高血压']
for i in str_columns:
    lbl = LabelEncoder()
    data_test[i] = lbl.fit_transform(data_test[i].astype(str))
for s in str_columns:
    data_test[s].fillna(data_test[s].mode(),inplace=True)
for n in num_columns:
    data_test[n].fillna(data_test[n].mean(),inplace=True)
for n in zero_to_one_columns:
    data_test[n].fillna(data_test[n].mode(),inplace=True)
data_train.info()

from sklearn.ensemble import RandomForestRegressor
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','体育活动']
zero_to_one_columns = ['血脂异常','PVD','家庭高血压', '家族糖尿病']
str_columns = ['家族肝炎', '慢性疲劳','糖尿病','肥胖腰围','性别','区域','饮酒','高血压']

#使用PCA主成分分析

n = ['ALF']+num_columns+zero_to_one_columns+str_columns
print(n)
### 使用 RandomForestClassifier 填补缺失的alf属性
def set_missing_alf(df):

    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    alf_df = df[n]

    # 分成已知alf和未知alf两部分
    known_alf = alf_df[alf_df.ALF.notnull()].values
    unknown_alf = alf_df[alf_df.ALF.isnull()].values

    # y即目标alf
    y = known_alf[:, 0]
    # print(y)
    # X即特征属性值
    X = known_alf[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)

    # 用得到的模型进行未知alf结果预测
    predictedAges = rfr.predict(unknown_alf[:, 1::])

    # 用得到的预测结果填补原缺失数据
    df.loc[ (df['ALF'].isnull()), 'ALF' ] = predictedAges 

    return df

data_train = set_missing_alf(data_train)
data_test = set_missing_alf(data_test)

import os
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

%matplotlib inline
warnings.filterwarnings('ignore')
# pd.set_option('display.max_rows',None)
# pd.set_option('display.max_columns',None)


submission  =  pd.read_csv(path+'submission_example.csv') 
data_train.head()

# 字符编码
for i in str_columns:
    lbl = LabelEncoder()
    data_train[i] = lbl.fit_transform(data_train[i].astype(str))
    data_test[i] = lbl.fit_transform(data_test[i].astype(str))
# 数值归一化
data_train[num_columns] = MinMaxScaler().fit_transform(data_train[num_columns])
data_test[num_columns]  = MinMaxScaler().fit_transform(data_test[num_columns])


columns = [i for i in data_train.columns if i not in ['肝炎','ID']]

train_x,train_y = data_train[columns].values,data_train['肝炎'].values
test_x  = data_test[columns].values
submission['hepatitis'] =0


kfold = StratifiedKFold(n_splits=6, shuffle=False)
model = CatBoostClassifier(
    iterations=600,#可修改
    learning_rate=0.04,#可修改
    loss_function='Logloss'
    )
for train, valid in kfold.split(train_x, train_y):
    X_train, Y_train = train_x[train], train_y[train]
    X_valid, Y_valid = train_x[valid], train_y[valid]
    model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
    submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 6

人开始变懒了,就不写详细注释了,有问题的评论区滴滴我(虽然不一定有人看)
在这里插入图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值