一、加载数据集,将train与test赋给all_data并释放空间
二、手动筛选数据集特征,并进行填充,利用循环,将所有在selected_features中特征类型为object的对象,np.nan(字符串类型)替换为‘0’,返回all_data[i]对应的Labels比如2,7,1,8等
三、利用索引,返回X_train,X_test,y的值,其中y中的索引一定是NAN,以此来区分
四、利用随机森林和XGB进行分类,同时用五折交叉检验对模型进行评估,训练结果存放在submission/rfc_submission.csv中
代码部分
#_*_ coding:utf-8 _*_
# @Time : 2019/11/15 19:11
# @Author : xm_ai
# @Email : 793790994@qq.com
# @File : train_v1.py
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
from tqdm import tqdm
import os, sys
import numpy as np
import gc
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = pd.concat([train, test], ignore_index=True)
print(train.info())
print(test.info())
del train, #test
gc.collect()
# 手动筛选数据集中的特征
selected_features = ['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare','Cabin','Name']
all_data['Embarked'].fillna('S',inplace=True)
all_data['Age'].fillna(all_data['Age'].mean(),inplace=True)
all_data['Fare'].fillna(all_data['Fare'].mean(),inplace=True)
cat_col = [i for i in all_data.select_dtypes(object).columns if i in selected_features]
print(cat_col)
for i in tqdm(cat_col):
all_data[i].replace(np.nan, '0', inplace=True)
lbl = LabelEncoder()
all_data[i] = lbl.fit_transform(all_data[i])
tr_index = ~all_data['Survived'].isnull()
X_train = all_data[tr_index][list(set(selected_features))].reset_index(drop=True)
y = all_data[tr_index]['Survived'].reset_index(drop=True).astype(int)
X_test = all_data[~tr_index][list(set(selected_features))].reset_index(drop=True)
#print('———————使用随机森林分类———————')
X_train.info()
rfc = RandomForestClassifier()
xgbc = XGBClassifier()
chose_mode = 'xgbc'
if chose_mode == 'rfc':
cross_val_score(rfc,X_train,y,cv = 5).mean()
rfc.fit(X_train,y)
rfc_y_predict = rfc.predict(X_test)
rfc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})
rfc_submission.to_csv('submission/rfc_submission.csv', index=False)
#print('———————使用XGB分类———————')
else:
cross_val_score(xgbc, X_train, y, cv=5).mean()
xgbc.fit(X_train, y)
xgbc_y_predict = xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_y_predict})
xgbc_submission.to_csv('submission/xgbc_submission.csv', index=False)
运行结果