随机森林算法预测泰坦尼克号沉船人员获救案例
一、导入相关第三方库
import sys
import pandas as pd # 数据分析
import numpy as np
import sklearn # 机器学习库
import random
import time
from sklearn import ensemble # 随机森林算法在ensemble类中
# 预处理相关
from sklearn.preprocessing import LabelEncoder # 编码转换
from sklearn import feature_selection # 特征选择
from sklearn import model_selection # 模型选择
from sklearn import metrics # 模型评估相关
# 绘图相关
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns # 基于matplotlib,绘图更加方便
%matplotlib inline # 在notebook中显示图
二、加载源数据集
data_raw = pd.read_csv('train.csv')
data_val = pd.read_csv('test.csv')
# 显示部分数据
data_raw.head() # 默认显示五条
# 查看表格中每一个字段的属性
data_raw.info()
# 将列名称转换为小写
data_raw.columns = data_raw.columns.str.lower()
data_val.columns = data_val.columns.str.lower()
data_raw.head()
# 绘制图形
sns.countplot(x=data_raw['survived'])
三、数据清洗
# 合并两个数据集 进行统一清洗
data_all = [data_raw,data_val]
# 查看数据集中的空值
data_raw.isnull().sum()
data_val.isnull().sum() # 查看测试集是否有空值
# 对源数据集进行描述
data_raw.describe(include='all')
(1)补足空缺值
# 对原始数据集(训练集+测试集)进行清理
for dataset in data_all :
# 补足空缺值
dataset['age'].fillna(dataset['age'].median(),inplace=True) # median()取年龄的中位数补全空缺值
dataset['fare'].fillna(dataset['fare'].median(),inplace=True) # 补全船票
dataset['embarked'].fillna(dataset['embarked'].mode()[0],inplace=True)# 取船舱等级按照频率排序,最大值
(2)删除一些无用字段
# 删除一些无用字段
drop_columns = ['cabin','passengerid','ticket']
data_raw.drop(drop_columns,axis=1,inplace=True) # axis=1 删除所有行中该列元素 inplace=True 在该文件中修改
data_val.drop(drop_columns,axis=1,inplace=True)
data_raw.isnull().sum() # 训练集
data_val.isnull().sum()
四、进行特征构建
for dataset in data_all :
# 构建新字段
# (1)family_size 家庭规模:sibsp + parch
dataset['family_size'] = dataset['sibsp'] + dataset['parch'] + 1
# (2) 单身 single 1:单身 0:非单身
dataset['single'] = 1
dataset['single'].loc[dataset['family_size'] > 1] = 0 # 家庭规模 > 1 表示非单身
# (3) 身份 title
dataset['title'] = dataset['name'].str.split(', ', expand=True)[1].str.split('.',expand=True)[0]
# dataset['title'] = dataset['name'].apply(lambda x : x.split(', ')[1].apply(lambda x : x.split('.')[0]))
# (4) 票价 fare_bin
dataset['fare_bin'] = pd.qcut(dataset['fare'],4) # 根据票价,分成4组(每组元素个数一致)
# (5)年龄 age_bin
dataset['age_bin'] = pd.cut(dataset['age'].astype(int),5) # 根据年龄分组,分成5组(每组元素不一致)
dataset.head()
# 根据title统计人数
data_raw['title'].value_counts()
title_names = (data_raw['title'].value_counts() < 10) # 将频率小于10的标记为True 其他的标记为False
title_names
# title : 将频率小于10的数据,全部归为一类:other
data_raw['title'] = data_raw['title'].apply(lambda x : 'other' if title_names[x] else x)
# 计数
data_raw['title'].value_counts()
# 每一个title类别能够生存的概率
data_raw['survived'].groupby(data_raw['title']).mean()
**将特征编码形成新的字段,基于scikit-learn中的LabelEncoder() **
for dataset in data_all :
# (1) 新字段 :sex_code
dataset['sex_code'] = label.fit_transform(dataset['sex'])
# (2) 新字段 :embarked_code
dataset['embarked_code'] = label.fit_transform(dataset['embarked'])
# (3) 新字段 :title_code
dataset['title_code'] = label.fit_transform(dataset['title'])
# (4) 新字段 :age_bin_code
dataset['age_bin_code'] = label.fit_transform(dataset['age_bin'])
# (5) 新字段 :fare_bin_code
dataset['fare_bin_code'] = label.fit_transform(dataset['fare_bin'])
data_raw.head()
# 列名
data_raw.columns.tolist()
五、特征选择
Target = ['survived'] # 标签
#方式一
data_columns_one = ['sex','pclass','embarked','title','sibsp','parch','age','fare','family_size','single']
columns_one = Target + data_columns_one
#方式二
data_columns_two = ['sex_code','pclass','embarked_code','title_code','sibsp','parch','age','fare']
columns_two = Target + data_columns_two
#方式三
data_columns_three = ['sex_code','pclass','embarked_code','title_code','family_size','age_bin_code','fare_bin_code']
columns_three = Target + data_columns_three
通过pandas中的get_dummies()进行编码
data_one_dummy = pd.get_dummies(data_raw[data_columns_one])
data_one_dummy_list = data_one_dummy.columns.tolist()
len(data_one_dummy_list) # 输出17
六、划分训练集和测试集
# 方式一
X_train_one, X_test_one,y_train_one, y_text_one = model_selection.train_test_split(data_one_dummy[data_one_dummy_list],
data_raw[Target],
random_state = 0)
X_train_one.shape
# 输出: (668, 1)
X_test_one.shape
# 输出: (223, 17)
# 方式二
X_train_two, X_test_two, y_train_two, y_test_two = model_selection.train_test_split(data_raw[data_columns_two],
data_raw[Target],
random_state = 0)
X_train_two.shape
# 输出:(668, 8)
X_test_two.shape
# 输出: (223, 8)
# 方式三
X_train_three, X_test_three, y_train_three, y_test_three = model_selection.train_test_split(data_raw[data_columns_three],
data_raw[Target],
random_state=0)
X_train_three.shape
# 输出:(668, 7)
X_test_three.shape
# 输出: (223, 7)
七、随机森林算法实现
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features='auto',
random_state = 1,
n_jobs = -1)# 多线程进行训练
# 网格搜索
param_grid = {
'criterion' : ['gini','entropy'], # 标准: 基尼 交叉熵
'min_samples_leaf':[1,5,10], # 最小的叶子节点数
'min_samples_split':[2,4,10,12,16],# 最小的分割数
'n_estimators':[50,100,400,700,1000] # 决策树的数目
}
gs = GridSearchCV(estimator=rf,
param_grid=param_grid,
scoring='accuracy',# 准确度作为评判标准
cv = 3,# 交叉验证
n_jobs= -1)
# 对方式一的特征进行训练
gs = gs.fit(X_train_one, y_train_one)
print(gs.best_score_)
# 输出 : 0.8323839534601868
print(gs.best_params_)
rf2 = RandomForestClassifier(criterion='entropy',
min_samples_leaf=5,
min_samples_split=12,
n_estimators=50,
n_jobs=-1,
random_state=1)
rf2.fit(X_train_one, y_train_one)
X_train_one.head()
# 根据特征重要性进行排序
pd.concat((pd.DataFrame(X_train_one.iloc[:,1:].columns,columns=['Variable']),
pd.DataFrame(rf2.feature_importances_,columns=['importance'])),
axis = 1).sort_values(by='importance',ascending = False)
在test上进行预测
# 在test上进行预测
pred = rf2.predict(X_test_one)
pred_df = pd.DataFrame(pred, columns=['survived'])
pred_df.head()
八、在最终的test.csv上预测
data_val_dummy = pd.get_dummies(data_val[data_columns_one])
data_val_dummy.head()
data_val_dummy_list = data_val_dummy.columns.tolist()
data_val_dummy_list
pred_val = rf2.predict(data_val_dummy[[
'pclass',
'age',
'fare',
'family_size',
'single',
'sex_female',
'sex_male',
'embarked_C',
'embarked_Q',
'embarked_S',
'title_Col',
'title_Dona',
'title_Dr',
'title_Master',
'title_Miss',
'title_Mr',
'title_Mrs',]])
pred_val_df = pd.DataFrame(pred_val, columns=['survived'])
pred_val_df.head(10)