记录一下：Titanic-classification (sklearn.ensemble.RandomForestClassifier)

myqijin

已于 2023-03-31 09:50:02 修改

阅读量131

点赞数

文章标签： sklearn python 机器学习

于 2023-03-28 17:17:17 首次发布

本文链接：https://blog.csdn.net/myqijin/article/details/129820337

版权

泰坦尼克号幸存者预测数据集，学习机器学习课程的常客，数据集可以从Kaggle上进行下载（需要注册，注册中需要替换验证码）

使用了scikit-learn中的RandomForest+Pipeline+grid：

RandomForest：算法

Pipeline：方便进行数据预处理，处理完训练集的过程可用于处理测试集

grid：网格方法用于寻找最佳超参数

#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics

# 读取数据集
fn_train = r'train.csv'
fn_test = r'test.csv'
fn_test_result = r'gender_submission.csv'
df = pd.read_csv(fn_train)
test_feature_df = pd.read_csv(fn_test)
test_result_df=pd.read_csv(fn_test_result)
test_df = pd.merge(test_feature_df, test_result_df)

# 查看数据情况
df.info()
df.sample(10)
pd.set_option('display.max_rows', None)
nums = df['Age'].value_counts(dropna=False)
fig, ax = plt.subplots()
ax.bar(nums.index, nums.values)
ax.grid()
plt.show()

# 生成训练和测试的feature和label
feature_df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']].copy()
feature_test_df = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']].copy()
label_arr = df[['Survived']].values.reshape((-1,))
label_test_arr = test_df[['Survived']].values.reshape((-1,))
label_test_arr.shape

def fill_null(df):
    #使用均值填充年龄缺失值
    avg_age = df.loc[:, 'Age'].mean()
    df.loc[pd.isnull(df.Age), 'Age'] = avg_age
    #使用众数填充上船口缺失值
    Embarked_mode = df['Embarked'].mode().values[0]
    df.loc[pd.isnull(df.Embarked), 'Embarked'] = Embarked_mode
    return df

def encode_features(df):
    #对性别和登船港口进行编码
    le = LabelEncoder()
    le.fit(df['Sex'])
    df['Sex_ft'] = le.transform(df['Sex'])
    le.fit(df['Embarked'])
    df['Embarked_ft'] = le.transform(df['Embarked'])
    return df

def select_features(df,
              cos=['Pclass', 'Sex_ft', 'Age', 'SibSp', 'Parch',
                   'Embarked_ft']):
    #选择需要的features
    df_copy = encode_features(fill_null(df))
    return df_copy[cos].copy()

# features预处理
preprocessing = Pipeline([
    ('fill_null', FunctionTransformer(fill_null)),
    ('encode_features', FunctionTransformer(encode_features)),
    ('select_features', FunctionTransformer(select_features))
])

# 建立pipeline
clf = Pipeline([
    ('preprocessing', preprocessing),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

# 设置超参数列表
n_estimators_range = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
max_features_range = ['sqrt', 'log2']
max_depth_range = [int(x) for x in np.linspace(10, 100, num=10)]
max_depth_range.append(None)
min_samples_split_range = [2, 5, 10]
min_samples_leaf_range = [1, 2, 4, 8]
random_forest_hp_range = {
    'classifier__n_estimators': n_estimators_range,
    'classifier__max_features': max_features_range,
    'classifier__max_depth': max_depth_range,
    'classifier__min_samples_split': min_samples_split_range,
    'classifier__min_samples_leaf': min_samples_leaf_range
}
random_forest_hp_range

# 构建超参数训练方法
model = RandomizedSearchCV(estimator=clf,
                           param_distributions=random_forest_hp_range,
                           n_iter=50,
                           cv=3,
                           verbose=1)

# 模型训练
model.fit(feature_df, label_arr)

# 获取最佳超参数
model.best_params_
model.best_score_

# 利用网格精确找到最佳超参数
random_forest_hp_range_2 = {
    'classifier__n_estimators': [i for i in range(400, 620, 20)],
    'classifier__max_depth': [70, 80, 90],
    'classifier__min_samples_split': [4, 5, 6],
    'classifier__min_samples_leaf': [6, 8, 10]
}
random_forest_hp_range_2
model_2 = RandomizedSearchCV(estimator=clf,
                                param_distributions=random_forest_hp_range_2,
                                n_iter=10,
                                cv=3,
                                verbose=1)
model_2.fit(feature_df, label_arr)
model_2.best_params_
model_2.best_score_

# 获取最佳模型
best_rf_model = model_2.best_estimator_

# 对测试集进行预测
label_test_pred=best_rf_model.predict(test_feature_df)

# 模型评价
#confusion_matrix 混淆矩阵
metrics.confusion_matrix(label_test_arr, label_test_pred)

#accuracy 正确率
metrics.accuracy_score(label_test_arr, label_test_pred)

#precision 命中率or精确度
metrics.precision_score(label_test_arr, label_test_pred)

#recall 召回率
metrics.recall_score(label_test_arr, label_test_pred)

#F1
metrics.f1_score(label_test_arr, label_test_pred)

#AUC
metrics.roc_auc_score(label_test_arr, label_test_pred)

#ROC曲线
metrics.RocCurveDisplay.from_predictions(
    label_test_arr,
    label_test_pred,
    color='red',
)
plt.plot([0, 1], [0, 1], 'k--', label='chance level (AUC = 0.5)')
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

# 保存模型
import joblib
joblib.dump(best_rf_model, filename='trfModel.joblib')