泰坦尼克号幸存者预测数据集,学习机器学习课程的常客,数据集可以从Kaggle上进行下载(需要注册,注册中需要替换验证码)
使用了scikit-learn中的RandomForest+Pipeline+grid:
RandomForest:算法
Pipeline:方便进行数据预处理,处理完训练集的过程可用于处理测试集
grid:网格方法用于寻找最佳超参数
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
# 读取数据集
fn_train = r'train.csv'
fn_test = r'test.csv'
fn_test_result = r'gender_submission.csv'
df = pd.read_csv(fn_train)
test_feature_df = pd.read_csv(fn_test)
test_result_df=pd.read_csv(fn_test_result)
test_df = pd.merge(test_feature_df, test_result_df)
# 查看数据情况
df.info()
df.sample(10)
pd.set_option('display.max_rows', None)
nums = df['Age'].value_counts(dropna=False)
fig, ax = plt.subplots()
ax.bar(nums.index, nums.values)
ax.grid()
plt.show()
# 生成训练和测试的feature和label
feature_df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']].copy()
feature_test_df = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']].copy()
label_arr = df[['Survived']].values.reshape((-1,))
label_test_arr = test_df[['Survived']].values.reshape((-1,))
label_test_arr.shape
def fill_null(df):
#使用均值填充年龄缺失值
avg_age = df.loc[:, 'Age'].mean()
df.loc[pd.isnull(df.Age), 'Age'] = avg_age
#使用众数填充上船口缺失值
Embarked_mode = df['Embarked'].mode().values[0]
df.loc[pd.isnull(df.Embarked), 'Embarked'] = Embarked_mode
return df
def encode_features(df):
#对性别和登船港口进行编码
le = LabelEncoder()
le.fit(df['Sex'])
df['Sex_ft'] = le.transform(df['Sex'])
le.fit(df['Embarked'])
df['Embarked_ft'] = le.transform(df['Embarked'])
return df
def select_features(df,
cos=['Pclass', 'Sex_ft', 'Age', 'SibSp', 'Parch',
'Embarked_ft']):
#选择需要的features
df_copy = encode_features(fill_null(df))
return df_copy[cos].copy()
# features预处理
preprocessing = Pipeline([
('fill_null', FunctionTransformer(fill_null)),
('encode_features', FunctionTransformer(encode_features)),
('select_features', FunctionTransformer(select_features))
])
# 建立pipeline
clf = Pipeline([
('preprocessing', preprocessing),
('scaler', StandardScaler()),
('classifier', RandomForestClassifier())
])
# 设置超参数列表
n_estimators_range = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
max_features_range = ['sqrt', 'log2']
max_depth_range = [int(x) for x in np.linspace(10, 100, num=10)]
max_depth_range.append(None)
min_samples_split_range = [2, 5, 10]
min_samples_leaf_range = [1, 2, 4, 8]
random_forest_hp_range = {
'classifier__n_estimators': n_estimators_range,
'classifier__max_features': max_features_range,
'classifier__max_depth': max_depth_range,
'classifier__min_samples_split': min_samples_split_range,
'classifier__min_samples_leaf': min_samples_leaf_range
}
random_forest_hp_range
# 构建超参数训练方法
model = RandomizedSearchCV(estimator=clf,
param_distributions=random_forest_hp_range,
n_iter=50,
cv=3,
verbose=1)
# 模型训练
model.fit(feature_df, label_arr)
# 获取最佳超参数
model.best_params_
model.best_score_
# 利用网格精确找到最佳超参数
random_forest_hp_range_2 = {
'classifier__n_estimators': [i for i in range(400, 620, 20)],
'classifier__max_depth': [70, 80, 90],
'classifier__min_samples_split': [4, 5, 6],
'classifier__min_samples_leaf': [6, 8, 10]
}
random_forest_hp_range_2
model_2 = RandomizedSearchCV(estimator=clf,
param_distributions=random_forest_hp_range_2,
n_iter=10,
cv=3,
verbose=1)
model_2.fit(feature_df, label_arr)
model_2.best_params_
model_2.best_score_
# 获取最佳模型
best_rf_model = model_2.best_estimator_
# 对测试集进行预测
label_test_pred=best_rf_model.predict(test_feature_df)
# 模型评价
#confusion_matrix 混淆矩阵
metrics.confusion_matrix(label_test_arr, label_test_pred)
#accuracy 正确率
metrics.accuracy_score(label_test_arr, label_test_pred)
#precision 命中率or精确度
metrics.precision_score(label_test_arr, label_test_pred)
#recall 召回率
metrics.recall_score(label_test_arr, label_test_pred)
#F1
metrics.f1_score(label_test_arr, label_test_pred)
#AUC
metrics.roc_auc_score(label_test_arr, label_test_pred)
#ROC曲线
metrics.RocCurveDisplay.from_predictions(
label_test_arr,
label_test_pred,
color='red',
)
plt.plot([0, 1], [0, 1], 'k--', label='chance level (AUC = 0.5)')
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()
# 保存模型
import joblib
joblib.dump(best_rf_model, filename='trfModel.joblib')