目录
对比五折交叉验证 & classification_report
记录一下课程作业和在这个过程中遇到的问题与心得
下面是我用MLP分类器在帕金森诊断上的运用,在训练过程中遇到的一些问题,在我利用了GridSearchCV后得到的模型评估的AUC值居然略逊一点于未调参的模型,下面是整个的流程
未调参的MLP评估
先针对没有调参的MLP进行一个简单的评估计算,看看没有进行调参的MLP的模型性能,好与后面调参后的进行对比
导入咱们需要的用到的包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
数据预处理
将帕金森数据划分为训练集和测试集,随机种子设为42,随机种子可以随便设,只需要跟后面模型定义的设置一致即可,但是帕金森数据的样本数量,我设来设去,42是里面最合适的
data = pd.read_csv('parkinsons.data', header=0, sep=',')
X = data.drop(["name", "status"], axis=1).values
y = data["status"].values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
定义多层感知器分类器,并对模型进行评估
前面就是循规蹈矩的跟其他模型步骤类似
clf = MLPClassifier(random_state=42, max_iter=1000)
clf.fit(X_train, y_train.ravel())
y_pred = clf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
print("多层感知器分类器:")
print("MLPClassifier模型精度:{:.4f}".format(np.mean(y_pred == y_test)))
print("混淆矩阵:\n", confusion)
print(classification_report(y_test, y_pred, target_names=None))
评估结果
上面的代码运行出来的结果就是下图,这里可以看出来模型精度一般,其他数据还行
网格搜索调参 GridSearchCV
下面是我网格内定义的参数,但是训练过程较慢
mlp_clf_parameters = {
'hidden_layer_sizes': [(50,), (100,)],
'activation': ['relu', 'tanh', 'logistic'],
'alpha': [0.0001, 0.001, 0.01],
'learning_rate': ['constant', 'invscaling', 'adaptive'],
"solver": ['adam', 'sgd', 'lbfgs'],
"max_iter": [100, 200, 300, 400],
"verbose": [True]
}
所以我这里先把激活函数和优化器确定下来
activations = ['logistic', 'tanh', 'relu']
solvers = ['adam', 'sgd','lbfgs']
for activation in activations:
for solver in solvers:
clf=MLPClassifier(activation=activation,solver=solver, random_state=42,max_iter=5000)
clf.fit(X_train, y_train.ravel())
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Activation: {activation}, Solver: {solver}, Test Accuracy: {accuracy:.6f}")
运行结果
这里就发现大家的效果有点子类似,那么我就给帕金森数据做了一个归一化处理,然后再跑得到下面的效果图
# 使用StandardScaler进行归一化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
那么现在我就确定了后面利用的激活函数和优化器分别为 relu & adam
然后跑下面的GridSearchCV这个参数,很快我就得到了这个结果
mlp_clf_parameters = {
'hidden_layer_sizes': [(50,), (100,),(50,50),(100,100)],
'alpha': [0.0001, 0.001, 0.01],
'learning_rate': ['constant', 'invscaling', 'adaptive'],
# 'verbose': [True],
'max_iter': [1000]
}
MLP分类器,数据可以提前进行归一化或者标准化一下,有助于后续运算,然后就是对于MLP的 iters 的迭代次数它很容易显示迭代次数不足,这个我一直都是手动解决的,不知道有没有什么法子可以解决
绘制ROC曲线和计算AUC值
# 获取正类别的概率
y_pred_prob = clf.predict_proba(X_test_scaled)[:, 1]
y_pred_prob_best = best_mlp.predict_proba(X_test_scaled)[:, 1]
# 计算fpr & tpr
fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=y_pred_prob)
fpr_best, tpr_best, threshold_best = roc_curve(y_true=y_test, y_score=y_pred_prob_best)
# 计算auc值
roc_auc = auc(fpr, tpr)
roc_auc_best = auc(fpr_best, tpr_best)
#绘图
plt.plot(fpr_best, tpr_best, color='green', linewidth=2,
label='Best_MLP ROC curve (area = {:.2f})'.format(roc_auc_best))
plt.plot(fpr, tpr, color='pink', linewidth=2, label='MLP ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--') # 对角线
#x,y轴的限制和title
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()
运行结果
我的疑惑
首先,就是上图这个结果是我跑了很多次得到的结果,没有修改前面的数据集和随机种子的情况下,反复跑得到的,才使得Best_MLP的ROC曲线和AUC值优于没有调参的MLP
对比五折交叉验证 & classification_report
没有调参的MLP
BSET的MLP
对比看来可以发现调参后的训练稳定性比没有调参的效果好,虽然其他很多数据他们很相似,我猜测调参后的MLP模型有个优点应该是训练更稳定了吧,emmm....
全部代码如下:
我的帕金森代码没有导入,我是自己下载的data,可以直接从dataset中导入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
# 读取数据
data = pd.read_csv('parkinsons.data', header=0, sep=',')
X = data.drop(["name", "status"], axis=1).values
y = data["status"].values.reshape(-1, 1)
# 将数据划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 使用StandardScaler进行归一化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 多层感知器分类器
clf = MLPClassifier(random_state=42, max_iter=1000)
cv_scores = cross_val_score(clf, X_scaled, y.ravel(), cv=5, scoring='roc_auc')
# 输出交叉验证的准确率
print("五折交叉验证的准确率:", cv_scores)
print("平均准确率:", np.mean(cv_scores))
clf.fit(X_train_scaled, y_train.ravel())
y_pred = clf.predict(X_test_scaled)
# 输出模型评估结果
confusion = confusion_matrix(y_test, y_pred)
print("多层感知器分类器:")
print("混淆矩阵:\n", confusion)
print(classification_report(y_test, y_pred, target_names=None))
# # 1.选择不同的激活函数和求解器
# activations = ['logistic', 'tanh', 'relu']
# solvers = ['adam', 'sgd']
# for activation in activations:
# for solver in solvers:
# clf = MLPClassifier(activation=activation, solver=solver, random_state=42, max_iter=2000, verbose=0)
# clf.fit(X_train_scaled, y_train.ravel())
# # 获取每次迭代的损失值
# loss_curve = clf.loss_curve_
# # 寻找第一次收敛的迭代次数
# convergence_iteration = next(i for i, loss in enumerate(loss_curve) if loss <= min(loss_curve[1:]))
# # 在测试集上评估性能
# y_pred = clf.predict(X_test_scaled)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Activation: {activation}, Solver: {solver}, Test Accuracy: {accuracy:.6f}, Convergence Iteration: {convergence_iteration}")
# 定义多层感知器分类器
mlp_clf = MLPClassifier(random_state=42, activation='relu', solver='adam')
# 定义要调优的参数范围
mlp_clf_parameters = {
'hidden_layer_sizes': [(100, 100)],
'alpha': [ 0.0001, 0.001, 0.01],
'learning_rate': ['constant', 'invscaling', 'adaptive'],
# 'verbose': [True],
'max_iter': [1000]
}
# 使用GridSearchCV进行参数搜索
grid_search = GridSearchCV(estimator=mlp_clf, param_grid=mlp_clf_parameters, cv=5, scoring='roc_auc')
grid_search.fit(X_train_scaled, y_train.ravel())
# 输出最优参数
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
# 使用最优参数的模型进行训练和评估
best_mlp = MLPClassifier(**best_params,)
cv_scores = cross_val_score(best_mlp, X_scaled, y.ravel(), cv=5, scoring='roc_auc')
# 输出交叉验证的准确率
print("五折交叉验证的准确率:", cv_scores)
print("平均准确率:", np.mean(cv_scores))
best_mlp.fit(X_train_scaled, y_train.ravel())
# 在测试集上评估性能
y_pred = best_mlp.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy with Best Parameters:", accuracy)
# 计算混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)
print("混淆矩阵:\n", conf_matrix)
print(classification_report(y_test, y_pred, target_names=None))
y_pred_prob = clf.predict_proba(X_test_scaled)[:, 1]
y_pred_prob_best = best_mlp.predict_proba(X_test_scaled)[:, 1] # 获取正类别的概率
fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=y_pred_prob)
fpr_best, tpr_best, threshold_best = roc_curve(y_true=y_test, y_score=y_pred_prob_best)
roc_auc = auc(fpr, tpr)
roc_auc_best = auc(fpr_best, tpr_best)
plt.plot(fpr_best, tpr_best, color='green', linewidth=2,
label='Best_MLP ROC curve (area = {:.2f})'.format(roc_auc_best))
plt.plot(fpr, tpr, color='pink', linewidth=2, label='MLP ROC curve (area = {:.2f})'.format(roc_auc)) # 注意这里的标签
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--') # 对角线
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()