三、评估模型的性能并调参
模型评估、调参的流程:
- 用管道简化工作流
- 使用k折交叉验证评估模型性能
- 使用学习和验证曲线调试算法
- 通过网格搜索进行超参数调优
- 比较不同的性能评估指标
**出处:**https://zhuanlan.zhihu.com/p/140040705
1. 用管道简化工作流
对数据进行标准化,PCA降维,最后拟合模型和预测
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#读取数据
data = pd.read_csv('wdbc.data',header = None)
data.tail(2)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.3514 | 0.152 | ... | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.265 | 0.4087 | 0.12400 |
568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.0000 | 0.000 | ... | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.000 | 0.2871 | 0.07039 |
2 rows × 32 columns
# 数据预处理
from sklearn.preprocessing import LabelEncoder
X = data.iloc[:,2:].values
y = data.iloc[:,1].values
le = LabelEncoder() #M=1,B=0
y = le.fit_transform(y)
le.transform(['M','B'])
array([1, 0], dtype=int64)
# 切割数据为训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
# 将操作封在一个管道pipeline内形成工作流:标准化+PCA+逻辑回归
#make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
pipe_lr1 = make_pipeline(StandardScaler(),
PCA(n_components=2),
LogisticRegression(random_state=1))
pipe_lr1.fit(X_train,y_train)
y_pred1 = pipe_lr1.predict(X_test)
print("Accuracy:%.3f" % pipe_lr1.score(X_test,y_test))
Accuracy:0.947
2. 使用k折交叉验证评估模型性能
# k折交叉验证
from sklearn.model_selection import cross_val_score
scores1 = cross_val_score(estimator=pipe_lr1, X=X_train, y=y_train, cv=10, n_jobs=1)
print("CV accuracy scores:%s" %scores1)
print("CV accuracy:%.3f + %.3f" %(np.mean(scores1),np.std(scores1)))
CV accuracy scores:[0.91304348 0.97826087 0.97826087 0.91304348 0.93478261 0.97777778
0.93333333 0.95555556 0.97777778 0.95555556]
CV accuracy:0.952 + 0.025
# 分层k折交叉验证
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train,y_trian)
scores2 = []
for k,(train,test) in enumerate(kfold):
pipe_lr1.fit(X_train[train],y_train[train])
score = pipe_lr1.score(X_train[test],y_train[test])
scores2.append(score)
print('Fold:%2d, Class dist:%s,Acc:%.3f'%(k+1, np.bincount(y_train[train]),score))
#np.bincount(统计非负整数出现的次数)
Fold: 1, Class dist:[256 153],Acc:0.913
Fold: 2, Class dist:[256 153],Acc:0.978
Fold: 3, Class dist:[256 153],Acc:0.978
Fold: 4, Class dist:[256 153],Acc:0.913
Fold: 5, Class dist:[256 153],Acc:0.935
Fold: 6, Class dist:[257 153],Acc:0.978
Fold: 7, Class dist:[257 153],Acc:0.933
Fold: 8, Class dist:[257 153],Acc:0.956
Fold: 9, Class dist:[257 153],Acc:0.978
Fold:10, Class dist:[257 153],Acc:0.956
D:\C_Anaconda\lib\site-packages\sklearn\model_selection\_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
warnings.warn(
3. 使用学习和验证曲线调试算法
模型过于复杂,有太多的自由度或者参数,模型就会过拟合的风险(高方差);模型如果过于简单,就会有欠拟合的风险(高偏差)
学习曲线:指在参数值确定的情况下,训练集和测试集的得分情况的对比
#学习曲线诊断偏差和方差
from sklearn.model_selection import learning_curve
#数据处理
pipe_lr3 = make_pipeline(StandardScaler(),
LogisticRegression(random_state=1,penalty='l2')) #假设模型参数满足高斯分布penalty=l2
#学习曲线诊断
train_sizes,train_scores,test_scores=learning_curve(estimator=pipe_lr3, X=X_train, y=y_train, train_sizes=np.linspace(0.1,1,10), cv=10, n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
#画图
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='red', marker='o', markersize=5, label='validation accuracy')
plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std, alpha=0.15, color='red')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8,1.02])
plt.show()
验证曲线:展示某个参数在取不同值时候,训练集与测试集得分情况的对比
# 用验证曲线解决欠拟合和过拟合
from sklearn.model_selection import validation_curve
pipe_lr4 = make_pipeline(StandardScaler(),
LogisticRegression(random_state=1,penalty='l2',solver="lbfgs",max_iter=100000))#打开最大迭代次数
param_range = [0.001, 0.01, 0.1, 1.0 ,10.0, 100.0]
train_scores,test_scores = validation_curve(estimator=pipe_lr4,X=X_train,y=y_train,param_name='logisticregression__C',param_range=param_range,cv=10,n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
#画图
plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='red', marker='o', markersize=5, label='validation accuracy')
plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15, color='red')
plt.xscale('log')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8,1.02])
plt.show()
4. 通过网格搜索进行超参数调优
如果只是一个参数调整,使用验证曲线手动调整。随着需调整的超参数增加的时候,如何自动去调整?【参数可以通过优化算法进行优化,如逻辑回归的系数;超参数是不能用优化模型进行优化的,如正则化的系数。】
# 网格搜索GridSearchCV()
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time
start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
param_range = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C':param_range, 'svc__kernel':['linear']}, {'svc__C':param_range, 'svc__gamma':param_range, 'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid,scoring='accuracy',cv=10, n_jobs=1)
gs = gs.fit(X_train, y_train)
end_time = time.time()
print("网格搜索经历时间:%.3f S" %float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)
网格搜索经历时间:7.367 S
0.9781159420289856
{'svc__C': 0.1, 'svc__kernel': 'linear'}
# 随机网络搜索RandomizedSearchCV()
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X_train, y_train)
end_time = time.time()
print("网格搜索经历时间:%.3f S" %float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)
网格搜索经历时间:4.594 S
0.9781159420289856
{'svc__kernel': 'linear', 'svc__C': 0.1}
# 嵌套交叉验证
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import time
start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=2, n_jobs=-1)
scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
end_time = time.time()
print("嵌套交叉验证:%.3f S"%float(end_time-start_time))
print('CV accuracy:%.3f+%.3f'%(np.mean(scores),np.std(scores)))
嵌套交叉验证:1.095 S
CV accuracy:0.965+0.025
5. 比较不同的性能评估指标
- 误差率ERR
E R R = F P + F N F P + F N + T P + T N ERR = \frac{FP+FN}{FP+FN+TP+TN} ERR=FP+FN+TP+TNFP+FN - 准确率ACC
A C C = T P + T N F P + F N + T P + T N ACC = \frac{TP+TN}{FP+FN+TP+TN} ACC=FP+FN+TP+TNTP+TN - 假阳率FPR
F P R = F P N = F P F P + T N FPR = \frac{FP}{N} = \frac{FP}{FP+TN} FPR=NFP=FP+TNFP - 真阳率TPR
T P R = T P P = T P F N + T P TPR = \frac{TP}{P} = \frac{TP}{FN+TP} TPR=PTP=FN+TPTP - 精度PRE
P R E = T P T P + F P PRE = \frac{TP}{TP+FP} PRE=TP+FPTP - 召回率REC
R E C = T P R = T P P = T P F N + T P REC = TPR = \frac{TP}{P}=\frac{TP}{FN+TP} REC=TPR=PTP=FN+TPTP - F1-score
F 1 − s c o r e = 2 P R E × R E C P R E + R E C F1-score = 2\frac{PRE \times REC }{PRE+REC} F1−score=2PRE+RECPRE×REC
# 绘制混淆矩阵
from sklearn.metrics import confusion_matrix
pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig,ax=plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
ax.text(x=j, y=i, s=confmat[i,j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()
# 各种指标的计算
from sklearn.metrics import precision_score, recall_score, f1_score
print('Precision:%.3f'%precision_score(y_true=y_test,y_pred=y_pred))
print('recall_score"%.3f'%recall_score(y_true=y_test,y_pred=y_pred))
print('f1_score:%.3f'%f1_score(y_true=y_test, y_pred=y_pred))
Precision:0.976
recall_score"0.952
f1_score:0.964
# 将不同的指标与GridSearch结合
from sklearn.metrics import make_scorer,f1_score
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10)
gs = gs.fit(X_train,y_train)
print(gs.best_score_)
print(gs.best_params_)
0.98287253786131
{'svc__C': 0.1, 'svc__kernel': 'linear'}
# 绘制ROC曲线
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import make_scorer,f1_score
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid,scoring=scorer,cv=10)
y_pred = gs.fit(X_train,y_train).decision_function(X_test)
fpr,tpr,threshold = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
plt.figure()
lw = 2
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area=%0.2f)' %roc_auc)
plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05,1.0])
plt.ylim([-0.05,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
<Figure size 432x288 with 0 Axes>
Z. 补充知识
Z1. 类别数据处理-LabelEncoder和OneHotEncoder
-
LabelEncoder
将一列文本数据转成数值 -
OneHotEncoder
将一列文本数据转成一列或多列只有0和1的数据
Z2. train_test_split
X_train,X_test, y_train, y_test =sklearn.model_selection.train_test_split(train_data,train_target,test_size=0.4, random_state=0,stratify=y_train)
- train_data:所要划分的样本特征集
- train_target:所要划分的样本结果
- test_size:样本占比,如果是整数的话就是样本的数量
- random_state:随机数的种子。每次都填1,其他参数一样的情况下你得到的随机数组是一样的。但填0或不填,每次都会不一样。
- stratify:保持split前类的分布。=X按照X中的比例分配;=y按照y中的分配
案例
**出处:**https://blog.csdn.net/jasonzhoujx/article/details/81905923
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)
['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
'Gerhard Schroeder' 'Hugo Chavez' 'Junichiro Koizumi' 'Tony Blair']
(1348, 62, 47)
# 画人脸
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
fig, ax = plt.subplots(3,5)
fig.subplots_adjust(left=0.0625, right=1.2, wspace=1)
# 控制子图布局
for i,axi in enumerate(ax.flat): #数组上的一维迭代器
axi.imshow(faces.images[i], cmap='bone') #cmap:配色方案
axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]])
# 使用预处理
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)
# 数据集分解为训练集和测试集进行交叉检验
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, random_state=42)
# 用网格搜索交叉检验来寻找最优参数。通过不断调整C(松弛变量)和gamma(控制径向基函数核的大小)
from sklearn.model_selection import GridSearchCV
param_grid = {'svc__C':[1,5,10,50], 'svc__gamma':[0.00001,0.0005,0.001,0.005]}
grid = GridSearchCV(model, param_grid)
grid.fit(X_train,y_train)
print(grid.best_params_)
{'svc__C': 10, 'svc__gamma': 0.001}
# 对测试集的数据进行预测
model = grid.best_estimator_
y_fit = model.predict(X_test)
# 比较预测结果和真实结果
fig, ax = plt.subplots(4,6)
for i, axi in enumerate(ax.flat):
axi.imshow(X_test[i].reshape(62,47), cmap='bone')
axi.set(xticks=[], yticks=[])
axi.set_ylabel(faces.target_names[y_fit[i]].split()[-1],
color = 'black' if y_fit[i]==y_test[i] else 'red')
fig.suptitle('Predicted Names; Incorect Labels in Red', size=14)
Text(0.5, 0.98, 'Predicted Names; Incorect Labels in Red')
# 打印分类结果,列举每个标签的统计结果
from sklearn.metrics import classification_report
print(classification_report(y_test, y_fit, target_names=faces.target_names))
precision recall f1-score support
Ariel Sharon 0.65 0.73 0.69 15
Colin Powell 0.80 0.87 0.83 68
Donald Rumsfeld 0.74 0.84 0.79 31
George W Bush 0.92 0.83 0.88 126
Gerhard Schroeder 0.86 0.83 0.84 23
Hugo Chavez 0.93 0.70 0.80 20
Junichiro Koizumi 0.92 1.00 0.96 12
Tony Blair 0.85 0.95 0.90 42
accuracy 0.85 337
macro avg 0.83 0.84 0.84 337
weighted avg 0.86 0.85 0.85 337
结果解读
- support:当前行的类别在测试数据中的样本总量
- precision:精度=正确预测的个数(TP)/被正确预测的个数(TP+FP)。真会预测结果,预测为正的样本中有多少真样本。
- recall:召回率= TP/(TP+FN)。针对原来的样本,表示样本中的正例有多少被预测正确
- f1-score: 2 p r e c i s i o n × r e c a l l p r e c i s i o n + r e c a l l 2\frac{precision \times recall}{precision+recall} 2precision+recallprecision×recall。precision和recall都高时F1也会高,F1为1时是最佳,0是最差的
- accuracy:所有数据下的指标值。假设全部数据 5 个样本中有 3 个预测正确,所以 micro avg 为 3/5=0.6
- macro avg:每个类别评估指标未加权的平均值,比如精确率的 macro avg,(0.50+0.00+1.00)/3=0.5
- weighted avg:加权平均,样本量作为权重
# 画出混淆矩阵,帮助判断哪些标签容易被分类器误判
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, y_fit)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels=faces.target_names,
yticklabels=faces.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')
Text(89.18, 0.5, 'predicted label')