【集成学习（上）】My_Task06_掌握分类问题的评估及超参数调优笔记

本文链接：https://blog.csdn.net/jcjic/article/details/115606883

文章目录

评估模型的性能并调参:
- 当类别为两类时,可以绘制混淆矩阵与ROC曲线

评估模型的性能并调参:

更详细的可以查看大佬的知乎：https://zhuanlan.zhihu.com/p/140040705

from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 使用网格搜索进行超参数调优:
# 方式1 : 网格搜索GridSearchCV()
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

start_time = time.time()
# make_pipeline 把所有的操作全部封在一个管道pipeline内形成一个工作流：
## 标准化+PCA+逻辑回归
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state = 1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C' :param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("网格搜索经历时间: %.3f S" % float(end_time - start_time))
print(gs.best_score_)
print(gs.best_params_)

网格搜索经历时间: 2.644 S
0.9800000000000001
{'svc__C': 1.0, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}

# 方式2：随机网格搜索RandomizedSearchCV()
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import time

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
# param_grid = [{'svc__C':param_range,'svc__kernel':['linear','rbf'],'svc__gamma':param_range}]
gs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("随机网格搜索经历时间：%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)

随机网格搜索经历时间：0.468 S
0.9800000000000001
{'svc__kernel': 'rbf', 'svc__gamma': 0.1, 'svc__C': 1.0}

当类别为两类时,可以绘制混淆矩阵与ROC曲线

# 混淆矩阵
#加载数据
import pandas as pd
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)

'''
乳腺癌数据集:569 个恶性和良性肿瘤细胞的样本,M为恶性,B为良性
'''
# 做基本的数据预处理
from sklearn.preprocessing import LabelEncoder
X = df.iloc[:,2:].values

y = df.iloc[:,1].values
le = LabelEncoder()
y = le.fit_transform(y)

le.transform(['M','B'])

array([1, 0])

# 数据切分8:2 
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2,stratify=y,random_state=1)
from sklearn.svm import SVC
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
from sklearn.metrics import  confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline 

pipe_svc.fit(X_train,y_train)
y_pred = pipe_svc.predict(X_test)
confmat =  confusion_matrix(y_true = y_test,y_pred = y_pred)
fig,ax = plt.subplots(figsize = (2.5,2.5))
ax.matshow(confmat,cmap=plt.cm.Blues,alpha = 0.3)  # 画混淆矩阵用的?
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j,y=i,s=confmat[i,j],va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

在这里插入图片描述

# 绘制ROC曲线:
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import make_scorer,f1_score
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10)
y_pred = gs.fit(X_train,y_train).decision_function(X_test)
# y_pred = gs.predict(X_test)
fpr,tpr,threshold = roc_curve(y_test,y_pred)  ###计算真阳率和假阳率
roc_auc = auc(fpr,tpr)  # 计算auc的值
plt.figure()
lw = 2
plt.figure(figsize = (7,5))
plt.plot(fpr,tpr,color = 'darkorange',
         lw=lw,label = 'ROC curve (area = %0.2f)' % roc_auc)  # 假阳率为横坐标,真阳率为纵坐标
plt.plot([0,1],[0,1],color = 'navy',lw=lw,linestyle='--')
plt.xlim([-0.05,1.0])
plt.ylim([-0.05,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc='lower right')
plt.show()

<Figure size 432x288 with 0 Axes>

在这里插入图片描述

实践
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_lfw_people.html
案例的内容是对图像进行识别并分类。
参考资料：
https://blog.csdn.net/cwlseu/article/details/52356665
https://blog.csdn.net/jasonzhoujx/article/details/81905923

参考链接
参考 : GitHub开源集成学习(上) Datawhale