-
一、模型评估
-
1.留出法(hand-out)
-
1.1原理
-
1.2示例
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()
X_train,X_test,y_train,y_test= train_test_split(iris.data,iris.target,test_size=0.3,random_state=0)
-
1.3分层采样示例
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()
X_train,X_test,y_train,y_test= train_test_split(iris.data,iris.target,test_size=0.3,random_state=0,stratify=iris.target)
-
2.交叉验证法(cross validation)
-
2.1原理
-
2.2交叉验证示例(KFold)
from sklearn.model_selection import KFold
import numpy as np
X = np.random.rand(9,4)
y = np.array([1,1,0,0,1,1,0,0,1])
folder = KFold(n_splits=3,random_state=0,shuffle=True)
for train_index,test_index in folder.split(X,y):
print("Train Index:",train_index)
print("Test Index:",test_index)
print("X_train:",X[train_index])
print("X_test:",X[test_index])
print("")
-
2.3分层交叉验证(StratifiedKFold)
from sklearn.model_selection import KFold,StratifiedKFold
import numpy as np
X = np.random.rand(8,4)
y = np.array([1,1,0,0,1,1,0,0])
stratified_folder = StratifiedKFold(n_splits=4,random_state=0,shuffle=False)
for train_index,test_index in stratified_folder.split(X,y):
print("Stratified Train Index:",train_index)
print("Stratified Test Index:",test_index)
# 标签y的分布比KFold均匀
print("Stratified y_train:",y[train_index])
print("Stratified y_test:",y[test_index])
print("")
-
2.4留一法(Leave-One-Out)
from sklearn.model_selection import LeaveOneOut
import numpy as np
X = np.array([[1,2,3,4],
[11,12,13,14],
[21,22,23,24],
[31,32,33,34]])
y = np.array([1,1,0,0])
loo = LeaveOneOut()
for train_index,test_index in loo.split(X):
print("Levave-One-Out Train Index:",train_index)
print("Levave-One-Out Test Index:",test_index)
print("Levave-One-Out X_train:",X[train_index])
print("Levave-One-Out X_test:",X[test_index])
-
2.5cross_val_score
cross_val_score()函数是一个便利函数,它将指定的学习器(estimator)运行在指定的数据集上,通过k折交叉验证获得最佳的性能。
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_digits
from sklearn.svm import LinearSVC
digits = load_digits()
X = digits.data
y = digits.target
result = cross_val_score(LinearSVC(),X,y,cv=10)
print("Cross Val Score is:",result)
-
3.自助法(bootstrapping)
-
3.1原理
-
3.2示例
import numpy as np
import pandas as pd
import random
data = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
data['y'] = [random.choice([0,1]) for i in range(10)]
train = data.sample(frac=1.0,replace=True) # 有放回随机采样
test = data.loc[data.index.difference(train.index)].copy() # 将未采样的样本作为测试集
-
二、性能度量
-
1.错误率与准确率
from sklearn.metrics import accuracy_score
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [0,0,1,1,0,1,1,1,0,0]
# 返回准确率
print('Accuracy Score(normalize=True):',accuracy_score(y_true,y_pred,normalize=True))
# 返回正确分类的数量
print('Accuracy Score(normalize=False):',accuracy_score(y_true,y_pred,normalize=False))
-
2.查准率(precision)与查全率(召回率、recall)
from sklearn.metrics import precision_score
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [1,1,0,0,0,1,1,0,0,0]
# 返回查准率
print('Precision Score:',precision_score(y_true,y_pred))
from sklearn.metrics import recall_score
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [1,1,0,0,0,1,1,0,0,0]
# 返回召回率
print('Recall Score:',recall_score(y_true,y_pred))
-
3.P-R曲线
from sklearn.metrics import precision_recall_curve
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
import numpy as np
iris = load_iris()
X = iris.data
y = iris.target
y = label_binarize(y,classes=[0,1,2]) # one-hot
n_classes = y.shape[1]
# 添加噪声
np.random.seed(0)
n_samples,n_features = X.shape
X = np.c_[X,np.random.randn(n_samples,200*n_features)]
# 训练模型
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)
clf = OneVsRestClassifier(SVC(kernel='linear',probability=True,random_state=0))
clf.fit(X_train,y_train)
y_score = clf.fit(X_train,y_train).decision_function(X_test)
# 绘制P-R曲线
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
precision = {}
recall = {}
for i in range(n_classes):
precision[i],recall[i],_ = precision_recall_curve(y_test[:,i],y_score[:,i])
ax.plot(recall[i],precision[i],label='target=%s'%i)
ax.set_xlabel("Recall Score")
ax.set_ylabel("Precision Score")
ax.set_title("P-R")
ax.legend(loc='best')
ax.set_xlim(0,1.1)
ax.set_ylim(0,1.1)
ax.grid()
-
4.F1与
from sklearn.metrics import f1_score
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [0,0,1,1,0,0,0,0,0,0]
print("F1 score:",f1_score(y_true,y_pred))
from sklearn.metrics import fbeta_score
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [0,0,1,1,0,0,0,0,0,0]
print("Fbeta Score(beta=0.01):",fbeta_score(y_true,y_pred,beta=0.001))
print("Fbeta Score(beta=1):",fbeta_score(y_true,y_pred,beta=1))
print("Fbeta Score(beta=100):",fbeta_score(y_true,y_pred,beta=100))
-
5.ROC与AUC
from sklearn.metrics import roc_curve,auc
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
import numpy as np
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
# one-hot
y = label_binarize(y,classes=[0,1,2])
n_classes = y.shape[1]
# 添加噪声
np.random.seed(0)
n_samples,n_features = X.shape
X = np.c_[X,np.random.randn(n_samples,200*n_features)]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=0)
# 训练模型
clf = OneVsRestClassifier(SVC(kernel='linear',probability=True,random_state=0))
clf.fit(X_train,y_train)
y_score = clf.fit(X_train,y_train).decision_function(X_test)
# 获取ROC
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
fpr = {}
tpr = {}
roc_auc = {}
for i in range(n_classes):
fpr[i],tpr[i],_ = roc_curve(y_test[:,i],y_score[:,i])
roc_auc[i] = auc(fpr[i],tpr[i])
ax.plot(fpr[i],tpr[i],label="target=%s,auc=%s"%(i,roc_auc[i]))
ax.plot([0,1],[0,1],'k--')
ax.set_xlabel("FPR")
ax.set_ylabel("TPR")
ax.set_title("ROC")
ax.legend(loc="best")
ax.set_xlim(0,1.1)
ax.set_ylim(0,1.1)
ax.grid()
-
6.sklearn中其他分类指标的实现
-
6.1classification_report
from sklearn.metrics import classification_report
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [0,0,1,1,0,0,0,0,0,0]
print("Classification Report:\n",classification_report(y_true,y_pred,target_names=["class_0","class_1"]))
-
6.2confusion_matrix(混淆矩阵)
from sklearn.metrics import confusion_matrix
y_true = [1,1,1,1,1,0,0,0,0,0]
y_pred = [0,0,1,1,0,0,0,0,0,0]
print("Confusion Matrix:\n",confusion_matrix(y_true,y_pred,labels=[0,1]))
-
7.sklearn中回归问题的性能度量
-
7.1平均绝对误差
from sklearn.metrics import mean_absolute_error
y_true = [1,0.3,0.5,0,1.3,0,3.5,0.6,0,0.7]
y_pred = [1,0,0.9,1.1,1.2,1.3,0,0,0,0]
print("Mean Absolute Error:",mean_absolute_error(y_true,y_pred))
-
7.2均方误差
from sklearn.metrics import mean_squared_error
y_true = [1,0.3,0.5,0,1.3,0,3.5,0.6,0,0.7]
y_pred = [1,0,0.9,1.1,1.2,1.3,0,0,0,0]
print("Mean Square Error:",mean_squared_error(y_true,y_pred))
-
三、绘制验证曲线与学习曲线
-
1.验证曲线
绘制一个模型在同一参数的不同取值下,在同一测试集上的性能曲线。
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import LinearSVC
from sklearn.model_selection import validation_curve
# 加载数据
digits = load_digits()
X,y = digits.data,digits.target
# 获取验证曲线的数据
param_name = "C" # 待调的参数
param_range = np.logspace(-2,2) # 待调参数的范围
# scoring指定性能度量的指标(如准确率、查全率等)
train_scores,test_scores = validation_curve(LinearSVC(),X,y,param_name=param_name,param_range=param_range,cv=10,scoring="accuracy")
# 对不同C的取值,获得10折交叉上的预测得分上的均值和方差
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std = np.std(test_scores,axis=1)
# 绘图
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.semilogx(param_range,train_scores_mean,label="Training Accuracy",color="r")
ax.fill_between(param_range,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.2,color='r')
ax.semilogx(param_range,test_scores_mean,label="Testing Accuracy",color="g")
ax.fill_between(param_range,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.2,color='g')
ax.set_title("Validation Curve with LinearSVC")
ax.set_xlabel("C")
ax.set_ylabel("Score")
ax.set_ylim(0,1.1)
ax.legend(loc='best')
-
2.学习曲线
以样本数为横坐标得到的交叉验证后的性能曲线。
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import LinearSVC
from sklearn.model_selection import learning_curve
# 加载数据
digits = load_digits()
X,y = digits.data,digits.target
# 获取学习曲线
train_sizes = np.linspace(0.1,1.0,endpoint=True,dtype='float')
abs_trains_sizes,train_scores,test_scores = learning_curve(LinearSVC(),X,y,cv=10,scoring="accuracy",train_sizes=train_sizes)
# 对不同数量的样本,获得10折交叉上的预测得分上的均值和方差
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std = np.std(test_scores,axis=1)
# 绘图
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(abs_trains_sizes,train_scores_mean,label="Training Accuracy",color="r")
ax.fill_between(abs_trains_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.2,color='r')
ax.plot(abs_trains_sizes,test_scores_mean,label="Testing Accuracy",color="g")
ax.fill_between(abs_trains_sizes,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.2,color='g')
ax.set_title("Learning Curve with LinearSVC")
ax.set_xlabel("Sample Nums")
ax.set_ylabel("Score")
ax.set_ylim(0,1.1)
ax.legend(loc='best')
-
四、调参
-
1.暴力搜索
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# 加载数据
digits = load_digits()
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.25,random_state=0,stratify=digits.target)
# 参数优化
tuned_parameters = [{'penalty':['l1','l2'],
'C':[0.01,0.05,0.1,0.5,1,5,10,50,100],
'solver':['liblinear'],
'multi_class':['ovr']},
{'penalty':['l2'],
'C':[0.01,0.05,0.1,0.5,1,5,10,50,100],
'solver':['lbfgs'],
'multi_class':['ovr','multinomial']}
]
clf = GridSearchCV(LogisticRegression(tol=1e-6),param_grid=tuned_parameters,cv=10)
clf.fit(X_train,y_train)
print("Best parameters set found:",clf.best_params_)
print("Grid scores:")
for params,mean_score,scores in clf.grid_scores_:
print("\t%0.3f (+/-%0.03f) for %s"%(mean_score,scores.std()*2,params))
print("Optimized Score:",clf.score(X_test,y_test))
print("Detailed classification report:")
y_true,y_pred = y_test,clf.predict(X_test)
print(classification_report(y_true,y_pred))
-
2.随机搜索
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import scipy
# 加载数据
digits = load_digits()
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.25,random_state=0,stratify=digits.target)
# 参数优化
tuned_parameters = {'C':scipy.stats.expon(scale=100),
'multi_class':['ovr','multinomial']}
clf = RandomizedSearchCV(LogisticRegression(penalty='l2',solver='lbfgs',tol=1e-6),
param_distributions=tuned_parameters,
cv=10,
scoring="accuracy",
n_iter=100)
clf.fit(X_train,y_train)
print("Best parameters set found:",clf.best_params_)
print("Randomized Grid scores:")
for params,mean_score,scores in clf.grid_scores_:
print("\t%0.3f (+/-%0.03f) for %s"%(mean_score,scores.std()*2,params))
print("Optimized Score:",clf.score(X_test,y_test))
print("Detailed classification report:")
y_true,y_pred = y_test,clf.predict(X_test)
print(classification_report(y_true,y_pred))
参考文献:
[1].周志华. 《机器学习》
[2].https://blog.csdn.net/b876144622/article/details/80009867
[3].https://en.wikipedia.org/wiki/Receiver_operating_characteristic