# -*- coding: utf-8 -*- ''''' Created on 2018年1月18日 @author: Jason.F @summary: 判别过拟合和欠拟合 学习曲线Learning Curve:评估样本量和指标的关系 验证曲线validation Curve:评估参数和指标的关系 ''' import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve import numpy as np from sklearn.model_selection import validation_curve # 导入数据 df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) # 类标整数化 print (le.transform(['M', 'B'])) # 划分训练集合测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) # 标准化、模型训练串联 pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(random_state=1, penalty='l2'))]) # case1:学习曲线 # 构建学习曲线评估器,train_sizes:控制用于生成学习曲线的样本的绝对或相对数量 train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) # 统计结果 train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # 绘制效果 plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='test accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.0]) plt.show() # case2:验证曲线 param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] # 10折,验证正则化参数C train_scores, test_scores = validation_curve(estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10) # 统计结果 train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='test accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.0]) plt.show()
学习曲线,调试模型
最新推荐文章于 2023-10-26 20:55:41 发布