# 偏差-方差分解，学习和验证曲线评估模型

## 偏差-方差分解

https://blog.csdn.net/simple_the_best/article/details/71167786

$bias^{2}(x)=(\bar{f}(x)-y)^{2}$

$var(x)=E_{D}[(f(x;D)-\bar{f}(x))^{2}]$

$E_{D}[f(\bm{x};D)\bar{f}(\bm{x})-f(\bm{x};D)y_{D}-\bar{f}^{2}(\bm{x})+\bar{f}(\bm{x})y_{D})] \\ =E_{D}[f(\bm{x};D)\bar{f}(\bm{x})-\bar{f}^{2}(\bm{x})]+E_{D}[\bar{f}(\bm{x})y_{D}-f(\bm{x};D)y_{D}] \\ =\bar{f}(\bm{x})E_{D}[f(\bm{x};D)]-\bar{f}^{2}(\bm{x})+\bar{f}(\bm{x})E_{D}(y_{D})-E_{D}[f(\bm{x};D)y_{D}]\\ =\bar{f}^{2}(\bm{x})-\bar{f}^{2}(\bm{x})+\bar{f}(\bm{x})E_{D}(y_{D})-\bar{f}(\bm{x})E_{D}(y_{D})=0$

$J_{train}(\theta)较大，J_{train}(\theta) \approx J_{CV}(\theta)$

$J_{train}(\theta)较小，J_{train}(\theta) \ll J_{CV}(\theta)$

#学习曲线

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

'''

3~32列包含了30个特征
'''
+ '/breast-cancer-wisconsin/wdbc.data',
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le =LabelEncoder()
# 将类标从字符串(M或B)变为整数的(0,1)
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

'''

PipeLine对象采用元组的序列作为输入，每个元组第一个值为字符串，

'''
pipe_lr = Pipeline([
('scl', StandardScaler()),
('clf', LogisticRegression(penalty='l2', random_state=0))
])
'''
learning_curve默认使用分层K折交叉验证
'''
train_sizes, train_scores, valid_scores = \
learning_curve(estimator=pipe_lr,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
valid_mean = np.mean(valid_scores, axis=1)
valid_std = np.std(valid_scores, axis=1)

plt.plot(train_sizes, train_mean, c='blue', marker='o', markersize=5,
label='training accuracy')
plt.fill_between(train_sizes,
train_mean - train_std,
train_mean + train_std,
alpha=0.15, color='blue')

plt.plot(train_sizes, valid_mean, c='green', marker='o', markersize=5,
linestyle='--', label='validation accuracy')
plt.fill_between(train_sizes,
valid_mean - valid_std,
valid_mean + valid_std,
alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.ylim([0.8, 1.01])
plt.show()


#验证曲线

import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

'''

3~32列包含了30个特征
'''
+ '/breast-cancer-wisconsin/wdbc.data',
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le =LabelEncoder()
# 将类标从字符串(M或B)变为整数的(0,1)
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

'''

PipeLine对象采用元组的序列作为输入，每个元组第一个值为字符串，

'''
pipe_lr = Pipeline([
('scl', StandardScaler()),
('clf', LogisticRegression(penalty='l2', random_state=0))
])

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, valid_scores = \
validation_curve(estimator=pipe_lr,
X=X_train,
y=y_train,
# 可以通过estimator.get_params().keys()获取param索引名
param_name='clf__C',
param_range=param_range,
cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
valid_mean = np.mean(valid_scores, axis=1)
valid_std = np.std(valid_scores, axis=1)

plt.plot(param_range, train_mean, c='blue', marker='o', markersize=5,
label='training accuracy')
plt.fill_between(param_range,
train_mean - train_std,
train_mean + train_std,
alpha=0.15, color='blue')

plt.plot(param_range, valid_mean, c='green', marker='o', markersize=5,
label='validation accuracy')
plt.fill_between(param_range,
valid_mean - valid_std,
valid_mean + valid_std,
alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.ylim([0.8, 1.0])
plt.show()



• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120