岭回归 ridge regression
原理:
为什么要用岭回归?
(1)OLS的最小二乘法是无偏的,不适于一些病态的数据
(2)
当存在很强的多重共线性时X'X是不可逆(或者接近不可逆)的
(3)实际上岭回归就是加2范数的最小二乘法
(4)在XTX对角线上加上alpha,那么线性相关的概率就小很多了。
用法:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
clf = Ridge()
X, y, w = make_regression(n_samples=10, n_features=10, coef=True,
random_state=1, bias=3.5)
coefs = []
errors = []
alphas = np.logspace(-6, 6, 200)
# Train the model with different regularisation strengths
for a in alphas:
clf.set_params(alpha=a)
clf.fit(X, y)
coefs.append(clf.coef_)
errors.append(mean_squared_error(clf.coef_, w))
# Display results
plt.figure(figsize=(20, 6))
plt.subplot(121)
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.subplot(122)
ax = plt.gca()
ax.plot(alphas, errors)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('error')
plt.title('Coefficient error as a function of the regularization')
plt.axis('tight')
plt.show()
运行结果:
对于当前数据集,可以看到,初始时weights随着alpha的增大保持稳定,之后weights变得混乱,之后weights归于零。同时起初错误率处于低水平,之后升高,最后稳定。
源码:
位置:return selfsklearn/linear_model/ridge.py
def fit(self, X, y, sample_weight=None):
if self.solver in ('sag', 'saga'):
_dtype = np.float64
else:
# all other solvers work at both float precision levels
_dtype = [np.float64, np.float32]
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=_dtype,
multi_output=True, y_numeric=True)
if ((sample_weight is not None) and
np.atleast_1d(sample_weight).ndim > 1):
raise ValueError("Sample weights must be 1D array or scalar")
X, y, X_offset, y_offset, X_scale = self._preprocess_data(
X, y, self.fit_intercept, self.normalize, self.copy_X,
sample_weight=sample_weight)
# temporary fix for fitting the intercept with sparse data using 'sag'
if sparse.issparse(X) and self.fit_intercept:
self.coef_, self.n_iter_, self.intercept_ = ridge_regression(
X, y, alpha=self.alpha, sample_weight=sample_weight,
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
random_state=self.random_state, return_n_iter=True,
return_intercept=True)
self.intercept_ += y_offset
else:
self.coef_, self.n_iter_ = ridge_regression(
X, y, alpha=self.alpha, sample_weight=sample_weight,
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
random_state=self.random_state, return_n_iter=True,
return_intercept=False)
self._set_intercept(X_offset, y_offset, X_scale)
#可以看出主要逻辑实现与ridge_regression方法中, ridge_regression方法源码如下:
def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
max_iter=None, tol=1e-3, verbose=0, random_state=None,
return_n_iter=False, return_intercept=False):
#solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
#sover提供了几个解决解决方案。
#“auto”是根据数据种类自动选择solver
#“svd”是当X是奇异矩阵的时候进行的计算,比“cholesky”更稳定
#“cholesky”使用scipy.linalg.solve这个标准程序来计算dot(X.T, X)
#"sparse_cg"使用梯度解决方案, scipy.sparse.linalg.cg。对于大规模的数据,比“cholesky”表现更好
#“lsqr”这个是最快的算法,但在scipy老版本中不可用,这也是用迭代来实现的
#“sag”也是一种迭代算法。当样本和属性都很多的时候,速度较快。
if return_intercept and sparse.issparse(X) and solver != 'sag':
if solver != 'auto':
warnings.warn("In Ridge, only 'sag' solver can currently fit the "
"intercept when X is sparse. Solver has been "
"automatically changed into 'sag'.")
solver = 'sag'
_dtype = [np.float64, np.float32]
# SAG needs X and y columns to be C-contiguous and np.float64
if solver in ['sag', 'saga']:
X = check_array(X, accept_sparse=['csr'],
dtype=np.float64, order='C')
y = check_array(y, dtype=np.float64, ensure_2d=False, order='F')
else:
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=_dtype)
y = check_array(y, dtype=X.dtype, ensure_2d=False)
check_consistent_length(X, y)
n_samples, n_features = X.shape
if y.ndim > 2:
raise ValueError("Target y has the wrong shape %s" % str(y.shape))
ravel = False
if y.ndim == 1:
y = y.reshape(-1, 1)
ravel = True
n_samples_, n_targets = y.shape
if n_samples != n_samples_:
raise ValueError("Number of samples in X and y does not correspond:"
" %d != %d" % (n_samples, n_samples_))
has_sw = sample_weight is not None
if solver == 'auto':
# cholesky if it's a dense array and cg in any other case
if not sparse.issparse(X) or has_sw:
solver = 'cholesky'
else:
solver = 'sparse_cg'
elif solver == 'lsqr' and not hasattr(sp_linalg, 'lsqr'):
warnings.warn("""lsqr not available on this machine, falling back
to sparse_cg.""")
solver = 'sparse_cg'
if has_sw:
if np.atleast_1d(sample_weight).ndim > 1:
raise ValueError("Sample weights must be 1D array or scalar")
if solver not in ['sag', 'saga']:
# SAG supports sample_weight directly. For other solvers,
# we implement sample_weight via a simple rescaling.
X, y = _rescale_data(X, y, sample_weight)
# There should be either 1 or n_targets penalties
alpha = np.asarray(alpha, dtype=X.dtype).ravel()
if alpha.size not in [1, n_targets]:
raise ValueError("Number of targets and number of penalties "
"do not correspond: %d != %d"
% (alpha.size, n_targets))
if alpha.size == 1 and n_targets > 1:
alpha = np.repeat(alpha, n_targets)
if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag', 'saga'):
raise ValueError('Solver %s not understood' % solver)
n_iter = None
if solver == 'sparse_cg':
coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose)
elif solver == 'lsqr':
coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol)
elif solver == 'cholesky':
if n_features > n_samples:
K = safe_sparse_dot(X, X.T, dense_output=True)
try:
dual_coef = _solve_cholesky_kernel(K, y, alpha)
coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T
except linalg.LinAlgError:
# use SVD solver if matrix is singular
solver = 'svd'
else:
try:
coef = _solve_cholesky(X, y, alpha)
except linalg.LinAlgError:
# use SVD solver if matrix is singular
solver = 'svd'
elif solver in ['sag', 'saga']:
# precompute max_squared_sum for all targets
max_squared_sum = row_norms(X, squared=True).max()
coef = np.empty((y.shape[1], n_features))
n_iter = np.empty(y.shape[1], dtype=np.int32)
intercept = np.zeros((y.shape[1], ))
for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
init = {'coef': np.zeros((n_features + int(return_intercept), 1))}
coef_, n_iter_, _ = sag_solver(
X, target.ravel(), sample_weight, 'squared', alpha_i, 0,
max_iter, tol, verbose, random_state, False, max_squared_sum,
init,
is_saga=solver == 'saga')
if return_intercept:
coef[i] = coef_[:-1]
intercept[i] = coef_[-1]
else:
coef[i] = coef_
n_iter[i] = n_iter_
if intercept.shape[0] == 1:
intercept = intercept[0]
coef = np.asarray(coef)
if solver == 'svd':
if sparse.issparse(X):
raise TypeError('SVD solver does not support sparse'
' inputs currently')
coef = _solve_svd(X, y, alpha)
if ravel:
# When y was passed as a 1d-array, we flatten the coefficients.
coef = coef.ravel()
if return_n_iter and return_intercept:
return coef, n_iter, intercept
elif return_intercept:
return coef, intercept
elif return_n_iter:
return coef, n_iter
else:
return coef