1.线性回归
-
(1) 什么是线性回归?
答: 线性回归(Linearregression)是利用称为线性回归方程的最小二乘函数对一个或多个自变量和因变量之间关系进行建模的一种回归分析。 -
(2) 线性回归解决的是什么问题?
答:解决的是回归问题,例如房价的预测,是把数据x输入线性方程y = a*x+b中,得到一个预测值y1. -
(3)怎么求线性回归方程?
答:求线性回归方程y =a * x + b,其实就是求参数a,b。可以通过最小二乘法等方法求解较好的参数a,b。使得回归方程的拟合性比较好。
2.最小二乘法
- 最小二乘法是一种求解线性方程参数的方法,例如:y=a*x +b,可以通过公式求出a,b:
x一杠,y一杠表示的是所有x的平均值,和所有y的平均值
下面是一个小例子:
import numpy as np
import matplotlib.pyplot as plt
# 给五个点
x = np.array([1, 2, 3, 4, 5], dtype=float)
y = np.array([1, 2, 3, 2, 3], dtype=float)
# 求均值
x_mean = np.mean(x)
y_mean = np.mean(y)
num_sum = 0.0 # 分子
d = 0.0 # 分母
for i in range(len(x)):
num += (x[i] - x_mean) * (y[i] - y_mean)
d += (x[i] - x_mean) ** 2
a = num_sum / d
b = y_mean - a * x_mean
plt.scatter(x, y)
plt.plot(x, a * x + b, color='r')
plt.axis([0, 6, 0, 6])
plt.show()
这就是运用最小二乘法来求参数a,b的
3.简单的线性(LinearRegression)回归的实现
- 关于最小二乘法的求解,我们运用了两种思想,一种是像上面一样,一个一个数据相乘然后累加,另一种是采用向量的方法,用矩阵相乘的方法。因为一般数据都是有多个特征的,如果一个一个计算,效率明显会慢一点。
import numpy as np
class SimpleLinearRegression1:
"""简单线性回归"""
def __init__(self):
self.a_ = None
self.b_ = None
def fit(self, x_train, y_train):
"""将训练数据喂入,然后求得参数a,b
y = a * x + b"""
assert x_train.ndim == 1
assert len(x_train) == len(y_train)
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
# 运用最小二乘法求出a, b(一个一个相乘然后累加)
num = 0.0
d = 0.0
for i in range(len(x_train)):
num += (x_train[i] - x_mean) * (y_train[i] - y_mean)
d += (x_train[i] - x_mean) ** 2
self.a_ = num / d
self.b_ = y_mean - self.a_ * x_mean
return self
def predict(self, x_predict):
assert x_predict.ndim == 1
assert self.a_ is not None and self.b_ is not None
return np.array([self._predict(x) for x in x_predict])
def _predict(self, x_single):
"""给定单个带预测数据,返回预测结果"""
return self.a_ * x_single + self.b_
def __repr__(self):
return "SimpleLinearRegression1"
class SimpleLinearRegression2:
"""简单线性回归"""
def __init__(self):
self.a_ = None
self.b_ = None
def fit(self, x_train, y_train):
"""将训练数据喂入,然后求得参数a,b"""
assert x_train.ndim == 1
assert len(x_train) == len(y_train)
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
# 采用向量相乘的方法计算最小二乘
num = (x_train - x_mean).dot(y_train - y_mean)
d = (x_train - x_mean).dot(x_train - x_mean)
self.a_ = num / d
self.b_ = y_mean - self.a_ * x_mean
return self
def predict(self, x_predict):
assert x_predict.ndim == 1
assert self.a_ is not None and self.b_ is not None
return np.array([self._predict(x) for x in x_predict])
def _predict(self, x_single):
"""给定单个带预测数据,返回预测结果"""
return self.a_ * x_single + self.b_
def __repr__(self):
return "SimpleLinearRegression2"
x_predict = np.array([6.0])
x = np.array([1, 2, 3, 4, 5], dtype=float)
y = np.array([1, 2, 3, 2, 3], dtype=float)
reg = SimpleLinearRegression1()
reg.fit(x, y)
4.多元线性回归
- 前面我们提到的是一元线性回归y=ax+b,也就是只有一个特征,接下来我们要讲一下多元线性回归,其实与一元线性回归差不多,只是计算稍微复杂一点,我们也是通过最小二乘法来实现的。多元线性方程:y = a1x1 + a2 * x2 + a3 x3 +…+an * xn+b
求的参数变多了,有(b,a1,a2,a3…an)
把要求的参数看成一个向量,(1,x1,x2,…xn)看成一个向量,这样就可以通过向量的计算求出所以参数。 - 我们令theta = (1,x1,x2,…xn),通过公式可以求得:
theta = np.linalg.inv(x_b.T.dot(x_b)).dot(x_b.T).dot(y_train)
代码如下:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
"""自定义线性回归算法"""
class LinearRegression1():
"""初始化模型"""
def __init__(self):
self.coef_ = None
self.interception_ = None
self._theta = None
def fit_normal(self, x_train, y_train):
"""喂入数据进行训练"""
assert x_train.shape[0] == y_train.shape[0]
# 向x_train添加一列元素
x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
# 求出theta的值
self._theta = np.linalg.inv(x_b.T.dot(x_b)).dot(x_b.T).dot(y_train)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon:
break
cur_iter += 1
return theta
def predict(self, x_predict):
"""对数据进行预测,得到一个预测值"""
assert self.interception_ is not None and self.coef_ is not None
assert x_predict.shape[1] == len(self.coef_)
x_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
return x_b.dot(self._theta)
def score(self, x_test, y_test):
y_predict = self.predict(x_test)
from sklearn.metrics import mean_squared_error
return 1 - mean_squared_error(y_test, y_predict) / np.var(y_test)
def __repr__(self):
return "LinearRegression"
if __name__ == "__main__":
boston = datasets.load_boston()
# 值取出其中一个特征
x = boston.data
y = boston.target
# 因为在收集数据的时候,数据的值很可能有上线,所以我们需要将数据中上限值去掉,以减小误差
x = x[y < 50.0]
y = y[y < 50]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=666)
# 使用自定义的类
reg = LinearRegression1()
reg.fit_normal(x_train, y_train)
print(reg.coef_, reg.interception_)
r = reg.score(x_test, y_test)
print("自定义的准确率:", r)
# 使用sklearn中的类解决回归问题
line_reg = LinearRegression()
line_reg.fit(x_train, y_train)
r2 = line_reg.score(x_test, y_test)
print("sklearn的准确率:", r2)
# 使用knn解决回归问题
from sklearn.neighbors import KNeighborsRegressor
param_grid = [{'weights': ['uniform'],
'n_neighbors': [i for i in range(1, 11)]},
{'weights': ['distance'],
'n_neighbors': [i for i in range(1, 11)],
'p': [i for i in range(1, 6)]}]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
print("正确率:", grid_search.best_estimator_.score(x_test, y_test))