原理
前提假设
给定一组由输入x和输出y构成的数据集D = {(x1,y1),(x2,y2),(x3,y3),…(xm,xm)},其中xi=(xi1;xi2;xi3…;xid),yi∈R。
原理阐述
线性回归是通过训练学习得到一个线性模型y来最大限度地根据输入x拟合输出y。即 y ^ = w x i + b \widehat{y} = wx_{i} + b y =wxi+b 确定参数w和b使得拟合值 y ^ \widehat{y} y 和真实值y的均方误差尽可能小。
代码
基于numpy的代码实现
import numpy as np
def linear_loss(X,y,w,b):
##回归模型主体
##输入 :X变量矩阵 y标签向量 w参数权重矩阵 b偏置
##输出 dw db loss y_hat
num_train = X.shape[0] ##得到数据量
num_feature = X.shape[1] ##得到特征数
y_hat = np.dot(X,w)+b ##线性模型y^=wx+b
loss = np.sum((y_hat-y)**2)/num_train ##计算均方损失
dw = np.dot(X.T,(y_hat-y))/num_train ##计算梯度
db = np.sum((y_hat-y))/num_train ##计算梯度
return y_hat,loss,dw,db
def initialize_params(dims):
##初始化函数
## 输入dims:数据维度 数据特征数量
##输出: 初始权重矩阵w 初始偏置参数b
w = np.zeros((dims,1))
b = 0
return w,b
def linear_train(X,y,learning_rate=0.01,epochs=20000):
##模型最优化过程
##输入 数据矩阵X 标签值y 学习率 迭代次数
##输出 每次迭代后的均方损失loss_every 优化后的参数字典params 迭代后的梯度
loss_every = [] ##存储均方损失
w,b = initialize_params(X.shape[1]) ##得到初始参数
for i in range(1,epochs): ##梯度下降法
y_hat,loss,dw,db = linear_loss(X,y,w,b)
w += -learning_rate * dw
b += -learning_rate * db
loss_every.append(loss)
if i%20000 == 0: ##每训练20000次,打印一次迭代次数i和损失loss
print("epoch %d loss %f"%(i,loss))
params = {'w':w, ##将w和b存到字典里
'b':b}
grads = {'dw':dw,
'db':db}
return loss_every,params,grads
##调用数据集 并分为训练集和测试集
from sklearn.datasets import load_diabetes
from sklearn.utils import shuffle
diabetes = load_diabetes()
data,target = diabetes.data,diabetes.target
X,y = shuffle(data,target,random_state=13) ##打乱数据集
offset = int(X.shape[0]*0.8) ##获得数据数量*0.8 再取整
X_train,y_train = X[:offset],y[:offset]
X_test,y_test = X[offset:],y[offset:]
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))
print("X_train's shape:",X_train.shape)
print("X_test's shape:",X_test.shape)
##模型训练
loss_every,params,grads = linear_train(X_train,y_train,0.01,200000)
print(params)
##模型预测
def predict(X,params):
## 输入 数据矩阵X 含w,b参数字典params
## 输出 y的预测值
w = params['w']
b = params['b'] ##字典值调用
y_pred = np.dot(X,w)+b
return y_pred
y_pred = predict(X_test,params)
def r2_score(y_test,y_pred):
##模型效果评估
## 输入y的测试集的值和y的预测值
## 输出r2
y_avg = np.mean(y_test)
ss_tot = np.sum((y_test-y_avg)**2)
ss_res = np.sum((y_test-y_pred)**2)
r2 = 1-(ss_res/ss_tot)
return r2
print(r2_score(y_test,y_pred))
基于sklearn的代码实现
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.datasets import load_diabetes
from sklearn.utils import shuffle
diabetes = load_diabetes() ##加载数据集
data,target = diabetes.data,diabetes.target
X,y = shuffle(data,target,random_state=13) ##打乱数据集
offset = int(X.shape[0]*0.8) ##获得数据量*0.8 再取整
X_train,y_train = X[:offset],y[:offset]
X_test,y_test = X[offset:],y[offset:]
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))
model = linear_model.LinearRegression() ##定义模型
model.fit(X_train,y_train) ##拟合模型
predict = model.predict(X_test) ##模型预测
print("均方误差:%.2f"% mean_squared_error(y_test,predict)) ##模型评估
print("R²:%.2f"% r2_score(y_test,predict))