文章目录
上次简单介绍了线性回归原理以及公式推导,今天用python代码简单实现下单变量线性回归:
# -*- coding:utf-8 -*-
import numpy as np
from matplotlib import pylab as pl
class LRArithmetic:
def __init__(self, x, y):
"""
Parameters of the loading
:param x:
:param y:
"""
self.x = np.array(x)
self.y = np.array(y)
# Check data
if len(x) != len(y):
raise ValueError('There is a problem with the input data!!!')
# Initialize the parameters of the unary equation: y = wx + b
self.w = 0
self.b = 0
self.is_fit = False
def fit_arithmetic(self):
"""
Calculate the parameters of w and b by arithmetic method
:return:
"""
# Calculate the mean of x and y
x_mean = np.mean(self.x)
y_mean = np.mean(self.y)
# Calculate the numerator and denominator
numerator = 0
denominator = 0
for i in range(len(self.x)):
numerator += (self.x[i] - x_mean) * (self.y[i] - y_mean)
denominator += np.square((self.x[i] - x_mean))
self.w = numerator / denominator
self.b = y_mean - self.w * x_mean
self.is_fit = True
def fit_vector(self):
"""
Calculate the parameters of w and b by vector method
:return:
"""
# Calculate the mean of x and y
x_mean = np.mean(self.x)
y_mean = np.mean(self.y)
# Calculate the numerator and denominator
numerator = np.sum((self.x - x_mean) * (self.y - y_mean))
denominator = np.sum(np.square(self.x - x_mean))
self.w = numerator / denominator
self.b = y_mean - self.w * x_mean
self.is_fit = True
def predit(self, x):
"""
Predicted data
:param x:
:return:
"""
if not self.is_fit:
raise RuntimeError('Please call the fit method first!!!')
return np.array(x) * self.w + self.b
if __name__ == '__main__':
# train
x = np.array([1, 3, 2, 1, 3])
y = np.array([14, 24, 18, 17, 27])
LR = LRArithmetic(x, y)
LR.fit_vector()
# predit
x_test = np.array([2, 3, 5, 6, 10])
y_test = LR.predit(x_test)
print(y_test)
xx = np.linspace(0, max(x_test))
yy = LR.w * xx + LR.b
pl.plot(xx, yy, 'k-')
pl.scatter(x, y, cmap=pl.cm.Paired)
pl.scatter(x_test, y_test, cmap=pl.cm.Paired)
pl.show()
运行效果如下图:
注: 蓝色的是训练数据,橙色的是预测的结果
使用fit_vector向量计算方法进行训练对比fit_arithmetic循环方式,数据量少时看不出太大的时间差,不过向量方式计算在性能上肯定是原因比循环要快的。推荐使用numpy和pandas下的包来进行计算,毕竟做过优化。