多元线性回归
In [1]:
import numpy as np
import matplotlib.pyplot as plt
- 显示中文
In [2]:
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False
- 加载数据集
In [3]:
def loadDataSet(filename):
X = []
Y = []
with open(filename, 'rb') as f:
for idx, line in enumerate(f):
line = line.decode('utf-8').strip()
if not line:
continue
eles = line.split()
if idx == 0:
numFea = len(eles)
eles = list(map(float, eles))
X.append(eles[:-1])
Y.append([eles[-1]])
return np.array(X), np.array(Y)
- 假设函数
数学表达式转化成代码:
In [4]:
def h(theta, X):
return np.dot(X, theta)
- 代价函数
转化成代码:
In [14]:
def J(theta, X, Y):
m = len(X)
return np.sum(np.dot((h(theta, X) - Y).T, (h(theta, X) - Y)) / (2 * m))
- 多元线性回归的梯度函数就变成:
- 批量梯度下降求出收敛到最小值的θ
In [33]:
def bgd(alpha, maxloop, epsilon, X, Y):
m, n = X.shape
theta = np.zeros((n, 1)) # 因为有n个特征系数,一元线性回归的时候是2
count = 0
converged = False
error = np.inf
errors = [J(theta, X, Y), ] # 记录所有的代价函数输出值用来画图
thetas = {}
for i in range(n):
thetas[i] = [theta[i, 0], ]
while count <= maxloop:
if(converged):
break
count = count + 1
# 所有的梯度一起算
for j in range(n):
deriv = np.dot(X[:,j].T, (h(theta, X) - Y)).sum() / m
thetas[j].append(theta[j, 0] - alpha*deriv)
for j in range(n):
theta[j,0] = thetas[j][-1]
error = J(theta, X, Y)
errors.append(error)
if(abs(errors[-1] - errors[-2]) < epsilon):
converged = True
return theta, errors, thetas
- 特征处理---均值标准化
In [29]:
def standarize(X):
"""
均值标准化
Args: X 样本集
Returns: 归一化处理后的样本集
"""
m, n = X.shape
for j in range(n):
features = X[:, j]
meanVal = features.mean(axis = 0)
std = features.std(axis = 0)
if std != 0:
X[:, j] = (features - meanVal) / std
else:
X[:, j] = 0
return X
In [30]:
ori_X, Y = loadDataSet('./data/houses.txt') # 房屋数据
print(ori_X.shape)
print(Y.shape)
(47, 2)
(47, 1)
In [31]:
m, n = ori_X.shape
X = standarize(ori_X.copy())
X = np.concatenate((np.ones((m, 1)), X), axis = 1)
In [32]:
alpha = 1
maxloop = 5000
epsilon = 0.000001
result = bgd(alpha, maxloop, epsilon, X, Y)
theta, errors, thetas = result
In [35]:
print('theta:')
print(theta)
print('')
# 预测价格
normalizedSize = (70 - ori_X[:, 0].mean(0)) / ori_X[:, 0].std(0)
normalizedBr = (2 - ori_X[:, 1].mean(0)) / ori_X[:, 1].std(0)
predicateX = np.matrix([[1, normalizedSize, normalizedBr]])
price = h(theta, predicateX)
print('70㎡两居估价: ¥%.4f万元'%price)
theta:
[[275.55319149]
[125.4886707 ]
[ 3.7666334 ]]
70㎡两居估价: ¥235.9863万元
In [36]:
%matplotlib inline
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm # 彩色映射函数库 color map
import matplotlib.ticker as mtick # 设置坐标轴
In [37]:
# 打印拟合平面
fittFig = plt.figure(figsize=(16, 12))
title = 'bgd: rate={}, maxloop={}, epsilon={} \n'.format(alpha,maxloop,epsilon)
ax = fittFig.gca(projection='3d')
xx = np.linspace(0,200,25)
yy = np.linspace(0,5,25)
zz = np.zeros((25,25))
for i in range(25):
for j in range(25):
normalizedSize = (xx[i]-ori_X[:,0].mean(0))/ori_X[:,0].std(0)
normalizedSize = (xx[i]-ori_X[:,0].mean(0))/ori_X[:,0].std(0)
x = np.matrix([[1,normalizedSize, normalizedBr]])
zz[i,j] = h(theta, x)
xx, yy = np.meshgrid(xx,yy)
ax.zaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e')) # 设置主标签的文本格式
ax.plot_surface(xx, yy, zz, rstride=1, cstride=1, cmap=cm.rainbow, alpha=0.1, antialiased=True) # antialiased光滑曲线
xs = ori_X[:, 0].flatten() # .flatten() 返回一个一位数组
ys = ori_X[:, 1].flatten()
zs = Y[:, 0].flatten()
ax.scatter(xs, ys, zs, c='b', marker='o')
ax.set_xlabel(u'面积')
ax.set_ylabel(u'卧室数')
ax.set_zlabel(u'估价')
plt.show()
In [38]:
%matplotlib inline
errorsFig = plt.figure()
ax = errorsFig.add_subplot(111)
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
ax.plot(range(len(errors)), errors)
ax.set_xlabel(u'迭代次数')
ax.set_ylabel(u'代价函数')
Out[38]:
Text(0,0.5,'代价函数')