Linear regression with multiple variables
In this part, you will implement linear regression with multiple variables to
predict the prices of houses. Suppose you are selling your house and you
want to know what a good market price would be. One way to do this is to
first collect information on recent houses sold and make a model of housing
prices.
首先观察所给数据
path = "data/ex1data2.txt"
data = pd.read_csv(path, header=None, names=["Size", "Bedrooms", "Price"])
ax = plt.figure(figsize=(8, 6)).add_subplot(111, projection='3d')
x = data.Size
y = data.Bedrooms
z = data.Price
ax.scatter(x, y, z, color="red")
plt.show()
对于两个变量不妨假设目标函数为
Y = ax1 + bx2 + c
损失函数为
由于采用梯度下降来解决问题,对数据因进行归一化处理
def mean_normalization(temp):
mean_temp=[]
s = temp.max() - temp.min()
avg = temp.mean()
for i in temp:
i = (i - avg) / s
mean_temp.append(i)
return mean_temp
处理后数据
损失函数如下
比较步长,选定步长为0.3
全部代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
'''
The first column is the size of the house (in square feet), the
second column is the number of bedrooms, and the third column is the price
of the house.
'''
ress = []
def mean_normalization(temp):
mean_temp = []
s = temp.max() - temp.min()
avg = temp.mean()
for i in temp:
i = (i - avg) / s
mean_temp.append(i)
return mean_temp
# 均方误差
# Y = ax1 + bx2 + c
def squared_error(a, b, c):
res = 0
d_a = 0
d_b = 0
d_c = 0
for i in range(len(data)):
Size = x[i]
Bedrooms = y[i]
Price = z[i]
res += pow(a * Size + b * Bedrooms + c - Price, 2)
d_a += (a * Size + b * Bedrooms + c - Price) * Size
d_b += (a * Size + b * Bedrooms + c - Price) * Bedrooms
d_c += (a * Size + b * Bedrooms + c - Price)
res *= 1 / 2 * len(data)
ress.append(res)
d_a *= 1 / len(data)
d_b *= 1 / len(data)
d_c *= 1 / len(data)
print("a: ", a, "b: ", b, "c: ", c)
print("欧氏距离: ", res, "d_a: ", d_a, "d_b:", d_b, "d_c:", d_c)
return d_a, d_b, d_c
# 梯度下降
def gradient_descent(a, b, c, alpha):
d_a, d_b, d_c = squared_error(a, b, c)
while abs(d_a) > 10e-9 or abs(d_b) > 10e-9 or abs(d_c) > 10e-9:
temp_a = a - alpha * d_a
temp_b = b - alpha * d_b
temp_c = c - alpha * d_c
a = temp_a
b = temp_b
c = temp_c
d_a, d_b, d_c = squared_error(a, b, c)
return a, b, c
path = "data/ex1data2.txt"
data = pd.read_csv(path, header=None, names=["Size", "Bedrooms", "Price"])
ax = plt.figure(figsize=(8, 6)).add_subplot(121, projection='3d')
x = mean_normalization(data.Size)
y = mean_normalization(data.Bedrooms)
z = mean_normalization(data.Price)
ax.scatter(x, y, z, color="red")
a, b, c = gradient_descent(0, 0, 0, 0.3)
print(a, b, c)
p_x = p_y = np.arange(start=-0.6, stop=0.6, step=0.001)
X, Y = np.meshgrid(p_x, p_y)
Z = X * a + Y * b + c
ax.plot_surface(X, Y, Z)
plt.show()
预测结果如下
红色为真实值,蓝色为预测值
正规方程
也可以采用正规方程来求解,另使用正规方程不需要对数据进行归一化
data.insert(0, 'C', 1)
data = data.to_numpy()
X = data[:, 0:3]
Y = data[:, 3]
Z = (np.linalg.inv(X.T.dot(X))).dot(X.T).dot(Y)
print(Z)
运算结果
data.insert(0, 'C', 1)
data = data.to_numpy()
X = data[:, 0:3]
Y = data[:, 3]
Z = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y))
p_x = np.arange(start=1000, stop=4500, step=1)
p_y = np.arange(start=0, stop=8, step=0.01)
x, y = np.meshgrid(p_x, p_y)
z = x * Z[1] + y * Z[2] + Z[0]
ax.plot_surface(x, y, z)
plt.show()