在本部分的练习中,您将使用一个变量实现线性回归,以预测食品卡车的利润。假设你是一家餐馆的首席执行官,正在考虑不同的城市开设一个新的分店。该连锁店已经在各个城市拥有卡车,而且你有来自城市的利润和人口数据。您希望使用这些数据来帮助您选择将哪个城市扩展到下一个城市。
梯度下降法
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def CostFunction(x,y,theta):
return np.sum((x.dot(theta) - y)**2) / (2 * len(x))
def GradienDescent(x,y,theta):
cost=[]
a=0.01
epcho=10000
for i in range(epcho):
temp=theta-(x.dot(theta)-y).dot(x)*a/(len(x))
theta=temp
#print("第",i+1,"次代价",CostFunction(x,y,theta))
cost.append(CostFunction(x,y,theta))
return theta,cost,epcho
f=open('ex1data1.txt',encoding='utf8')
data=pd.read_csv(f,header=None,names=['population','profit'])
x=data['population']
h=data['profit']
plt.scatter(x,h)
x = np.c_[np.ones(x.size), x]#详见机器学习(二)线性代数——计算技巧
theta=np.ones(x.shape[1])
theta,cost,epcho=GradienDescent(x,h,theta)
X=np.arange(4,25,0.01)
H=theta[0]+theta[1]*X
print(theta)
plt.title('GradienDescent')
plt.xlabel('population')
plt.ylabel('profit')
plt.plot(X,H,color='red')
plt.show()
plt.xlabel('epcho')
plt.ylabel('cost')
plt.plot(range(epcho),cost)
plt.show()
#theta=[-3.89578081 1.19303364]
对上面第二个注释解释下:hypothesis: h(x)=0.5+40*x
x
=
[
5
12
13
]
x=\left[ \begin{matrix} 5 \\ 12 \\ 13 \end{matrix} \right]
x=⎣⎡51213⎦⎤
θ
=
[
0.5
40
]
θ=\left[ \begin{matrix} 0.5 \\ 40 \end{matrix} \right]
θ=[0.540]
通过那步变换,可以将x变为
x
=
[
1
5
1
12
1
13
]
x=\left[ \begin{matrix} 1&5 \\ 1&12 \\ 1&13 \end{matrix} \right]
x=⎣⎡11151213⎦⎤
从而
h
(
x
)
=
[
1
∗
0.5
+
5
∗
40
1
∗
0.5
+
12
∗
40
1
∗
0.5
+
13
∗
40
]
h(x)=\left[ \begin{matrix} 1*0.5+5*40\\ 1*0.5+12*40\\ 1*0.5+13*40 \end{matrix} \right]
h(x)=⎣⎡1∗0.5+5∗401∗0.5+12∗401∗0.5+13∗40⎦⎤
使运算简便
结果如下:
正规方程法
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def LoadData():
f=open('ex1data1.txt',encoding='utf8')
data=pd.read_csv(f,header=None,names=['population','profit'])
t=data['population']
h=data['profit']
x = np.c_[np.ones(t.size), t]
return t,x,h
def NormalEquation(x,h):
return ((np.linalg.inv((x.T).dot(x))).dot(x.T)).dot(h)
def main():
t,x,h=LoadData()
theta=NormalEquation(x,h)
X=np.arange(4,25,0.01)
H=theta[0]+theta[1]*X
print(theta)
plt.title('Normal Equation')
plt.xlabel('population')
plt.ylabel('profit')
plt.scatter(t,h)
plt.plot(X,H,color='red')
plt.show()
if __name__ == '__main__':
main()
#theta=[-3.89578088 1.19303364]
结果如下:
sklearn检验
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
f=open('ex1data1.txt',encoding='utf8')
data=pd.read_csv(f,names=['population','profit'])
x=np.array(data['population']).reshape(-1,1)
y=np.array(data['profit']).reshape(-1,1)
plt.scatter(x,y)
model=linear_model.LinearRegression()
model.fit(x,y)
X=np.arange(4,25,0.01).reshape(-1,1)
H=model.predict(X)
plt.plot(X,H,color='red')
plt.show()
print(model.coef_)
print(model.intercept_)
结果如下:
斜率:1.19303364
截距:-3.89578088