多元线性回归初学
导入库
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing as fch
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
开始操作
years = np.arange(2009,2020)
sales = np.array([0.5,9.36,52,191,352,571,912,1207,1682.69,2135,2684])
plt.figure(figsize=(13,8),dpi=80)
plt.scatter(years,sales,c='red',marker='v')
plt.show()
优化一下
x = (years-2008).reshape(-1,1)
y = sales
model = LR().fit(x,y)
plt.figure(figsize=(13,8),dpi=80)
plt.scatter(years-2008,sales,c='red',marker='v')
plt.plot(x,model.coef_[0]*x+model.intercept_)
plt.show()
x_new = np.concatenate([x**2,x],axis=1)
x_new
array([[ 1, 1],
[ 4, 2],
[ 9, 3],
[ 16, 4],
[ 25, 5],
[ 36, 6],
[ 49, 7],
[ 64, 8],
[ 81, 9],
[100, 10],
[121, 11]])
model = LR().fit(x_new,y)
w1,w2 = model.coef_
plt.figure(figsize=(13,8),dpi=80)
plt.scatter(years-2008,sales,c='red',marker='v')
plt.plot(x,w1*x**2+w2*x+model.intercept_)
plt.show()
预测
f_sales = lambda x:w1*x**2+w2*x+model.intercept_
f_sales(12)
3280.062242424243
案例
housevalue = fch()
x = pd.DataFrame(housevalue.data)
x
y = housevalue.target
y.max()
y.min()
x.columns = housevalue.feature_names
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state = 420)
x_test
恢复索引
#恢复索引
for i in [x_train,x_test]:
i.index = range(i.shape[0])
x_train = (x_train - x_train.mean())/x_train.std()
x_test = (x_test - x_test.mean())/x_test.std()
model = LR().fit(x_train,y_train)
y_hat = model.predict(x_test)
y_hat
array([1.49203117, 0.45498477, 2.25315388, …, 2.11133918, 1.7670662 ,
0.72741136])
model.coef_
array([ 8.34359117e-01, 1.28396759e-01, -2.71761090e-01, 3.07246826e-01,
5.87780105e-04, -4.09691866e-02, -8.81180273e-01, -8.52996280e-01])
[*zip(x_train.columns,model.coef_)]
[(‘MedInc’, 0.8343591168009582),
(‘HouseAge’, 0.12839675861589372),
(‘AveRooms’, -0.27176109015428723),
(‘AveBedrms’, 0.3072468258262482),
(‘Population’, 0.0005877801053252033),
(‘AveOccup’, -0.040969186598430574),
(‘Latitude’, -0.8811802730160843),
(‘Longitude’, -0.8529962804954484)]
'''
'MedInc':该街区住户的收入中位数
'HouseAge':该街区房屋使用年代的中位数
'AveRooms':该街区平局的房间数目
'AveBedrms':该街区平局卧室数目
'Population':该街区人口
'AveOccup’:平局入住率
'Latitude':该街区的纬度
'Longitude':该街区的精度
'''
如何评判
from sklearn.metrics import mean_squared_error as MSE
MSE(y_hat,y_test)
0.5274256678664516
y_test.mean()
2.0819292877906976
from sklearn.metrics import r2_score
r2_score(y_test,y_hat)
0.606956866619088
r2 = model.score(x_test,y_test)
r2
0.606956866619088
plt.figure(figsize=(13,8),dpi=80)
plt.plot(range(len(y_test)),sorted(y_test),c="red",label="real")
plt.plot(range(len(y_hat)),sorted(y_hat),c="blue",label="predict")
plt.legend()
plt.show()