import pandas as pd
df = pd.read_csv("housing.csv")
print(df.head())
print(df.shape)
df = df.drop(["longitude","ocean_proximity"],1)
df.head()
## 由于近海的值与经度的值差不多,故删除
df.dropna(inplace=True) # 将其中含有缺乏值的去除 #(20433, 8)
#数据的预处理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
x = np.array(df.drop(["median_house_value"],1))
y = np.array(df["median_house_value"])
scaler = StandardScaler()
x = scaler.fit_transform(x) #将x标准化
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.2) #训练集为0.8,测试集为0.2
print(train_x.shape,test_x.shape) ##(16346, 7) (4087, 7)
print(train_y.shape,test_y.shape) ##(16346,) (4087,)
#定义训练模型
def train_test(model,train_x,train_y,test_x,test_y):
model.fit(train_x,train_y)
R_train = model.score(train_x,train_y)
R_test = model.score(test_x,test_y)
return R_train,R_test
# 模型的训练
from sklearn.linear_model import LinearRegression,LassoCV,RidgeCV
import matplotlib.pyplot as plt
models = [LinearRegression(),LassoCV(alphas=[0.1, 1.0, 10.0]),RidgeCV()]
for i,model in enumerate(models):
R_train,R_test = train_test(model,train_x,train_y,test_x,test_y)
print(model)
print("model_train_R^2:",R_train)
print("model_test_R^2:",R_test)
pre_y = model.predict(test_x)
plt.subplot(3,1,i+1)
plt.scatter(pre_y,test_y)
plt.xlabel("pre_x")
plt.ylabel("real_x")
plt.title("pre_x VS real_x")
plt.grid() # 生成网格线
x = np.arange(1,700000)
y = x
plt.plot(x,y,'r')
plt.show()
由于笔者第一次用深度学习模型预测价格,故准确度不高,期待指点,谢谢。