思路在这:
【房价预测】BP神经网络回归的现实应用-上海市二手房价格影响因素分析——思路剖析和结果分享
前言:
不提供数据,不提供爬虫,协助调代码正常运行和安装geoplot环境
100 RMB一次(因为真的很麻烦);其他定制需求看复杂程度收恰饭钱。
讲解实现思路和各模块的协调和作用(建议看上面放的【思路分析】链接)
预测效果:
大部分的差价都在百位左右,少数差价在千位以上和十位以下:
模型损失:
代码:
jupyter-notebook 全代码的【下载链接】(有详细注释,需要关注后才能下载,持续吸粉中):BP神经网络实现-上海市二手房价格影响因素分析
import pandas as pd
import os
HOUNG_PATH = 'F:/2020Work/final'
def load_housing_data(housing_path=HOUNG_PATH):
csv_path=os.path.join(housing_path,"housing.csv")
return pd.read_csv(csv_path,encoding ='gbk')
# csv文件是用逗号分开的一列列数据文件
## 加载房屋信息,并且查看信息和列名
housing=load_housing_data()
housing_train=load_housing_data()
housing.info()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
housing.hist(bins=50,figsize=(25,15))
plt.savefig("图1-房屋各类数据分布.png")
plt.show()
vnorm = plt.Normalize(vmin=15000.60047, vmax=70000.0412)
housing.plot(kind="scatter",x="经度",y="纬度",alpha=0.1,
s=housing["收藏人数"],label="收藏人数",
c="单位价格/平方米",cmap=plt.get_cmap("jet"),colorbar=True,sharex=False,norm=vnorm)
plt.legend()
plt.savefig("图2-地理位置与价格和收藏人数的关系.png", bbox_inches='tight', pad_inches=0, dpi=300)
plt.show()
import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt
from shapely.geometry import Point # 经纬度转换为点
#shp_path=os.path.join(HOUNG_PATH,"上海市.shp")
#city = gp.read_file(shp_path)
geojson_path=os.path.join(HOUNG_PATH,"shanghai.json")
shanghai_boroughts = gpd.read_file(geojson_path) #读取数据为geodataframe格式
longtitude =housing["经度"][1:len(housing["经度"])]
altitude = housing["纬度"][1:len(housing["纬度"])]
xy = [Point(xy) for xy in zip(longtitude,altitude)]
pts = gp.GeoSeries(xy)
import csv
csv_path=os.path.join(HOUNG_PATH,"housing.csv")
csv_reader = csv.reader(open(csv_path))
geojson = '{"type":"FeatureCollection","features":['
for row in csv_reader:
if row[15]=="经度":
pass
else:
# print(row)
# print(row[15]+","+row[16], row[9],row[10])
geojson += '{"type":"Feature","properties":{"price":%s,"loved":%s,"name":"%s"},"geometry":{"type":"Point","coordinates":[%s]}},' % ( row[9],row[10],row[0],row[15]+","+row[16])
geojson = geojson[:-1] + ']}'
links_file = open(HOUNG_PATH+"/housing.geojson", 'w',encoding = 'utf-8')
links_file.write(geojson)
import mapclassify as mc
# 简单绘制上海行政区划
point_path=os.path.join(HOUNG_PATH,"housing.geojson")
print(point_path)
house_df = gpd.read_file(point_path) #读取数据为geodataframe格式
house_df.head()
geo_price=house_df.groupby('name')[['price']].mean()
geo_price.columns = ["mean"]
geo_merge =pd.merge(shanghai_boroughts, geo_price,on="name", how="left")
geo_merge['center']=geo_merge['geometry'].centroid
geo_merge.set_geometry("center", inplace=True)
# print(geo_merge)
housing_train = housing.copy()
housing_train = housing_train.drop("页码",axis=1)
housing_train = housing_train.drop("访问网址",axis=1)
housing_train = housing_train.drop("发布时间",axis=1)
housing_train = housing_train.drop("具体地名",axis=1)
housing_labels=housing_train["单位价格/平方米"].copy()
housing_train
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")
## 因为只有数值属性才能算出中位数,需要创建一份不包括文本属性的数据副本
housing_num=housing_train.drop("房屋朝向",axis=1)
housing_num=housing_num.drop("房型",axis=1)
housing_num=housing_num.drop("大致区域",axis=1)
housing_num=housing_num.drop("房屋装修",axis=1)
housing_num=housing_num.drop("建筑类型",axis=1)
housing_num=housing_num.drop("楼层信息",axis=1)
housing_num=housing_num.drop("总价格/",axis=1)
housing_num=housing_num.drop("楼层类型",axis=1)
housing_num.head(5)
## 指定用某属性的中位数来替换该属性所有的缺失值
imputer.fit(housing_num)
SimpleImputer(copy=True, missing_values='NaN', strategy='mean', verbose=0)
imputer.statistics_
## imputer应用到每个数值
housing_num.median().values
X=imputer.transform(housing_num)
# 将其放回到Pandas的DataFrame中
housing_tr=pd.DataFrame(X,columns=housing_num.columns)
sample_incomplete_rows=housing[housing.isnull().any(axis=1)].head()
housing_tr.loc[sample_incomplete_rows.index.values]
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_direction=housing["房屋朝向"]
housing_type=housing["房型"]
housing_area=housing["大致区域"]
housing_decoration=housing["房屋装修"]
housing_struct=housing["建筑类型"]
housing_floor=housing["楼层信息"]
housing_direction_encoded=encoder.fit_transform(housing_direction)
housing_direction_encoded
print(encoder.classes_)
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_direction_1hot=encoder.fit_transform(housing_direction_encoded.reshape(-1,1))
housing_direction_1hot.toarray()
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer(sparse_output=True)
housing_direction_1hot=encoder.fit_transform(housing_direction)
housing_direction_1hot
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
# 栏序号
# 房屋面积 修建时间 总价格/ 收藏人数 近地铁 未满5年 满五年免税 经度 纬度
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self): # no *args or **kargs
pass
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X, y=None):
house_age, house_subway, house_five, house_taxfree = 1, 4, 5, 6
print(X[0])
house_class = X[:,house_taxfree] + X[:,house_subway]
house_year = 2020-X[:,house_age]
return np.c_[X, house_year, house_class]
attr_adder = CombinedAttributesAdder()
housing_extra_attribs = attr_adder.transform(housing_num.values)
housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,columns=list(housing_num.columns)+["房屋年龄", "受欢迎因素"])
housing_extra_attribs.head()
from sklearn.base import TransformerMixin #gives fit_transform method for free
class MyLabelBinarizer(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = LabelBinarizer(*args, **kwargs)
def fit(self, x):
self.encoder.fit(x)
return self
def transform(self, x, y=0):
return self.encoder.transform(x)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
# 'imputer': 数据填充
# 'attribs_adder':变换
# 'std_scaler': 数值型数据的特征缩放
print(housing_num)
housing_num_tr = num_pipeline.fit_transform(housing_num)
from sklearn.base import BaseEstimator,TransformerMixin
# creat a class to select numerical or categorical columns
# since sklearn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator,TransformerMixin):
def __init__(self,attribute_names):
self.attribute_names=attribute_names
def fit(self,X,y=None):
return self
def transform(self,X):
return X[self.attribute_names].values
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["大致区域","房型","建筑类型","房屋装修","楼层信息","楼层类型"] # ,"房型","大致区域","房屋装修","建筑类型","楼层信息",建筑类型
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
# 选择
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder',OneHotEncoder(sparse=False))
])
# 拼接
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline)
])
## 神经网络训练:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.datasets import boston_housing
from keras.layers import Dense, Dropout
from keras.utils import multi_gpu_model
from keras import regularizers # 正则化
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split
housing_tr = housing_train.copy()
train_set,test_set=train_test_split(housing_tr,test_size=0.2,random_state=24)
print(test_set.head())
print(train_set.head())
x_train = pd.DataFrame(full_pipeline.fit_transform(train_set))
y_train = pd.DataFrame(train_set["单位价格/平方米"])
x_valid = pd.DataFrame(full_pipeline.fit_transform(test_set))
y_valid = pd.DataFrame(test_set["单位价格/平方米"])
y_valid_view = y_valid.copy()
print(x_train,y_train,x_valid,y_valid)
# 单CPU or GPU版本,若有GPU则自动切换
model = Sequential() # 初始化,很重要!
model.add(Dense(units = 64, # 输出大小
activation='relu', # 激励函数
input_shape=(x_train.shape[1],) # 输入大小, 也就是列的大小
)
)
model.add(Dropout(0.1)) # 丢弃神经元链接概率
model.add(Dense(units = 64,
kernel_regularizer=regularizers.l2(0.05), # 施加在权重上的正则项
activity_regularizer=regularizers.l1(0.05), # 施加在输出上的正则项
activation='relu', # 激励函数
bias_regularizer=regularizers.l1_l2(0.05) # 施加在偏置向量上的正则项
)
)
model.add(Dense(units = 1,
activation='linear' # 线性激励函数 回归一般在输出层用这个激励函数
)
)
print(model.summary()) # 打印网络层次结构
model.compile(loss='mse', # 损失均方误差
optimizer='adam', # 优化器
)
history = model.fit(x_train, y_train,
epochs=1000, # 迭代次数
batch_size=512, # 每次用来梯度下降的批处理数据大小
verbose=2, # verbose:日志冗长度,int:冗长度,0:不输出训练过程,1:输出训练进度,2:输出每一个epoch
validation_data = (x_valid, y_valid) # 验证集
)
import matplotlib.pyplot as plt
# 绘制训练 & 验证的损失值
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'][200:len(history.history['loss']):10])
plt.plot(history.history['val_loss'][200:len(history.history['loss']):10])
plt.title('Model loss 局部')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
from keras.utils import plot_model
from keras.models import load_model
# 保存模型
model.save('model_MLP_128.h5') # creates a HDF5 file 'my_model.h5'
#模型可视化 pip install pydot
plot_model(model, to_file='model_MLP.png', show_shapes=True)
# 加载模型
model = load_model('model_MLP_128.h5')