导入相关的库
from sklearn.metrics import mean_squared_error #均方误差
from sklearn.metrics import mean_absolute_error #平方绝对误差
from sklearn.metrics import r2_score#R square
import tensorflow._api.v2.compat.v1 as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd # 能快速读取常规大小的文件。Pandas能提供高性能、易用的数据结构和数据分析工具
from sklearn.utils import shuffle # 随机打乱工具,将原有序列打乱,返回一个全新的顺序错乱的值
tf.disable_v2_behavior()
读取数据集
data = pd.read_csv('kc_train.csv', header=None)
划分数据集--1(自己进行划分)
# 分割训练集与测试集-训练集占70%,测试集占30%
train_data = data.sample(frac=0.7,random_state=0,axis=0)
test_data = data[~data.index.isin(train_data.index)]
#划分标签值
train_data.columns = ["销售日期", "销售价格", "卧室数", "浴室数", "房屋面积", "停车面积", "楼层数", "房屋评分",
"建筑面积", "地下室面积", "建筑年份", "修复年份", "纬度", "经度"]
test_data.columns = ["销售日期", "销售价格", "卧室数", "浴室数", "房屋面积", "停车面积", "楼层数", "房屋评分",
"建筑面积", "地下室面积", "建筑年份", "修复年份", "纬度", "经度"]
target = train_data['销售价格']
target1 = test_data['销售价格']
housing = train_data.drop(columns=['销售价格'])
housing1 = test_data.drop(columns=['销售价格'])
划分数据集--2(k折交叉验证-效果更好)
#划分标签值
train_data.columns = ["销售日期", "销售价格", "卧室数", "浴室数", "房屋面积", "停车面积", "楼层数", "房屋评分",
"建筑面积", "地下室面积", "建筑年份", "修复年份", "纬度", "经度"]
test_data.columns = ["销售日期", "销售价格", "卧室数", "浴室数", "房屋面积", "停车面积", "楼层数", "房屋评分",
"建筑面积", "地下室面积", "建筑年份", "修复年份", "纬度", "经度"]
train_targets = train_data['销售价格']
test_targets = test_data['销售价格']
train_data = train_data.drop(columns=['销售价格'])
test_data = test_data.drop(columns=['销售价格'])
#把训练数据分成两组,一组用于训练网络,一组用于校验训练的结果
import numpy as np
k = 4
num_val_samples = len(train_data) // k #整数除法
num_epochs = 10
all_scores = []
for i in range(k):
print('processing fold #', i)
#依次把k分数据中的每一份作为校验数据集
val_data = train_data[i * num_val_samples : (i+1) * num_val_samples]
val_targets = train_targets[i* num_val_samples : (i+1) * num_val_samples]
#把剩下的k-1分数据作为训练数据,如果第i分数据作为校验数据,那么把前i-1份和第i份之后的数据连起来
partial_train_data = np.concatenate([train_data[: i * num_val_samples],
train_data[(i+1) * num_val_samples:]], axis = 0)
partial_train_targets = np.concatenate([train_targets[: i * num_val_samples],
train_targets[(i+1) * num_val_samples: ]],
axis = 0)
print("建立模型")
model = build_model()
#把分割好的训练数据和校验数据输入网络
history =model.fit(partial_train_data, partial_train_targets, epochs = num_epochs,
batch_size = 1, verbose = 0)
print("正在拟合模型")
val_mse, val_mae = model.evaluate(val_data, val_targets, verbose = 0)
all_scores.append(val_mae)
mae_history = history.history['val_mae']
all_mae_histories.append(mae_history)
model.summary()
print(all_scores)
建立模型
def build_model():
'''
由于后面我们需要反复构造同一种结构的网络,所以我们把网络的构造代码放在一个函数中,
后面只要直接调用该函数就可以将网络迅速初始化
'''
model = models.Sequential()
#输入层神经元个数为13个
model.add(layers.Dense(13, activation='relu', input_shape=(train_data.shape[1],)))
#shape[1]表示为train_data的列的长度
#激励函数选择relu
#隐藏层神经元个数为64个
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
#输出层神经元为1个
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
#优化器采用rmsprop,就是误差计算方法。
return model
求取误差
# 得到200个平均值,这些平均值将用来衡量200个epoch训练中,模型精准度的变化
average_mae_history = [
np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)
]
接着我们把200次循环的误差绘制出来
import matplotlib.pyplot as plt
plt.plot(range(1, len(average_mae_history) + 1), average_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()
绘图比较
# # 绘图进行比较
# plot.figure(figsize=(10, 7)) # 画布大小
# num = 100
# x = np.arange(1, num + 1) # 取100个点进行比较
# plot.plot(x, target1[:num], label='target') # 目标取值
# plot.plot(x, preds[:num], label='preds') # 预测取值
# plot.legend(loc='upper right') # 线条显示位置
# plot.show()
调整参数
# 调整各项参数,然后再把模型重新训练一遍
model = build_model()
model.fit(train_data, train_targets, epochs = 30, batch_size = 16, verbose = 0)
#batch_size为分批次训练,提升效率
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)
print(test_mae_score)