波士顿房价,数据keras内置可以下载。
from keras.datasets import boston_housing
import numpy as np
from keras import models
from keras import layers
#训练集404个,测试集102个数据,特征13个
(train_data,train_targets),(test_data,test_targets)=boston_housing.load_data()
#处理数据,先减去平均值,再除以标准差,axis=0表示对列取平均等
mean=train_data.mean(axis=0)
train_data-=mean
std=train_data.std(axis=0)
train_data/=std
test_data-=mean
test_data/=std
def build_model():
model=models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape=(train_data.shape[1],)))
model.add(layers.Dense(64,activation='relu'))
#回归输出不需要激活函数,否则将限制输出范围
model.add(layers.Dense(1))
#损失函数一般为均方误差,mae为平均绝对误差
model.compile(loss='mse',optimizer='rmsprop',metrics=['mae'])
return model
#K-fold cross-validation,当训练集太小不好分出验证集时,将训练集均分为K份,
#每次以其中一份作为验证集,K一般为4-5
k=4
# //表示整除,/表示浮点数除
each_samples=len(train_data)//k
num_epochs=100
all_scores=[]
for i in range(k):
print('processing fold #',i)
val_data=train_data[i*each_samples:(i+1)*each_samples]
val_targets=train_targets[i*each_samples:(i+1)*each_samples]
#拼合数组
part_train_data=np.concatenate([train_data[:i*each_samples],train_data[(i+1)*each_samples:]],axis=0)
part_train_targets=np.concatenate([train_targets[:i*each_samples],train_targets[(i+1)*each_samples:]],axis=0)
model=build_model()
#verbose:0对应不输出,1对应输出进度,2对应为每个epoch输出日志,默认为1
model.fit(part_train_data,part_train_targets,epochs=num_epochs,batch_size=1,verbose=0)
val_mse,val_mae=model.evaluate(val_data,val_targets,verbose=0)
all_scores.append(val_mae)
#平均绝对误差的平均值为2.3953995083523267
print(np.mean(all_scores))
# epochs改为500,实际80左右就开始过拟合了。而且运行耗时
num_epochs=500
all_mae_history=[]
for i in range(k):
print('processing fold #',i)
val_data=train_data[i*each_samples:(i+1)*each_samples]
val_targets=train_targets[i*each_samples:(i+1)*each_samples]
#拼合数组
part_train_data=np.concatenate([train_data[:i*each_samples],train_data[(i+1)*each_samples:]],axis=0)
part_train_targets=np.concatenate([train_targets[:i*each_samples],train_targets[(i+1)*each_samples:]],axis=0)
model=build_model()
#verbose:0对应不输出,1对应输出进度,2对应为每个epoch输出日志,默认为1
history=model.fit(part_train_data,part_train_targets,epochs=num_epochs,batch_size=1,validation_data=(val_data,val_targets),verbose=0)
mae_history=history.history['val_mean_absolute_error']
all_mae_history.append(mae_history)
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)
#2.46-2.92 平均为2.7540025629249274
print('test_mae_score:',test_mae_score)
avg_mae_history=[np.mean([x[i] for x in all_mae_history]) for i in range(num_epochs)]
# print(avg_mae_history)
#改为80
num_epochs=80
model=build_model()
#在所有训练集上操作
model.fit(train_data,train_targets,epochs=num_epochs,batch_size=16,verbose=0)
mse,mae=model.evaluate(test_data,test_targets)
#结果为15.68760591394761 2.5484998366411995
print(mse,mae)
如图,验证mae结果如下,这里面因为数据很少,采用了K-fold cross-validation,这是一种在数据很少时非常有用的技术。