Keras-8 Predicting house prices: a regression example

Predicting house prices: a regression example

0. 探索数据

from keras.datasets import boston_housing
from keras import layers
from keras import models
from keras import callbacks

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Using TensorFlow backend.

导入数据

(x_train, y_train), (x_test, y_test) = boston_housing.load_data()

print('Training data shape:{}, training prices shape:{}'.format(x_train.shape, y_train.shape))
print('Test data shape:{}, test prices shape:{}'.format(x_test.shape, y_test.shape))
Training data shape:(404, 13), training prices shape:(404,)
Test data shape:(102, 13), test prices shape:(102,)

显示部分数据

features = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
            'DIS', 'RAD', 'TAX', 'PTRATION', 'B     ', 'LSTAT', 'MEDV']
for f in features:
    print('{:<7}'.format(f), end='\t')
print('')

for features_val, price in zip(x_train[:20], y_train[:20]):
    for val in features_val:
        print('{:<7}'.format(val), end='\t')
    print(price)
房价和犯罪率的关系
  • 通过结果我们可以看到,犯罪率和房价有很大的关系,犯罪率低的区域房价明显要高,最贵的那些房子所在区域都是犯罪率低的区域。
crime = x_train[:, 0]
plt.scatter(crime, y_train)
plt.xlabel('Crime')
plt.ylabel('Price')
plt.show()

png

是否靠河
  • 从结果上看,靠河的房价普遍高于不靠河的房价,房价最高的房子基本靠河
  • 不靠河也有价格高的房子,但是被视作离群值。
chas = x_train[:, 3]
price_chas = y_train[chas == 1]
price_no_chas = y_train[chas == 0]
plt.boxplot([price_no_chas, price_chas], labels=['0', '1'])
plt.show()

png

来个汇总
  • 可以看到,有些特征能够很好的反映房价,例如犯罪率(CRIM),LSTAT, B, RM等等
  • 有些则难以从中判断房价,例如RAD, TAX
features_without_price = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE','DIS', 'RAD', 'TAX', 'PTRATION', 'B     ', 'LSTAT']

fig, axes = plt.subplots(4,4, sharey=True, figsize=(15,15))
for i, f in enumerate(features_without_price, 0):
    plot_rows = i // 4
    plot_cols = i % 4
    if f == 'CHAS':
        chas = x_train[:, 3]
        price_chas = y_train[chas == 1]
        price_no_chas = y_train[chas == 0]
        axes[plot_rows, plot_cols].boxplot([price_no_chas, price_chas], labels=['0', '1'])
    else:
        f_val = x_train[:, i]
        axes[plot_rows, plot_cols].scatter(f_val, y_train)
        axes[plot_rows, plot_cols].set_xlabel(f)
        axes[plot_rows, plot_cols].set_ylabel('Price')
plt.show()

png

1. 定义问题

  • 输入数据: xRn x ∈ R n , n是特征个数
  • 希望的输出:房价
  • 回归问题

2. 衡量指标

  • 回归问题,用MAE(Mean Absolute Error; 平均绝对误差)

3. 验证策略

  • 训练样本很少,因此k-fold validation是很好的选择

4. 准备数据

# 数据归一化,注意'CHAS'的值 0 或者 1,无需归一化
chas_train = x_train[:, 3]
chas_test = x_test[:, 3]

mean = np.mean(x_train, axis=0)
x_train -= mean
std = np.std(x_train, axis=0)
x_train /= std

x_test -= mean
x_test /= std

x_train[:, 3] = chas_train
x_test[:, 3] = chas_test

5. 简单模型

  • 这个问题的base line 是什么?不太清楚,大概取个平均值对付一下
  • 6.59左右,说明如果取房价的平均值来预测房价,大概误差是6500美元左右,差距挺大的
mean_value = np.mean(y_train)
baseline_value = np.mean(np.abs(y_test - mean_value))
print('Base line value:{}'.format(baseline_value))
Base line value:6.533042127742185
建立一个简单模型
def build_base_model():
    model = models.Sequential()

    model.add(layers.Dense(32, activation='relu', input_shape=(13,)))
    model.add(layers.Dense(32, activation='relu'))

    model.add(layers.Dense(1, activation=None))
    model.compile(optimizer='adam', loss='mae', metrics=['mae'])

    return model
# K-fold validation
k = 4
num_val_samples = len(x_train) // k
all_score = []
for i in range(k):
    x_val = x_train[i*num_val_samples : (i+1)*num_val_samples]
    y_val = y_train[i*num_val_samples : (i+1)*num_val_samples]

    partial_x_train = np.concatenate([x_train[:i*num_val_samples],
                                      x_train[(i+1)*num_val_samples:]], axis=0)
    partial_y_train = np.concatenate([y_train[:i*num_val_samples],
                                      y_train[(i+1)*num_val_samples:]], axis=0)

    model = build_base_model()
    model.fit(partial_x_train, partial_y_train, epochs=10, batch_size=1, verbose=0)
    val_loss, val_mae = model.evaluate(x_val, y_val)
    print('MAE:{}'.format(val_mae))
    all_score.append(val_mae)

print('Mean Score:{}'.format(np.mean(all_score)))
101/101 [==============================] - 0s 255us/step
MAE:4.367843151092529
101/101 [==============================] - 0s 355us/step
MAE:4.813835068504409
101/101 [==============================] - 0s 640us/step
MAE:4.371486975414919
101/101 [==============================] - 0s 631us/step
MAE:6.346890519161035
Mean Score:4.975013928543223

6. 调整参数

def build_model():
    model = models.Sequential()

    model.add(layers.Dense(64, activation='relu', input_shape=(13,)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(16, activation='relu'))
    #model.add(layers.Dropout(0.2))

    #model.add(layers.BatchNormalization())    
    model.add(layers.Dense(1, activation=None))
    model.compile(optimizer='adam', loss='mae', metrics=['mae'])

    return model
# 平滑曲线
def smooth_curve(points, factor=0.9):
    smoothed_points = []
    smoothed_points.append(points[0])

    for point in points[1:]:
        previous = smoothed_points[-1]
        smoothed_points.append( previous*factor + point * (1-factor) )

    return smoothed_points
# K-fold validation
k = 4
num_val_samples = len(x_train) // k
all_mae_histories = []
num_epochs = 200
batch_size = 2
for i in range(k):
    x_val = x_train[i*num_val_samples : (i+1)*num_val_samples]
    y_val = y_train[i*num_val_samples : (i+1)*num_val_samples]

    partial_x_train = np.concatenate([x_train[:i*num_val_samples],
                                      x_train[(i+1)*num_val_samples:]], axis=0)
    partial_y_train = np.concatenate([y_train[:i*num_val_samples],
                                      y_train[(i+1)*num_val_samples:]], axis=0)

    model = build_model()
    history = model.fit(partial_x_train, partial_y_train, epochs=num_epochs, batch_size=batch_size, verbose=0, validation_data=(x_val, y_val))
    mae_history = history.history['val_mean_absolute_error']
    all_mae_histories.append(mae_history)
    print('{}-fold\t MAE:{}'.format(i+1, np.mean(mae_history)))

test_mae_loss, test_mae_score = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test MAE score:{}'.format(test_mae_score))

# plot the validation scores

average_mae_history = np.mean(all_mae_histories, axis=0)
smoothed_points = smooth_curve(average_mae_history[10:])

layer_shapes = []
for layer in model.layers:
    layer_shapes.append(str(layer.output_shape[-1]))

fig_title = '-'.join(layer_shapes)
fig_title += str("_E{}_b{}".format(num_epochs, batch_size))
plt.plot(range(1, len(smoothed_points)+1), smoothed_points)
plt.title(fig_title)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.savefig(fig_title + '.png')
plt.show()
1-fold   MAE:2.1139200915676533
2-fold   MAE:2.3213165556794344
3-fold   MAE:2.4103674963205166
4-fold   MAE:2.397814279806496
102/102 [==============================] - 0s 113us/step
Test MAE score:2.7559990508883607

png

# 做最终的训练
final_epochs = 160
model = build_model()
model.fit(x_train, y_train, epochs=final_epochs, batch_size=batch_size, verbose=0)
test_mae_loss, test_mae_score = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test MAE score:{}'.format(test_mae_score))
102/102 [==============================] - 1s 7ms/step
Test MAE score:2.3922921349020565
# 输出一些预测结果
y_preds = model.predict(x_test)

range_x = range(1, len(y_preds)+1)
plt.figure(figsize=(15,9))
plt.plot(range_x, y_preds, '--o', alpha=0.5, label='Predictions')
plt.plot(range_x, y_test, '-o', alpha=0.8, label='Real')
plt.legend()
plt.show()

png

7. 总结

  • 加了Dropout(0.2)效果不好,平均3.5左右
  • 两层网络 64->32->1 效果还行,在2.5左右
  • 三层网络 64-32-32-1 效果还行,2.5左右,并且训练的曲线一直在下降,所以epochs从100增加到200。从结果上看,大概从175个epoch开始过拟合
    • 这里写图片描述
  • 加了BatchNormalization 效果变差了
    • 这里写图片描述
  • 四层网络 64-32-32-16-1也还行吧
    • 64-32-32-16-1
  • 最后的结果表面我们的模型预测还是相当准确的,误差在2300美元左右
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值