我们这里先按照原文要求把输入特征线性映射到-1到1区间
import pandas as pd
pd.set_option('display.max_columns', 10)
def linear_scale(array):
min = array.min()
max = array.max()
scale = (max - min)/2
array = ((array - min)/scale) - 1
return array
def normalizer_linear_scale(df):
for i in range(len(df.columns)):
df[df.columns[i]] = linear_scale(df[df.columns[i]])
return df
df = pd.read_csv('california_housing_train.csv').copy()
df_targets = df['median_house_value'].copy()
get_linear = normalizer_linear_scale(df)
get_linear['median_house_value'] = df_targets
print(get_linear.describe())
get_linear.to_csv('normalizer_linear_scale.csv')
我们先来进行可视化。
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
df = pd.read_csv('normalizer_linear_scale.csv', index_col=0)
plt.figure(1, figsize=(20, 6))
for i in range(len(df.columns)):
plt.subplot(3, 4, i + 1)
x, bins, patches = plt.hist(df[df.columns[i]], 30, density=1, facecolor='blue', alpha=0.5)
y = stats.norm.pdf(bins, df[df.columns[i]].mean(), df[df.columns[i]].std())
plt.plot(bins, y, 'r--')
plt.plot((bins[: -1] + bins[1 :])/2, x/2, 'g--')
plt.title('%s' % df.columns[i])
plt.subplots_adjust(left=0.15, wspace=0.5, hspace=0.5)
plt.show()
可视化结果与原文一致,下一步我们构建神经网络。
import tensorflow as tf
from tensorflow.data import Dataset
class _dnn():
def my_fn_input(self, features, targets, batch_size=1, num_epochs=1, shuffle=True):
# num_epochs > batch * train steps
features = {key: value for key, value in features.items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds.shuffle(12000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def get_features(self, xs_dict):
feautres = list(xs_dict.values())
outputs = []
for i in feautres:
outputs.append(tf.expand_dims(i, -1))
outputs = tf.concat([i for i in outputs], -1)
return outputs
def add_layer(self, inputs, input_size, output_size, activation_function=None, regularization=0.001):
weights = tf.Variable(tf.random_normal([input_size, output_size], stddev=.1))
# l2 regularizer, lambda:0.0005
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularization)(weights))
biases = tf.Variable(tf.zeros([output_size]) + 0.1)
wx_b = tf.matmul(inputs, weights) + biases
if activation_function is None:
outputs = wx_b
else:
outputs = activation_function(wx_b)
return weights, biases, outputs
def _loss(self, pred, ys, regularizer=False):
# rmse + lambda*l2regularizer
rmse = tf.sqrt(tf.reduce_mean(tf.square(pred - ys)))
if regularizer:
loss = rmse + tf.add_n(tf.get_collection('losses'))
else:
loss = rmse
return loss
def train_step(self, learning_rate, loss):
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# train = tf.train.GradientDescentOptimizer(learning_rate)
# train = tf.contrib.estimator.clip_gradients_by_norm(train, 5.0).minimize(loss)
return train
这里直接调用的adam,关于GradientDescent的方法文中已经给出并且comment掉了。
import numpy as np
from improving import _dnn
import pandas as pd
import tensorflow as tf
dnn = _dnn()
df = pd.read_csv('normalizer_linear_scale.csv', index_col=0)
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
df_features = df[['longitude', 'latitude',
'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
'households', 'median_income', 'rooms_per_person']]
df_targets = df['median_house_value']
train_features = df_features.head(12000).astype('float32')
train_targets = df_targets.head(12000).astype('float32')
validation_features = df_features.tail(5000).astype('float32')
validation_targets = df_targets.tail(5000).astype('float32')
xs, ys = dnn.my_fn_input(train_features, train_targets, batch_size=100, num_epochs=30, shuffle=True)
vx, vy = dnn.my_fn_input(validation_features, validation_targets, batch_size=5000, num_epochs=10, shuffle=False)
xs = dnn.get_features(xs)
vx = dnn.get_features(vx)
w1, b1, l1 = dnn.add_layer(xs, 9, 10)
w2, b2, l2 = dnn.add_layer(l1, 10, 10)
w3, b3, pred = dnn.add_layer(l2, 10, 1)
loss = dnn._loss(pred, ys, regularizer=False)
train_step = dnn.train_step(0.1, loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
_vl1 = tf.matmul(vx, w1) + b1
_vl2 = tf.matmul(_vl1, w2) + b2
_vpred = tf.matmul(_vl2, w3) + b3
_vloss = dnn._loss(_vpred, vy, regularizer=False)
for i in range(3000):
sess.run(train_step)
if i % 300 == 0:
print('validation loss:', sess.run(_vloss))
训练结果与原文不一致,原文在rmse70左右徘徊,该网络在rmse在115左右。lr,weights分布,正则化率均更改过,均在115左右徘徊,如果我们这里batch_size过大的话,则会让运行速度非常非常慢,但是因为我们总体样本非常小,这里我们直接用placeholder来处理。
import tensorflow as tf
import numpy as np
import pandas as pd
from improving import _dnn
df = pd.read_csv('normalizer_linear_scale.csv', index_col=0)
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
df_features = df[['longitude', 'latitude',
'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
'households', 'median_income', 'rooms_per_person']]
df_targets = df['median_house_value']
train_features = df_features.head(12000).astype('float32')
train_targets = df_targets.head(12000).astype('float32')
validation_features = df_features.tail(5000).astype('float32')
validation_targets = df_targets.tail(5000).astype('float32')
# def get_num_epochs(matrix, epochs):
# i = matrix.copy()
# for k in range(epochs):
# matrix = np.concatenate([matrix, i])
# return matrix
#
# batch = 0
# def get_batch(matrix, batch_size):
# global batch
# outputs = matrix[batch:batch+batch_size, :]
# batch += batch_size
# return outputs
dnn = _dnn()
xs = np.mat(train_features)
ys = np.mat(train_targets).T
vx = np.mat(validation_features)
vy = np.mat(validation_targets).T
# xk_epoch = get_num_epochs(xk, 2000)
# xs = get_batch(xk_epoch, 8000)
# yk_epoch = get_num_epochs(yk, 2000)
# ys = get_batch(yk_epoch, 8000)
x_input = tf.placeholder(tf.float32, [None, 9])
y_input = tf.placeholder(tf.float32, [None, 1])
w1, b1, l1 = dnn.add_layer(x_input, 9, 10, activation_function=tf.nn.tanh)
w2, b2, l2 = dnn.add_layer(l1, 10, 10)
w3, b3, pred = dnn.add_layer(l2, 10, 1)
loss = dnn._loss(pred, y_input)
train = dnn.train_step(0.01, loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(2000):
sess.run(train, feed_dict={x_input: xs, y_input: ys})
if i % 50 == 0:
print(sess.run(loss, feed_dict={x_input: vx, y_input: vy}))
comment掉的地方是我用numpy来模仿dataset的写法,在batchsize过小的情况下依旧收敛到115左右。使用所有数据大概率收敛到60~70左右。将lr设置到0.1的情况下部分时候收敛到115。可能因为我们的数据集过小,导致每次取出的batchsize不能代表整个数据集,参数改了很多次包括去掉l2正则化(在batchsize等于整个数据集的情况下,我没添加l2正则化)。这里给出一个loss运行结果。
梯度在rmse为115左右(验证集)并没有再收敛(本以为附近有局部最优解)。但是突然梯度开始下降,降到61左右。现在看起来还是感觉有点奇怪,如果有人能解释下并提出优化方案那就感激不尽了。原文后2个方案是尝试不用的train方法,这里不再提,只需在dnn结构中更改下tf.train的算法就行。包括只使用经度和纬度也是一样的。毕竟框架已经建立好了。