首先,良好的特征集是必要的前提,我们这里可以看相关系数。
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('california_housing_train.csv')
print(df.corr())
这里r的系数如果为1.0代表完全正相关,-1.0代表完全负相关, 0.0不相关。
这里的r系数称为 皮尔逊相关系数 大家可以自行学习一下。
如果只用1、2个特征我们能得到最好的模型是什么?这里书中给了一个2个特征的例子。
用的median_income和latitude作为特征。这里给出代码(未作分箱)。
import numpy as np
import pandas as pd
from tensorflow.data import Dataset
import tensorflow as tf
df = pd.read_csv('california_housing_train.csv')
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
df['rooms_per_person'] = df['total_rooms']/df['population']
df['median_house_value'] /= 1000
training_examples = df.head(12000).astype('float32')
training_targets = df['median_house_value'].head(12000).astype('float32')
validation_examples = df.tail(5000).astype('float32')
validation_targets =df['median_house_value'].tail(5000).astype('float32')
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def add_layer(inputs, input_size, output_size, activation_function=None):
weights = tf.Variable(tf.random_normal([input_size, output_size]))
biases = tf.Variable(tf.zeros(output_size) + 0.1)
Wx_b = tf.matmul(inputs, weights) + biases
if activation_function is None:
outputs = Wx_b
else:
outputs = activation_function(Wx_b)
return weights, biases, outputs
def loss(pred, ys):
RMSE = tf.sqrt(tf.reduce_mean(tf.square(pred - ys)))
return RMSE
def train(learning_rate, loss):
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
return train
xs, ys = my_input_fn(training_examples, training_targets, batch_size=200, shuffle=False)
xs1 = tf.expand_dims(xs['median_income'], -1)
xs2 = tf.expand_dims(xs['latitude'], -1)
xs_input = tf.concat([xs1, xs2], 1)
w1, b1, l1 = add_layer(xs_input, 2, 10, activation_function=tf.nn.tanh)
w2, b2, l2 = add_layer(l1, 10, 1)
_loss = loss(l2, ys)
_train = train(0.01, _loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(5000):
sess.run(_train)
if i % 50 == 0:
_inputs1 = validation_examples['median_income'][:, np.newaxis]
_inputs2 = validation_examples['latitude'][:, np.newaxis]
_inputs = np.concatenate([_inputs1, _inputs2], axis=1)
_l1 = tf.nn.tanh(tf.matmul(_inputs, w1) + b1)
_pred = tf.matmul(_l1, w2) + b2
validation_y = validation_targets[:, np.newaxis]
v_loss = tf.sqrt(tf.reduce_mean(tf.square(_pred - validation_y)))
print(sess.run(v_loss))
当然我们也可以选择作分箱。这个只需要用df.apply()来设计一个lambda函数就可完成。
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('california_housing_train.csv')
LATITUDE_RANGES = zip(range(32, 44), range(33, 45))
training_examples = pd.DataFrame()
training_examples['median_income'] = df['median_income']
_zip = zip(range(32, 44), range(33, 45))
for r in _zip:
training_examples['latitude_%d_to_%d' % r] = df['latitude'].apply(lambda x: 1.0 if
x >= r[0] and x <r[1] else 0.0)
print(training_examples)
这里给出书中的例子,我们先zip一个元组。这里 training_examples要新建一个dataframe或者 df.copy() 不要直接training_examples= df,否则会出现这种情况。
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('california_housing_train.csv')
LATITUDE_RANGES = zip(range(32, 44), range(33, 45))
training_examples = df
_zip = zip(range(32, 44), range(33, 45))
for r in _zip:
training_examples['latitude_%d_to_%d' % r] = df['latitude'].apply(lambda x: 1.0 if
x >= r[0] and x <r[1] else 0.0)
print(training_examples.columns)
print(df.columns)
我们在给training_examples创建 columns的同时也会给 df创建,请避免这种情况发生
分箱后的latitude我们也可以试着放进神经网络看看loss如何,这里不再赘述,请自行尝试一下。