使用tensorflow estimator训练一个预测房价的线性回归模型
学习谷歌的机器学习快速入门课程,有一个作业是利用线性回归预测房价,数据集有8个特征,但是在这里仅用其中一个,起到练习作用。所以不可能仅靠单个特性搞出一个好的模型。
数据集的链接
初次尝试
代码:
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.logging.set_verbosity(tf.logging.ERROR) # DEBUG INFO WARN ERROR FATAL
pd.options.display.max_rows = 10
pd.options.display.max_columns = 9
# pd.set_option('max_columns', 9)
pd.options.display.float_format = '{:.1f}'.format
# 加载数据集
# california_housing_dataframe = pd.read_csv
# ("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=',')
california_housing_dataframe = pd.read_csv("california_housing_train.csv", sep=',')
# 随机数据
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index))
# 将数据整合到统一范围,median_house_value单位为千记
california_housing_dataframe["median_house_value"] /= 1000.0
# 检查数据
# print('\n数据:')
# print(california_housing_dataframe.head())
# print('\n数据统计:')
# describe = california_housing_dataframe.describe()
# print(describe)
# 搞模型
# 1.定义特征和特征列
my_feature = california_housing_dataframe[['total_rooms']] # 返回dataframe
# my_feature_series = california_housing_dataframe['total_rooms'] # 返回series
# print('\n特征')
# print(type(my_feature))
# print(type(my_feature_series))
feature_columns = [tf.feature_column.numeric_column('total_rooms')] # 定义特征列 todo
# print(feature_columns)
# 2.定义目标
targets = california_housing_dataframe['median_house_value']
# 3.配置线性回归
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
linear_regressor = tf.estimator.LinearRegressor(feature_columns=feature_columns, optimizer=my_optimizer)
# 4.定义输入函数
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""
输入函数
:param features: 输入特征
:param targets: 数据标签
:param batch_size: 输出数据的大小
:param shuffle: 随机抽取数据
:param num_epochs:重复的次数
:return:数据和标签
"""
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(buffer_size=10000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
# 5.训练
_ = linear_regressor.train(input_fn=lambda: my_input_fn(my_feature, targets), steps=100)
# 6.评估模型
prediction_input_fn = lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
# print(predictions)
# 6.评估误差
mean_squared_error = metrics.mean_squared_error(targets, predictions)
root_mean_squared_error = math.sqrt(mean_squared_error)
min_house_value = california_housing_dataframe['median_house_value'].min()
max_house_value = california_housing_dataframe['median_house_value'].max()
max_min_difference = max_house_value - min_house_value
# print('Mean squared error(on train set): %.3f' % mean_squared_error)
print('Root mean squared error(on train set): %.3f' % root_mean_squared_error)
print('Max. median house value(on train set): %.3f' % max_house_value)
print('Min. median house value(on train set): %.3f' % min_house_value)
print('Difference between Min. and Max.(on train set): %.3f' % max_min_difference)
# 方差大,校准数据
# Root mean squared error(on train set): 237.417
# Max. median house value(on train set): 500.001
# Min. median house value(on train set): 14.999
# Difference between Min. and Max.(on train set): 485.002
calibration_data = pd.DataFrame()
calibration_data['prediction'] = pd.Series(predictions)
calibration_data['targets'] = pd.Series(targets)
print(calibration_data.describe())
# prediction targets
# count 17000.0 17000.0
# mean 0.1 207.3
# std 0.1 116.0
# min 0.0 15.0
# 25% 0.1 119.4
# 50% 0.1 180.4
# 75% 0.2 265.0
# max 1.9 500.0
# 可视化
sample = california_housing_dataframe.sample(n=