import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import math
from IPython import display
from matplotlib import cm, gridspec
from sklearn import metrics
from tensorflow. python. data import Dataset
tf. logging. set_verbosity( tf. logging. ERROR)
pd. options. display. max_rows = 10
pd. options. display. float_format = '{:.1f}' . format
print ( 'section1 finished.' )
section1 finished.
california_housing_dataframe = pd. read_csv( "https://storage.googleapis.com/mledu-datasets/california_housing_train.csv" , sep= "," )
print ( 'section2 finished.' )
section2 finished.
california_housing_dataframe[ [ "median_house_value" , 'population' ] ]
median_house_value population 0 66900.0 1015.0 1 80100.0 1129.0 2 85700.0 333.0 3 73400.0 515.0 4 65500.0 624.0 ... ... ... 16995 111400.0 907.0 16996 79000.0 1194.0 16997 103600.0 1244.0 16998 85800.0 1298.0 16999 94600.0 806.0
17000 rows × 2 columns
california_housing_dataframe = california_housing_dataframe. reindex(
np. random. permutation( california_housing_dataframe. index) )
california_housing_dataframe[ "median_house_value" ] /= 1000.0
california_housing_dataframe
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value 3958 -114.3 34.2 15.0 5612.0 1283.0 1015.0 472.0 1.5 66.9 6515 -114.5 34.4 19.0 7650.0 1901.0 1129.0 463.0 1.8 80.1 11198 -114.6 33.7 17.0 720.0 174.0 333.0 117.0 1.7 85.7 3501 -114.6 33.6 14.0 1501.0 337.0 515.0 226.0 3.2 73.4 8838 -114.6 33.6 20.0 1454.0 326.0 624.0 262.0 1.9 65.5 ... ... ... ... ... ... ... ... ... ... 2805 -124.3 40.6 52.0 2217.0 394.0 907.0 369.0 2.4 111.4 11369 -124.3 40.7 36.0 2349.0 528.0 1194.0 465.0 2.5 79.0 11180 -124.3 41.8 17.0 2677.0 531.0 1244.0 456.0 3.0 103.6 1002 -124.3 41.8 19.0 2672.0 552.0 1298.0 478.0 2.0 85.8 15287 -124.3 40.5 52.0 1820.0 300.0 806.0 270.0 3.0 94.6
17000 rows × 9 columns
california_housing_dataframe. describe( )
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value count 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 17000.0 mean -119.6 35.6 28.6 2643.7 539.4 1429.6 501.2 3.9 207.3 std 2.0 2.1 12.6 2179.9 421.5 1147.9 384.5 1.9 116.0 min -124.3 32.5 1.0 2.0 1.0 3.0 1.0 0.5 15.0 25% -121.8 33.9 18.0 1462.0 297.0 790.0 282.0 2.6 119.4 50% -118.5 34.2 29.0 2127.0 434.0 1167.0 409.0 3.5 180.4 75% -118.0 37.7 37.0 3151.2 648.2 1721.0 605.2 4.8 265.0 max -114.3 42.0 52.0 37937.0 6445.0 35682.0 6082.0 15.0 500.0
my_feature = california_housing_dataframe[ [ "total_rooms" ] ]
feature_column = [ tf. feature_column. numeric_column( 'total_rooms' ) ]
[_NumericColumn(key='total_rooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
targets = california_housing_dataframe[ 'median_house_value' ]
targets. describe( )
count 17000.0
mean 207.3
std 116.0
min 15.0
25% 119.4
50% 180.4
75% 265.0
max 500.0
Name: median_house_value, dtype: float64
my_optimizer = tf. train. GradientDescentOptimizer( learning_rate= 0.0000001 )
my_optimizer = tf. contrib. estimator. clip_gradients_by_norm( my_optimizer, 5.0 )
linear_regressor = tf. estimator. LinearRegressor(
feature_columns= feature_column,
optimizer= my_optimizer
)
dict ( my_feature)
{'total_rooms': 3958 5612.0
6515 7650.0
11198 720.0
3501 1501.0
8838 1454.0
...
2805 2217.0
11369 2349.0
11180 2677.0
1002 2672.0
15287 1820.0
Name: total_rooms, Length: 17000, dtype: float64}
def my_input_fn ( features, targets, batch_size= 1 , shuffle= True , num_epochs= None ) :
"""
Args:
features: pandas Dataframe, 输入的特征
targets: pandas Dataframe, 需要预测的目标
batch_size: int,将数据拆分成大小为batch_size的多批数据
shuffle: bool value,是否随机抽取数据
num_epochs:int,指定重复周期数,若为None,则无限循环
Returns:
Tuple:特征、标签用于下次数据批处理
"""
features = { key: np. array( value) for key, value in dict ( features) . items( ) }
dataset = Dataset. from_tensor_slices( ( features, targets) )
dataset = dataset. batch( batch_size) . repeat( num_epochs)
if shuffle:
dataset = dataset. shuffle( buffer_size= 1000 )
features, labels = dataset. make_one_shot_iterator( ) . get_next( )
return features, labels
_ = linear_regressor. train(
input_fn = lambda : my_input_fn( my_feature, targets) ,
steps = 100
)
prediction_input_fn = lambda : my_input_fn( my_feature, targets, num_epochs= 1 , shuffle= False )
predictions = linear_regressor. predict( input_fn= prediction_input_fn)
predictions = np. array( [ item[ 'predictions' ] [ 0 ] for item in predictions] )
mean_squared_error = metrics. mean_squared_error( predictions, targets)
root_mean_squared_error = math. sqrt( mean_squared_error)
print ( "Mean Squared Error (on training data): %0.3f" % mean_squared_error)
print ( "Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)
Mean Squared Error (on training data): 27664.247
Root Mean Squared Error (on training data): 166.326
min_house_value = california_housing_dataframe[ "median_house_value" ] . min ( )
max_house_value = california_housing_dataframe[ "median_house_value" ] . max ( )
min_max_difference = max_house_value - min_house_value
print ( "Min median house value is %0.3f" % min_house_value)
print ( "Max median house value is %0.3f" % max_house_value)
print ( "Difference between min and max house value is %0.3f" % min_max_difference)
print ( "Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)
Min median house value is 14.999
Max median house value is 500.001
Difference between min and max house value is 485.002
Root Mean Squared Error (on training data): 166.326
calibration_data = pd. DataFrame( )
calibration_data[ 'predictions' ] = pd. Series( predictions)
calibration_data[ 'targets' ] = pd. Series( targets)
calibration_data. describe( )
predictions targets count 17000.0 17000.0 mean 129.1 207.3 std 106.5 116.0 min 0.1 15.0 25% 71.4 119.4 50% 103.9 180.4 75% 153.9 265.0 max 1852.7 500.0
sample = california_housing_dataframe. sample( n= 300 )
x_0 = sample[ 'total_rooms' ] . min ( )
x_1 = sample[ 'total_rooms' ] . max ( )
weight = linear_regressor. get_variable_value( 'linear/linear_model/total_rooms/weights' ) [ 0 ]
bias = linear_regressor. get_variable_value( 'linear/linear_model/bias_weights' )
y_0 = weight* x_0+ bias
y_1 = weight* x_1+ bias
plt. plot( [ x_0, x_1] , [ y_0, y_1] , c= 'r' )
plt. ylabel( 'median_house_value' )
plt. xlabel( 'total_rooms' )
plt. scatter( sample[ 'total_rooms' ] , sample[ 'median_house_value' ] )
plt. show( )
from mpl_toolkits. mplot3d import Axes3D
def train_model ( learning_rate, steps, batch_size, input_feature= [ "population" , "total_rooms" ] ) :
"""
Args:
learning_rate: float 学习率
steps: 训练次数
batch_size: 批处理规模
input_feature: string,从california_housing_dataframe提取的特征名
"""
periods = 5
steps_per_period = steps/ periods
my_feature = input_feature
my_feature_data = california_housing_dataframe[ my_feature]
my_label = 'median_house_value'
targets = california_housing_dataframe[ [ my_label] ]
feature_columns = [ tf. feature_column. numeric_column( k) for k in my_feature]
train_input_fn = lambda : my_input_fn( my_feature_data, targets, batch_size)
prediction_input_fn = lambda : my_input_fn( my_feature_data, targets, num_epochs= 1 , shuffle= False )
my_optimizer = tf. train. GradientDescentOptimizer( learning_rate= learning_rate)
my_optimizer = tf. contrib. estimator. clip_gradients_by_norm( my_optimizer, 5.0 )
linear_regressor = tf. estimator. LinearRegressor(
feature_columns= feature_columns,
optimizer= my_optimizer)
plt. figure( figsize= ( 15 , 6 ) )
ax = plt. subplot( 1 , 2 , 1 , projection= '3d' )
plt. title( 'Learned Line by Period' )
ax. set_zlabel( my_label)
plt. ylabel( my_feature[ 1 ] )
plt. xlabel( my_feature[ 0 ] )
n = 300
sample = california_housing_dataframe. sample( n)
ax. scatter( sample[ my_feature[ 0 ] ] , sample[ my_feature[ 1 ] ] , sample[ my_label] )
colors = [ cm. coolwarm( x) for x in np. linspace( - 1 , 1 , periods) ]
print ( "Train model..." )
print ( 'RMSE (on training data):' )
root_mean_squared_errors = [ ]
for period in range ( periods) :
linear_regressor. train(
input_fn= train_input_fn,
steps= steps_per_period)
predictions = linear_regressor. predict( input_fn= prediction_input_fn)
predictions = np. array( [ item[ 'predictions' ] [ 0 ] for item in predictions] )
root_mean_squared_error = math. sqrt(
metrics. mean_squared_error( predictions, targets) )
print ( "period %02d : %0.2f" % ( period, root_mean_squared_error) )
root_mean_squared_errors. append( root_mean_squared_error)
weight0 = linear_regressor. get_variable_value( 'linear/linear_model/%s/weights' % input_feature[ 0 ] ) [ 0 ]
weight1 = linear_regressor. get_variable_value( 'linear/linear_model/%s/weights' % input_feature[ 1 ] ) [ 0 ]
bias = linear_regressor. get_variable_value( 'linear/linear_model/bias_weights' )
for i in range ( n) :
y_extents = weight0* sample[ my_feature[ 0 ] ] . values[ i] + weight1* sample[ my_feature[ 1 ] ] . values[ i] + bias
ax. scatter( sample[ my_feature[ 0 ] ] . values[ i] , sample[ my_feature[ 1 ] ] . values[ i] , y_extents, color= colors[ period] )
print ( 'Model training finished.' )
plt. subplot( 1 , 2 , 2 )
plt. ylabel( 'RMSE' )
plt. xlabel( 'Periods' )
plt. title( 'Root Mean Squared Error vs. Periods' )
plt. tight_layout( )
plt. plot( root_mean_squared_errors)
calibration_data = pd. DataFrame( )
calibration_data[ 'predictions' ] = pd. Series( predictions)
calibration_data[ 'targets' ] = pd. Series( targets)
calibration_data. describe( )
print ( 'Final RMSE (on training data): %0.2f' % root_mean_squared_error)
train_model( learning_rate= 0.0001 ,
steps= 100 ,
batch_size= 1
)
Train model...
RMSE (on training data):
period 00 : 214.40
period 01 : 198.69
period 02 : 183.96
period 03 : 176.80
period 04 : 173.32
Model training finished.
Final RMSE (on training data): 173.32
sample = california_housing_dataframe. sample( 300 )
my_feature = [ 'total_rooms' , 'population' ]
my_label = 'median_house_value'
fig = plt. figure( figsize= ( 15 , 6 ) )
ax = Axes3D( fig)
ax. scatter( sample[ my_feature[ 0 ] ] , sample[ my_feature[ 1 ] ] , sample[ my_label] )
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1c263d734e0>
"""
总结:
使用tensorflow进行线性回归的步骤:
1.加载数据集,分析数据主要特征,最好能够分析特征之间的相关性
(violin图查看各个变量的分布,表格构建相关性)
相关效果见:https://blog.csdn.net/u010099080/article/details/72824899?ref=myread
计算方法见:https://blog.csdn.net/weixin_37272286/article/details/80079673
2.数据随机排序,防止病态排序结果
3.定义特征和标签
4.配置LinearRegressor (包括optimizer,梯度裁剪上界,学习率)
位于tf.estimator.LinearRegressor
5.定义输入函数 (构建一个迭代器,将数据拆分成多批数据,按指定周期向LinearRegressor输入训练数据)
输入参数包括特征、标签、批尺寸、随机性、循环周期
返回特征、标签
6.训练LR模型并查看训练效果
训练效果主要通过均方根误差(RMSE)和max_min_difference的差距体现
获取weight和bias的位置:linear/linear_model/%s/weights %my_features
linear/linear_model/bias_weights
7.调整模型超参(在此时是学习率learning_rate),寻求更好的拟合和泛化效果
"""