Tensorflow线性回归官方示例+中文注释

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import math
from IPython import display
from matplotlib import cm, gridspec
from sklearn import metrics
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
# TensorFlow使用五个不同级别的日志消息。 按照上升的顺序,
# 它们是DEBUG,INFO,WARN,ERROR和FATAL。 
# 当您在任何这些级别配置日志记录时,
# TensorFlow将输出与该级别相对应的所有日志消息以及所有级别的严重级别。
# 即,设置为ERROR时,将输出ERROR和FATAL两个级别的日志

pd.options.display.max_rows = 10
# 设置notebook一个cell最多显示数据行数为10行
pd.options.display.float_format = '{:.1f}'.format
# 设置pandas数据的显示格式为1位小数
print('section1 finished.')
section1 finished.
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
# 从网站上下载csv文件,并使用pandas的read_csv函数读取,获得Dataframe对象
print('section2 finished.')
section2 finished.
california_housing_dataframe[["median_house_value",'population']]
median_house_valuepopulation
066900.01015.0
180100.01129.0
285700.0333.0
373400.0515.0
465500.0624.0
.........
16995111400.0907.0
1699679000.01194.0
16997103600.01244.0
1699885800.01298.0
1699994600.0806.0

17000 rows × 2 columns

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
# Dataframe的索引随机排列化,涉及到两个函数:
# 1是np.random.permutation,输入一个数组,返回数组的一个随机排列
# 2是Dataframe.reindex,输入一个数组,将索引设置为输入的数组
california_housing_dataframe["median_house_value"] /= 1000.0
# 查看Dataframe对象的列数据必须使用类似字典的形式
california_housing_dataframe

longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
3958-114.334.215.05612.01283.01015.0472.01.566.9
6515-114.534.419.07650.01901.01129.0463.01.880.1
11198-114.633.717.0720.0174.0333.0117.01.785.7
3501-114.633.614.01501.0337.0515.0226.03.273.4
8838-114.633.620.01454.0326.0624.0262.01.965.5
..............................
2805-124.340.652.02217.0394.0907.0369.02.4111.4
11369-124.340.736.02349.0528.01194.0465.02.579.0
11180-124.341.817.02677.0531.01244.0456.03.0103.6
1002-124.341.819.02672.0552.01298.0478.02.085.8
15287-124.340.552.01820.0300.0806.0270.03.094.6

17000 rows × 9 columns

california_housing_dataframe.describe()
# Dataframe对象的describe函数返回各列数据的计数、均值、标准差、最小值、分位数
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
count17000.017000.017000.017000.017000.017000.017000.017000.017000.0
mean-119.635.628.62643.7539.41429.6501.23.9207.3
std2.02.112.62179.9421.51147.9384.51.9116.0
min-124.332.51.02.01.03.01.00.515.0
25%-121.833.918.01462.0297.0790.0282.02.6119.4
50%-118.534.229.02127.0434.01167.0409.03.5180.4
75%-118.037.737.03151.2648.21721.0605.24.8265.0
max-114.342.052.037937.06445.035682.06082.015.0500.0
# 我们尝试预测的目标是median_house_value,作为我们的label/target
# 使用total_rooms作为输入特征
# 使用tensorflow Estimator提供的LinearRegressor接口
# 常用的数据有两种,一是分类数据(文字数据)
# 二是数值数据(如本例中的total_rooms)
# tensorflow中我们使用“特征列”的结构来标识特征的数据类型
# 使用numeric_column将特征指定为数值

my_feature = california_housing_dataframe[["total_rooms"]]
# 如果要使用多个变量,将"total_rooms"改为["feature1","feature2"...] 形式的列表
feature_column = [tf.feature_column.numeric_column('total_rooms')]
# 此时feature_column只是一个numeric_column对象,还未赋值
# my_feature
# feature_column
[_NumericColumn(key='total_rooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
# 定义目标,即median_house_value
targets = california_housing_dataframe['median_house_value']
targets.describe()
count   17000.0
mean      207.3
std       116.0
min        15.0
25%       119.4
50%       180.4
75%       265.0
max       500.0
Name: median_house_value, dtype: float64
# 配置LinearRegressor
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
# 设置优化器为SGD(随机梯度下降法),learning_rate控制步长大小
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
# 为安全起见,我们通过clip_gradients_by_norm将梯度裁剪应用到优化器
# 梯度裁剪可以确保梯度大小在训练期间不会变得过大,过大将导致GD方法失败。
# 梯度裁剪相关:https://blog.csdn.net/y12345678904/article/details/79581550
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_column,
    optimizer=my_optimizer
)
dict(my_feature)
{'total_rooms': 3958    5612.0
 6515    7650.0
 11198    720.0
 3501    1501.0
 8838    1454.0
          ...  
 2805    2217.0
 11369   2349.0
 11180   2677.0
 1002    2672.0
 15287   1820.0
 Name: total_rooms, Length: 17000, dtype: float64}
# 定义输入函数,告诉tensorflow如何预处理数据,以及如何在训练期间批处理、随机处理和重复数据
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """
    Args:
    features: pandas Dataframe, 输入的特征
    targets: pandas Dataframe, 需要预测的目标
    batch_size: int,将数据拆分成大小为batch_size的多批数据
    shuffle: bool value,是否随机抽取数据
    num_epochs:int,指定重复周期数,若为None,则无限循环
    Returns:
    Tuple:特征、标签用于下次数据批处理
    """
    # 首先将Dataframe对象转换为numpy字典
    features = {key:np.array(value) for key,value in dict(features).items()}
    # key均为total_rooms,values则为带有索引的Series对象(操作形式与列表类似)
    
    dataset = Dataset.from_tensor_slices((features,targets))
    # 创建dataset对象,并设置批处理大小和重复次数
    dataset = dataset.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=1000)
        # 维持一个buffer_size大小的shuffle buffer,每次样本都从buffer中抽取
        # 每取出一个样本,就向其中补充一个
    
    # 返回下一批数据
    features, labels = dataset.make_one_shot_iterator().get_next()
    return features, labels


# 使用linear_regressor训练模型
# 将my_input_fn封装在lambda中,以便可以传入参数(features,targets)

_ = linear_regressor.train(
    input_fn = lambda:my_input_fn(my_feature, targets),
    steps = 100
)
# 训练次数为10000
# 使用训练数据预测与数据的拟合情况

prediction_input_fn =lambda: my_input_fn(my_feature, targets, num_epochs=1,shuffle=False)
# 创建一个预测函数,基于输入函数

predictions = linear_regressor.predict(input_fn=prediction_input_fn)
# 将预测函数作为参数输入regressor预测器

predictions = np.array([item['predictions'][0] for item in predictions])
# 将预测结果转为array对象,方便计算误差

mean_squared_error = metrics.mean_squared_error(predictions, targets)
# 计算均方差

root_mean_squared_error = math.sqrt(mean_squared_error)
# 均方根误差差

print("Mean Squared Error (on training data): %0.3f"%mean_squared_error)
print("Root Mean Squared Error (on training data): %0.3f"%root_mean_squared_error)

Mean Squared Error (on training data): 27664.247
Root Mean Squared Error (on training data): 166.326
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value
print("Min median house value is %0.3f"%min_house_value)
print("Max median house value is %0.3f"%max_house_value)
print("Difference between min and max house value is %0.3f"%min_max_difference)
print("Root Mean Squared Error (on training data): %0.3f"%root_mean_squared_error)

Min median house value is 14.999
Max median house value is 500.001
Difference between min and max house value is 485.002
Root Mean Squared Error (on training data): 166.326
calibration_data = pd.DataFrame()
calibration_data['predictions'] = pd.Series(predictions)
calibration_data['targets'] = pd.Series(targets)
calibration_data.describe()
predictionstargets
count17000.017000.0
mean129.1207.3
std106.5116.0
min0.115.0
25%71.4119.4
50%103.9180.4
75%153.9265.0
max1852.7500.0
sample = california_housing_dataframe.sample(n=300)
x_0 = sample['total_rooms'].min()
x_1 = sample['total_rooms'].max()

weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')

y_0 = weight*x_0+bias
y_1 = weight*x_1+bias

plt.plot([x_0,x_1],[y_0,y_1],c='r')
plt.ylabel('median_house_value')
plt.xlabel('total_rooms')

plt.scatter(sample['total_rooms'],sample['median_house_value'])
plt.show()

在这里插入图片描述

from mpl_toolkits.mplot3d import Axes3D
def train_model(learning_rate, steps, batch_size, input_feature=["population","total_rooms"]):
    """
    Args:
    learning_rate: float 学习率
    steps: 训练次数
    batch_size: 批处理规模
    input_feature: string,从california_housing_dataframe提取的特征名
    """
    # 在10个等分时间段使用训练函数
    periods = 5
    steps_per_period =steps/periods
    
    my_feature = input_feature
    my_feature_data = california_housing_dataframe[my_feature]
    my_label ='median_house_value'
    targets = california_housing_dataframe[[my_label]]
    
    # 创建特征块
    feature_columns = [tf.feature_column.numeric_column(k) for k in my_feature]
    
    # 创建输入函数
    train_input_fn = lambda:my_input_fn(my_feature_data, targets, batch_size)
    prediction_input_fn = lambda:my_input_fn(my_feature_data, targets, num_epochs=1,shuffle=False)
    
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=feature_columns,
        optimizer=my_optimizer)
    
    plt.figure(figsize=(15,6)) # 得到画面
    ax = plt.subplot(1,2,1,projection='3d')
    plt.title('Learned Line by Period')
    ax.set_zlabel(my_label)
    plt.ylabel(my_feature[1]) # total_rooms
    plt.xlabel(my_feature[0]) # population
    n = 300  # 取样点数
    sample = california_housing_dataframe.sample(n)
    ax.scatter(sample[my_feature[0]],sample[my_feature[1]],sample[my_label])
    colors = [cm.coolwarm(x) for x in np.linspace(-1,1,periods)]
    
    print("Train model...")
    print('RMSE (on training data):')
    root_mean_squared_errors = []
    for period in range(periods):
        linear_regressor.train(
            input_fn=train_input_fn,
            steps=steps_per_period)
        predictions = linear_regressor.predict(input_fn=prediction_input_fn)
        predictions = np.array([item['predictions'][0] for item in predictions])
        
        root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(predictions, targets))
        print("period %02d : %0.2f"%(period, root_mean_squared_error))
        root_mean_squared_errors.append(root_mean_squared_error)
        
#         y_extents = np.array([0, sample[my_label].max()])
        weight0 = linear_regressor.get_variable_value('linear/linear_model/%s/weights'%input_feature[0])[0]
        weight1 = linear_regressor.get_variable_value('linear/linear_model/%s/weights'%input_feature[1])[0]
        bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
        
        # y = w1*x1+w2*x2+b
#         x_extents = (y_extents - bias)/weight
#         x_extents = np.maximum(np.minimum(x_extents,
#                                           sample[my_feature].max()),
#                               sample[my_feature].min())
#         y_extents =weight*x_extents+bias
        for i in range(n):
            y_extents = weight0*sample[my_feature[0]].values[i]+weight1*sample[my_feature[1]].values[i]+bias
            ax.scatter(sample[my_feature[0]].values[i], sample[my_feature[1]].values[i], y_extents, color=colors[period])
    print('Model training finished.')
    plt.subplot(1,2,2)
    plt.ylabel('RMSE')
    plt.xlabel('Periods')
    plt.title('Root Mean Squared Error vs. Periods')
    plt.tight_layout()
    plt.plot(root_mean_squared_errors)
    
    calibration_data = pd.DataFrame()
    calibration_data['predictions'] = pd.Series(predictions)
    calibration_data['targets'] = pd.Series(targets)
    calibration_data.describe()
    
    print('Final RMSE (on training data): %0.2f'%root_mean_squared_error)
train_model(learning_rate=0.0001,
            steps=100,
            batch_size=1
            )

Train model...
RMSE (on training data):
period 00 : 214.40
period 01 : 198.69
period 02 : 183.96
period 03 : 176.80
period 04 : 173.32
Model training finished.
Final RMSE (on training data): 173.32

在这里插入图片描述

sample = california_housing_dataframe.sample(300)
my_feature = ['total_rooms','population']
my_label = 'median_house_value'
fig = plt.figure(figsize=(15,6))
ax = Axes3D(fig)
# sample[my_label].values
ax.scatter(sample[my_feature[0]],sample[my_feature[1]],sample[my_label])
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1c263d734e0>

在这里插入图片描述

"""
总结:
使用tensorflow进行线性回归的步骤:
1.加载数据集,分析数据主要特征,最好能够分析特征之间的相关性
    (violin图查看各个变量的分布,表格构建相关性)
    相关效果见:https://blog.csdn.net/u010099080/article/details/72824899?ref=myread
    计算方法见:https://blog.csdn.net/weixin_37272286/article/details/80079673
2.数据随机排序,防止病态排序结果
3.定义特征和标签
4.配置LinearRegressor (包括optimizer,梯度裁剪上界,学习率)
    位于tf.estimator.LinearRegressor
5.定义输入函数 (构建一个迭代器,将数据拆分成多批数据,按指定周期向LinearRegressor输入训练数据)
    输入参数包括特征、标签、批尺寸、随机性、循环周期
    返回特征、标签
6.训练LR模型并查看训练效果
    训练效果主要通过均方根误差(RMSE)和max_min_difference的差距体现
    获取weight和bias的位置:linear/linear_model/%s/weights %my_features
                            linear/linear_model/bias_weights
7.调整模型超参(在此时是学习率learning_rate),寻求更好的拟合和泛化效果
"""
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
TensorFlow是一个开源的机器学习框架,用于构建和训练各种机器学习模型。线性回归是一种常见的机器学习算法,用于建立一个线性模型来预测连续型的输出变量。 在TensorFlow中,可以使用以下步骤来实现线性回归: 1. 导入所需的库和模块: ```python import tensorflow as tf import numpy as np ``` 2. 准备数据: ```python # 定义输入特征和标签 x_train = np.array([...]) # 输入特征 y_train = np.array([...]) # 标签 ``` 3. 定义模型结构: ```python # 定义模型参数 W = tf.Variable(tf.random.normal([num_features, 1]), name='weight') b = tf.Variable(tf.zeros(1), name='bias') # 定义线性回归模型 def linear_regression(x): return tf.matmul(x, W) + b ``` 4. 定义损失函数: ```python # 定义均方误差损失函数 def mean_square(y_pred, y_true): return tf.reduce_mean(tf.square(y_pred - y_true)) ``` 5. 定义优化器: ```python # 定义梯度下降优化器 optimizer = tf.optimizers.SGD(learning_rate) ``` 6. 训练模型: ```python # 定义训练函数 def train_step(x, y): with tf.GradientTape() as tape: y_pred = linear_regression(x) loss = mean_square(y_pred, y) gradients = tape.gradient(loss, [W, b]) optimizer.apply_gradients(zip(gradients, [W, b])) # 迭代训练 for epoch in range(num_epochs): train_step(x_train, y_train) ``` 7. 使用模型进行预测: ```python # 使用训练好的模型进行预测 y_pred = linear_regression(x_test) ``` 这是一个简单的TensorFlow线性回归的实现示例。你可以根据自己的需求和数据进行相应的调整和扩展。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值