机器学习的训练数据(Training Dataset)、测试数据(Testing Dataset)和验证数据(Validation Dataset)

三者的意义
- 训练数据:用来训练模型的数据
- 验证数据:用来检验模型准确率
- 测试数据:再一次确认验证数据集中的模型是好的模型。

一般步骤:

测试数据集和验证数据的数据一定不能用来训练,否则会出现过拟合的现象

代码:

import math
import os

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10

# 读取数据
if os.path.exists('data.csv'):
    california_housing_dataframe = pd.read_csv('data.csv', sep=',')
else:
    california_housing_dataframe = pd.read_csv(
        "https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
    california_housing_dataframe.to_csv('data.csv')


def preprocess_features(california_housing_dataframe):
    '''
    准备输入测试数据特征列
    :param california_housing_dataframe:
    :return:
    '''
    selected_features = california_housing_dataframe[
        [
            'latitude',
            'longitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income'
        ]
    ]
    processed_features = california_housing_dataframe.copy()
    # 添加人均住房面积
    processed_features['rooms_per_person'] = (
            california_housing_dataframe['total_rooms'] /
            california_housing_dataframe['population']
    )
    return processed_features


def preprocess_targets(california_housing_dataframe):
    '''
    目标值输入函数
    :param california_housing_dataframe:
    :return: 房子的价值
    '''
    output_targets = pd.DataFrame()
    output_targets['median_house_value'] = (
            california_housing_dataframe['median_house_value'] / 1000.0
    )
    return output_targets


# 传入12000组数据
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_examples.describe()

# 传入12000组目标值
training_targets = preprocess_targets(california_housing_dataframe.head(12000))
training_targets.describe()

# 验证数据集
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_examples.describe()

validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))
validation_targets.describe()

plt.figure(figsize=(13, 8))
ax = plt.subplot(1, 2, 1)
ax.set_title("Validation Data")
ax.set_ylim([32, 43])
ax.set_autoscalex_on(False)
ax.set_xlim([-126, -112])
plt.scatter(validation_examples['longitude'],
            validation_examples['latitude'],
            cmap="coolwarm",
            c=validation_targets['median_house_value'] / training_targets['median_house_value'].max()
            )
plt.plot()


def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    '''
    输入函数
    :param features: 特征列
    :param targets: 目标值
    :param batch_size: batch size
    :param shuffle: 是否乱序
    :param num_epochs: epoch的数量
    :return: 一个迭代批次数据,包含特征列和标签
    '''
    ds = Dataset.from_tensor_slices((dict(features), targets))
    ds = ds.batch(batch_size=batch_size)
    if shuffle:
        ds.shuffle(buffer_size=10000)
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels


def construct_feature_columns(input_features):
    '''
    :param input_features:特征
    :return: 构造的特征列
    '''
    return set([tf.feature_column.numeric_column(key=my_feature)
                for my_feature in input_features])


def train_model(learning_rate,
                steps,
                batch_size,
                training_examples,
                training_targets,
                validation_examples,
                validation_targets):
    periods = 10
    steps_per_period = steps // periods
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=construct_feature_columns(training_examples),
        optimizer=my_optimizer
    )
    print("Training model.....")
    training_rmse = []
    validation_rmse = []
    for period in range(0, periods):
        # 开始训练
        linear_regressor.train(
            input_fn=lambda: my_input_fn(
                training_examples,
                training_targets['median_house_value'],
                batch_size=batch_size
            ),
            steps=steps_per_period
        )

        # 预测数据集的处理
        training_predictions = linear_regressor.predict(
            input_fn=lambda: my_input_fn(
                training_examples,
                training_targets['median_house_value'],
                num_epochs=1,
                shuffle=False
            )
        )
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])

        # 验证数据集的处理
        validation_predictions = linear_regressor.predict(
            input_fn=lambda: my_input_fn(
                validation_examples,
                validation_targets['median_house_value'],
                num_epochs=1,
                shuffle=False
            )
        )
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])

        tmp_training_rmse = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets)
        )
        tmp_validation_rmse = math.sqrt(
            metrics.mean_squared_error(validation_predictions, validation_targets)
        )
        print("period %02d: %0.2f" % (period, tmp_training_rmse))
        training_rmse.append(tmp_training_rmse)
        validation_rmse.append(tmp_validation_rmse)
    print("Model training finished")

    # 输出结果图
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squred Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, labels="training")
    plt.plot(validation_rmse, labels="validation")
    plt.legend()

    return linear_regressor


# 训练模型
linear_regressor = train_model(
    learning_rate=0.00003,
    steps=500,
    batch_size=1,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets
)

# 在测试数据集上评估模型
california_housing_test_data = pd.read_csv('data.csv', sep=',')
test_examples = preprocess_features(california_housing_test_data)
test_targets = preprocess_targets(california_housing_test_data)

predict_test_input_fn = lambda: my_input_fn(
    test_examples,
    test_targets['median_house_value'],
    num_epochs=1,
    shuffle=False
)

test_predictions = linear_regressor.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item['predictions'][0] for item in test_predictions])

RMSE = math.sqrt(
    metrics.mean_squared_error(test_predictions, test_targets)
)

print("Final RMSE (on test data): %0.2f" % RMSE)

  • 4
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
这是一个基于Python的示例代码,使用了TensorFlow和Keras库。假设你已经有了一些用于训练和测试神经网络的数据集,并且已经建立了一个适当的神经网络模型。 首先,导入所需的库和模块: ```python import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers.experimental import preprocessing import numpy as np import pandas as pd import matplotlib.pyplot as plt ``` 接下来,加载和准备数据集。假设你已经有了一个CSV文件,其中包含用于训练和测试神经网络的数据。使用Pandas库将数据加载到一个DataFrame中: ```python dataframe = pd.read_csv("data.csv") ``` 然后,将DataFrame中的数据分成训练集和测试集: ```python train_dataset = dataframe.sample(frac=0.8, random_state=0) test_dataset = dataframe.drop(train_dataset.index) ``` 接下来,将数据集转换为张量(Tensor)格式: ```python train_features = train_dataset.copy() test_features = test_dataset.copy() train_labels = train_features.pop('label') test_labels = test_features.pop('label') train_features = np.array(train_features) test_features = np.array(test_features) train_labels = np.array(train_labels) test_labels = np.array(test_labels) ``` 然后,对数据进行标准化处理: ```python normalizer = preprocessing.Normalization() normalizer.adapt(train_features) train_features = normalizer(train_features) test_features = normalizer(test_features) ``` 接下来,建立神经网络模型: ```python model = keras.Sequential([ layers.Dense(64, activation='relu', input_shape=[len(train_features[0])]), layers.Dense(64, activation='relu'), layers.Dense(1) ]) model.compile( optimizer=tf.keras.optimizers.Adam(0.001), loss='mse', metrics=['mae', 'mse'] ) ``` 然后,训练神经网络模型: ```python history = model.fit( train_features, train_labels, validation_split=0.2, verbose=0, epochs=100) ``` 最后,使用Matplotlib库绘制训练验证的损失和精度曲线: ```python plt.plot(history.history['mse'], label='mse') plt.plot(history.history['val_mse'], label='val_mse') plt.xlabel('Epoch') plt.ylabel('mse') plt.legend() plt.show() ``` 以上是一个简单的示例代码,可以帮助你开始分析神经网络的性能。当然,具体实现还需要根据你的数据集和任务需求进行调整和优化。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值