"""
训练代码
"""
# 导入基本的库
import os
import paddle
import numpy as np
import matplotlib.pyplot as plt
"""
UCIHousing数据集共506行,每行14列。前13列用来描述房屋的各种信息,最后一列为该类房屋价格中位数。
PaddlePaddle提供了读取uci_housing数据集的接口:paddle.text.datasets.UCIHousing
PaddlePaddle中使用`paddle.io.DataLoader`来进行数据的加载操作,通过参数batch_size控制批次大小,shuffle控制是否打乱顺序
"""
BATCH_SIZE = 10
train_dataset = paddle.text.datasets.UCIHousing(mode='train')
valid_dataset = paddle.text.UCIHousing(mode='test')
# 用于训练的数据加载器,每次随机读取批次大小的数据,剩余不满足批大小的数据丢弃
train_loader = paddle.io.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
valid_loader = paddle.io.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
# 定义线性网络
net = paddle.nn.Linear(13, 1)
optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=net.parameters())
iter = 0
iters = []
train_costs = []
# 定义绘制训练过程的损失值变化趋势的方法
def draw_train_process(iters, train_costs):
title="training cost"
plt.title(title, fontsize=24)
plt.xlabel("iter", fontsize=14)
plt.ylabel("cost", fontsize=14)
plt.plot(iters, train_costs, color='blue', label='training cost')
plt.grid()
plt.show()
EPOCH_NUM = 20
"""
训练EPOCH_NUM轮,遍历轮次和数据集loader,将批次数据送入net里面进行计算,最终经过loss计算,在进行反向传播和参数优化。
注:enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,
"""
for pass_id in range(EPOCH_NUM):
# 开始训练并输出最后一个batch的损失值
train_cost = 0
# 遍历train_reader迭代器
for batch_id, data in enumerate(train_loader()):
inputs = paddle.to_tensor(data[0])
labels = paddle.to_tensor(data[1])
out = net(inputs)
train_loss = paddle.mean(paddle.nn.functional.square_error_cost(out, labels))
train_loss.backward()
optimizer.step()
optimizer.clear_grad()
# 打印最后一个batch的损失值
if batch_id % 40 == 0:
print("Pass:%d, Cost:%0.5f" % (pass_id, train_loss))
iter = iter + BATCH_SIZE
iters.append(iter)
train_costs.append(train_loss.numpy()[0])
# 开始测试并输出最后一个batch的损失值
test_loss = 0
# 遍历test_reader迭代器
for batch_id, data in enumerate(valid_loader()):
inputs = paddle.to_tensor(data[0])
labels = paddle.to_tensor(data[1])
out = net(inputs)
test_loss = paddle.mean(paddle.nn.functional.square_error_cost(out, labels))
# 打印最后一个batch的损失值
print('Test:%d, Cost:%0.5f' % (pass_id, test_loss))
# 保存模型
paddle.save(net.state_dict(), 'fit_a_line.pdparams')
draw_train_process(iters, train_costs)
"""
测试代码
"""
import matplotlib.pyplot as plt
import numpy as np
import paddle
infer_results = []
groud_truths = []
#绘制真实值和预测值对比图
def draw_infer_result(groud_truths, infer_results):
title='Boston'
plt.title(title, fontsize=24)
x = np.arange(1,20)
y = x
plt.plot(x, y)
plt.xlabel('ground truth', fontsize=14)
plt.ylabel('infer result', fontsize=14)
plt.scatter(groud_truths, infer_results, color='green',label='training cost')
plt.grid()
plt.show()
valid_dataset = paddle.text.UCIHousing(mode='test')
# 用于测试的数据加载器,每次随机读取批次大小的数据
infer_loader = paddle.io.DataLoader(valid_dataset, batch_size=200, shuffle=True)
infer_net = paddle.nn.Linear(13, 1)
param = paddle.load('fit_a_line.pdparams')
infer_net.set_dict(param)
data = next(infer_loader())
inputs = paddle.to_tensor(data[0])
results = infer_net(inputs)
for idx, item in enumerate(zip(results, data[1])):
print("Index:%d, Infer Result: %.2f, Ground Truth: %.2f" % (idx, item[0], item[1]))
infer_results.append(item[0].numpy()[0])
groud_truths.append(item[1].numpy()[0])
draw_infer_result(groud_truths, infer_results)
1.梯度下降法
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
class Network(object):
def __init__(self, num_of_weights):
np.random.seed(0) # 当我们设置相同的seed,每次生成的随机数相同。
self.w = np.random.randn(num_of_weights, 1) # 随机产生w的初始值
# print(self.w)
# self.w[5] = -100
# self.w[9] = -100
self.b = 0.
def load_data(self):
# 从文件导入数据
datafile = '../data/home/housing.data'
data = np.fromfile(datafile, sep=' ')
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格中位数
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行Reshape,变成[N, 14]这样的形状
# data = data.reshape([data.shape[0] // feature_num, feature_num])
data = data.reshape(-1, feature_num) # -1代表我不知道可以分成多少行,但是需要分成feature_num列
# print(data)
# print(len(data)) # 共有多少行
# 将原数据集拆分成训练集和测试集
# 这里使用80%的数据做训练,20%的数据做测试
# 测试集和训练集必须是没有交集的
ratio = 0.8
offset = int(data.shape[0] * ratio)
training_data = data[0:offset]
# print(len(training_data)) # 切片切割出训练数据
# 计算训练集的最大值,最小值,平均值
# maximums, minimums, avgs = training_data.max(axis=0), training_data.min(axis=0), \
# training_data.sum(axis=0) / training_data.shape[0]
maximums = training_data.max(axis=0) # DataFrame当中axis为0和1,0代表每一列,1代表每一行
minimums = training_data.min(axis=0)
avgs = training_data.sum(axis=0) / training_data.shape[0]
# 对数据进行归一化处理
for i in range(feature_num):
# print(maximums[i], minimums[i], avgs[i])
data[:, i] = (data[:, i] - minimums[i]) / (maximums[i] - minimums[i])
# 训练集和测试集的划分比例
training_data = data[:offset] # 从前往后切
test_data = data[offset:] # 从后往前切
# print(len(training_data))
# print(len(test_data))
return training_data, test_data
def forward(self, x):
z = np.dot(x, self.w) + self.b
return z
# 损失函数
def loss(self, z, y):
error = z - y
num_samples = error.shape[0]
cost = error * error
cost = np.mean(cost) / num_samples
return cost
# 梯度计算
def gradient(self, x, y):
z = self.forward(x)
# w的梯度计算
gradient_w = (z - y) * x
gradient_w = np.mean(gradient_w, axis=0)
gradient_w = gradient_w[:, np.newaxis]
# b的梯度计算
gradient_b = (z - y)
gradient_b = np.mean(gradient_b)
return gradient_w, gradient_b
def updata(self, gradient_w, gradient_b, eta=0.01):
net.w = self.w - eta * gradient_w
net.b = self.b - eta * gradient_b
def train(self, x, y, iterations=100, eta=0.01):
losses = []
for i in range(iterations):
z = self.forward(x)
L = self.loss(z, y)
gradient_w, gradient_b = self.gradient(x, y)
self.updata(gradient_w, gradient_b, eta)
losses.append(L)
if (i+1) % 10 == 0:
print('iter {}, loss {}'.format(i, L))
return losses
net = Network(13)
# 获取数据
train_data, test_data = net.load_data()
x = train_data[:, :-1]
y = train_data[:, -1:]
# 创建网络
num_iterations=1000
# 启动训练
losses = net.train(x,y, iterations=num_iterations, eta=0.01)
# 画出损失函数的变化趋势
plot_x = np.arange(num_iterations)
plot_y = np.array(losses)
plt.plot(plot_x, plot_y)
plt.show()
2.随机梯度下降法
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
class Network(object):
def __init__(self, num_of_weights):
# np.random.seed(0) # 当我们设置相同的seed,每次生成的随机数相同。
self.w = np.random.randn(num_of_weights, 1) # 随机产生w的初始值
# print(self.w)
self.b = 0.
def load_data(self):
# 从文件导入数据
datafile = '../data/home/housing.data'
data = np.fromfile(datafile, sep=' ')
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格中位数
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行Reshape,变成[N, 14]这样的形状
# data = data.reshape([data.shape[0] // feature_num, feature_num])
data = data.reshape(-1, feature_num) # -1代表我不知道可以分成多少行,但是需要分成feature_num列
# print(data)
# print(len(data)) # 共有多少行
# 将原数据集拆分成训练集和测试集
# 这里使用80%的数据做训练,20%的数据做测试
# 测试集和训练集必须是没有交集的
ratio = 0.8
offset = int(data.shape[0] * ratio)
training_data = data[0:offset]
# print(len(training_data)) # 切片切割出训练数据
# 计算训练集的最大值,最小值,平均值
# maximums, minimums, avgs = training_data.max(axis=0), training_data.min(axis=0), \
# training_data.sum(axis=0) / training_data.shape[0]
maximums = training_data.max(axis=0) # DataFrame当中axis为0和1,0代表每一列,1代表每一行
minimums = training_data.min(axis=0)
avgs = training_data.sum(axis=0) / training_data.shape[0]
# 对数据进行归一化处理
for i in range(feature_num):
# print(maximums[i], minimums[i], avgs[i])
data[:, i] = (data[:, i] - minimums[i]) / (maximums[i] - minimums[i])
# 训练集和测试集的划分比例
training_data = data[:offset] # 从前往后切
test_data = data[offset:] # 从后往前切
# print(len(training_data))
# print(len(test_data))
return training_data, test_data
def forward(self, x):
z = np.dot(x, self.w) + self.b
return z
# 损失函数
def loss(self, z, y):
error = z - y
num_samples = error.shape[0]
cost = error * error
cost = np.sum(cost) / num_samples
return cost
# 梯度计算
def gradient(self, x, y):
z = self.forward(x)
N = x.shape[0]
# w的梯度计算
# gradient_w = (z - y) * x
# gradient_w = np.mean(gradient_w, axis=0)
# gradient_w = gradient_w[:, np.newaxis]
gradient_w = 1. / N * np.sum((z - y) * x, axis=0)
gradient_w = gradient_w[:, np.newaxis]
# b的梯度计算
# gradient_b = (z - y)
# gradient_b = np.mean(gradient_b)
gradient_b = 1. / N * np.sum(z - y)
return gradient_w, gradient_b
def updata(self, gradient_w, gradient_b, eta=0.01):
net.w = self.w - eta * gradient_w
net.b = self.b - eta * gradient_b
def train(self, training_data, num_epochs, batch_size=10, eta=0.01):
n = len(training_data)
losses = []
for epoch_id in range(num_epochs):
# 在每轮迭代开始之前,将训练数据的顺序随机打乱
# 然后再按每次取batch_size条数据的方式取出
np.random.shuffle(training_data)
# 将训练数据进行拆分,每个mini_batch包含batch_size条的数据
mini_batches = [training_data[k:k + batch_size] for k in range(0, n, batch_size)]
for iter_id, mini_batch in enumerate(mini_batches):
# print(self.w.shape)
# print(self.b)
x = mini_batch[:, :-1]
y = mini_batch[:, -1:]
a = self.forward(x)
loss = self.loss(a, y)
gradient_w, gradient_b = self.gradient(x, y)
self.updata(gradient_w, gradient_b, eta)
losses.append(loss)
print('Epoch {:3d} / iter {:3d}, loss = {:.4f}'.format(epoch_id, iter_id, loss))
return losses
net = Network(13)
# 获取数据
train_data, test_data = net.load_data()
# 启动训练
losses = net.train(train_data, num_epochs=50, batch_size=100, eta=0.1)
# 画出损失函数的变化趋势
plot_x = np.arange(len(losses))
plot_y = np.array(losses)
plt.plot(plot_x, plot_y)
plt.show()
2.通过PaddlePaddle重写
#加载飞桨、Numpy和相关类库
import paddle # 飞桨的主库
# from paddle.nn import Linear # 组网相关的API,例如 Linear 、卷积 Conv2D 、 循环神经网络 LSTM 、损失函数 CrossEntropyLoss 、 激活函数 ReLU 等。
# Linear:神经网络的全连接层函数,即包含所有输入权重相加的基本神经元结构。
# 在房价预测任务中,使用只有一层的神经网络(全连接层)来实现线性回归模型。
import paddle.nn.functional as F
import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid.dygraph import Linear
import numpy as np
import os
import random
# 数据处理
def load_data():
# 从文件导入数据
datafile = '../data/home/housing.data'
data = np.fromfile(datafile, sep=' ')
# 每条数据包括14项,其中前面13项是影响因素,第14项是相应的房屋价格中位数
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE','DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
feature_num = len(feature_names)
# 将原始数据进行Reshape,变成[N, 14]这样的形状
# data = data.reshape([data.shape[0] // feature_num, feature_num])
data = data.reshape(-1, feature_num) # -1代表我不知道可以分成多少行,但是需要分成feature_num列
# print(data)
# print(len(data)) # 共有多少行
# 将原数据集拆分成训练集和测试集
# 这里使用80%的数据做训练,20%的数据做测试
# 测试集和训练集必须是没有交集的
ratio = 0.8
offset = int(data.shape[0] * ratio)
training_data = data[0:offset]
# print(len(training_data)) # 切片切割出训练数据
# 计算训练集的最大值,最小值,平均值
# maximums, minimums, avgs = training_data.max(axis=0), training_data.min(axis=0), \
# training_data.sum(axis=0) / training_data.shape[0]
maximums = training_data.max(axis=0) # DataFrame当中axis为0和1,0代表每一列,1代表每一行
minimums = training_data.min(axis=0)
avgs = training_data.sum(axis=0) / training_data.shape[0]
# 对数据进行归一化处理
for i in range(feature_num):
# print(maximums[i], minimums[i], avgs[i])
data[:, i] = (data[:, i] - minimums[i]) / (maximums[i] - minimums[i])
# 训练集和测试集的划分比例
training_data = data[:offset] # 从前往后切
test_data = data[offset:] # 从后往前切
# print(len(training_data))
# print(len(test_data))
return training_data, test_data
# 模型设计
class Regressor(fluid.dygraph.Layer):
# self代表类的实例自身
def __init__(self):
# 初始化父类中的一些参数
super(Regressor, self).__init__()
# 定义一层全连接层,输入维度是13,输出维度是1,激活函数为None
self.fc = Linear(input_dim=13, output_dim=1, act=None)
# 网络的前向计算
def forward(self, inputs):
x = self.fc(inputs)
return x
with fluid.dygraph.guard():
# 声明定义好的线性回归模型
model = Regressor()
# 开启模型训练模式
model.train()
# 加载数据
training_data, test_data = load_data()
# 定义优化算法,使用随机梯度下降SGD
# 学习率设置为0.01
opt = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
with dygraph.guard(fluid.CPUPlace()):
EPOCH_NUM = 100 # 设置外层循环次数
BATCH_SIZE = 10 # 设置batch大小
# 定义外层循环
for epoch_id in range(EPOCH_NUM):
# 在每轮迭代开始之前,将训练数据的顺序随机的打乱
np.random.shuffle(training_data)
# 将训练数据进行拆分,每个batch包含10条数据
mini_batches = [training_data[k:k + BATCH_SIZE] for k in range(0, len(training_data), BATCH_SIZE)]
# 定义内层循环
for iter_id, mini_batch in enumerate(mini_batches):
x = np.array(mini_batch[:, :-1]).astype('float32') # 获得当前批次训练数据
y = np.array(mini_batch[:, -1:]).astype('float32') # 获得当前批次训练标签(真实房价)
# 将numpy数据转为飞桨动态图tensor形式
house_features = paddle.to_tensor(x)
prices = paddle.to_tensor(y)
# 前向计算
predicts = model(house_features)
# 计算损失
loss = F.square_error_cost(predicts, label=prices)
avg_loss = paddle.mean(loss)
if iter_id % 20 == 0:
print("epoch: {}, iter: {}, loss is: {}".format(epoch_id, iter_id, avg_loss.numpy()))
# 反向传播
avg_loss.backward()
# 最小化loss,更新参数
opt.minimize(avg_loss)
# 清除梯度
model.clear_gradients()
paddle.save(model.state_dict(), 'LR_model.pdparams')
print("模型保存成功,模型参数保存在LR_model.pdparams中")