工程结构:
training.py:
数据预处理部分显得非常笨拙……请大家指教。
import numpy as np
import pandas as pd
from network import RNN
from dataset import MyDataset
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.utils.data.dataloader as DataLoader
import matplotlib.pyplot as plt
# Prepare data
data = pd.DataFrame(pd.read_csv("train.csv"))
pmData = data.query('測項=="PM2.5"')
trainData = np.array(pmData.drop(['日期', '測站', '測項'], axis=1))
trainData = np.delete(trainData, 0, 0)
trainData = np.delete(trainData, 0, 1)
trainData = trainData.flatten()
(n,) = trainData.shape
print(n)
cycleData = trainData[0:9]
cycleData = cycleData.reshape((1, 9))
cycleLabel = trainData[9:]
cycleLabel = cycleLabel.reshape((-1, 1))
for i in range(1, n - 9):
cycleData = np.append(cycleData, trainData[i:i + 9].reshape((1, 9)), 0)
cycleData = cycleData.astype(np.float32)
cycleLabel = cycleLabel.astype(np.float32)
print(cycleData.shape) # [5488,9]
print(cycleLabel.shape) # [5488,1]
# cuda setting
assert torch.cuda.is_available()
cuda_device = torch.device("cuda") # device object representing GPU
# training setting
errors = []
k_folds = 8 # 8折
epochs = 30
lr = [0.001, 0.001, 0.004, 0.008, 0.01, 0.04, 0.08, 0.1] # 学习率
batch_size = 7
criterion = nn.MSELoss(reduce=True)
criterion.cuda()
h_state = torch.zeros([1, batch_size, 32]) # 32对应网络定义中的hidden size即隐藏神经元个数
kf = KFold(n_splits=k_folds)
fold = 0
for train_index, test_index in kf.split(cycleData):
# 为节约时间,此处实际上只进行了一折
if (fold != 0):
break
dataset = MyDataset(cycleData[train_index], cycleLabel[train_index])
dataloader = DataLoader.DataLoader(dataset, batch_size=batch_size, shuffle=True)
net = RNN()
net = net.float()
net.cuda()
optimizer = torch.optim.Adam(net.parameters(), lr=lr[fold])
# training loss
err_list = []
print("\ntraining...")
for _ in range(epochs):
epoch_loss = 0
for i, (x, lbs) in enumerate(dataloader):
outputs, h_state = net(x.float().cuda(), h_state.cuda())
# print(outputs.shape)
# print(h_state.shape)
outputs = outputs[:, -1, :]
h_state = h_state.detach()
lbs = lbs.squeeze(1)
loss = criterion(outputs, lbs.cuda())
# backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += torch.sum((outputs.cuda() - lbs.cuda()) ** 2).item() / batch_size
err_list.append(epoch_loss / i)
print(err_list)
errors.append(err_list)
# paint the learning curve
plt.figure(fold)
x = range(len(errors[-1]))
y = errors[-1]
plt.plot(x, y)
plt.title("k=" + str(fold) + ", lr=" + str(lr[fold]))
plt.show()
fold += 1
torch.save(net, 'RNN_' + str(fold) + '.pkl')
network.py:
一个网上到处都是的简单结构
import torch.nn as nn
class RNN(nn.Module):
def __init__(self):
super(RNN, self).__init__()
self.rnn = nn.RNN(
input_size=1,
hidden_size=32,
num_layers=1,
batch_first=True
)
self.out = nn.Linear(32, 1)
def forward(self, x, h):
out, h = self.rnn(x, h)
prediction = self.out(out)
return prediction, h
dataset.py:
简单粗暴的dataset,好像使用的必要性不大,但是习惯了这种……
from torch.utils.data import Dataset
import torch
class MyDataset(Dataset):
def __init__(self, a, b):
self.file = torch.from_numpy(a).unsqueeze(-1)
self.label = torch.from_numpy(b).unsqueeze(-1)
def __getitem__(self, index):
seq = self.file[index]
label = self.label[index]
return seq, label
def __len__(self):
return self.label.shape[0]
testing.py:
把training.py改了改……其实数据处理都是重复的
import numpy as np
import pandas as pd
from network import RNN
from dataset import MyDataset
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.utils.data.dataloader as DataLoader
import matplotlib.pyplot as plt
#Prepare data
data = pd.DataFrame(pd.read_csv("train.csv"))
pmData = data.query('測項=="PM2.5"')
trainData = np.array(pmData.drop(['日期','測站','測項'],axis=1))
trainData = np.delete(trainData,0,0)
trainData = np.delete(trainData,0,1)
trainData = trainData.flatten()
(n,) = trainData.shape
print(n)
cycleData = trainData[0:9]
cycleData = cycleData.reshape((1,9))
cycleLabel = trainData[9:]
cycleLabel = cycleLabel.reshape((-1,1))
for i in range(1,n-9):
cycleData = np.append(cycleData,trainData[i:i+9].reshape((1,9)),0)
cycleData = cycleData.astype(np.float32)
cycleLabel = cycleLabel.astype(np.float32)
print(cycleData.shape)
print(cycleLabel.shape)
#cuda setting
assert torch.cuda.is_available()
cuda_device = torch.device("cuda") # device object representing GPU
#training setting
errors = []
k_folds = 8
epochs = 30
lr = [0.001, 0.001, 0.004, 0.008, 0.01, 0.04, 0.08, 0.1]
batch_size = 1 #注意修改此处!!
criterion = nn.MSELoss(reduce=True)
criterion.cuda()
h_state = torch.zeros([1,batch_size,32])
kf = KFold(n_splits=8)
fold = 0
for train_index, test_index in kf.split(cycleData):
#实际上只测试了一个模型
if(fold!=0):
break
testset = MyDataset(cycleData[test_index], cycleLabel[test_index])
testloader = DataLoader.DataLoader(testset)
labels = cycleLabel[test_index].flatten().tolist()
net = torch.load('RNN_1.pkl')#实际上只测试了一个模型
net.eval()
predict = []
with torch.no_grad():
for x, lbs in testloader:
outputs, h_state = net(x.float().cuda(), h_state.cuda())
outputs = outputs[:, -1, :]
h_state = h_state.detach()
lbs = lbs.squeeze(1)
loss = criterion(outputs, lbs.cuda())
predict.append(outputs.squeeze(1).item())
plt.figure(fold)
x = range(len(predict))
plt.plot(x, labels, color='orange', label='label')
plt.plot(x, predict, color='cyan', label='predict')
plt.legend()
plt.title("model RNN_1")
plt.show()
fold += 1
结果:
总结:
1、个人认为,按此方法处理数据以后,其实并没有使用RNN的必要,普通NN将九个输入视为九个特征完全可以达到差不多的效果。但显然RNN参数更少。时间有限,此模型没有调参,但基于样本量,完全可以使RNN结构更复杂一点。
2、代码学习借鉴了网络上各种博客,感谢!同时欢迎纠错和提问。
3、模型资源以上传,审核通过后会附上链接。