一、介绍
目的:回归预测。根据数据集提供的前93行特征来预测第94行结果,数据集有train.csv和test.csv,其中test.csv中没有提供第94行结果,需要在Kaggle官网的ML2021Spring-hw1中进行测试结果准确性。
数据集链接:https://pan.baidu.com/s/1LOnPkn-0lKQE8Lth-93gQQ
二、作业实现
2.1 库的导入
import torch
import torch.nn as nn
import torch.nn.functional as f
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
import json
2.2 定义数据集处理类
主要定义三个函数:__init__, __getitem__, __len__。
class MyData(Dataset):
def __init__(self, path, mode='train', target_only=False):
self.mode = mode
# 读取csv文件
f = open(path, 'r')
data = list(csv.reader(f))
data = np.array(data[1:])[:, 1:].astype(float)
f.close()
# feats用于选择特征,默认是利用93个特征去训练
feats = None
if target_only:
feats = list(range(93))
else:
pass
# 如果是test,由于数据集特殊原因只return self.data;反之利用random_split分割train和valid
if mode == 'test':
data = data[:, feats]
self.data = torch.FloatTensor(data)
else:
train_len = int(0.8*len(data))
train_dataset, valid_dataset = random_split(data, [train_len, len(data) - train_len], generator=torch.Generator().manual_seed(5200319))
train_dataset, valid_dataset = np.array(train_dataset), np.array(valid_dataset)
if mode == "train":
self.data = torch.FloatTensor(train_dataset[:, feats])
self.target = torch.FloatTensor(train_dataset[:, -1])
else:
self.data = torch.FloatTensor(valid_dataset[:, feats])
self.target = torch.FloatTensor(valid_dataset[:, -1])
# 归一化处理操作,加速模型收敛速度
self.data[:, 40:] = (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
/ self.data[:, 40:].std(dim=0, keepdim=True)
# 利用item对dataset取索引,是重写函数
def __getitem__(self, item):
if self.mode == "test":
return self.data[item]
else:
return self.data[item], self.target[item]
# 返回数据集的尺寸
def __len__(self):
return len(self.data)
2.3 定义数据加载函数
'''
drop_last = False 最后一个batch丢不丢,num_workers = n_jobs线程问题。
pin_memory = True就是说cpu通过PCIe个gpu进行数num_workers = n_jobs据的传输,
但是cpu的数据有时候会被存在固态中,那么就会多了一个存取的过程呗cpu取出来给pcle然后
传到gpu,这个pin_memory为真就是不让模型cpu将多余的信息存到磁盘中直接通过PCIe传给gpu
'''
def perp_dataloader(path, mode, batch_size, num_workers=8, target_only=False):
dataset = MyData(path, mode, target_only=target_only)
dataloader = DataLoader(dataset, batch_size, shuffle=(mode == 'train'),
drop_last=False, num_workers=num_workers, pin_memory=True)
return dataloader
2.4 定义train函数
def train(train_set, valid_set, model, epoch_num, loss_function, optimizer, device, save_path):
model.to(device)
# 记录训练和验证的损失和精度
dict_trainParam = {'train_loss': [], 'train_acc': [],
'valid_loss': [], 'valid_acc': []}
# 设置早停机制
min_loss = 1000.0
early_stop_cnt = 0
eraly_stop = 30
all_epochByuse = 0
# 开始迭代
for epoch in range(1, epoch_num+1):
all_epochByuse += 1
model.train() # 开始训练
train_loss = 0.0
train_num = 0
for x, y in train_set:
optimizer.zero_grad() # 梯度清零
x, y = x.to(device), y.to(device)
pred = model(x)
loss = loss_function(pred, y) # 计算损失函数
loss.backward() # 链式法则进行反向传播
optimizer.step() # 参数优化
train_loss += loss.detach().cpu().item() * len(x) # 计算训练损失
train_num += len(x)
train_loss /= train_num
dict_trainParam['train_loss'].append(train_loss)
model.eval() # 模型开始验证
valid_loss = 0.0
valid_num = 0
for x, y in valid_set:
x, y = x.to(device), y.to(device)
with torch.no_grad():
pred = model(x)
valid_loss += loss_function(pred, y).detach().cpu().item() * len(x)
valid_num += len(x)
valid_loss /= valid_num
dict_trainParam['valid_loss'].append(valid_loss)
print("epoch:{},train loss:{},valid loss: {}".format(epoch, train_loss, valid_loss))
# 判断当前是否满足收敛条件并且是否需要更新pth文件
if valid_loss < min_loss:
min_loss = valid_loss
print('Saving model (epoch = {:4d}, loss = {:.4f})'
.format(epoch, min_loss))
torch.save(model.state_dict(), save_path)
early_stop_cnt = 0
else:
early_stop_cnt += 1
if early_stop_cnt > eraly_stop:
break
print('完成训练{} epochs'.format(all_epochByuse))
return min_loss, dict_trainParam
2.5 定义test函数
def test(test_set, model, device, model_path):
model.to(device)
# 加载pth文件
model.load_state_dict(torch.load(model_path))
model.eval()
preds = []
for x in test_set:
x = x.to(device)
# 设置不计算梯度
with torch.no_grad():
pred = model(x)
preds.append(pred.detach().cpu())
preds = torch.cat(preds, dim=0).numpy()
return preds
2.6 定义主函数
if __name__ == "__main__":
# 加载数据集
train_set = perp_dataloader("./data/train.csv", 'train', 8, target_only=True)
valid_set = perp_dataloader("./data/train.csv", 'valid', 8, target_only=True)
test_set = perp_dataloader("./data/test.csv", 'test', 8, target_only=True)
# 定义计算设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = myNet(93)
criterion = nn.MSELoss(reduction='mean')
# optim = torch.optim.Adam(model.parameters(), 0.001, (0.9, 0.99), 1e-6, 5e-4)
optim = torch.optim.SGD(model.parameters(), 0.001, momentum=0.8, weight_decay=5e-4)
save_path = "./data/net.pth"
# select 用于决定选择train/test函数
select = 0
if select == 1:
train_min_loss, train_param_dict = train(train_set, valid_set, model, 3000, criterion, optim, device, save_path)
print(train_min_loss)
trainLossList = train_param_dict["train_loss"]
validLossList = train_param_dict["valid_loss"]
epochList = [i for i in range(1, len(trainLossList)+1)]
f = open("./data/trainData.json", 'w')
json.dump(train_param_dict, f)
f.close()
plt.figure()
plt.plot(epochList, trainLossList, color='red', label='train')
plt.figure()
plt.plot(epochList, validLossList, color='b', label='valid')
plt.show()
else:
pred = test(test_set, model, device, save_path)
f = open("./data/predict.csv", 'w', newline='')
writer = csv.writer(f)
c = "id"
a = "tested_positive"
writer.writerow([c, a])
for i, idx in enumerate(pred):
writer.writerow([i, idx])
f.close()
2.7 训练和测试结果
训练结果,左图为train,右图为valid。
三、模型改进
3.1 特征相关性分析
通过调用pandas库对特征与结果间的相关性进行分析(采用皮尔逊相关系数(Pearson Correlation Coefficient):用于度量两个变量X和Y之间的线性相关程度,其值介于-1和1之间。当相关系数为1时,表示完全正相关;当相关系数为-1时,表示完全负相关;当相关系数为0时,表示不相关。皮尔逊相关系数要求数据服从正态分布,并且是线性关系。)相关性结果如下:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import numpy as np
df = pd.read_csv("./data/train.csv")
# 计算相关性
features = df.columns.difference(['id', 'tested_positive'])
# 计算这些特征列与'tested_positive'列之间的相关性
correlation_matrix = df[features].corrwith(df['tested_positive']).to_frame('Correlation')
"""
# 遍历每一行(或列),同时获取索引(行名/列名)和数据
for index, row in correlation_matrix.iterrows():
print(f"Row Index: {index}")
print(f"Row Data: {row.to_dict()}") # 将行转换为字典,但注意这将包括所有列
"""
# 转换为列表的列表(每行一个列表)
list_of_lists = correlation_matrix.values.tolist()
# 用于记录相关性系数大于0.4的列索引
list_corr = []
for idx, value in enumerate(list_of_lists):
if value[0] > 0.1:
list_corr.append(idx)
print(list_corr)
print(len(list_corr))
# 绘制相关性分析数值图
plt.figure(figsize=(10, 6))
correlation_matrix.sort_values(by='Correlation', ascending=False).plot(kind='bar')
plt.title('Correlation Features with Result')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.xticks(rotation=60, ha="right") # 旋转x轴标签以便更好地显示
plt.tight_layout()
plt.show()
选择相关性大于0.1的特征重新训练。
需要对上述代码进行改进:在数据集处理类中修改:
if target_only:
feats = list(range(93))
else:
feats = [9, 10, 20, 21, 23, 26, 28, 29, 31, 34, 35, 40, 41, 42, 43, 44, 45, 52, 53, 54, 55, 56, 57, 61, 62, 63, 76, 77, 87, 88, 89, 90, 91, 92]
3.2 模型改进
可以对网络的层数和神经元进行增减以适应数据集,另外选择合适的损失函数、优化器以及学习率等也是非常重要的。
以上就是牛马研究生的初步学习拉,啦啦啦啦!!!加油加油!!!