UP:B站-刘二大人
原视频链接:08.加载数据集_哔哩哔哩_bilibili
Kaggle数据集下载地址:Titanic - Machine Learning from Disaster | Kaggle
import torch
from torch.utils.data import Dataset # Dataset是一个抽象类,不可以直接实例化,需要自己定义子类去继承Dataset
from torch.utils.data import DataLoader # DataLoader是一个可以进行实例化的类
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class TitanicDataset(Dataset):
def __init__(self, filepath):
xy = pd.read_csv(filepath)
self.len = xy.shape[0] # xy.shape是取数据集的行数,即得到样本的数量N
feature = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']
# 本来就是数字的列保持原样,非数字数据的列会重新编码,在这步中即把Sex列分成Sex_male和Sex_female两列,通过0、1编码
self.x_data = torch.from_numpy(np.array(pd.get_dummies(xy[feature])))
self.y_data = torch.from_numpy(np.array(xy['Survived']))
def __getitem__(self, index): # 魔法函数(具体定义可以百度)可以通过下标索引取出数据
return self.x_data[index], self.y_data[index] # python中return x, y 返回的是(x, y)这样一个元组
def __len__(self): # 使用len()函数可以把数据的长度返回
return self.len
dataset = TitanicDataset("Titanic_train.csv")
train_loader = DataLoader(dataset = dataset, batch_size=32, shuffle=True) # DataLoader每次会读出一个batch_size
class Model(torch.nn.Module):
# 构造函数
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(6, 3) # #输入6维,输出3维;五个特征转化为了6维,因为get_dummies将性别这一个特征用两个维度来表示
self.linear2 = torch.nn.Linear(3, 1)
self.active = torch.nn.ReLU()
self.sigmoid = torch.nn.Sigmoid()
# 前馈计算函数
def forward(self, x):
x = self.sigmoid(self.linear1(x))
x = self.sigmoid(self.linear2(x))
return x
# 实例化模型
model = Model()
# 定义损失函数
criterion = torch.nn.BCELoss(reduction = 'mean')
# 定义优化器
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
loss_list = []
epoch_list = []
# 训练过程
if __name__ == '__main__': # 防止Windows系统报错
for epoch in range(500): # 训练轮数的迭代
loss_sum = 0
for i, (inputs, labels) in enumerate(train_loader, 0): # mini-batch的迭代
inputs =inputs.float() # 因为输入的列不都是float浮点型,所以需要格式转换一下
labels = labels.float() # 因为输出的列(survived)格式是整型,所以需要格式转换一下
y_pred = model(inputs)
y_pred = y_pred.squeeze(-1) # #前向输出结果是[[12],[34],[35],[23],[11]]这种,需要将这个二维矩阵转换成一行[12,34,35,23,11]
'''
在加载数据集那个py文件中,之所以不需要这一步是因为:self.y_data = torch.from_numpy(xy[:, [-1]])
这一步即使得labels也是二维矩阵,y_pred和labels都是二维矩阵,所以不会报错
这里的思路是,因为labels现在是一维数组,所以将y_pred也转换成一维数组
'''
loss = criterion(y_pred, labels)
loss_sum += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_list.append(epoch)
loss_list.append(loss_sum/train_loader.batch_size)
print("epoch:", epoch, "loss:", loss.item())
# 测试过程
test_data = pd.read_csv("Titanic_test.csv")
test_realresult = pd.read_csv("Titanic_gender_submission.csv")
feature = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
x_test = torch.from_numpy(np.array(pd.get_dummies(test_data[feature])))
y_test_pred = []
y_test = model.forward(x_test.float())
for i in y_test:
if i>0.5:
y_test_pred.append(1)
else:
y_test_pred.append(0)
# 预测结果提交:https://www.kaggle.com/competitions/titanic/submissions
# gender_submission是官网给出的示例提交文件,这里就是做一个演示,假设如果这是正确答案,那么预测的精度怎么计算
outputs = pd.DataFrame({'PassengerId': test_data.PassengerId, 'pred-Survived': y_test_pred, 'real-Survived': test_realresult.Survived})
outputs.to_csv('predict_titantic.csv', index=False) # index=False 代表不保存索引
outputs.head()
# 通过比较预测生存情况和实际生存情况两列数据的相等率来求预测的精度
a = y_test_pred
b = test_realresult.Survived
equal_column = 0
m = 0
n = 0
for i in range(len(a)):
if a[m] == b[n]:
equal_column += 1
m += 1
n += 1
print("acc=", equal_column/len(a))
# 画图
plt.plot(epoch_list, loss_list)
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()