UP:B站-刘二大人
原视频链接:09.多分类问题_哔哩哔哩_bilibili
数据集下载地址:Otto Group Product Classification Challenge | Kaggle
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import pandas as pd
import matplotlib.pyplot as plt
# 首先需要把target的字符型标签转换成数字型标签
def target_to_number(labels):
target_number = []
target_character = ["Class_1", "Class_2", "Class_3", "Class_4", "Class_5", "Class_6", "Class_7", "Class_8", "Class_9"]
for i in labels:
target_number.append(target_character.index(i)) # target_character.index(x)作用即是获得target_character这个list中x元素的下标
return target_number # 把Class_1到Class_9重新编号成0-8
class TrainDataset(Dataset):
def __init__(self, filepath):
data = pd.read_csv(filepath)
labels = data["target"]
self.x_data = torch.from_numpy(np.array(data)[:, 1:-1].astype(float)) # type()返回参数的数据类型; dtype()返回数组中元素的数据类型; astype()对数据类型进行转换
self.y_data = target_to_number(labels)
self.len = data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.len
dataset = TrainDataset("Otto Group Product Classification Challenge_train.csv")
train_loader = DataLoader(dataset = dataset, batch_size = 64, shuffle = True)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.l1 = torch.nn.Linear(93, 64)
self.l2 = torch.nn.Linear(64, 32)
self.l3 = torch.nn.Linear(32, 16)
self.l4 = torch.nn.Linear(16, 9) # 最后分成9类
self.active = torch.nn.ReLU()
def forward(self, x):
x = self.active(self.l1(x))
x = self.active(self.l2(x))
x = self.active(self.l3(x))
return self.l4(x)
model = Model()
criterion = torch.nn.CrossEntropyLoss() # 交叉熵损失
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.5)
loss_list = []
def train(epoch):
loss_sum = 0.0
for i, (inputs, targets) in enumerate(train_loader):
inputs = inputs.float()
y_pred = model(inputs)
loss = criterion(y_pred, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_list.append(loss.item())
loss_sum += loss.item()
if i % 300 == 299:
print("[%d %5d] loss: %.3f" % (epoch+1, i+1, loss_sum / i))
if __name__ == '__main__':
for epoch in range(50):
train(epoch)
plt.plot(range(len(loss_list)), loss_list)
plt.xlabel('step')
plt.ylabel('loss')
plt.show()
def test():
test_data = pd.read_csv("Otto Group Product Classification Challenge_test.csv")
test.x_data = torch.from_numpy(np.array(test_data)[:, 1:].astype(np.float32))
test.y_data = model(test.x_data) # test.y_data的输出其实是N个1*9的数组(N对应的是测试集的样本总数),每个1*9的数组实际是该样本分别被预测成0到8这9类的概率大小
# print(test.y_data)
# print(test.y_data.size())
_, predicted = torch.max(test.y_data, dim=1) # predicted的输出其实是1*N的数组,等于是每个样本都从自己的1*9的数组中找到最大值,返回的数字就是这个最大值的下标
# print(predicted)
# print(predicted.size())
out = pd.get_dummies(predicted)
# print(out)
labels = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
out.columns = labels # 添加列标签
# 插入id行
out.insert(0, 'id', test_data['id'])
result = pd.DataFrame(out)
result.to_csv('otto-group-product_predictions.csv', index=False) # index=False 代表不保存索引
test()