一、目的
对kaggle上的Otto Group Product Classification Challenge项目进行多分类,数据集中每个个体提供93个特征,分9类。
二、编程
数据集官网下载即可,https://www.kaggle.com/c/otto-group-product-classification-challenge/data,还可以看到训练集与测试集的细节以及需要提交怎样的预测文件。
使用pytorch框架,本次搭建4层神经网络,使用relu和softmax激活函数。
下面是本次编程使用的模块:
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
本次编程同titanic项目不同的是,对于数据的处理较为简单,主要是测试集将target转化为one-hot格式存储,训练集使用id存储,方便计算交叉熵损失。
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
# 将类别标签转换为id标签,方便后面计算交叉熵损失
def lables2id(lables):
target_id = []
target_lables = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
for lable in lables:
# 标签返回索引
target_id.append(target_lables.index(lable))
return target_id
# 建立数据集类
class MyDataset(Dataset):
def __init__(self, filepath):
data = pd.read_csv(filepath)
lables = data['target']
self.len = data.shape[0]
# 除去id和target列,只包含特征列
self.data_x = torch.tensor(np.array(data)[:, 1:-1].astype(float))
# 得到target列的id索引
self.data_y = lables2id(lables)
def __getitem__(self, index):
return self.data_x[index], self.data_y[index]
def __len__(self):
return self.len
dataset = MyDataset('./dataset/kaggle/train.csv')
# 建立数据集加载器
train_loader = DataLoader(dataset=dataset, shuffle=True, batch_size=64, num_workers=0)
# 构建网络模型
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.l1 = torch.nn.Linear(93, 64)
self.l2 = torch.nn.Linear(64, 32)
self.l3 = torch.nn.Linear(32, 16)
self.l4 = torch.nn.Linear(16, 9)
self.relu = torch.nn.ReLU()
def forward(self, x):
x = self.relu(self.l1(x))
x = self.relu(self.l2(x))
x = self.relu(self.l3(x))
return self.l4(x)
def predict(self, x):
with torch.no_grad():
x = self.relu(self.l1(x))
x = self.relu(self.l2(x))
x = self.relu(self.l3(x))
x = self.relu(self.l4(x))
# 取出最大概率的索引
_, predicted = torch.max(x, dim=1)
# 将预测的类别转为one-hot表示,方便后面保存为csv文件
y = pd.get_dummies(predicted)
return y
model = Net()
# 构建损失函数和优化器
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), momentum=0.5, lr=0.01)
# 训练
def train(epoch):
running_loss = 0.
for batch_idx, data in enumerate(train_loader, 0):
# 取出一批数据
inputs, target = data
inputs = inputs.float()
optimizer.zero_grad()
# 正向传播
outputs = model(inputs)
# 计算损失
loss = criterion(outputs, target)
# 反向传播
loss.backward()
# 参数更新
optimizer.step()
running_loss += loss.item()
if batch_idx % 300 == 299:
print('[%d %5d] loss: %.3f' % (epoch+1, batch_idx+1, running_loss/300))
running_loss = 0.
if __name__ == "__main__":
for epoch in range(10):
train(epoch)
def predict_sive():
test_data = pd.read_csv("./dataset/kaggle/test.csv")
test_inputs = torch.tensor(np.array(test_data)[:, 1:].astype(float))
out = model.predict(test_inputs.float())
print(out.shape)
lables = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
# 添加列标签
out.columns = lables
# 插入id行
out.insert(0, 'id', test_data['id'])
output = pd.DataFrame(out)
output.to_csv("my_predict.csv", index=False)
predict_sive()
损失下降如下:
[1 300] loss: 1.714
[1 600] loss: 1.040
[1 900] loss: 0.891
[2 300] loss: 0.769
[2 600] loss: 0.737
[2 900] loss: 0.716
[3 300] loss: 0.691
[3 600] loss: 0.688
[3 900] loss: 0.677
[4 300] loss: 0.650
[4 600] loss: 0.659
[4 900] loss: 0.648
[5 300] loss: 0.628
[5 600] loss: 0.627
[5 900] loss: 0.629
[6 300] loss: 0.611
[6 600] loss: 0.601
[6 900] loss: 0.604
[7 300] loss: 0.585
[7 600] loss: 0.601
[7 900] loss: 0.584
[8 300] loss: 0.579
[8 600] loss: 0.579
[8 900] loss: 0.579
[9 300] loss: 0.573
[9 600] loss: 0.549
[9 900] loss: 0.567
[10 300] loss: 0.546
[10 600] loss: 0.561
[10 900] loss: 0.557