1.dataset
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
import re
import pandas as pd
class DailyDialogueDataset(Dataset):
def __init__(self,split,path):
self.Speakers,self.InputSequence,self.InputMaxSequenceLength,\
self.ActLabels,self.EmotionLabels,self.trainId,self.testId,self.validId = pickle.load(open(path,'rb'))
if split == 'train':
self.keys = [x for x in self.trainId]
elif split == 'test':
self.keys = [x for x in self.testId]
elif split == 'valid':
self.keys = [x for x in self.validId]
self.len = len(self.keys)
def __getitem__(self,index):
conv = self.keys[index]
return (torch.LongTensor(self.InputSequence[conv]), # [num_seq,seq_len]
torch.FloatTensor([[1,0] if x == '0' else [0,1] for x in self.Speakers[conv]]), # [num_seq,num_party]
torch.FloatTensor([1] * len(self.ActLabels[conv])), # [num_seq]
torch.LongTensor(self.ActLabels[conv]), # [num_seq]
torch.LongTensor(self.EmotionLabels[conv]), # [num_seq]
self.InputMaxSequenceLength[conv],
conv)
def __len__(self):
return self.len
path = './data/dailydialog/daily_dialogue.pkl'
split = 'valid'
data = DailyDialogueDataset(split,path)
#data[0][0].shape
2.dataloader构建
class DailyDialoguePadCollate(object):
def __init__(self,dim = 0):
self.dim = dim
def __call__(self,batch):
dat = pd.DataFrame(batch)
# dat[0]: batch_size个[num_seq,seq_len]
# dat[1]: batch_size个[num_seq,num_party]
# dat[2]: batch_size个[num_seq]
# dat[3]: batch_size个[num_seq]
# dat[4]: batch_size个[num_seq]
# dat[5]: batch_size个 c
# dat[6]: batch_size个 c
return [
self.pad_collate(dat[i]).transpose(1,0).contiguous() if i == 0 else \
pad_sequence(dat[i].tolist()) if i == 1 else \
pad_sequence(dat[i].tolist(),True) if i < 5 else \
dat[i].tolist() for i in dat]
def pad_collate(self,batch):
# batch:batch_size个 [seq_len,num_seq]
max_len = max(map(lambda x:x.shape[self.dim],batch))
batch = [self.pad_tensor(x,pad = max_len,dim = self.dim) for x in batch]
return torch.stack(batch,dim = 0)
def pad_tensor(self,vec,pad,dim):
pad_size = list(vec.shape)
pad_size[dim] = pad - vec.size(dim)
return torch.cat([vec,torch.zeros(*pad_size).type(torch.LongTensor)],dim = dim)
dataloader = DataLoader(dataset = data,batch_size = 28,collate_fn=DailyDialoguePadCollate(0))
print('InputSequence:',list(dataloader)[0][0].shape)
print('Speakers:',list(dataloader)[0][1].shape)
print('umask:',list(dataloader)[0][2].shape)
print('ActLabels:',list(dataloader)[0][3].shape)
print('EmotionLabels:',list(dataloader)[0][4].shape)
输出:
InputSequence: torch.Size([15, 28, 250])
Speakers: torch.Size([15, 28, 2])
umask: torch.Size([28, 15])
ActLabels: torch.Size([28, 15])
EmotionLabels: torch.Size([28, 15])
3.collate_fn的小demo
# pad举例
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from torch.utils.data import DataLoader,Dataset
batch_size = 8
embedding_dim = 100
max_len1 = 30
max_len2 = 40
max_len3 = 20
max_len4 = 9
# 将data1,data2,data3打包到一个batch中,需对max_len进行padding
data1 = torch.randn([max_len1,embedding_dim])
data2 = torch.randn([max_len2,embedding_dim])
data3 = torch.randn([max_len3,embedding_dim])
data4 = torch.randn([max_len4,embedding_dim])
datas = np.array([data1,data2,data3,data4])
# 构建dataset
class TestDataset(Dataset):
def __init__(self,data):
super(TestDataset,self).__init__()
self.datas = datas
def __len__(self):
return len(self.datas)
def __getitem__(self,i):
return datas[i]
class TestPadCollate(object):
def __init__(self,dim = 0):
self.dim = dim # 进行padding的维度
def __call__(self,batches):
max_len = max([batch.shape[self.dim] for batch in batches])
# 对batches里每一个样本进行padding
batches = [self.pad_tensor(batch,self.dim,max_len) for batch in batches]
return torch.stack(batches,dim = self.dim)
# vec:需要padding的向量
# dim:需要padding的 维度
# pad:需要padding得大小
def pad_tensor(self,vec,dim,pad):
pad_size = list(vec.shape)
pad_size[dim] = pad - pad_size[dim]
return torch.cat([vec,torch.zeros(pad_size).type(torch.LongTensor)],dim = dim)
testDataset = TestDataset(datas)
dataloader = DataLoader(dataset=testDataset,batch_size = 2,collate_fn=TestPadCollate(0))
print('shape:',[data.shape for data in list(dataloader)])
输出:
shape: [torch.Size([2, 40, 100]), torch.Size([2, 20, 100])]