小黑维度逐行分析与调试:dataloader.py

1.dataset

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
import re
import pandas as pd

class DailyDialogueDataset(Dataset):
    def __init__(self,split,path):
        self.Speakers,self.InputSequence,self.InputMaxSequenceLength,\
            self.ActLabels,self.EmotionLabels,self.trainId,self.testId,self.validId = pickle.load(open(path,'rb'))
        if split == 'train':
            self.keys = [x for x in self.trainId]
        elif split == 'test':
            self.keys = [x for x in self.testId]
        elif split == 'valid':
            self.keys = [x for x in self.validId]
        self.len = len(self.keys)
    def __getitem__(self,index):
        conv = self.keys[index]
        
        
        
        
        
        return (torch.LongTensor(self.InputSequence[conv]),   # [num_seq,seq_len]
                    torch.FloatTensor([[1,0] if x == '0' else [0,1] for x in self.Speakers[conv]]),  # [num_seq,num_party]
                    torch.FloatTensor([1] * len(self.ActLabels[conv])),   # [num_seq]
                    torch.LongTensor(self.ActLabels[conv]),    # [num_seq]   
                    torch.LongTensor(self.EmotionLabels[conv]),  # [num_seq]
                    self.InputMaxSequenceLength[conv],
                    conv)
    def __len__(self):
        return self.len
path = './data/dailydialog/daily_dialogue.pkl'
split = 'valid'
data = DailyDialogueDataset(split,path)
#data[0][0].shape

2.dataloader构建

class DailyDialoguePadCollate(object):
    def __init__(self,dim = 0):
        self.dim = dim
    def __call__(self,batch):
        dat = pd.DataFrame(batch)
        # dat[0]: batch_size个[num_seq,seq_len]
        # dat[1]: batch_size个[num_seq,num_party]
        # dat[2]: batch_size个[num_seq]
        # dat[3]: batch_size个[num_seq]
        # dat[4]: batch_size个[num_seq]
        # dat[5]: batch_size个 c
        # dat[6]: batch_size个 c
        return [
            self.pad_collate(dat[i]).transpose(1,0).contiguous() if i == 0 else  \
            pad_sequence(dat[i].tolist()) if i == 1 else \
            pad_sequence(dat[i].tolist(),True) if i < 5 else \
            dat[i].tolist() for i in dat]
    def pad_collate(self,batch):
        # batch:batch_size个 [seq_len,num_seq]
        max_len = max(map(lambda x:x.shape[self.dim],batch))
        batch = [self.pad_tensor(x,pad = max_len,dim = self.dim) for x in batch]
        return torch.stack(batch,dim = 0)
    def pad_tensor(self,vec,pad,dim):
        pad_size = list(vec.shape)
        pad_size[dim] = pad - vec.size(dim)
        return torch.cat([vec,torch.zeros(*pad_size).type(torch.LongTensor)],dim = dim)
dataloader = DataLoader(dataset = data,batch_size = 28,collate_fn=DailyDialoguePadCollate(0))
print('InputSequence:',list(dataloader)[0][0].shape)
print('Speakers:',list(dataloader)[0][1].shape)
print('umask:',list(dataloader)[0][2].shape)
print('ActLabels:',list(dataloader)[0][3].shape)
print('EmotionLabels:',list(dataloader)[0][4].shape)

输出:

InputSequence: torch.Size([15, 28, 250])
Speakers: torch.Size([15, 28, 2])
umask: torch.Size([28, 15])
ActLabels: torch.Size([28, 15])
EmotionLabels: torch.Size([28, 15])

3.collate_fn的小demo

# pad举例
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from torch.utils.data import DataLoader,Dataset
batch_size = 8
embedding_dim = 100
max_len1 = 30
max_len2 = 40
max_len3 = 20
max_len4 = 9
# 将data1,data2,data3打包到一个batch中,需对max_len进行padding
data1 = torch.randn([max_len1,embedding_dim])
data2 = torch.randn([max_len2,embedding_dim])
data3 = torch.randn([max_len3,embedding_dim])
data4 = torch.randn([max_len4,embedding_dim])
datas = np.array([data1,data2,data3,data4])

# 构建dataset
class TestDataset(Dataset):
    def __init__(self,data):
        super(TestDataset,self).__init__()
        self.datas = datas
    def __len__(self):
        return len(self.datas)
    def __getitem__(self,i):
        return datas[i]
class TestPadCollate(object):
    def __init__(self,dim = 0):
        self.dim = dim    # 进行padding的维度
    def __call__(self,batches):
        max_len = max([batch.shape[self.dim] for batch in batches])
        # 对batches里每一个样本进行padding
        batches = [self.pad_tensor(batch,self.dim,max_len) for batch in batches]
        return torch.stack(batches,dim = self.dim)
    # vec:需要padding的向量
    # dim:需要padding的 维度
    # pad:需要padding得大小
    def pad_tensor(self,vec,dim,pad):
        pad_size = list(vec.shape)
        pad_size[dim] = pad - pad_size[dim]
        return torch.cat([vec,torch.zeros(pad_size).type(torch.LongTensor)],dim = dim)
testDataset = TestDataset(datas)
dataloader = DataLoader(dataset=testDataset,batch_size = 2,collate_fn=TestPadCollate(0))
print('shape:',[data.shape for data in list(dataloader)])

输出:

shape: [torch.Size([2, 40, 100]), torch.Size([2, 20, 100])]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值