paddlepaddle加载本地数据训练

方法一:内置数据格式加载本地文件 以字典的方式

if COTE_DP == 1:
    train_ds, test_ds = load_dataset("cote",'dp', splits=["train", "test"])
if COTE_BD == 1:
    train_ds = load_dataset("cote",'dp', data_files={"train":"Datasets/COTE_BD/train.tsv"})
    test_ds = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_BD/test.tsv"})
if COTE_MFW == 1:
    train_ds = load_dataset("cote",'dp', data_files={"train":"Datasets/COTE_MFW/train.tsv"})
    test_ds  = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_MFW/test.tsv"})
if COTE_ENLARGE == 1:
    train_ds = load_dataset("cote",'dp', data_files={"train":"Datasets/train_enlarge_cote.tsv"})
    #COTE_BD
    test_ds_cote_bd = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_BD/test.tsv"})
    #COTE_MFW
    test_ds_cote_mfw  = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_MFW/test.tsv"})
    #COTE_DP
    test_ds_cote_dp = load_dataset("cote",'dp', splits="test")

方法二:

class BaseDateset(Dataset):
    def __init__(self, data, is_test = False):
        self._data = data
        self._is_test = is_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        example = {}
        samples = self._data[idx].split('\t')
        if self._is_test:
            qid = samples[-2]
            label = ''
            text = samples[-1]
        else:
            qid = ''
            label = int(samples[-2])
            text = samples[-1]
            
        example['text'] = text
        example['label'] = label
        example['qid'] = qid
        return example

def open_func(file_path):
    samples = []
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f.readlines()[1:]:
            if len(line.strip().split('\t')) >= 2:
                samples.append(line.strip())
    return samples
# 定义数据集
from paddlenlp.datasets import MapDataset
from paddle.io import Dataset, DataLoader, Subset
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]

# 考虑token_type_id
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-3]
        text_b = samples[-1]
        text_a = samples[-2]
        label = int(label)
        encoder_out = self._tokenizer.encode(text_a, text_b, max_seq_len=self._max_len)
        text = encoder_out['input_ids']
        token_type = encoder_out['token_type_ids']
        if self._for_test:
            return np.array(text, dtype='int64'), np.array(token_type, dtype='int64')
        else:
            return np.array(text, dtype='int64'), np.array(token_type, dtype='int64'), np.array(label, dtype='int64')

def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=tokenizer.pad_token_type_id)): [data for data in fn(samples)]
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
                                        Stack()): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False, k=0):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    if for_test == False:
        shuffle = True
        train_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 5 != k])
        dev_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 5 == k])
        train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        dev_loader = DataLoader(dataset=dev_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        return train_loader, dev_loader
    else:
        shuffle = False
        test_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        return test_loader


# # 根据索引建立子集
# train_ds = Subset(dataset=baseset, indices=[i for i in range(len(baseset)) if i % 5 != 1])
# dev_ds = Subset(dataset=baseset, indices=[i for i in range(len(baseset)) if i % 5 == 1])
# 定义数据集
from paddle.io import Subset, Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = {'B': 0, 'I': 1, 'O': 2}
index2label = {0: 'B', 1: 'I', 2: 'O'}

# 考虑token_type_id
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-2]
        text = samples[-1]
        if self._for_test:
            origin_enc = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
            return np.array(origin_enc, dtype='int64')
        else:
            
            # 由于并不是每个字都是一个token,这里采用一种简单的处理方法,先编码label,再编码text中除了label以外的词,最后合到一起
            texts = text.split(label)
            label_enc = self._tokenizer.encode(label)['input_ids']
            cls_enc = label_enc[0]
            sep_enc = label_enc[-1]
            label_enc = label_enc[1:-1]
            
            # 合并
            origin_enc = []
            label_ids = []
            for index, text in enumerate(texts):
                text_enc = self._tokenizer.encode(text)['input_ids']
                text_enc = text_enc[1:-1]
                origin_enc += text_enc
                label_ids += [label_list['O']] * len(text_enc)
                if index != len(texts) - 1:
                    origin_enc += label_enc
                    label_ids += [label_list['B']] + [label_list['I']] * (len(label_enc) - 1)

            origin_enc = [cls_enc] + origin_enc + [sep_enc]
            label_ids = [label_list['O']] + label_ids + [label_list['O']]
            
            # 截断
            if len(origin_enc) > self._max_len:
                origin_enc = origin_enc[:self._max_len-1] + origin_enc[-1:]
                label_ids = label_ids[:self._max_len-1] + label_ids[-1:]
            return np.array(origin_enc, dtype='int64'), np.array(label_ids, dtype='int64')


def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=label_list['O'])): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False, k=0):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    if for_test == False:
        shuffle = True
        train_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 10 != k])
        dev_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 10 == k])
        train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        dev_loader = DataLoader(dataset=dev_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        return train_loader, dev_loader
    else:
        shuffle = False
        test_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
        return test_loader
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值