方法一:内置数据格式加载本地文件 以字典的方式
if COTE_DP == 1:
train_ds, test_ds = load_dataset("cote",'dp', splits=["train", "test"])
if COTE_BD == 1:
train_ds = load_dataset("cote",'dp', data_files={"train":"Datasets/COTE_BD/train.tsv"})
test_ds = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_BD/test.tsv"})
if COTE_MFW == 1:
train_ds = load_dataset("cote",'dp', data_files={"train":"Datasets/COTE_MFW/train.tsv"})
test_ds = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_MFW/test.tsv"})
if COTE_ENLARGE == 1:
train_ds = load_dataset("cote",'dp', data_files={"train":"Datasets/train_enlarge_cote.tsv"})
#COTE_BD
test_ds_cote_bd = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_BD/test.tsv"})
#COTE_MFW
test_ds_cote_mfw = load_dataset("cote",'dp', data_files={"test":"Datasets/COTE_MFW/test.tsv"})
#COTE_DP
test_ds_cote_dp = load_dataset("cote",'dp', splits="test")
方法二:
class BaseDateset(Dataset):
def __init__(self, data, is_test = False):
self._data = data
self._is_test = is_test
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
example = {}
samples = self._data[idx].split('\t')
if self._is_test:
qid = samples[-2]
label = ''
text = samples[-1]
else:
qid = ''
label = int(samples[-2])
text = samples[-1]
example['text'] = text
example['label'] = label
example['qid'] = qid
return example
def open_func(file_path):
samples = []
with open(file_path, 'r', encoding='utf8') as f:
for line in f.readlines()[1:]:
if len(line.strip().split('\t')) >= 2:
samples.append(line.strip())
return samples
# 定义数据集
from paddlenlp.datasets import MapDataset
from paddle.io import Dataset, DataLoader, Subset
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]
# 考虑token_type_id
class MyDataset(Dataset):
def __init__(self, data, tokenizer, max_len=512, for_test=False):
super().__init__()
self._data = data
self._tokenizer = tokenizer
self._max_len = max_len
self._for_test = for_test
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
samples = self._data[idx].split('\t')
label = samples[-3]
text_b = samples[-1]
text_a = samples[-2]
label = int(label)
encoder_out = self._tokenizer.encode(text_a, text_b, max_seq_len=self._max_len)
text = encoder_out['input_ids']
token_type = encoder_out['token_type_ids']
if self._for_test:
return np.array(text, dtype='int64'), np.array(token_type, dtype='int64')
else:
return np.array(text, dtype='int64'), np.array(token_type, dtype='int64'), np.array(label, dtype='int64')
def batchify_fn(for_test=False):
if for_test:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=tokenizer.pad_token_type_id)): [data for data in fn(samples)]
else:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
Stack()): [data for data in fn(samples)]
def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False, k=0):
dataset = MyDataset(data, tokenizer, max_len, for_test)
if for_test == False:
shuffle = True
train_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 5 != k])
dev_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 5 == k])
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
dev_loader = DataLoader(dataset=dev_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return train_loader, dev_loader
else:
shuffle = False
test_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return test_loader
# # 根据索引建立子集
# train_ds = Subset(dataset=baseset, indices=[i for i in range(len(baseset)) if i % 5 != 1])
# dev_ds = Subset(dataset=baseset, indices=[i for i in range(len(baseset)) if i % 5 == 1])
# 定义数据集
from paddle.io import Subset, Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = {'B': 0, 'I': 1, 'O': 2}
index2label = {0: 'B', 1: 'I', 2: 'O'}
# 考虑token_type_id
class MyDataset(Dataset):
def __init__(self, data, tokenizer, max_len=512, for_test=False):
super().__init__()
self._data = data
self._tokenizer = tokenizer
self._max_len = max_len
self._for_test = for_test
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
samples = self._data[idx].split('\t')
label = samples[-2]
text = samples[-1]
if self._for_test:
origin_enc = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
return np.array(origin_enc, dtype='int64')
else:
# 由于并不是每个字都是一个token,这里采用一种简单的处理方法,先编码label,再编码text中除了label以外的词,最后合到一起
texts = text.split(label)
label_enc = self._tokenizer.encode(label)['input_ids']
cls_enc = label_enc[0]
sep_enc = label_enc[-1]
label_enc = label_enc[1:-1]
# 合并
origin_enc = []
label_ids = []
for index, text in enumerate(texts):
text_enc = self._tokenizer.encode(text)['input_ids']
text_enc = text_enc[1:-1]
origin_enc += text_enc
label_ids += [label_list['O']] * len(text_enc)
if index != len(texts) - 1:
origin_enc += label_enc
label_ids += [label_list['B']] + [label_list['I']] * (len(label_enc) - 1)
origin_enc = [cls_enc] + origin_enc + [sep_enc]
label_ids = [label_list['O']] + label_ids + [label_list['O']]
# 截断
if len(origin_enc) > self._max_len:
origin_enc = origin_enc[:self._max_len-1] + origin_enc[-1:]
label_ids = label_ids[:self._max_len-1] + label_ids[-1:]
return np.array(origin_enc, dtype='int64'), np.array(label_ids, dtype='int64')
def batchify_fn(for_test=False):
if for_test:
return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
else:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=label_list['O'])): [data for data in fn(samples)]
def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False, k=0):
dataset = MyDataset(data, tokenizer, max_len, for_test)
if for_test == False:
shuffle = True
train_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 10 != k])
dev_ds = Subset(dataset=dataset, indices=[i for i in range(len(dataset)) if i % 10 == k])
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
dev_loader = DataLoader(dataset=dev_ds, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return train_loader, dev_loader
else:
shuffle = False
test_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return test_loader