BPE调用方法:
from tokenizers import Tokenizer, models, trainers, processors, pre_tokenizers
import pandas as pd
import pickle
tokenize_path = '/root/BPE/vocab/'+ name +'_'+ item +'_bpe_tokenizer.json'
# 创建一个BPE tokenizer对象
tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"], vocab_size = vocab_size)
tokenizer.train_from_iterator(training_data, trainer)
# Save the trained tokenizer
print(f'{tokenize_path}')
tokenizer.save(tokenize_path)
# 加载已训练好的模型
tokenizer = Tokenizer.from_file(tokenize_path)
output = tokenizer.encode(sample)
tokens = output.tokens
print(tokens)
使用BPE对数据进行分词:
from tokenizers import Tokenizer, models, trainers, processors, pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace
import pandas as pd
import pickle
def read_file(path):
temp = pd.read_csv(path)
rxn_seq = temp['source'].tolist() # max_len = 3519 ind = 12
protein_seq = temp['target'].tolist() # max_len = 4369 ind = 346
return rxn_seq, protein_seq
def BPE(path, vocab_size=80):
name = path.split('.')[0].split('/')[-1]
rxn_seq, protein_seq = read_file(path)
temp_list = ['source', 'target']
for item in temp_list:
tokenize_path = '/root/BPE/vocab/'+ name +'_'+ item +'_bpe_tokenizer.json'
# 训练BPE模型
if item == 'source':
training_data = rxn_seq
else:
vocab_size = 1000
training_data = protein_seq
# 创建一个BPE tokenizer对象
tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"], vocab_size = vocab_size)
tokenizer.train_from_iterator(training_data, trainer)
# Save the trained tokenizer
print(f'{tokenize_path}')
tokenizer.save(tokenize_path)
return tokenizer, tokenize_path
def encode(path,tokenize_path):
name_1 = tokenize_path.split('/')[-1]
name_2 = path.split('.')[0].split('/')[-1]
rxn_seq, protein_seq = read_file(path)
temp_list = ['source', 'target']
for item in temp_list:
tokenize_path = '/root/BPE/vocab/'+ name_1 +'_'+ item +'_bpe_tokenizer.json'
if item == 'source':
tokenized_source = []
# 加载已训练好的模型
tokenizer = Tokenizer.from_file(tokenize_path)
for sample in rxn_seq:
output = tokenizer.encode(sample)
tokens = output.tokens
tokenized_source.append(tokens)
if item == 'target':
tokenized_target = []
# 加载已训练好的模型
tokenizer = Tokenizer.from_file(tokenize_path)
for sample in protein_seq:
output = tokenizer.encode(sample)
tokens = output.tokens
tokenized_target.append(tokens)
data = list(zip(tokenized_source,tokenized_target))
# 指定保存文件的路径
file_path = '/root/BPE/tokenized/'+ name_2 + '_tokenized.pkl'
# 使用 pickle.dump 将数据保存到文件中
with open(file_path, 'wb') as file:
pickle.dump(data, file)
return data
############### 测试例子 #####################
# # 进行BPE编码
# encoded = tokenizer.encode(seq[-2])
# # 打印tokenized结果
# print(len(seq[-2]))
# print(len(encoded.tokens))
# # 获取编码后的token IDs
# token_ids = encoded.ids
# # 获取编码后的token字符串
# token_strings = encoded.tokens
# # 打印结果
# print("Token IDs:", token_ids)
# print("Token Strings:", token_strings)
if __name__ == '__main__':
path = '/root/BPE/data/new_data.csv'
train = '/root/BPE/data/new_train.csv'
valid = '/root/BPE/data/new_valid.csv'
test = '/root/BPE/data/new_test.csv'
# tokenizer, tokenize_path = BPE(path)
tokenize_path = '/root/BPE/tokenized/new_data'
data = encode(train, tokenize_path)
data = encode(valid, tokenize_path)
data = encode(test, tokenize_path)
print(data[0:5])
使用torchtext包装分词后的数据:
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data.distributed import DistributedSampler
from torchtext.legacy.data import Field,TabularDataset, BucketIterator, Example, Dataset
import numpy as np
import random, math, time, datetime, os
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import torch.distributed as dist
import socket
import pickle
from embedding import BPE
world_size =torch.cuda.device_count() #num GPUs
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
train_path = '/root/BPE/tokenized/new_train_tokenized.pkl'
with open(train_path, 'rb') as file:
tokenized_train = pickle.load(file)
# train_path = '/root/BPE/data/train.csv'
# tokenized_train = BPE(train_path, Pretrain=True)
valid_path = '/root/BPE/tokenized/new_valid_tokenized.pkl'
with open(valid_path, 'rb') as file:
tokenized_valid = pickle.load(file)
# valid_path = '/root/BPE/data/valid.csv'
# tokenized_valid = BPE(valid_path, Pretrain=True)
test_path = '/root/BPE/tokenized/new_test_tokenized.pkl'
with open(test_path, 'rb') as file:
tokenized_test = pickle.load(file)
# test_path = '/root/BPE/data/test.csv'
# tokenized_test = BPE(test_path, Pretrain=True)
max_sequence_length = 500
source_field = Field(init_token='<sos>', eos_token='<eos>',
lower=False, batch_first = True, fix_length=max_sequence_length)
target_field = Field(init_token='<sos>', eos_token='<eos>',
lower=False, batch_first = True, fix_length=max_sequence_length)
fields = [('source', source_field), ('target', target_field)]
train_examples = [Example.fromlist([source_text, target_text], fields) for source_text, target_text in tokenized_train]
train_data = Dataset(train_examples, fields)
valid_examples = [Example.fromlist([source_text, target_text], fields) for source_text, target_text in tokenized_valid]
valid_data = Dataset(valid_examples, fields)
test_examples = [Example.fromlist([source_text, target_text], fields) for source_text, target_text in tokenized_test]
test_data = Dataset(test_examples, fields)
# Build vocabulary
source_field.build_vocab(train_data, min_freq=2)
target_field.build_vocab(train_data, min_freq=2)
SRC_PAD_IDX = source_field.vocab.stoi[source_field.pad_token]
TRG_PAD_IDX = target_field.vocab.stoi[target_field.pad_token]
# Create BucketIterators for train, validation, and test sets
batch_size = 2
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
datasets=(train_data, valid_data, test_data),
batch_size=batch_size,
sort_key=lambda x: len(x.source),
sort_within_batch=True,
shuffle=True,
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)