BPE的使用(代码)

BPE调用方法:

from tokenizers import Tokenizer, models, trainers, processors, pre_tokenizers
import pandas as pd
import pickle

tokenize_path = '/root/BPE/vocab/'+ name +'_'+ item +'_bpe_tokenizer.json'

# 创建一个BPE tokenizer对象
tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"], vocab_size = vocab_size)
tokenizer.train_from_iterator(training_data, trainer)
# Save the trained tokenizer
print(f'{tokenize_path}')
tokenizer.save(tokenize_path)

# 加载已训练好的模型
tokenizer = Tokenizer.from_file(tokenize_path)
output = tokenizer.encode(sample)
tokens = output.tokens
print(tokens)             

使用BPE对数据进行分词:

from tokenizers import Tokenizer, models, trainers, processors, pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace
import pandas as pd
import pickle


def read_file(path):
    temp = pd.read_csv(path)
    rxn_seq = temp['source'].tolist()  # max_len = 3519   ind = 12
    protein_seq = temp['target'].tolist()   # max_len = 4369   ind = 346
    return rxn_seq, protein_seq

def BPE(path, vocab_size=80):
    name = path.split('.')[0].split('/')[-1]
    rxn_seq, protein_seq = read_file(path)

    temp_list = ['source', 'target']
    for item in temp_list:

        tokenize_path = '/root/BPE/vocab/'+ name +'_'+ item +'_bpe_tokenizer.json'

        # 训练BPE模型
        if item == 'source':
            training_data = rxn_seq
        else:
            vocab_size = 1000
            training_data = protein_seq

        # 创建一个BPE tokenizer对象
        tokenizer = Tokenizer(models.BPE())
        trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"], vocab_size = vocab_size)
        tokenizer.train_from_iterator(training_data, trainer)
        # Save the trained tokenizer
        print(f'{tokenize_path}')
        tokenizer.save(tokenize_path)
    return tokenizer, tokenize_path

def encode(path,tokenize_path):
    name_1 = tokenize_path.split('/')[-1]
    name_2 = path.split('.')[0].split('/')[-1]
    rxn_seq, protein_seq = read_file(path)

    temp_list = ['source', 'target']
    for item in temp_list:

        tokenize_path = '/root/BPE/vocab/'+ name_1 +'_'+ item +'_bpe_tokenizer.json'
        if item == 'source':
            tokenized_source = []
            # 加载已训练好的模型
            tokenizer = Tokenizer.from_file(tokenize_path)
            for sample in rxn_seq:
                output = tokenizer.encode(sample)
                tokens = output.tokens
                tokenized_source.append(tokens)

        if item == 'target':
            tokenized_target = []
            # 加载已训练好的模型
            tokenizer = Tokenizer.from_file(tokenize_path)
            for sample in protein_seq:
                output = tokenizer.encode(sample)
                tokens = output.tokens
                tokenized_target.append(tokens)

    data = list(zip(tokenized_source,tokenized_target))
    
    # 指定保存文件的路径
    file_path = '/root/BPE/tokenized/'+ name_2 + '_tokenized.pkl'

    # 使用 pickle.dump 将数据保存到文件中
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

    return data
############### 测试例子 #####################
# # 进行BPE编码
# encoded = tokenizer.encode(seq[-2])

# # 打印tokenized结果
# print(len(seq[-2]))
# print(len(encoded.tokens))

# # 获取编码后的token IDs
# token_ids = encoded.ids

# # 获取编码后的token字符串
# token_strings = encoded.tokens

# # 打印结果
# print("Token IDs:", token_ids)
# print("Token Strings:", token_strings)

if __name__ == '__main__':
    path = '/root/BPE/data/new_data.csv'
    train = '/root/BPE/data/new_train.csv'
    valid = '/root/BPE/data/new_valid.csv'
    test = '/root/BPE/data/new_test.csv'
    # tokenizer, tokenize_path = BPE(path)
    tokenize_path = '/root/BPE/tokenized/new_data'
    data = encode(train, tokenize_path)
    data = encode(valid, tokenize_path)
    data = encode(test, tokenize_path)
    print(data[0:5])

使用torchtext包装分词后的数据:

import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data.distributed import DistributedSampler
from torchtext.legacy.data import Field,TabularDataset, BucketIterator, Example, Dataset
import numpy as np
import random, math, time, datetime, os
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import torch.distributed as dist
import socket
import pickle
from embedding import BPE

world_size =torch.cuda.device_count() #num GPUs

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_path = '/root/BPE/tokenized/new_train_tokenized.pkl'
with open(train_path, 'rb') as file:
    tokenized_train = pickle.load(file)
# train_path = '/root/BPE/data/train.csv'
# tokenized_train = BPE(train_path, Pretrain=True)

valid_path = '/root/BPE/tokenized/new_valid_tokenized.pkl'
with open(valid_path, 'rb') as file:
    tokenized_valid = pickle.load(file)
# valid_path = '/root/BPE/data/valid.csv'
# tokenized_valid = BPE(valid_path, Pretrain=True)

test_path = '/root/BPE/tokenized/new_test_tokenized.pkl'
with open(test_path, 'rb') as file:
    tokenized_test = pickle.load(file)
# test_path = '/root/BPE/data/test.csv'
# tokenized_test = BPE(test_path, Pretrain=True)


max_sequence_length = 500
source_field = Field(init_token='<sos>', eos_token='<eos>', 
                     lower=False, batch_first = True, fix_length=max_sequence_length)
target_field = Field(init_token='<sos>', eos_token='<eos>', 
                     lower=False, batch_first = True, fix_length=max_sequence_length)


fields = [('source', source_field), ('target', target_field)]
train_examples = [Example.fromlist([source_text, target_text], fields) for source_text, target_text in tokenized_train]
train_data = Dataset(train_examples, fields)
valid_examples = [Example.fromlist([source_text, target_text], fields) for source_text, target_text in tokenized_valid]
valid_data = Dataset(valid_examples, fields)
test_examples = [Example.fromlist([source_text, target_text], fields) for source_text, target_text in tokenized_test]
test_data = Dataset(test_examples, fields)


# Build vocabulary
source_field.build_vocab(train_data, min_freq=2)
target_field.build_vocab(train_data, min_freq=2)


SRC_PAD_IDX = source_field.vocab.stoi[source_field.pad_token]
TRG_PAD_IDX = target_field.vocab.stoi[target_field.pad_token]

# Create BucketIterators for train, validation, and test sets
batch_size = 2
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    datasets=(train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_key=lambda x: len(x.source),
    sort_within_batch=True,
    shuffle=True,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)
  • 6
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Billie使劲学

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值