bert中的sep_keras_bert的简单使用

最新推荐文章于 2020-12-29 10:24:16 发布

weixin_39832643

最新推荐文章于 2020-12-29 10:24:16 发布

阅读量177

点赞数 1

文章标签： bert中的sep

本文链接：https://blog.csdn.net/weixin_39832643/article/details/112230539

版权

pip install keras_bert

from keras_bert import Tokenizer

#字典
token_dict = {
    '[CLS]': 0,
    '[SEP]': 1,
    'un': 2,
    '##aff': 3,
    '##able': 4,
    '[UNK]': 5,
}

# 拆分单词实例
tokenizer = Tokenizer(token_dict)
print(tokenizer.tokenize('unaffable'))  # ['[CLS]', 'un', '##aff', '##able', '[SEP]']

# indices是字对应索引
# segments表示索引对应位置上的字属于第一句话还是第二句话
# 这里只有一句话 unaffable，所以segments都是0
indices, segments = tokenizer.encode('unaffable')
print(indices)  # [0, 2, 3, 4, 1]
print(segments)  # [0, 0, 0, 0, 0]

import os
import codecs
from keras_bert import load_trained_model_from_checkpoint
from keras_bert import Tokenizer
import numpy as np



# 设置预训练模型的路径
pretrained_path = 'chinese_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# 构建字典
# 也可以用 keras_bert 中的 load_vocabulary() 函数
# 传入 vocab_path 即可
# from keras_bert import load_vocabulary
# token_dict = load_vocabulary(vocab_path)

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

# 加载预训练模型
model = load_trained_model_from_checkpoint(config_path, checkpoint_path)



tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
# ['[CLS]', '语', '言', '模', '型', '[SEP]']

indices, segments = tokenizer.encode(first=text, max_len=512)
print(indices[:10])
# [101, 6427, 6241, 3563, 1798, 102, 0, 0, 0, 0]
print(segments[:10])
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])

参考:

BERT实战--基于Keras - 那少年和狗 - 博客园www.cnblogs.com

keras-bert

weixin_39832643

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
bert中的sep_keras_bert的简单使用

pip install keras_bertfrom keras_bert import Tokenizer#字典token_dict = { '[CLS]': 0, '[SEP]': 1, 'un': 2, '##aff': 3, '##able': 4, '[UNK]': 5,}# 拆分单词实例tokenizer = Tokeniz...
复制链接

扫一扫