bert字向量

最新推荐文章于 2024-08-14 13:33:00 发布

tt163789

最新推荐文章于 2024-08-14 13:33:00 发布

阅读量263

点赞数

分类专栏： nlp 文章标签： bert tensorflow 深度学习

原文链接：https://blog.csdn.net/qq_40210472/article/details/100890120; https://blog.csdn.net/qq_25992377/article/details/105019786

版权

nlp 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

转载：
https://blog.csdn.net/qq_40210472/article/details/100890120
https://blog.csdn.net/qq_25992377/article/details/105019786
https://github.com/Kyubyong/bert-token-embeddings/blob/master/extract.py

import tensorflow as tf
from tensorflow.python import pywrap_tensorflow
import numpy as np
 
ckpt_path = 'bert_model.ckpt'
 
reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
param_dict = reader.get_variable_to_shape_map()  # 读取 ckpt中的参数的维度的
#emd = param_dict['bert/embeddings/word_embeddings']
emd = reader.get_tensor('bert/embeddings/word_embeddings') # 得到ckpt中指定的tensor 
print(len(emd))
print(emd[:5])
param=np.array(emd)
np.save('bert_embed.txt',param)
'''
from tensorflow.python.tools import inspect_checkpoint as chkp
chkp.print_tensors_in_checkpoint_file(file_name="./bert_model.ckpt", 
                                      tensor_name = 'bert/embeddings/word_embeddings', 
                                      all_tensors = True, 
                                      all_tensor_names=True) #
'''
————————————————
原文链接：https://blog.csdn.net/qq_40210472/article/details/100890120

from tensorflow.python import pywrap_tensorflow
import numpy as np
reader=pywrap_tensorflow.NewCheckpointReader(ckpt_path)
param_dict=reader.get_variable_to_shape_map()
emb=reader.get_tensor("bert/embeddings/word_embeddings")
vocab_file="vocab.txt"
vocab=open(vocab_file).read().split("\n")

out=open("bert_embedding","w")
out.write(str(emb.shape[0])+" "+str(emb.shape[1])+"\n")
for index in range(0, emb.shape[0]):
    out.write(vocab[index]+" "+" ".join([str(i) for i in emb[index,:]])+"\n")
out.close()
————————————————
原文链接：https://blog.csdn.net/qq_25992377/article/details/105019786

import torch
import numpy as np
np.set_printoptions(threshold=np.nan)
from multiprocessing import Pool
import re
from tqdm import tqdm

import os
os.system("pip install pytorch_pretrained_bert")
from pytorch_pretrained_bert import BertTokenizer, BertModel

def get_embeddings(mname):
    '''Gets pretrained embeddings of Bert-tokenized tokens or subwords
    mname: string. model name.
    '''
    print("# Model name:", mname)

    print("# Load pre-trained model tokenizer (vocabulary)")
    tokenizer = BertTokenizer.from_pretrained(mname)

    print("# Construct vocab")
    vocab = [token for token in tokenizer.vocab]

    print("# Load pre-trained model")
    model = BertModel.from_pretrained(mname)

    print("# Load word embeddings")
    emb = model.embeddings.word_embeddings.weight.data
    emb = emb.numpy()

    print("# Write")
    with open("{}.{}.{}d.vec".format(mname, len(vocab), emb.shape[-1]), "w") as fout:
        fout.write("{} {}\n".format(len(vocab), emb.shape[-1]))
        assert len(vocab)==len(emb), "The number of vocab and embeddings MUST be identical."
        for token, e in zip(vocab, emb):
            e = np.array2string(e, max_line_width=np.inf)[1:-1]
            e = re.sub("[ ]+", " ", e)
            fout.write("{} {}\n".format(token, e))

if __name__ == "__main__":
    mnames = (
              "bert-base-uncased",
              "bert-large-uncased",
              "bert-base-cased",
              "bert-large-cased",
              "bert-base-multilingual-cased",
              "bert-base-multilingual-uncased",
              "bert-base-chinese"
             )

    p = Pool(16)
    with tqdm(total=len(mnames)) as pbar:
        for _ in tqdm(p.imap(get_embeddings, mnames)):
            pbar.update()
————————————————
原文链接：https://github.com/Kyubyong/bert-token-embeddings/blob/master/extract.py