转载:
https://blog.csdn.net/qq_40210472/article/details/100890120
https://blog.csdn.net/qq_25992377/article/details/105019786
https://github.com/Kyubyong/bert-token-embeddings/blob/master/extract.py
import tensorflow as tf
from tensorflow.python import pywrap_tensorflow
import numpy as np
ckpt_path = 'bert_model.ckpt'
reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
param_dict = reader.get_variable_to_shape_map() # 读取 ckpt中的参数的维度的
#emd = param_dict['bert/embeddings/word_embeddings']
emd = reader.get_tensor('bert/embeddings/word_embeddings') # 得到ckpt中指定的tensor
print(len(emd))
print(emd[:5])
param=np.array(emd)
np.save('bert_embed.txt',param)
'''
from tensorflow.python.tools import inspect_checkpoint as chkp
chkp.print_tensors_in_checkpoint_file(file_name="./bert_model.ckpt",
tensor_name = 'bert/embeddings/word_embeddings',
all_tensors = True,
all_tensor_names=True) #
'''
————————————————
原文链接:https://blog.csdn.net/qq_40210472/article/details/100890120
from tensorflow.python import pywrap_tensorflow
import numpy as np
reader=pywrap_tensorflow.NewCheckpointReader(ckpt_path)
param_dict=reader.get_variable_to_shape_map()
emb=reader.get_tensor("bert/embeddings/word_embeddings")
vocab_file="vocab.txt"
vocab=open(vocab_file).read().split("\n")
out=open("bert_embedding","w")
out.write(str(emb.shape[0])+" "+str(emb.shape[1])+"\n")
for index in range(0, emb.shape[0]):
out.write(vocab[index]+" "+" ".join([str(i) for i in emb[index,:]])+"\n")
out.close()
————————————————
原文链接:https://blog.csdn.net/qq_25992377/article/details/105019786
import torch
import numpy as np
np.set_printoptions(threshold=np.nan)
from multiprocessing import Pool
import re
from tqdm import tqdm
import os
os.system("pip install pytorch_pretrained_bert")
from pytorch_pretrained_bert import BertTokenizer, BertModel
def get_embeddings(mname):
'''Gets pretrained embeddings of Bert-tokenized tokens or subwords
mname: string. model name.
'''
print("# Model name:", mname)
print("# Load pre-trained model tokenizer (vocabulary)")
tokenizer = BertTokenizer.from_pretrained(mname)
print("# Construct vocab")
vocab = [token for token in tokenizer.vocab]
print("# Load pre-trained model")
model = BertModel.from_pretrained(mname)
print("# Load word embeddings")
emb = model.embeddings.word_embeddings.weight.data
emb = emb.numpy()
print("# Write")
with open("{}.{}.{}d.vec".format(mname, len(vocab), emb.shape[-1]), "w") as fout:
fout.write("{} {}\n".format(len(vocab), emb.shape[-1]))
assert len(vocab)==len(emb), "The number of vocab and embeddings MUST be identical."
for token, e in zip(vocab, emb):
e = np.array2string(e, max_line_width=np.inf)[1:-1]
e = re.sub("[ ]+", " ", e)
fout.write("{} {}\n".format(token, e))
if __name__ == "__main__":
mnames = (
"bert-base-uncased",
"bert-large-uncased",
"bert-base-cased",
"bert-large-cased",
"bert-base-multilingual-cased",
"bert-base-multilingual-uncased",
"bert-base-chinese"
)
p = Pool(16)
with tqdm(total=len(mnames)) as pbar:
for _ in tqdm(p.imap(get_embeddings, mnames)):
pbar.update()
————————————————
原文链接:https://github.com/Kyubyong/bert-token-embeddings/blob/master/extract.py