-
动机:想确认几个相似的英文单词经过word embedding确实被映射到了空间中相近的位置上
-
代码:
-
get word embedding
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
def get_contextual_embedding(sentence, target_word):
# 使用 tokenize 方法保留目标单词在句子中的所有实例
tokenized_sentence = tokenizer.tokenize(sentence)
# 使用 encode_plus 方法获得所需的张量格式
encoded_input = tokenizer.encode_plus(
tokenized_sentence,
return_tensors="pt",
add_special_tokens=True,
padding=True,
truncation=True,
max_length=512
)
tokens = encoded_input["input_ids"]
outputs = model(tokens)
embeddings = outputs.last_hidden_state[0]
target_token_id = tokenizer.encode(target_word, add_special_tokens=False)[0]
target_token_index = (tokens[0] == target_token_id).nonzero(as_tuple=True)[0][0]
return embeddings[target_token_index].detach().numpy()
- test:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
word1 = "apple"
word2 = "dog"
word3 = "banana"
vector1 = get_contextual_embedding("I ate an apple", word1)
vector2 = get_contextual_embedding("I saw a dog", word2)
vector3 = get_contextual_embedding("She was holding a banana", word3)
embeddings = np.array([vector1, vector2, vector3])
words = [word1, word2, word3]
# 使用 PCA 降维至三维空间
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings)
# 使用 Matplotlib 进行三维可视化
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(embeddings_3d[:, 0], embeddings_3d[:, 1], embeddings_3d[:, 2], marker='o', s=200, edgecolors='k', c='c')
for i, word in enumerate(words):
ax.text(embeddings_3d[i, 0], embeddings_3d[i, 1], embeddings_3d[i, 2], word, fontsize=20)
ax.set_title("PCA Visualization of Contextual Word Embeddings in 3D")
ax.set_xlabel("PCA Dimension 1")
ax.set_ylabel("PCA Dimension 2")
ax.set_zlabel("PCA Dimension 3")
plt.show()
计算相似度:
import numpy as np
from scipy.spatial.distance import cosine
word1 = "apple"
word2 = "dog"
word3 = "banana"
vector1 = get_contextual_embedding("I ate an apple", word1)
vector2 = get_contextual_embedding("I saw a dog", word2)
vector3 = get_contextual_embedding("She was holding a banana", word3)
embeddings = np.array([vector1, vector2, vector3])
words = [word1, word2, word3]
# 计算并打印每对词嵌入之间的余弦相似度
for i in range(len(words)):
for j in range(i+1, len(words)):
similarity = 1 - cosine(embeddings[i], embeddings[j])
print(f"Cosine similarity between {words[i]} and {words[j]}: {similarity:.4f}")
结果:
Cosine similarity between apple and dog: 0.6298
Cosine similarity between apple and banana: 0.8324
Cosine similarity between dog and banana: 0.6567