import numpy as np
# 假设我们有以下句子集合
corpus = [
"I like machine learning.",
"I enjoy deep learning.",
"Machine learning is fascinating.",
"Deep learning is interesting."
]
# 构建单词-索引和索引-单词映射
word_to_idx = {}
idx_to_word = {}
for sentence in corpus:
words = sentence.lower().split()
for word in words:
if word not in word_to_idx:
idx = len(word_to_idx)
word_to_idx[word] = idx
idx_to_word[idx] = word
# 构建共现矩阵
vocab_size = len(word_to_idx)
cooc_matrix = np.zeros((vocab_size, vocab_size))
window_size = 2
for sentence in corpus:
words = sentence.lower().split()
for target_idx, target_word in enumerate(words):
target_id = word_to_idx[target_word]
# 使用窗口大小找到上下文词汇的索引范围
start_idx = max(0, target_idx - window_size)
end_idx = min(len(words), target_idx + window_size + 1)
# 对上下文词汇进行共现计数
for context_idx in range(start_idx, end_idx):
if context_idx != target_idx:
context_word = words[context_idx]
context_id = word_to_idx[context_word]
cooc_matrix[target_id, context_id] += 1
# 使用GloVe算法进行训练
embedding_size = 50
learning_rate = 0.01
num_epochs = 100
# 初始化参数
W = np.random.rand(vocab_size, embedding_size)
U = np.random.rand(vocab_size, embedding_size)
b = np.random.rand(vocab_size)
# 迭代训练
for epoch in range(num_epochs):
for target_id in range(vocab_size):
for context_id in range(vocab_size):
cooc_count = cooc_matrix[target_id, context_id]
if cooc_count > 0:
# 计算预测误差
diff = np.dot(W[target_id], U[context_id]) + b[target_id] + b[context_id] - np.log(cooc_count)
# 更新参数
W[target_id] -= learning_rate * diff * U[context_id]
U[context_id] -= learning_rate * diff * W[target_id]
b[target_id] -= learning_rate * diff
b[context_id] -= learning_rate * diff
# 输出单词的向量表示
for word, idx in word_to_idx.items():
print(f"{word}: {W[idx]}")
Glove的一个小例子
于 2023-08-15 13:58:54 首次发布