分词
Word2vec
Word2vec分为skip-gram和CBOW(continuous bag of words),前者是通过中心词预测窗口词,后者是通过窗口预测中心词。
数据预处理
K = 10 # 负样本随机采样数量
C = 3 # 周围单词的数量
NUM_EPOCHS = 2
VOCAB_SIZE = 30000
BATCH_SIZE = 128
LEARNING_RATE = 0.2
EMBEDDING_SIZE = 100
LOG_FILE = "word_embedding.log"
#print(os.getcwd())
with open("text8.train.txt", "r") as file:
text = file.read()
text = [w for w in (text.lower()).split()]
vocab = dict(Counter(text).most_common(VOCAB_SIZE-1))
vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
print(np.sum(list(vocab.values())),vocab["<unk>"])
idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
word_freqs = word_freqs / np.sum(word_freqs) #
这里K是负样本的采集数量,c是正样本的窗口,左右各取c个,EMBEDDING_SIZE是词向量的维度。下边是英文的数据生成方式,英文只需要根据空格来划分词即可。然后使用Counter统计每个词的词频,ind_to_word和word_to_index记录了每个词及其label,word_freqs是用来做negative sampling的具体配合torch.multinomial频率越大被取到的几率越大。
print(os.getcwd())
lines = []
with open("1.txt", "r",encoding='utf-8',errors='ignore') as f:
lines = f.readlines()
# 清理数据
data = []
for line in lines:
line = line.strip()
if line != "":
data.append(line)
# 分词
with open("p_input.txt", "w", encoding="utf-8") as f:
f.write("\n".join(data))
thu1 = thulac.thulac(seg_only=True)
thu1.cut_f("p_input.txt", "p_output.txt")
# 标点符号集
stopwords = '''~!@#$%^&*()_+`1234567890-={}[]::";'<>,.?/|\、·!()¥“”‘’《》,。?/—-【】….'''
stopwords_set = set([i for i in stopwords])
stopwords_set.add("br") # 异常词也加入此集,方便去除
with open("p_output.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
# 数据清理
data = []
for line in lines:
for s in stopwords_set:
line = line.strip().replace(s, "")
line = line.replace(" ", " ").replace(" ", " ")
if line != "" and line != " ":
data.append(line)
# 保存数据
with open("all.txt", "w", encoding="utf-8") as f:
f.write(" ".join(data))
all_text = ""
with open("all.txt", "r", encoding="utf-8") as f:
all_text = f.readline()
all_len = len(all_text)
train_text = all_text[:int(all_len * 0.9)]
dev_text = all_text[int(all_len * 0.9):int(all_len * 0.95)]
test_text = all_text[int(all_len * 0.95):]
with open("dev.txt", "w", encoding="utf-8") as f:
f.write(dev_text)
with open("test.txt", "w", encoding="utf-8") as f:
f.write(test_text)
with open("train.txt", "w", encoding="utf-8") as f:
f.write(train_text)
text = ""
with open("train.txt", "r", encoding="utf-8") as f:
text = f.read()
text = text.lower().split() # 分割成单词列表
vocab = dict(Counter(text).most_common(VOCAB_SIZE - 1)) # 得到单词字典表,key是单词,value是次数
vocab['<UNK>'] = len(text) - np.sum(list(vocab.values()))
# text = [w for w in (text.lower()).split()]
# vocab = dict(Counter(text).most_common(VOCAB_SIZE - 1))
#vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
#print(np.sum(list(vocab.values())), vocab["<unk>"])
idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word: i for i, word in enumerate(idx_to_word)}
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3. / 4.)
word_freqs = word_freqs / np.sum(word_freqs)
中文获取其词向量复杂一些,首先要明确你文件的编码格式gdk,utf等等,其次需要使用分词工具将句子分词词这里使用的是thulac,接着要去除中文常见的标点符号最后的处理步骤和英文的没有区别
skip-gram
class Dataset(tud.Dataset): # 继承tud.Dataset父类
def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
super(Dataset, self).__init__()
self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE - 1) for t in text]
self.text_encoded = torch.Tensor(self.text_encoded).long()
self.word_to_idx = word_to_idx
self.idx_to_word = idx_to_word
self.word_freqs = torch.Tensor(word_freqs)
self.word_counts = torch.Tensor(word_counts)
def __len__(self):
return len(self.text_encoded)
def __getitem__(self, idx):
''' 这个function返回以下数据用于训练
- 中心词
- 这个单词附近的(positive)单词
- 随机采样的K个单词作为negative sample
'''
center_word = self.text_encoded[idx]
pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1))
pos_indices = [i % len(self.text_encoded) for i in pos_indices]
pos_words = self.text_encoded[pos_indices]
# replacement=True有放回的取
neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], replacement=True)
return center_word, pos_words, neg_words
这个和cv的dataset建立形式是一致的,输出有label形式是tensor int,还有positive sample是在词窗口内word的id,negative sample就是根据词频取得word的id。
class EmbeddingModel(nn.Module):
def __init__(self, vocab_size, embed_size):
super(EmbeddingModel, self).__init__()
self.vocab_size = vocab_size # 30000
self.embed_size = embed_size # 100
# 模型输入,输出是两个一样的矩阵参数nn.Embedding(30000, 100)
self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
self.out_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
# 模型权重初始化
initrange = 0.5 / self.embed_size
self.in_embed.weight.data.uniform_(-initrange, initrange)
self.out_embed.weight.data.uniform_(-initrange, initrange)
def forward(self, input_labels, pos_labels, neg_labels):
'''
input_labels: 中心词, [batch_size]
pos_labels: 中心词周围词 [batch_size * (c * 2)]
neg_labelss: 中心词负采样单词 [batch_size, (c * 2 * K)]
return: loss, 返回loss [batch_size]
'''
batch_size = input_labels.size(0)
input_embedding = self.in_embed(input_labels) # B * embed_size
pos_embedding = self.out_embed(pos_labels) # B * (2C) * embed_size
neg_embedding = self.out_embed(neg_labels) # B * (2*C*K) * embed_size
# torch.bmm()为batch间的矩阵相乘(b,n.m)*(b,m,p)=(b,n,p)
log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze() # B * (2*C)
log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze() # B * (2*C*K)
# 下面loss计算就是论文里的公式
log_pos = F.logsigmoid(log_pos).sum(1) # batch_size
log_neg = F.logsigmoid(log_neg).sum(1) # batch_size
loss = log_pos + log_neg # 正样本损失和负样本损失和尽量最大
return -loss
# 模型训练有两个矩阵self.in_embed和self.out_embed, 作者认为输入矩阵比较好
def input_embeddings(self):
return self.in_embed.weight.data.cpu().numpy()
logsigmoid函数公式和图片
CBOW
CBOW只需要改变dataset中的label,negative sample,positive sample的顺序。label变成了中心词的窗口。后续model中注意 torch.bmm和logsigmoid中的维度对齐。
glove
glove原理请查看
class GloVeModel(nn.Module):
"""Implement GloVe model with Pytorch
"""
def __init__(self, embedding_size, context_size, vocab_size, min_occurrance=1, x_max=100, alpha=3 / 4):
super(GloVeModel, self).__init__()
self.embedding_size = embedding_size
if isinstance(context_size, tuple):
self.left_context, self.right_context = context_size
if isinstance(context_size, int):
self.left_context = self.right_context = context_size
else:
raise ValueError(
"'context_size' should be an int or a tuple of two ints")
self.vocab_size = vocab_size
self.alpha = alpha
self.min_occurrance = min_occurrance
self.x_max = x_max
self._focal_embeddings = nn.Embedding(
vocab_size, embedding_size).type(torch.float64)
self._context_embeddings = nn.Embedding(
vocab_size, embedding_size).type(torch.float64)
self._focal_biases = nn.Embedding(vocab_size, 1).type(torch.float64)
self._context_biases = nn.Embedding(vocab_size, 1).type(torch.float64)
self._glove_dataset = None
for params in self.parameters():
init.uniform_(params, a=-1, b=1)
def fit(self, corpus):
"""get dictionary word list and co-occruence matrix from corpus
Args:
corpus (list): contain word id list
Raises:
ValueError: when count zero cocurrences will raise the problems
"""
left_size, right_size = self.left_context, self.right_context
vocab_size, min_occurrance = self.vocab_size, self.min_occurrance
# get co-occurence count matrix
word_counts = Counter()
cooccurence_counts = defaultdict(float)
for region in corpus:
word_counts.update(region)
for left_context, word, right_context in _context_windows(region, left_size, right_size):
for i, context_word in enumerate(left_context[::-1]):
# add (1 / distance from focal word) for this pair
cooccurence_counts[(word, context_word)] += 1 / (i + 1)
for i, context_word in enumerate(right_context):
cooccurence_counts[(word, context_word)] += 1 / (i + 1)
if len(cooccurence_counts) == 0:
raise ValueError(
"No coccurrences in corpus, Did you try to reuse a generator?")
# get words bag information
tokens = [word for word, count in
word_counts.most_common(vocab_size) if count >= min_occurrance]
coocurrence_matrix = [(words[0], words[1], count)
for words, count in cooccurence_counts.items()
if words[0] in tokens and words[1] in tokens]
self._glove_dataset = GloVeDataSet(coocurrence_matrix)
def train(self, num_epoch, device, batch_size=512, learning_rate=0.05, loop_interval=10):
"""Training GloVe model
Args:
num_epoch (int): number of epoch
device (str): cpu or gpu
batch_size (int, optional): Defaults to 512.
learning_rate (float, optional): Defaults to 0.05. learning rate for Adam optimizer
batch_interval (int, optional): Defaults to 100. interval time to show average loss
Raises:
NotFitToCorpusError: if the model is not fit by corpus, the error will be raise
"""
if self._glove_dataset is None:
raise NotFitToCorpusError(
"Please fit model with corpus before training")
# basic training setting
optimizer = optim.Adam(self.parameters(), lr=learning_rate)
glove_dataloader = DataLoader(self._glove_dataset, batch_size)
total_loss = 0
for epoch in range(num_epoch):
for idx, batch in enumerate(glove_dataloader):
optimizer.zero_grad()
i_s, j_s, counts = batch
i_s = i_s.to(device)
j_s = j_s.to(device)
counts = counts.to(device)
loss = self._loss(i_s, j_s, counts)
total_loss += loss.item()
if idx % loop_interval == 0:
avg_loss = total_loss / loop_interval
print("epoch: {}, current step: {}, average loss: {}".format(
epoch, idx, avg_loss))
total_loss = 0
loss.backward()
optimizer.step()
print("finish glove vector training")
def get_coocurrance_matrix(self):
""" Return co-occurance matrix for saving
Returns:
list: list itam (word_idx1, word_idx2, cooccurances)
"""
return self._glove_dataset._coocurrence_matrix
def embedding_for_tensor(self, tokens):
if not torch.is_tensor(tokens):
raise ValueError("the tokens must be pytorch tensor object")
return self._focal_embeddings(tokens) + self._context_embeddings(tokens)
def _loss(self, focal_input, context_input, coocurrence_count):
x_max, alpha = self.x_max, self.alpha
focal_embed = self._focal_embeddings(focal_input)
context_embed = self._context_embeddings(context_input)
focal_bias = self._focal_biases(focal_input)
context_bias = self._context_biases(context_input)
# count weight factor
weight_factor = torch.pow(coocurrence_count / x_max, alpha)
weight_factor[weight_factor > 1] = 1
embedding_products = torch.sum(focal_embed * context_embed, dim=1)
log_cooccurrences = torch.log(coocurrence_count)
distance_expr = (embedding_products + focal_bias +
context_bias + log_cooccurrences) ** 2
single_losses = weight_factor * distance_expr
mean_loss = torch.mean(single_losses)
return mean_loss
class GloVeDataSet(Dataset):
def __init__(self, coocurrence_matrix):
self._coocurrence_matrix = coocurrence_matrix
def __getitem__(self, index):
return self._coocurrence_matrix[index]
def __len__(self):
return len(self._coocurrence_matrix)
class NotTrainedError(Exception):
pass
class NotFitToCorpusError(Exception):
pass
def _context_windows(region, left_size, right_size):
"""generate left_context, word, right_context tuples for each region
Args:
region (str): a sentence
left_size (int): left windows size
right_size (int): right windows size
"""
for i, word in enumerate(region):
start_index = i - left_size
end_index = i + right_size
left_context = _window(region, start_index, i - 1)
right_context = _window(region, i + 1, end_index)
yield (left_context, word, right_context)
def _window(region, start_index, end_index):
"""Returns the list of words starting from `start_index`, going to `end_index`
taken from region. If `start_index` is a negative number, or if `end_index`
is greater than the index of the last word in region, this function will pad
its return value with `NULL_WORD`.
Args:
region (str): the sentence for extracting the token base on the context
start_index (int): index for start step of window
end_index (int): index for the end step of window
"""
last_index = len(region) + 1
selected_tokens = region[max(start_index, 0):
min(end_index, last_index) + 1]
return selected_tokens
主要通过fit来创建共现矩阵,调用了_context_windows来返回中心词的左右窗口,cooccurence_counts来统计每个词之间的距离关系。
_loss函数对应了glove的loss计算。