word2vec-实现、代码解析笔记

最新推荐文章于 2024-08-05 18:01:35 发布

可可~

最新推荐文章于 2024-08-05 18:01:35 发布

阅读量1.7k

点赞数

分类专栏：代码解析文章标签： word2vec skip-gram 词向量实现代码解析

本文链接：https://blog.csdn.net/Sunflower_ke/article/details/98650519

版权

代码解析专栏收录该内容

2 篇文章 0 订阅

订阅专栏

看了很多关于word2vec的讲解文章，自己对代码略加改动和简化，有了更深的理解。使用的小数据集“text8”（点击链接下载）来训练skip-gram模型，主要用于理解思想和流程。

Step 1: 加载数据

数据集可直接载入内存，英文数据集已是空格隔开的单词不需考虑分词。

import tensorflow as tf
import numpy as np
import bs4
import collections
import math
import os
import random
from six.moves import xrange

filename = 'text8'
vocabulary_size=50000

# Step 1: 加载数据
def read_data(filename):
    with open(filename) as file:
        words = file.read().split()  # .split()用空格划分单词
    return words

Step 2: 构建单词对应词典

函数bulid_vocabulary( words, vocabulary_size)

参数：

words: 加载进来的数据即text8 中的所有单词序列
vocabulary_size: 超参数（程序中取50000）表示构建的词典大小（构建好的词典包含前50000个高频词）

返回值：

count: 统计所有单词的词频后截取前50000个高频词。形如: [['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629),...]
dictionary: 构建字典，键值对为word:id，值越小对应word的频率越高。形如 {'UNK': 0, 'of': 2, 'one': 4, 'the': 1, 'and': 3....}
reverse_dictionary: 互换dictionary中的键和值得到编号id:word的字典形式数据。形如：{0: 'UNK', 1: 'the', 2: 'of', 3: 'and'，...}
data：将传入的单词序列words中的每个单词用对应编号id替换，组成编号id的序列。

# Step 2: 构建单词表（使用前vocabulary_size个高频词构建词列表，其他词为UNK）
def bulid_vocabulary(words,vocabulary_size):
    # extend追加一个列表
    # Counter用来统计每个词出现的次数
    # most_common(N)返回一个TOP N的列表，只留N个
    # c = Counter('abracadabra')
    # c.most_common() ==> [('a',5),('r',2),('b',2),('c',1),('d',1)]
    # c.most_common（3） ==>[('a',5),('r',2),("b",2)]
    count=[['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    #print(count)

    #生成字典，词对应编号word:id
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)      #创建键值对组成的字典：{'a': 6, 'the': 1, 'of': 2, 'in': 5, 'UNK': 0, 'and': 3, 'one': 4},值越小词出现频率越高

    #将句子中的单词全部变成相应的id编号，并统计未在dictionary中出现的词个数
    unk_count = 0
    data=list()
    for word in words:
        if word in dictionary:
            index = dictionary[word]            #即index取到单词word对应的编号
        else:
            index = 0
            unk_count+=1
        data.append(index)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))  #互换字典中的键和值，得到id:word
    return data, count, dictionary,reverse_dictionary

Step 3: 生成批次数据样本

# Step 3: 基于skip-gram模型生成 trainingg batch
batch_size = 128
#词向量维度
embedding_size = 128    # Dimension of the embedding vector
skip_window = 1         # How many words to consider left and right
num_skips = 2            #它代表着我们从整个窗口中选取多少个不同的词作为我们的output word
data_index = 0
num_sampled = 64


valid_size = 16         # Random set of words to evaluate similarity on  取16个词做验证集
valid_window = 100      # Only pick dev samples in the head of the distribution
# 即从0-100抽取16个整数，无放回抽样
valid_example = np.random.choice(valid_window, valid_size, replace = False)

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0  
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # 相当于得到训练字段[左边skip_window个词  target   右边词]
    buffer = collections.deque(maxlen=span)  # 定义一个长度为span的双向队列 buffer

    #循环结束-->把span里的内容加入队列
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1 ) % len(data)

    for i in range(batch_size // num_skips):
        target = skip_window
        target_to_avoid = [skip_window]  # target_to_avoid=[target的编号]
        # 循环2次，一个目标但词对应两个上下文单词
        for j in range(num_skips):                                                              
            while target in target_to_avoid:
                # 可能先拿到前面的词，也可能先拿到后面的词
                target = random.randint(0, span - 1)
            target_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # 将双向队列
            label[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])  # buffer长度固定，加入一个数据时候会自动出队
        data_index = (data_index +1) % len(data)
    data_index = (data_index + len(data) - span) % len(data)
    return batch, label

Step 4: 构建图(Graph)

words = read_data(filename)
data,count, dictionary,reverse_dictionary = bulid_vocabulary (words,vocabulary_size)

num_steps =50000
# Step 4: 构建图Graph
graph = tf.Graph()
with graph.as_default():
    # 定义变量
    train_input = tf.placeholder(dtype = tf.int32, shape = [batch_size])                            
    train_labels = tf.placeholder(dtype = tf.int32,shape = [batch_size,1])
    valid_data = tf.constant(valid_example,dtype = tf.int32)

    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    #定义负采样的相关参数
    nce_weights = tf.Variable( tf.truncated_normal(shape=[vocabulary_size,embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros(shape = [vocabulary_size]))

    #提取要训练的词  不是每次迭代都训练所有词，而是这个批次里的词语的向量
    embed = tf.nn.embedding_lookup(embeddings,train_input)
    # 定义损失函数 和 优化算法
    loss =  tf.reduce_mean(                                                                         
             tf.nn.nce_loss(weights =nce_weights,
                            biases=nce_biases,
                            labels = train_labels,
                            inputs = embed,
                            num_sampled = num_sampled,
                            num_classes = vocabulary_size))
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    #计算单词的余弦相似度
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
    normlized_embedding = embeddings/norm
    # 抽取一些常用词来测试余弦相似度
    valid_embedding = tf.nn.embedding_lookup( normlized_embedding, valid_data)                                                                          
    similarity = tf.matmul( valid_embedding, normlized_embedding, transpose_b=True)                                                            
    # 定义初始化函数init
    init = tf.global_variables_initializer()

Steo 5: 启动会话(session)

#Step 5:开启会话
with tf.Session(graph = graph) as sess:
    # 初始化全部变量init
    sess.run(init)
    # 获取训练数据 和 对应标签
    for step in xrange(num_steps):
        batch_input,batch_labels = generate_batch(batch_size,num_skips,skip_window)
        sess.run(optimizer,feed_dict ={ train_input:batch_input,train_labels:batch_labels})
    #打印一部分余弦相似度高的单词（非训练中必要的，可以删除）
    sim = similarity.eval()
    for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_example[i]]
        top_k = 8
        nearest =(-sim[i,:]).argsort()[1:top_k+1]
        log_str="Nearest to %s:" %valid_word
        for k in xrange(top_k):
            close_word = reverse_dictionary[nearest[k]]
            log_str = "%s %s," %(log_str, close_word)
        print(log_str)

写在最后: 对于不理解的数据处理过程，可以打印出来看看这个语句处理前后数据到底产生了哪些变化。（ps:如果有什么问题也可以在评论中一起交流学习?）

可可~

关注

0
点赞
踩
9

收藏

觉得还不错? 一键收藏
1
评论
word2vec-实现、代码解析笔记

看了很多关于word2vec的讲解文章，自己对代码略加改动和简化，有了更深的理解。使用的小数据集“text8”（点击链接下载）来训练skip-gram模型，主要用于理解思想和流程。Step 1: 加载数据数据集可直接载入内存，英文数据集已是空格隔开的单词不需考虑分词。import tensorflow as tfimport numpy as np...
复制链接

扫一扫

专栏目录