tensorflow实现word2vec及相关注释

tensorflow实现word2vec以及相关注释

 

参考1:https://www.jianshu.com/p/556d735a7f97

参考2:http://blog.csdn.net/wangyangzhizhou/article/details/77530479

参考3:http://blog.csdn.net/NNNNNNNNNNNNY/article/details/70177509

参考4:https://www.jianshu.com/p/fab82fa53e16

......

# -*- coding: utf-8 -*-

import collections
import math
import random
import codecs
import numpy as np
import tensorflow as tf
import jieba

"""read samples"""
def read_data(filename):
    with codecs.open(filename, 'r', encoding='utf-8') as f:
        data = f.read()
        seg_list = jieba.cut(data, cut_all=False)
        text = tf.compat.as_str("/".join(seg_list)).split('/')
    return text

"""建立数据集"""
"""words表示要读取的分词,n_words为读取的词汇量大小"""
def build_dataset(words,n_words):
    count=[['UNK',-1]]
    count.extend(collections.Counter(words).most_common(n_words-1))
    dictionary=dict()
    """给每一个高频词标上序号"""
    for word,_ in count:
        dictionary[word]=len(dictionary)
    #print(dictionary) #可获取全部的高频词汇
    #所有词汇
    data=list()
    unk_count=0
    for word in words:
        if word in dictionary:
            index=dictionary[word]
        else:
            index=0
            unk_count+=1
        """data中index对应dictionary中的词的序号"""
        data.append(index)
    """不在dictionary中的词(低频词)赋值给count[0][1]"""
    count[0][1]=unk_count 
    """dictionary中的数据存储为[...,'上起':33812,...],而reversed中的数据存储正好反过来[...,33812:'上起',...]"""
    reversed_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
    """返回data,count,dictionary,reversed_dictionary,分别表示:所有词及下标,高频词及词频,高频词及下标,压缩词典"""
    return data,count,dictionary,reversed_dictionary

"""generate batch from all samples,skip-gram模型batch生成训练数据"""
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    """assert声明断言语句,如果后面为真则继续执行,如果不真,则抛出异常AssertionError"""
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    """创建batch,batch是一行batch_size列,里边是随机数,类型为int32"""
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    """创建batch对应的label,并且label是一列batch_size行,里边是随机数,类型为int32"""
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    """左右各取skip_window个词"""
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    """依次取span个词"""
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        """目标值为中间那个值"""
        target = skip_window
        targets_to_avoid = [skip_window]
        """从目标次左右取num_skips个值"""
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        if data_index == len(data):
            #print(data[:span])
            #buffer[:] = data[:span]
            buffer=[]
            buffer[:]=data[:span]
            data_index = span
        else:
            """deque丢弃掉最前面的,后面加入新值data[data_index]"""
            buffer.append(data[data_index])
            data_index += 1   
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

"""降维画图"""   
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        """绘制散点图"""
        plt.scatter(x, y)
        """添加图的注释,xytext设置注释内容显示的起始位置"""
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)
   
if __name__=='__main__':
    """sample.txt为分词语料"""
    filename = "sample.txt"
    vocabulary=read_data(filename)

    vocabulary_size=50000
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,vocabulary_size)

    """删除变量vocabulary,而不是数据"""
    del vocabulary

    data_index = 0
#==============================================================================
#    """测试generate_batch函数"""
#    batch,labels=generate_batch(batch_size=8,num_skips=2,skip_window=1)
#    for i in range(8):
#        print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])
#==============================================================================
    
    """构造神经网络"""
    batch_size=128
    """embedding表示词向量维度,skip_window表示左右窗口大小,num_skips表示每个窗口取几个词"""
    embedding_size=128
    skip_window=1
    num_skips=2
    
    valid_size = 16        # Random set of words to evaluate similarity on.
    valid_window = 100     ## Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)
    num_sampled = 64      # Number of negative examples to sample.
    
    graph = tf.Graph()
    with graph.as_default():
        """placeholder用来放置网络使用过程的数据"""
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        """TensorFlow指定特定CPU进行计算,默认为#0"""
        with tf.device('/cpu:0'):
            """词向量,词典大小*词向量维数"""
            embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            """根据train_inputs查找embedding""" 
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
            """构造网络"""
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        """定义lost function,"""
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size))
        """定义优化方法"""
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        """norm化,每一行平方求和再开方"""
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        """找到评估的几个词向量"""
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        """相似度矩阵,得到每个待评估的词和所有词的相似度"""
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        # Add variable initializer. 
        init = tf.global_variables_initializer()
    
    num_steps=40001
    
#==============================================================================
    """创建会话,开始训练"""   
    with tf.Session(graph=graph) as session:
        """初始化所有变量"""
        init.run()
        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
            """运行依次迭代,指定loss函数,训练方法,初始数据""" 
            _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += loss_val
     
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
            # Note that this is expensive (~20% slowdown if computed every 500 steps) 
            if step % 10000 == 0:
                """计算similarity,结果是[评估个数*词数]"""
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    #print(len(reverse_dictionary))
                    top_k = 8
                    """每个词的top_k个最相似词"""  
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        if nearest[k]<len(reverse_dictionary):
                            close_word=reverse_dictionary[nearest[k]]
                            log_str='%s %s,'%(log_str,close_word)
                    print(log_str)
        final_embeddings = normalized_embeddings.eval()
#==============================================================================

#==============================================================================
    try:
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt
          
        #solve the error code for chinese
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        """降维可视化"""
        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
        plot_only = 300
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [reverse_dictionary[i] for i in range(plot_only)]
        plot_with_labels(low_dim_embs, labels)
         
    except ImportError:
        print('Please install sklearn,matplotlib,anf scipy to show embeddings.')
#==============================================================================
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值