tensorflow实现Word2Vec——生成词向量以及降维可视化

以下代码来源与《Tensorflow实战》,来自Github上的tensorflow开源实现,代码非常简洁,可读性高,对于研究NLP、tensorflow、python编程等有很大帮助。

import zipfile
import collections
import tensorflow as tf
import numpy as np
import math
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


data_index = 0
vocabulary_size = 50000
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()  #从压缩文件中读取第一个文件,转为列表
    return data

def build_dataset(words):
    count = [["UNK",-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1)) #选取TOP50000词汇
    dictionary = dict()
    for word,_ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary

words = read_data("text8.zip")
data, count, dictionary, reverse_dictionary = build_dataset(words)
del words

#一次性处理batch_size大小的样本数据集,在batch_size大小的样本数据集中
#每个目标单词只能生成两个样本(该单词的前一个单词和后一个单词)
#通过队列操作使得窗在每次目标单词训练完毕后向后移动一格
def generate_batch(batch_size,num_skips,skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size),dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0,span-1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j]=buffer[skip_window]
            labels[i * num_skips + j,0]=buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

batch_size=128
embedding_size=128  #词向量的维度
skip_window=1  #窗口移动的距离
num_skips=2  #每个目标单词只能生成两个样本(该单词的前一个单词和后一个单词)

valid_size=16  #用作验证的单词数量
valid_window=100  #从频数TOP100中抽取验证单词
valid_examples=np.random.choice(valid_window,valid_size,replace=False)  #在0-100中生成16个随机数序列(验证集)
num_sampled=64  #训练时用来做负样本的噪声单词数量


graph = tf.Graph()
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32,shape=[batch_size])
    train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset = tf.constant(valid_examples,dtype=tf.int32)
    
    with tf.device("/cpu:0"):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,-1.0))
        embed = tf.nn.embedding_lookup(embeddings,train_inputs)
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],
                                                     stddev=1.0/math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
        #计算预测在负样本的噪声单词上的损失
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,  
                                            biases=nce_biases,
                                            labels=train_labels,
                                            inputs=embed,
                                            num_sampled=num_sampled,
                                            num_classes=vocabulary_size))
        
        #利用梯度下降法最小化损失,使得目标单词分布在正样本上的概率最大,负样本上的概率最小
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
        #计算embeddings的L2范数
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        
        #标准化处理
        normalized_embeddings = embeddings / norm
        
        #在normalized_embeddings查找验证单词的嵌入向量
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        
        #计算验证单词与所有单词之间的相似度
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        init = tf.global_variables_initializer()
    
num_steps = 100001  #最大迭代次数十万次
with tf.Session(graph=graph) as session:
    init.run()
    print("Initialized")
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000  #每2000次计算一次平均损失
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8   #定义寻找验证单词的最近的8个单词
                #加负号使得最大变为最小,再通过从小到大排序返回前top_k个单词索引
                nearest = (-sim[i, :]).argsort()[1:top_k+1]  
                log_str = "Nearest to %s:" % valid_word
                
                #寻找距离每个验证单词最近的tok_k个单词
                for k in range(top_k):  
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))
    plt.rcParams["font.sans-serif"] = ["Arial"]
    plt.rcParams["axes.unicode_minus"] = False
    font = {"size":15}
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y,s=60)
        plt.text(x,y,label,font)
    plt.savefig(filename)
    
#利用TSNE算法对词向量矩阵进行降维
#选取每个样本点最近的30个样本
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)  
plot_only = 200   #这里只可视化词频最高的200个单词
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
Initialized
Average loss at step  0 :  299.19677734375
Nearest to that: act, has, also, up, early, is, destroy, self,
Nearest to united: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to only: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to between: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to in: also, in, destroy, it, radicals, abuse, first, up,
Nearest to if: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to can: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to had: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to first: been, and, whilst, including, is, french, abuse, in,
Nearest to seven: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to is: also, label, been, that, any, UNK, act, first,
Nearest to into: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to has: against, abuse, act, first, destroy, taken, is, diggers,
Nearest to i: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to the: of, used, a, to, revolution, as, term, radicals,
Nearest to world: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Average loss at step  2000 :  119.57253617954254
Average loss at step  4000 :  65.94876452064514
Average loss at step  6000 :  38.677242650270465
Average loss at step  8000 :  27.363487136363982
Average loss at step  10000 :  20.721028483152388
Nearest to that: but, which, when, it, because, often, however, if,
Nearest to united: military, public, british, city, german, union, through, among,
Nearest to only: but, however, so, very, about, both, him, film,
Nearest to between: including, under, through, against, before, german, british, national,
Nearest to in: during, from, after, under, against, since, at, like,
Nearest to if: where, became, although, then, because, however, when, before,
Nearest to can: could, would, will, may, if, so, although, made,
Nearest to had: has, when, have, if, became, where, were, because,
Nearest to first: name, over, film, both, during, under, human, great,
Nearest to seven: six, eight, five, c, four, three, over, years,
Nearest to is: are, was, if, has, where, longer, but, when,
Nearest to into: like, became, including, through, without, major, under, large,
Nearest to has: had, have, if, when, became, where, although, because,
Nearest to i: so, t, film, even, still, government, then, very,
Nearest to the: its, this, several, each, out, any, all, many,
Nearest to world: government, state, name, film, down, great, human, modern,
Average loss at step  12000 :  17.01681400346756
Average loss at step  14000 :  14.1600362906456
Average loss at step  16000 :  11.939597959756851
Average loss at step  18000 :  11.16051618885994
Average loss at step  20000 :  10.807074008107186
Nearest to that: but, which, however, when, because, then, if, what,
Nearest to united: following, military, former, army, christian, southern, jewish, various,
Nearest to only: even, then, man, very, so, still, use, being,
Nearest to between: around, under, including, against, within, through, upon, among,
Nearest to in: during, under, among, through, like, at, between, against,
Nearest to if: though, when, where, before, became, although, since, however,
Nearest to can: should, will, could, must, would, may, might, though,
Nearest to had: has, have, having, if, longer, though, became, since,
Nearest to first: last, second, following, great, same, over, name, right,
Nearest to seven: eight, six, four, x, p, five, o, nine,
Nearest to is: was, longer, though, became, if, are, although, while,
Nearest to into: through, without, against, including, like, within, under, upon,
Nearest to has: had, having, though, since, became, have, longer, if,
Nearest to i: ii, t, v, n, g, king, thus, god,
Nearest to the: its, each, another, our, government, any, several, part,
Nearest to world: ii, government, great, book, public, law, earth, country,
Average loss at step  22000 :  10.05287909913063
Average loss at step  24000 :  9.349426787853242
Average loss at step  26000 :  9.21153033208847
Average loss at step  28000 :  8.465280988335609
Average loss at step  30000 :  8.530597537875176
Nearest to that: however, which, what, because, but, thus, then, if,
Nearest to united: southern, throughout, former, physical, jewish, army, legal, following,
Nearest to only: just, still, man, god, even, being, then, usually,
Nearest to between: within, around, against, under, including, among, through, using,
Nearest to in: under, during, among, against, through, including, between, within,
Nearest to if: though, when, before, although, however, because, became, where,
Nearest to can: will, could, must, should, would, may, cannot, might,
Nearest to had: has, having, have, never, longer, became, since, took,
Nearest to first: last, following, second, original, current, name, school, great,
Nearest to seven: six, eight, five, nine, four, zero, p, three,
Nearest to is: was, became, longer, does, being, though, means, while,
Nearest to into: through, under, against, without, upon, including, within, like,
Nearest to has: had, having, have, became, since, though, never, longer,
Nearest to i: t, g, ii, you, n, v, god, r,
Nearest to the: our, its, east, each, every, empire, another, various,
Nearest to world: government, school, ii, great, battle, law, country, character,
Average loss at step  32000 :  8.168832635045051
Average loss at step  34000 :  8.242317155838013
Average loss at step  36000 :  7.890609099388122
Average loss at step  38000 :  7.404720583796501
Average loss at step  40000 :  7.517167312383652
Nearest to that: which, what, however, then, because, but, thus, how,
Nearest to united: independent, southern, throughout, arab, former, christian, legal, eastern,
Nearest to only: just, god, then, thus, man, even, another, itself,
Nearest to between: within, against, around, towards, throughout, among, under, including,
Nearest to in: under, during, until, since, within, around, throughout, through,
Nearest to if: though, when, before, then, without, because, thus, within,
Nearest to can: will, could, must, would, should, might, cannot, may,
Nearest to had: has, having, have, never, since, became, longer, wrote,
Nearest to first: last, second, original, next, following, current, final, third,
Nearest to seven: six, five, eight, four, nine, zero, three, flight,
Nearest to is: became, was, being, does, means, longer, remains, includes,
Nearest to into: through, against, upon, without, within, under, throughout, towards,
Nearest to has: had, have, having, never, since, became, through, without,
Nearest to i: t, you, ii, we, g, n, god, then,
Nearest to the: our, each, another, every, my, service, whose, level,
Nearest to world: battle, country, ii, school, character, post, book, god,
Average loss at step  42000 :  7.3395823111534115
Average loss at step  44000 :  7.206046066641807
Average loss at step  46000 :  7.342387794137001
Average loss at step  48000 :  6.962286033987999
Average loss at step  50000 :  7.074249260544777
Nearest to that: which, what, however, thus, but, then, because, how,
Nearest to united: northern, southern, independent, arab, federal, indian, across, royal,
Nearest to only: just, either, usually, thus, now, play, god, still,
Nearest to between: within, against, around, across, under, towards, among, throughout,
Nearest to in: during, under, within, until, throughout, around, against, near,
Nearest to if: when, though, before, thus, while, re, although, above,
Nearest to can: could, must, will, should, might, would, cannot, may,
Nearest to had: has, having, have, never, ever, won, yet, once,
Nearest to first: last, second, third, next, final, original, current, field,
Nearest to seven: eight, six, four, uk, five, zero, nine, three,
Nearest to is: was, became, remains, does, being, includes, becomes, means,
Nearest to into: through, under, against, within, upon, back, off, across,
Nearest to has: had, having, have, never, since, includes, does, yet,
Nearest to i: you, t, we, ii, g, n, david, v,
Nearest to the: our, your, each, base, my, whose, australia, society,
Nearest to world: country, battle, post, game, character, ii, largest, team,
Average loss at step  52000 :  7.1185582935810086
Average loss at step  54000 :  7.046169123888015
Average loss at step  56000 :  6.70773850774765
Average loss at step  58000 :  6.723229277849198
Average loss at step  60000 :  6.534247406482697
Nearest to that: which, what, however, this, thus, then, how, it,
Nearest to united: independent, arab, federal, southern, indian, across, northern, royal,
Nearest to only: just, either, always, thus, usually, actually, true, around,
Nearest to between: against, within, around, among, across, throughout, under, towards,
Nearest to in: within, during, throughout, until, under, around, near, towards,
Nearest to if: when, though, before, thus, because, above, re, whether,
Nearest to can: must, could, might, cannot, should, will, would, may,
Nearest to had: has, having, have, already, yet, received, never, ever,
Nearest to first: last, second, next, third, final, original, design, book,
Nearest to seven: eight, six, five, nine, four, three, deaths, zero,
Nearest to is: remains, does, becomes, was, considered, includes, contains, became,
Nearest to into: through, under, against, back, within, across, down, off,
Nearest to has: had, having, have, already, yet, under, does, includes,
Nearest to i: you, ii, t, we, iii, g, r, god,
Nearest to the: our, your, space, whose, mass, full, fire, class,
Nearest to world: battle, post, country, pre, england, company, ii, earth,
Average loss at step  62000 :  6.462325292706489
Average loss at step  64000 :  6.341322261810303
Average loss at step  66000 :  6.696628123760223
Average loss at step  68000 :  6.478848235368728
Average loss at step  70000 :  6.287410691618919
Nearest to that: which, what, however, thus, how, legal, then, itself,
Nearest to united: arab, member, independent, federal, canadian, across, royal, southern,
Nearest to only: either, just, thus, play, run, here, mostly, always,
Nearest to between: within, around, against, across, among, with, via, towards,
Nearest to in: within, during, throughout, until, under, despite, near, along,
Nearest to if: though, when, although, because, thus, without, before, did,
Nearest to can: could, must, might, cannot, should, may, will, would,
Nearest to had: has, having, have, already, ever, yet, received, never,
Nearest to first: last, second, next, third, final, original, current, full,
Nearest to seven: six, eight, nine, four, five, three, zero, bit,
Nearest to is: remains, was, becomes, does, contains, makes, became, includes,
Nearest to into: through, back, against, down, across, off, under, towards,
Nearest to has: had, having, have, yet, already, received, contains, includes,
Nearest to i: ii, you, t, g, h, iii, we, god,
Nearest to the: our, your, single, whose, full, mass, code, my,
Nearest to world: post, battle, pre, country, ii, philosophy, england, office,
Average loss at step  72000 :  6.348856044650078
Average loss at step  74000 :  6.357206651926041
Average loss at step  76000 :  6.585670741438865
Average loss at step  78000 :  6.379361570596695
Average loss at step  80000 :  6.503398251891136
Nearest to that: which, however, what, thus, itself, then, how, nor,
Nearest to united: arab, member, independent, royal, federal, canadian, across, southern,
Nearest to only: thus, just, either, true, therefore, actually, play, always,
Nearest to between: within, around, across, against, behind, among, with, via,
Nearest to in: within, during, throughout, until, around, under, near, across,
Nearest to if: when, though, whether, did, thus, because, before, re,
Nearest to can: could, might, must, cannot, should, would, will, may,
Nearest to had: has, having, have, already, yet, recently, received, ever,
Nearest to first: last, second, next, third, final, full, original, single,
Nearest to seven: six, eight, five, four, nine, three, zero, car,
Nearest to is: remains, becomes, was, does, contains, makes, being, includes,
Nearest to into: through, back, across, down, off, towards, within, under,
Nearest to has: had, having, have, already, yet, contains, becomes, since,
Nearest to i: ii, you, iii, we, david, v, g, t,
Nearest to the: our, your, fire, flight, sex, mass, israel, running,
Nearest to world: country, battle, pre, post, philosophy, season, era, company,
Average loss at step  82000 :  6.230700287342072
Average loss at step  84000 :  6.302620532751083
Average loss at step  86000 :  7.993966495275497
Average loss at step  88000 :  5.879935008049011
Average loss at step  90000 :  6.398025431632996
Nearest to that: which, what, however, thus, itself, nor, actually, how,
Nearest to united: arab, member, southern, royal, independent, federal, indian, constitution,
Nearest to only: thus, just, either, therefore, run, actually, true, play,
Nearest to between: within, around, across, with, against, via, behind, toward,
Nearest to in: within, during, throughout, until, near, around, under, across,
Nearest to if: when, though, becomes, because, although, containing, did, despite,
Nearest to can: must, could, cannot, might, should, may, will, would,
Nearest to had: has, having, have, yet, already, recently, finally, ever,
Nearest to first: last, second, next, final, third, movie, original, past,
Nearest to seven: six, eight, five, four, nine, three, movie, zero,
Nearest to is: remains, becomes, contains, was, makes, does, includes, appears,
Nearest to into: through, back, down, across, off, towards, via, around,
Nearest to has: had, having, have, contains, yet, already, includes, previously,
Nearest to i: you, ii, g, iii, we, r, god, david,
Nearest to the: our, your, whose, mass, fire, sex, single, zone,
Nearest to world: pre, country, battle, post, philosophy, england, cold, era,
Average loss at step  92000 :  6.097939577579498
Average loss at step  94000 :  5.983247451663018
Average loss at step  96000 :  6.013742367982864
Average loss at step  98000 :  6.240200996875763
Average loss at step  100000 :  5.976171571850776
Nearest to that: what, which, however, why, itself, thus, actually, nor,
Nearest to united: arab, member, southern, royal, independent, federal, constitution, nation,
Nearest to only: just, thus, either, actually, run, therefore, another, mostly,
Nearest to between: within, across, around, with, containing, behind, against, toward,
Nearest to in: within, during, throughout, across, until, near, towards, around,
Nearest to if: when, though, containing, because, therefore, perhaps, becomes, without,
Nearest to can: could, must, cannot, might, may, should, will, would,
Nearest to had: has, having, have, recently, finally, yet, already, ever,
Nearest to first: last, next, second, final, third, magazine, past, title,
Nearest to seven: six, eight, four, five, nine, three, iv, zero,
Nearest to is: becomes, was, remains, contains, makes, does, appears, includes,
Nearest to into: through, across, back, towards, down, via, off, toward,
Nearest to has: had, having, have, previously, already, recently, yet, finally,
Nearest to i: you, ii, we, iii, me, god, v, g,
Nearest to the: your, our, whose, my, records, business, israel, sex,
Nearest to world: pre, country, philosophy, battle, cold, era, novel, season,

在这里插入图片描述
参考:
sklearn.manifold.TSNE官方文档
tensorflow官方文档

  • 0
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在训练中文词向量并进行可视化的过程中,可以按照以下步骤进行操作: 1. 准备数据:首先,需要准备一个包含大量中文文本的语料库作为训练数据。可以使用已有的中文语料库,或者自己收集相关的中文文本数据。 2. 数据预处理:对于中文文本数据,需要进行一些预处理操作,例如分词、去除停用词、去除标点符号等。可以使用分词工具(如jieba)来进行中文分词,并根据需要进行其他预处理操作。 3. 训练词向量模型使用Word2Vec算法对预处理后的中文文本数据进行训练,得到词向量模型。可以使用Python中的gensim库来实现Word2Vec算法的训练。 4. 可视化词向量:将训练得到的词向量进行可视化,可以使用t-SNE算法将高维词向量降维到二维或三维空间,并使用可视化工具(如matplotlib)进行展示。 下面是一个示例代码,演示了如何使用Word2Vec训练中文词向量并进行可视化: ```python import jieba from gensim.models import Word2Vec from sklearn.manifold import TSNE import matplotlib.pyplot as plt # 准备数据 corpus = [ "我喜欢吃苹果", "苹果是一种水果", "我喜欢吃香蕉", "香蕉是一种水果", "我喜欢吃橘子", "橘子是一种水果" ] # 数据预处理 sentences = [list(jieba.cut(sentence)) for sentence in corpus] # 训练词向量模型 model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) # 可视化词向量 words = model.wv.index2word[:10] # 取前10个词 word_vectors = model.wv[words] tsne = TSNE(n_components=2) word_vectors_tsne = tsne.fit_transform(word_vectors) plt.figure(figsize=(10, 6)) for i, word in enumerate(words): x, y = word_vectors_tsne[i] plt.scatter(x, y) plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.show() ``` 这段代码首先准备了一个包含若干中文句子的语料库,然后使用jieba进行分词,接着使用Word2Vec算法训练词向量模型。最后,使用t-SNE算法将词向量降维到二维空间,并使用matplotlib进行可视化展示。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值