以下代码来源与《Tensorflow实战》,来自Github上的tensorflow开源实现,代码非常简洁,可读性高,对于研究NLP、tensorflow、python编程等有很大帮助。
import zipfile
import collections
import tensorflow as tf
import numpy as np
import math
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
data_index = 0
vocabulary_size = 50000
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split() #从压缩文件中读取第一个文件,转为列表
return data
def build_dataset(words):
count = [["UNK",-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1)) #选取TOP50000词汇
dictionary = dict()
for word,_ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
return data,count,dictionary,reverse_dictionary
words = read_data("text8.zip")
data, count, dictionary, reverse_dictionary = build_dataset(words)
del words
#一次性处理batch_size大小的样本数据集,在batch_size大小的样本数据集中
#每个目标单词只能生成两个样本(该单词的前一个单词和后一个单词)
#通过队列操作使得窗在每次目标单词训练完毕后向后移动一格
def generate_batch(batch_size,num_skips,skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size),dtype=np.int32)
labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0,span-1)
targets_to_avoid.append(target)
batch[i * num_skips + j]=buffer[skip_window]
labels[i * num_skips + j,0]=buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch_size=128
embedding_size=128 #词向量的维度
skip_window=1 #窗口移动的距离
num_skips=2 #每个目标单词只能生成两个样本(该单词的前一个单词和后一个单词)
valid_size=16 #用作验证的单词数量
valid_window=100 #从频数TOP100中抽取验证单词
valid_examples=np.random.choice(valid_window,valid_size,replace=False) #在0-100中生成16个随机数序列(验证集)
num_sampled=64 #训练时用来做负样本的噪声单词数量
graph = tf.Graph()
with graph.as_default():
train_inputs = tf.placeholder(tf.int32,shape=[batch_size])
train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
valid_dataset = tf.constant(valid_examples,dtype=tf.int32)
with tf.device("/cpu:0"):
embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,-1.0))
embed = tf.nn.embedding_lookup(embeddings,train_inputs)
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],
stddev=1.0/math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
#计算预测在负样本的噪声单词上的损失
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
#利用梯度下降法最小化损失,使得目标单词分布在正样本上的概率最大,负样本上的概率最小
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
#计算embeddings的L2范数
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
#标准化处理
normalized_embeddings = embeddings / norm
#在normalized_embeddings查找验证单词的嵌入向量
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
#计算验证单词与所有单词之间的相似度
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
init = tf.global_variables_initializer()
num_steps = 100001 #最大迭代次数十万次
with tf.Session(graph=graph) as session:
init.run()
print("Initialized")
average_loss = 0
for step in range(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000 #每2000次计算一次平均损失
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
if step % 10000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 #定义寻找验证单词的最近的8个单词
#加负号使得最大变为最小,再通过从小到大排序返回前top_k个单词索引
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log_str = "Nearest to %s:" % valid_word
#寻找距离每个验证单词最近的tok_k个单词
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18))
plt.rcParams["font.sans-serif"] = ["Arial"]
plt.rcParams["axes.unicode_minus"] = False
font = {"size":15}
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y,s=60)
plt.text(x,y,label,font)
plt.savefig(filename)
#利用TSNE算法对词向量矩阵进行降维
#选取每个样本点最近的30个样本
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 200 #这里只可视化词频最高的200个单词
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
Initialized
Average loss at step 0 : 299.19677734375
Nearest to that: act, has, also, up, early, is, destroy, self,
Nearest to united: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to only: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to between: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to in: also, in, destroy, it, radicals, abuse, first, up,
Nearest to if: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to can: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to had: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to first: been, and, whilst, including, is, french, abuse, in,
Nearest to seven: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to is: also, label, been, that, any, UNK, act, first,
Nearest to into: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to has: against, abuse, act, first, destroy, taken, is, diggers,
Nearest to i: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Nearest to the: of, used, a, to, revolution, as, term, radicals,
Nearest to world: adad, bataan, bawdy, apologia, salvo, emptive, detonating, plasmids,
Average loss at step 2000 : 119.57253617954254
Average loss at step 4000 : 65.94876452064514
Average loss at step 6000 : 38.677242650270465
Average loss at step 8000 : 27.363487136363982
Average loss at step 10000 : 20.721028483152388
Nearest to that: but, which, when, it, because, often, however, if,
Nearest to united: military, public, british, city, german, union, through, among,
Nearest to only: but, however, so, very, about, both, him, film,
Nearest to between: including, under, through, against, before, german, british, national,
Nearest to in: during, from, after, under, against, since, at, like,
Nearest to if: where, became, although, then, because, however, when, before,
Nearest to can: could, would, will, may, if, so, although, made,
Nearest to had: has, when, have, if, became, where, were, because,
Nearest to first: name, over, film, both, during, under, human, great,
Nearest to seven: six, eight, five, c, four, three, over, years,
Nearest to is: are, was, if, has, where, longer, but, when,
Nearest to into: like, became, including, through, without, major, under, large,
Nearest to has: had, have, if, when, became, where, although, because,
Nearest to i: so, t, film, even, still, government, then, very,
Nearest to the: its, this, several, each, out, any, all, many,
Nearest to world: government, state, name, film, down, great, human, modern,
Average loss at step 12000 : 17.01681400346756
Average loss at step 14000 : 14.1600362906456
Average loss at step 16000 : 11.939597959756851
Average loss at step 18000 : 11.16051618885994
Average loss at step 20000 : 10.807074008107186
Nearest to that: but, which, however, when, because, then, if, what,
Nearest to united: following, military, former, army, christian, southern, jewish, various,
Nearest to only: even, then, man, very, so, still, use, being,
Nearest to between: around, under, including, against, within, through, upon, among,
Nearest to in: during, under, among, through, like, at, between, against,
Nearest to if: though, when, where, before, became, although, since, however,
Nearest to can: should, will, could, must, would, may, might, though,
Nearest to had: has, have, having, if, longer, though, became, since,
Nearest to first: last, second, following, great, same, over, name, right,
Nearest to seven: eight, six, four, x, p, five, o, nine,
Nearest to is: was, longer, though, became, if, are, although, while,
Nearest to into: through, without, against, including, like, within, under, upon,
Nearest to has: had, having, though, since, became, have, longer, if,
Nearest to i: ii, t, v, n, g, king, thus, god,
Nearest to the: its, each, another, our, government, any, several, part,
Nearest to world: ii, government, great, book, public, law, earth, country,
Average loss at step 22000 : 10.05287909913063
Average loss at step 24000 : 9.349426787853242
Average loss at step 26000 : 9.21153033208847
Average loss at step 28000 : 8.465280988335609
Average loss at step 30000 : 8.530597537875176
Nearest to that: however, which, what, because, but, thus, then, if,
Nearest to united: southern, throughout, former, physical, jewish, army, legal, following,
Nearest to only: just, still, man, god, even, being, then, usually,
Nearest to between: within, around, against, under, including, among, through, using,
Nearest to in: under, during, among, against, through, including, between, within,
Nearest to if: though, when, before, although, however, because, became, where,
Nearest to can: will, could, must, should, would, may, cannot, might,
Nearest to had: has, having, have, never, longer, became, since, took,
Nearest to first: last, following, second, original, current, name, school, great,
Nearest to seven: six, eight, five, nine, four, zero, p, three,
Nearest to is: was, became, longer, does, being, though, means, while,
Nearest to into: through, under, against, without, upon, including, within, like,
Nearest to has: had, having, have, became, since, though, never, longer,
Nearest to i: t, g, ii, you, n, v, god, r,
Nearest to the: our, its, east, each, every, empire, another, various,
Nearest to world: government, school, ii, great, battle, law, country, character,
Average loss at step 32000 : 8.168832635045051
Average loss at step 34000 : 8.242317155838013
Average loss at step 36000 : 7.890609099388122
Average loss at step 38000 : 7.404720583796501
Average loss at step 40000 : 7.517167312383652
Nearest to that: which, what, however, then, because, but, thus, how,
Nearest to united: independent, southern, throughout, arab, former, christian, legal, eastern,
Nearest to only: just, god, then, thus, man, even, another, itself,
Nearest to between: within, against, around, towards, throughout, among, under, including,
Nearest to in: under, during, until, since, within, around, throughout, through,
Nearest to if: though, when, before, then, without, because, thus, within,
Nearest to can: will, could, must, would, should, might, cannot, may,
Nearest to had: has, having, have, never, since, became, longer, wrote,
Nearest to first: last, second, original, next, following, current, final, third,
Nearest to seven: six, five, eight, four, nine, zero, three, flight,
Nearest to is: became, was, being, does, means, longer, remains, includes,
Nearest to into: through, against, upon, without, within, under, throughout, towards,
Nearest to has: had, have, having, never, since, became, through, without,
Nearest to i: t, you, ii, we, g, n, god, then,
Nearest to the: our, each, another, every, my, service, whose, level,
Nearest to world: battle, country, ii, school, character, post, book, god,
Average loss at step 42000 : 7.3395823111534115
Average loss at step 44000 : 7.206046066641807
Average loss at step 46000 : 7.342387794137001
Average loss at step 48000 : 6.962286033987999
Average loss at step 50000 : 7.074249260544777
Nearest to that: which, what, however, thus, but, then, because, how,
Nearest to united: northern, southern, independent, arab, federal, indian, across, royal,
Nearest to only: just, either, usually, thus, now, play, god, still,
Nearest to between: within, against, around, across, under, towards, among, throughout,
Nearest to in: during, under, within, until, throughout, around, against, near,
Nearest to if: when, though, before, thus, while, re, although, above,
Nearest to can: could, must, will, should, might, would, cannot, may,
Nearest to had: has, having, have, never, ever, won, yet, once,
Nearest to first: last, second, third, next, final, original, current, field,
Nearest to seven: eight, six, four, uk, five, zero, nine, three,
Nearest to is: was, became, remains, does, being, includes, becomes, means,
Nearest to into: through, under, against, within, upon, back, off, across,
Nearest to has: had, having, have, never, since, includes, does, yet,
Nearest to i: you, t, we, ii, g, n, david, v,
Nearest to the: our, your, each, base, my, whose, australia, society,
Nearest to world: country, battle, post, game, character, ii, largest, team,
Average loss at step 52000 : 7.1185582935810086
Average loss at step 54000 : 7.046169123888015
Average loss at step 56000 : 6.70773850774765
Average loss at step 58000 : 6.723229277849198
Average loss at step 60000 : 6.534247406482697
Nearest to that: which, what, however, this, thus, then, how, it,
Nearest to united: independent, arab, federal, southern, indian, across, northern, royal,
Nearest to only: just, either, always, thus, usually, actually, true, around,
Nearest to between: against, within, around, among, across, throughout, under, towards,
Nearest to in: within, during, throughout, until, under, around, near, towards,
Nearest to if: when, though, before, thus, because, above, re, whether,
Nearest to can: must, could, might, cannot, should, will, would, may,
Nearest to had: has, having, have, already, yet, received, never, ever,
Nearest to first: last, second, next, third, final, original, design, book,
Nearest to seven: eight, six, five, nine, four, three, deaths, zero,
Nearest to is: remains, does, becomes, was, considered, includes, contains, became,
Nearest to into: through, under, against, back, within, across, down, off,
Nearest to has: had, having, have, already, yet, under, does, includes,
Nearest to i: you, ii, t, we, iii, g, r, god,
Nearest to the: our, your, space, whose, mass, full, fire, class,
Nearest to world: battle, post, country, pre, england, company, ii, earth,
Average loss at step 62000 : 6.462325292706489
Average loss at step 64000 : 6.341322261810303
Average loss at step 66000 : 6.696628123760223
Average loss at step 68000 : 6.478848235368728
Average loss at step 70000 : 6.287410691618919
Nearest to that: which, what, however, thus, how, legal, then, itself,
Nearest to united: arab, member, independent, federal, canadian, across, royal, southern,
Nearest to only: either, just, thus, play, run, here, mostly, always,
Nearest to between: within, around, against, across, among, with, via, towards,
Nearest to in: within, during, throughout, until, under, despite, near, along,
Nearest to if: though, when, although, because, thus, without, before, did,
Nearest to can: could, must, might, cannot, should, may, will, would,
Nearest to had: has, having, have, already, ever, yet, received, never,
Nearest to first: last, second, next, third, final, original, current, full,
Nearest to seven: six, eight, nine, four, five, three, zero, bit,
Nearest to is: remains, was, becomes, does, contains, makes, became, includes,
Nearest to into: through, back, against, down, across, off, under, towards,
Nearest to has: had, having, have, yet, already, received, contains, includes,
Nearest to i: ii, you, t, g, h, iii, we, god,
Nearest to the: our, your, single, whose, full, mass, code, my,
Nearest to world: post, battle, pre, country, ii, philosophy, england, office,
Average loss at step 72000 : 6.348856044650078
Average loss at step 74000 : 6.357206651926041
Average loss at step 76000 : 6.585670741438865
Average loss at step 78000 : 6.379361570596695
Average loss at step 80000 : 6.503398251891136
Nearest to that: which, however, what, thus, itself, then, how, nor,
Nearest to united: arab, member, independent, royal, federal, canadian, across, southern,
Nearest to only: thus, just, either, true, therefore, actually, play, always,
Nearest to between: within, around, across, against, behind, among, with, via,
Nearest to in: within, during, throughout, until, around, under, near, across,
Nearest to if: when, though, whether, did, thus, because, before, re,
Nearest to can: could, might, must, cannot, should, would, will, may,
Nearest to had: has, having, have, already, yet, recently, received, ever,
Nearest to first: last, second, next, third, final, full, original, single,
Nearest to seven: six, eight, five, four, nine, three, zero, car,
Nearest to is: remains, becomes, was, does, contains, makes, being, includes,
Nearest to into: through, back, across, down, off, towards, within, under,
Nearest to has: had, having, have, already, yet, contains, becomes, since,
Nearest to i: ii, you, iii, we, david, v, g, t,
Nearest to the: our, your, fire, flight, sex, mass, israel, running,
Nearest to world: country, battle, pre, post, philosophy, season, era, company,
Average loss at step 82000 : 6.230700287342072
Average loss at step 84000 : 6.302620532751083
Average loss at step 86000 : 7.993966495275497
Average loss at step 88000 : 5.879935008049011
Average loss at step 90000 : 6.398025431632996
Nearest to that: which, what, however, thus, itself, nor, actually, how,
Nearest to united: arab, member, southern, royal, independent, federal, indian, constitution,
Nearest to only: thus, just, either, therefore, run, actually, true, play,
Nearest to between: within, around, across, with, against, via, behind, toward,
Nearest to in: within, during, throughout, until, near, around, under, across,
Nearest to if: when, though, becomes, because, although, containing, did, despite,
Nearest to can: must, could, cannot, might, should, may, will, would,
Nearest to had: has, having, have, yet, already, recently, finally, ever,
Nearest to first: last, second, next, final, third, movie, original, past,
Nearest to seven: six, eight, five, four, nine, three, movie, zero,
Nearest to is: remains, becomes, contains, was, makes, does, includes, appears,
Nearest to into: through, back, down, across, off, towards, via, around,
Nearest to has: had, having, have, contains, yet, already, includes, previously,
Nearest to i: you, ii, g, iii, we, r, god, david,
Nearest to the: our, your, whose, mass, fire, sex, single, zone,
Nearest to world: pre, country, battle, post, philosophy, england, cold, era,
Average loss at step 92000 : 6.097939577579498
Average loss at step 94000 : 5.983247451663018
Average loss at step 96000 : 6.013742367982864
Average loss at step 98000 : 6.240200996875763
Average loss at step 100000 : 5.976171571850776
Nearest to that: what, which, however, why, itself, thus, actually, nor,
Nearest to united: arab, member, southern, royal, independent, federal, constitution, nation,
Nearest to only: just, thus, either, actually, run, therefore, another, mostly,
Nearest to between: within, across, around, with, containing, behind, against, toward,
Nearest to in: within, during, throughout, across, until, near, towards, around,
Nearest to if: when, though, containing, because, therefore, perhaps, becomes, without,
Nearest to can: could, must, cannot, might, may, should, will, would,
Nearest to had: has, having, have, recently, finally, yet, already, ever,
Nearest to first: last, next, second, final, third, magazine, past, title,
Nearest to seven: six, eight, four, five, nine, three, iv, zero,
Nearest to is: becomes, was, remains, contains, makes, does, appears, includes,
Nearest to into: through, across, back, towards, down, via, off, toward,
Nearest to has: had, having, have, previously, already, recently, yet, finally,
Nearest to i: you, ii, we, iii, me, god, v, g,
Nearest to the: your, our, whose, my, records, business, israel, sex,
Nearest to world: pre, country, philosophy, battle, cold, era, novel, season,