"""Basic word2vec implementation through tensorflow"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from glob import glob
import collections
import math
import os
import sys
import time
import argparse
import random
import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf
current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
parser = argparse.ArgumentParser()
parser.add_argument(
'--log_dir',
type=str,
default=os.path.join(current_path, 'output'),
help='The log directory for TensorBoard summaries.')
FLAGS, unparsed = parser.parse_known_args()
# Create the directory for TensorBoard variables if there is not.
if not os.path.exists(FLAGS.log_dir):
os.makedirs(FLAGS.log_dir)
files_list=glob("*************")
def read_data(path):
vocabulary=[]
with open(path, 'r') as f:
for line in f:
vocabulary += line.split()
return vocabulary
# Step1: get the dictionary and reverse_dictionary
vocab_uni=[]
dictionary={}
dictionary['UNK'] = 0
for ii in files_list:
# print("file is:",ii)
vocabulary=read_data(ii)
for voca in vocabulary:
if voca in dictionary.keys():
pass
else:
dictionary[voca]=len(dictionary)
vocab_uni+=vocabulary
del vocabulary
vocab_uni=list(set(vocab_uni))
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
print("size of vocab is:",len(vocab_uni))
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = len(vocab_uni)+1 #50000
# 建立数据集,words是所有单词的列表,n_words是想建的字典中单词的个数
def build_dataset(words, n_words, dictionary):
"""Process raw inputs into a dataset."""
#将所有低频单词设为UNK,个数先设为-1
count = [['UNK', -1]]
data = list()
unk_count = 0
for word in words:
index = dictionary.get(word, 0)
if index == 0: # dictionary['UNK']
unk_count += 1
data.append(index)
#记录UNK个数
count[0][1] = unk_count
return data, count
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(data,batch_size, num_skips, skip_window):
global data_index
global has_next
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = skip_window + 1
buffer = collections.deque(maxlen=span)
# #建立一个结构为双向队列的缓冲区,大小不超过3
if data_index + span > len(data):
has_next=False
# 如果索引超过了数据长度,则重新从数据头部开始
buffer.extend(data[data_index:data_index + span])
data_index += span #将index向后移3位
for i in range(batch_size // num_skips):
context_words = [w for w in range(span) if w != 0]
words_to_use = random.sample(context_words, num_skips)
# start_words=len(words_to_uselen)//2
# words_to_use=context_words[start_words:]
for j, context_word in enumerate(words_to_use):
batch[i * num_skips + j] = buffer[0] #在batch中存入当前单词
labels[i * num_skips + j, 0] = buffer[context_word]
if data_index == len(data):
buffer.extend(data[0:span])
data_index = span
has_next=False
elif data_index>len(data):
has_next=False
else:
buffer.append(data[data_index])
data_index += 1 #当前单词的索引向后移一位
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = (data_index + len(data) - span) % len(data)
# 避免循环结束后刚好停在data尾部,以防下次运行该函数向后移动三位index时越界
# print("batch is",batch)
# print("labels is",labels)
return batch, labels
batch_size = 1024
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 5 # How many words to consider left and right.
num_skips = 5 # How many times to reuse an input to generate a label.
num_sampled = 10 # Number of negative examples to sample.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 20 # Only pick dev samples in the head of the distribution. #原为100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
print("valid examples",valid_examples)
graph = tf.Graph()
with graph.as_default():
# Input data.
with tf.name_scope('inputs'):
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/gpu:0'):
# Look up embeddings for inputs.
with tf.name_scope('embeddings'):
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
with tf.name_scope('weights'):
#initialization train parameters
nce_weights = tf.Variable(
tf.truncated_normal(
[vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
#initialization bias
with tf.name_scope('biases'):
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch.
with tf.name_scope('loss'):
loss = tf.reduce_mean(
tf.nn.nce_loss(
weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Add the loss value as a scalar to summary.
tf.summary.scalar('loss', loss)
# Construct the SGD optimizer using a learning rate of 1.0.
with tf.name_scope('optimizer'):
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
#embedding normalization
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
#找到验证集中的id对应的embedding
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
valid_dataset)
#判断验证集和整个归一化的embedding的相似性
similarity = tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True)
# Merge all summaries.
merged = tf.summary.merge_all()
# Add variable initializer.
init = tf.global_variables_initializer()
# Create a saver.
saver = tf.train.Saver()
# Step 5: Begin training.
num_steps = 10000
with tf.Session(graph=graph) as session:
# Open a writer to write summaries.
writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
# We must initialize all variables before we use them.
init.run()
print('Initialized')
average_loss = 0
step=0
for path in files_list:
# num_steps+=10001
# start=num_steps-10001
print("path is:",path)
vocabulary=read_data(path)
data_index=0
has_next=True
data, count = build_dataset(
vocabulary, vocabulary_size, dictionary)
# time.sleep(1)
del vocabulary # Hint to reduce memory.
while step<3000:
step+=1
# for step in xrange(num_steps):
#生成一个batch的训练数据
batch_inputs, batch_labels = generate_batch(data,batch_size, num_skips,
skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# Define metadata variable.
run_metadata = tf.RunMetadata()
_, summary, loss_val = session.run(
[optimizer, merged, loss],
feed_dict=feed_dict,
run_metadata=run_metadata)
average_loss += loss_val
# Add returned summaries to writer in each step.
writer.add_summary(summary, step)
# Add metadata to visualize the graph for the last run.
if step == (num_steps - 1):
writer.add_run_metadata(run_metadata, 'step%d' % step)
# compute average loss eval 20000 steps
if step % 10000 == 0:
if step > 0:
average_loss /= step
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step ', step, ': ', average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 50000 == 0:
# 每10000步评估一下验证集和整个embeddings的相似性
# 结果是验证集中每个词和字典中所有词的相似性
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
#因为两个向量相乘,值越小越相似(余弦定理),这里找出前8个最相似的词
top_k = 8
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in xrange(top_k):
# 根据id找到对应的word
# print("reverse dictionary is:",reverse_dictionary)
# print("near is",nearest)
close_word = reverse_dictionary[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
# Step 6: 输出词向量
with open('word2vec_karate.txt', "w", encoding="UTF-8") as fW2V:
fW2V.write(str(vocabulary_size) + ' ' + str(embedding_size) + '\n')
for i in xrange(final_embeddings.shape[0]):
sWord = reverse_dictionary[i]
sVector = ''
for j in xrange(final_embeddings.shape[1]):
sVector = sVector + ' ' + str(final_embeddings[i, j])
fW2V.write(sWord + sVector + '\n')
用tensorflow实现基本的word2vec
最新推荐文章于 2024-04-12 18:22:03 发布