用tensorflow实现基本的word2vec

"""Basic word2vec implementation through tensorflow"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from glob import glob
import collections
import math
import os
import sys
import time
import argparse
import random

import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf


current_path = os.path.dirname(os.path.realpath(sys.argv[0]))

parser = argparse.ArgumentParser()
parser.add_argument(
    '--log_dir',
    type=str,
    default=os.path.join(current_path, 'output'),
    help='The log directory for TensorBoard summaries.')
FLAGS, unparsed = parser.parse_known_args()

# Create the directory for TensorBoard variables if there is not.
if not os.path.exists(FLAGS.log_dir):
  os.makedirs(FLAGS.log_dir)


files_list=glob("*************")
def read_data(path):
  vocabulary=[]
  with open(path, 'r') as f:
    for line in f:
      vocabulary += line.split()
  return vocabulary

# Step1: get the dictionary and reverse_dictionary
vocab_uni=[]
dictionary={}
dictionary['UNK'] = 0
for ii in files_list:
  # print("file is:",ii)
  vocabulary=read_data(ii)
  for voca in vocabulary:
    if voca in dictionary.keys():
      pass
    else:
      dictionary[voca]=len(dictionary)

  vocab_uni+=vocabulary
  del vocabulary
  vocab_uni=list(set(vocab_uni))
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

print("size of vocab is:",len(vocab_uni))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = len(vocab_uni)+1 #50000

# 建立数据集,words是所有单词的列表,n_words是想建的字典中单词的个数
def build_dataset(words, n_words, dictionary):
  """Process raw inputs into a dataset."""
  #将所有低频单词设为UNK,个数先设为-1
  count = [['UNK', -1]]
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  #记录UNK个数
  count[0][1] = unk_count

  return data, count



# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(data,batch_size, num_skips, skip_window):

  global data_index
  global has_next
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = skip_window + 1
  buffer = collections.deque(maxlen=span)
  # #建立一个结构为双向队列的缓冲区,大小不超过3
  if data_index + span > len(data):
    has_next=False
  # 如果索引超过了数据长度,则重新从数据头部开始
  buffer.extend(data[data_index:data_index + span])
  data_index += span    #将index向后移3位
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != 0]
    words_to_use = random.sample(context_words, num_skips)  
    # start_words=len(words_to_uselen)//2
    # words_to_use=context_words[start_words:]

    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[0]  #在batch中存入当前单词
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer.extend(data[0:span])
      data_index = span
      has_next=False
    elif data_index>len(data):
      has_next=False
    else:
      buffer.append(data[data_index])
      data_index += 1  #当前单词的索引向后移一位
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  # 避免循环结束后刚好停在data尾部,以防下次运行该函数向后移动三位index时越界
  # print("batch is",batch)
  # print("labels is",labels)
  return batch, labels


batch_size = 1024
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 5  # How many words to consider left and right.
num_skips = 5  # How many times to reuse an input to generate a label.
num_sampled = 10  # Number of negative examples to sample.

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 20  # Only pick dev samples in the head of the distribution. #原为100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
print("valid examples",valid_examples)

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope('inputs'):
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/gpu:0'):
    # Look up embeddings for inputs.
    with tf.name_scope('embeddings'):
      embeddings = tf.Variable(
          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
      embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    with tf.name_scope('weights'):
      #initialization train parameters
      nce_weights = tf.Variable(
          tf.truncated_normal(
              [vocabulary_size, embedding_size],
              stddev=1.0 / math.sqrt(embedding_size)))
    #initialization bias
    with tf.name_scope('biases'):
      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.

  with tf.name_scope('loss'):
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weights,
            biases=nce_biases,
            labels=train_labels,
            inputs=embed,
            num_sampled=num_sampled,
            num_classes=vocabulary_size))

  # Add the loss value as a scalar to summary.
  tf.summary.scalar('loss', loss)

  # Construct the SGD optimizer using a learning rate of 1.0.
  with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  #embedding normalization
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
  normalized_embeddings = embeddings / norm
  #找到验证集中的id对应的embedding
  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                            valid_dataset)
  #判断验证集和整个归一化的embedding的相似性
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Merge all summaries.
  merged = tf.summary.merge_all()

  # Add variable initializer.
  init = tf.global_variables_initializer()

  # Create a saver.
  saver = tf.train.Saver()

# Step 5: Begin training.
num_steps = 10000
with tf.Session(graph=graph) as session:
  # Open a writer to write summaries.
  writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')
  average_loss = 0
  step=0
  for path in files_list:
    # num_steps+=10001
    # start=num_steps-10001
    print("path is:",path)
    vocabulary=read_data(path)
    data_index=0
    has_next=True
    data, count = build_dataset(
        vocabulary, vocabulary_size, dictionary)
    # time.sleep(1)
    del vocabulary  # Hint to reduce memory.
    while step<3000:
      step+=1
      # for step in xrange(num_steps):
      #生成一个batch的训练数据
      batch_inputs, batch_labels = generate_batch(data,batch_size, num_skips,
                                                  skip_window)

      feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

      # Define metadata variable.
      run_metadata = tf.RunMetadata()

      _, summary, loss_val = session.run(
          [optimizer, merged, loss],
          feed_dict=feed_dict,
          run_metadata=run_metadata)
      average_loss += loss_val

      # Add returned summaries to writer in each step.
      writer.add_summary(summary, step)
      # Add metadata to visualize the graph for the last run.
      if step == (num_steps - 1):
        writer.add_run_metadata(run_metadata, 'step%d' % step)
      # compute average loss eval 20000 steps
      if step % 10000 == 0:
        if step > 0:
          average_loss /= step
        # The average loss is an estimate of the loss over the last 2000 batches.
        print('Average loss at step ', step, ': ', average_loss)
        average_loss = 0

      # Note that this is expensive (~20% slowdown if computed every 500 steps)
      if step % 50000 == 0:
        # 每10000步评估一下验证集和整个embeddings的相似性
        # 结果是验证集中每个词和字典中所有词的相似性
        sim = similarity.eval()
        for i in xrange(valid_size):
          valid_word = reverse_dictionary[valid_examples[i]]
          #因为两个向量相乘,值越小越相似(余弦定理),这里找出前8个最相似的词
          top_k = 8
          nearest = (-sim[i, :]).argsort()[1:top_k + 1]
          log_str = 'Nearest to %s:' % valid_word
          for k in xrange(top_k):
            # 根据id找到对应的word
            # print("reverse dictionary is:",reverse_dictionary)
            # print("near is",nearest)
            close_word = reverse_dictionary[nearest[k]]
            log_str = '%s %s,' % (log_str, close_word)
          print(log_str)
    final_embeddings = normalized_embeddings.eval()


# Step 6: 输出词向量
with open('word2vec_karate.txt', "w", encoding="UTF-8") as fW2V:
    fW2V.write(str(vocabulary_size) + ' ' + str(embedding_size) + '\n')
    for i in xrange(final_embeddings.shape[0]):
        sWord = reverse_dictionary[i]
        sVector = ''
        for j in xrange(final_embeddings.shape[1]):
            sVector = sVector + ' ' + str(final_embeddings[i, j])
        fW2V.write(sWord + sVector + '\n')
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值