Tensor Flow实现word2vec(纯代码)

# -*- coding: utf-8 -*-
"""
Created on Thu Jan  3 15:34:23 2019

@author: WX
"""

import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

url = "http://mattmahoney.net/dc/"

def maybe_download(filename,excepted_bytes):
    if not os.path.exists(filename):
        filename,_ = urllib.request.urlretrieve(url+filename,filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == excepted_bytes:
        print("Found and Verified",filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to Verify '+filename+" . Can you get to it with a browser?")
    return filename

filename = maybe_download('text8.zip',31344016)

# 读取数据,并对数据进行解压
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data Size ',len(words))

vocabulary_size =50000
def build_dataset(words):
    count = [['UNK',-1]]
    #统计排名前50000的词汇
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    dictionary = dict()
    #将统计出的词汇存入字典,很巧妙的运用字典长度为每个词汇自动编号
    for word,_ in count:
        dictionary[word] = len(dictionary)
    #data用于存原始文章编码后的结果,即用此表中下标数代替每一个词汇
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    # 0号位置一定存放是是unKnow的词汇数
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary
data,count,dictionary,reverse_dictionary = build_dataset(words)

del words
print('Most common words (+UNK) ',count[:5])
print('Sample data',data[:10]," ",[reverse_dictionary[i] for i in data[:10]])

# 下来开始生成词向量的训练样本
data_index = 0
#其中batch_size为每一个batch的大小,skip_window为单词最远可联系的距离,
#设为1,num_skips为每个单词可以生成的样本数
def generate_batch(batch_size,num_skips,skip_window):
    global data_index
    #batch_size必须是num_skips的整数倍(要确保每个batch包含了一个词汇对应的所有样本)
    assert batch_size % num_skips == 0
    # num_skips不能大于skip_window的两倍,因为滑动窗口的问题。
    assert num_skips <= 2*skip_window
    batch = np.ndarray(shape=(batch_size),dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
    #span为对某个单词创建相关样本时会使用到的单词数量,包括目标单词本身和它前后的单词
    #所以应该是是前后向可以达到的最远距离加词汇本身
    #span也可以是用于控制每次用于生成样本,而要输入的单词个数
    span = 2*skip_window+1
    #为span创建大小为span的双向队列
    #buffer中保存的词汇就是目标词和其相关词汇。
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index+1)%len(data)
    n = batch_size//num_skips
    #开始遍历整个文本,生成所有的样本
    for i in range(n):
        # target用于存储当前目标词汇的位置
        target = skip_window
        # 用于区分目标词汇和情景词汇
        targets_to_avoid = [skip_window]
        #开始生成target词汇所对应的num_skips个样本
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0,span-1)#0,1,2
            # 将使用过的词汇存储到list,避免重复使用
            targets_to_avoid.append(target)
            # 生成样本,样本格式为目标词汇(batch)--》情景词汇(labels)
            batch[i*num_skips+j] = buffer[skip_window]
            labels[i*num_skips+j,0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index+1)%len(data)
    return batch,labels

batch,labels = generate_batch(batch_size=8,num_skips=2,skip_window=1)
for i in range(8):
    print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])
    
batch_size = 128
# 单词转化为稠密向量的维度
embedding_size = 128
skip_window = 1
num_skips = 2

# 抽取的验证单词数
vaild_size = 16
# 抽取的验证单词只从频率最高的100个单词中取
vaild_window = 100
# 使用np.random.choice在前100中随机抽取
vaild_examples = np.random.choice(vaild_window,vaild_size,replace = False)
# 训练时用来做负样本的单词个数
num_sampled = 64

graph = tf.Graph()
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32,shape=[batch_size])
    train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
    vaild_dataset = tf.constant(vaild_examples,dtype=tf.int32)
    
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
        embed = tf.nn.embedding_lookup(embeddings,train_inputs)
        
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=train_labels,
                                         inputs = embed,
                                         num_sampled=num_sampled,
                                         num_classes=vocabulary_size))
    
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
    normalized_embeddings = embeddings/norm
    vaild_embeddings = tf.nn.embedding_lookup(normalized_embeddings,vaild_dataset)
    similarity = tf.matmul(vaild_embeddings,normalized_embeddings,transpose_b=True)
    init = tf.global_variables_initializer()
    
num_steps = 100001
    
with tf.Session(graph=graph) as session:
    init.run()
    print("initialized")
    
    average_loss = 0
    for step in range(num_steps):
        batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window)
        feed_dict = {train_inputs:batch_inputs,train_labels:batch_labels}
        _,loss_val = session.run([optimizer,loss],feed_dict=feed_dict)
        average_loss +=loss_val
        if step%2000 == 0:
            if step>0:
                average_loss /= 2000
            print("Average loss at step ",step,":",average_loss)
            average_loss = 0
        if step%10000==0:
            sim = similarity.eval()
            for i in range(vaild_size):
                vaild_word = reverse_dictionary[vaild_examples[i]]
                top_k = 8
                nearset = (-sim[i,:]).argsort()[1:top_k+1]
                log_str = "Nearset to %s: " % vaild_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearset[k]]
                    log_str = "%s %s" %(log_str,close_word)
            print(log_str)
    final_embeddings = normalized_embeddings.eval()
    
def plot_with_labels(low_dim_embs,labels,filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels),"More labels than embeddings"
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y = low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(label,xy=(x,y),
                   xytext=(5,2),
                   textcoords='offset points',
                   ha='right',
                   va='bottom')
    plt.savefig(filename)
    
tsne = TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_only = 100
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值