tensorflow 实现word2vec

这段代码展示了如何使用TensorFlow构建Word2Vec模型。首先,读取停用词,处理文本数据,然后构建词汇表。接着,定义并训练神经网络,包括随机梯度下降优化器和噪声对比估计损失函数。最后,评估模型并在验证集上找到最相似的词。
摘要由CSDN通过智能技术生成
# coding: utf-8

# In[151]:


import jieba
import tensorflow as tf
import numpy as np
import math
import collections
import pickle as pkl
from pprint import pprint
from pymongo import MongoClient
import re
import jieba
import os.path as path
import os
import re
import codecs


# In[3]:



    # step 1 读取停用词
stop_words = []
with open('stop_words.txt') as f:
    line = f.readline()
    while line:
        stop_words.append(line[:-1])
        line = f.readline()
stop_words = set(stop_words)
print('停用词读取完毕,共{n}个单词'.format(n=len(stop_words)))


# In[216]:


def Re(k):
    k=re.sub(r'\n','',k)
    k=re.sub(r'\xa0','',k)
    k=re.sub(r'\r','',k)
    #k=re.sub(r'\s','',k)
    k=re.sub(r'\t','',k)
    return k


# In[217]:


text=codecs.open('standata.txt')
content=text.readlines()
data=[]
label=[]
for coni in content:
    
    coni=re.sub(r'__label__.*','',coni)
    coni=Re(coni)
    lis=coni.split(' ')
    for li in lis:
        data.append(li)


# In[219]:


raw_word_list=[]
for datai in data:
    if datai not in stop_words:
        raw_word_list.append(datai)
        


# In[220]:


len(raw_word_list)


# In[221]:


word_count = collections.Counter(raw_word_list)

g=[i for i in word_count.values() if i==1]
len(word_count)-len(g)


# In[229]:





# In[222]:


vocabulary_size=16266
def build_dataset(words):
    count=[['UNK','-1']]
    count.extend(collections.Counter(words).most_common(vocabulary_size))
    dictionary=dict()
    for word,_ in count:
        dictionary[word]=len(dictionary)
    data=list()
    unk_count=0
    for word in words:
        if word in dictionary:
            index=dictionary[word]
        else:
            index=0
            unk_count +=1
        data.append(index)
    count[0][1]=unk_count
    reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary
        


# In[223]:


data,count,dictionary,reverse_dictionary=build_dataset(raw_word_list)


# In[224]:


dictionary


# In[59]:


collections.deque(maxlen=3)
b=8
n=2
print(b/2.5)
print(b//2.5)


# In[108]:


data_index=0
def generate_batch(batch_size,num_skips,skip_window):
    global data_index
    assert batch_size%num_skips==0
    assert num_skips<=2*skip_window
    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
    span=2*skip_window+1
    buffer=collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index=(data_index+1)%len(data)
    for i in range(batch_size//num_skips):
        target=skip_window
        targets_to_avoid=[skip_window] 
        for j in range(num_skips):
            while target in targets_to_avoid:
                target=random.randint(0,span-1)
            targets_to_avoid.append(target)
            batch[i*num_skips+j]=buffer[skip_window]
            labels[i*num_skips+j,0]=buffer[target]
        buffer.append(data[data_index])
        data_index=(data_index+1)%len(data)
    return batch,labels
        


# In[91]:


import random


# In[96]:


batch_size=8
num_skips=3
skip_window=1
data_index=0
global data_index
#assert batch_size% num_skips==0
#assert num_skips<=2*skip_window
batch=np.ndarray(shape=(batch_size),dtype=np.int32)
labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
span=2*skip_window+1
buffer=collections.deque(maxlen=span)
for _ in range(span):
    buffer.append(data[data_index])
    data_index=(data_index+1)%len(data)


# In[225]:


batch,labels=generate_batch(batch_size=8,num_skips=2,skip_window=1)
for i in range(8):
    print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])


# In[138]:


batch_size=128
embedding_size=128
skip_window=1
num_skips=2
valid_size=16
valid_window=100
valid_examples=np.random.choice(valid_window,valid_size,replace=False)
num_sampled=64


# In[226]:


vocabulary_size=16266


# In[227]:


graph=tf.Graph()
with graph.as_default():
    train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
    train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset=tf.constant(valid_examples,dtype=tf.int32)
    embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    embed=tf.nn.embedding_lookup(embeddings,train_inputs)
    nce_weights=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
    nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
    loss=tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                       biases=nce_biases,
                                       labels=train_labels,
                                       inputs=embed,
                                       num_sampled=num_sampled,
                                       num_classes=vocabulary_size))
    optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    norm=tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
    normalized_embeddings=embeddings/norm
    valid_embeddings=tf.nn.embedding_lookup(
    normalized_embeddings,valid_dataset)
    similarity=tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)
    init=tf.initialize_all_variables()#tf.global_variables_initializer()
    


# In[228]:


num_steps=100001
with tf.Session(graph=graph) as session:
    init.run()
    print('Initialized')
    average_loss=0
    for step in range(num_steps):
        batch_inputs,batch_labels=generate_batch(
        batch_size,num_skips,skip_window
        )
        feed_dict={train_inputs:batch_inputs,train_labels:batch_labels}
        _,loss_val=session.run([optimizer,loss],feed_dict=feed_dict)
        average_loss+=loss_val
        if step%2000==0:
            if step>0:
                average_loss/=2000
            print('Average loss at step',step,':',average_loss)
            average_loss=0
        if step%10000==0:
            sim=similarity.eval()
            for i in range(valid_size):
                valid_word=reverse_dictionary[valid_examples[i]]
                top_k=8
                nearest=(-sim[i,:]).argsort()[1:top_k+1]
                log_str='Nearest to %s:'%valid_word
                for k in range(top_k):
                    close_word=reverse_dictionary[nearest[k]]
                    log_str='%s %s,'%(log_str,close_word)
                print(log_str)
    final_embeddings=normalized_embeddings.eval()
        


# In[134]:


print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
target=1
target_to_avoid=[1]
while target in target_to_avoid:
    target=random.randint(0,span-1)
#span
#buffer.append(1)
#buffer.append(5)
#buffer
print(target)
print(target_to_avoid)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值