Word2Vector Skip-gram方法详解

本文整理自部分文章内容

即词x的左侧2个词和右侧两个词,不足不补。

由此便可进行训练。那么训练过程到底训练的是什么呢?答案是向量,为了让近义词之间的距离更接近,不同此之间的距离更远。

如何训练呢?

 

训练的过程就是更新这些编码的过程,这些编码就是每个词对应的向量。

代码详解:

# -*- ciding:utf-8 -*-
import matplotlib
matplotlib.use("WebAgg")
import numpy as np
import pandas as pd
import collections
import jieba
import re
import tensorflow as tf
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

"""https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.html"""


def preprocess(text, freq=5):
    text = re.sub("[^\u4E00-\u9FA5 \n]", "", text)  # 只保留汉字、空格和换行符
    words = text.split(" ")
    words = list(filter(None, words))  # 把空的元素去掉
    word_counts = collections.Counter(words)  # 统计每个词的个数
    trimmed_words = [word for word in words if word_counts[word] > freq]  # 把出现次数小于等于5的去掉
    return trimmed_words


# with open(u'凡人修仙传.txt', encoding='utf-8') as f:
#     readlines = f.readlines()
# text_cut = []
# for text in readlines:
#     if len(text) > 3:
#         text_cut.append(" ".join(jieba.cut(text)).strip())
# text_cut = " ".join(text_cut)
# with open('text_cut.txt', 'w+') as f:  # 保存分词结果,不用每次运行都分词了
#     f.write(text_cut)
text_cut = open('text_cut.txt', 'r').read()
words = preprocess(text_cut)

vocab = collections.Counter(words)
vocab2int = {w: c for c, w in enumerate(vocab)}
int2vocab = {c: w for c, w in enumerate(vocab)}
print('total words:%d' % len(words))
print('unique words: %d' % len(vocab))

t = 1e-5
threshold = 0.8
total_count = len(words)
int_words = [vocab2int[w] for w in words]
word_freqs = {w: c / total_count for w, c in collections.Counter(int_words).items()}
print(max(word_freqs.values()))  # max=0.06即没有词被剔除
pro_drop = {w: 1 - np.sqrt(t / f) for w, f in word_freqs.items()}
train_words = [w for w in int_words if pro_drop[w] < threshold]


def get_targets(words, index, window_size=5):
    target_window = np.random.randint(1, window_size + 1)  # 让模型更多地去关注离input word更近的词
    start_point = index - window_size if (index - window_size) > 0 else 0
    end_point = index + target_window
    targets = set(words[start_point:index] + words[index + 1:end_point])
    return list(targets)


def make_windows(words, window_size=5):
    """构建训练集,让词词之间一一对应"""
    x, y = [], []
    for i in tqdm(range(len(words))):
        start = i - window_size if (i - window_size) > 0 else 0
        end = i + window_size if (i + window_size) < len(words) - 1 else len(words) - 1
        while start <= end:
            if start == i:
                start += 1
                continue
            else:
                x.append(words[i])
                y.append(words[start])
                start += 1
    return x, y


# def get_batches(words, batch_size, window_size=5):
#     n_batches = len(words) // batch_size
#     x, y = make_windows(words, window_size)
#     for i in tqdm(range(0, n_batches * batch_size, batch_size)):
#         yield x[i:i + batch_size], y[i:i + batch_size]


batch_size = 512
vocab_size = len(int2vocab)
embedding_size = 100
num_negative_samples = 64
# with tf.Graph.as_default():
inputs = tf.placeholder(tf.int32, shape=[batch_size], name='inputs')
outputs = tf.placeholder(tf.int32, shape=[batch_size, 1], name='outputs')
embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1., 1.))  # 均匀分布,即落在每个点的概率相同
'''假如input=[1,3,5],则找出embedding中下标为1,3,5的向量组成一个矩阵返回'''
embeded = tf.nn.embedding_lookup(embedding, inputs)
# 截尾正态分布
nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=1. / np.sqrt(embedding_size)))
nce_bias = tf.Variable(tf.zeros(vocab_size))
loss = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights, biases=nce_bias, labels=outputs,
                   inputs=embeded, num_sampled=num_negative_samples, num_classes=vocab_size))  # 内置方法,负采样
optimizer = tf.train.AdamOptimizer().minimize(loss)

norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), axis=1, keep_dims=True))
normalized_embeddings = embedding / norm  # 把所有词向量进行了单位化

valid = tf.placeholder(tf.int32, shape=[None], name='valid')
valid_embedding = tf.nn.embedding_lookup(normalized_embeddings, valid)  # 查找验证集的词向量
'''计算余弦相似度,通过余弦相似度来找到最相近的词'''
similarity = tf.matmul(valid_embedding, normalized_embeddings, transpose_b=True)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
offset = 0
losses = []
valid_size = 11
# valid_examples = np.random.choice(100, valid_size, replace=False)
valid_examples = [vocab2int['韩某'], vocab2int['元婴'], vocab2int['宝物'], vocab2int['修士'],
                  vocab2int['前辈'], vocab2int['突然'], vocab2int['少女'], vocab2int['举动'],
                  vocab2int['不是'], vocab2int['前面'], vocab2int['野兽']]
# batches = get_batches(train_words, batch_size)
X_train, Y_train = make_windows(train_words)
input_x = np.squeeze(np.array(X_train))
output_y = np.expand_dims(np.squeeze(np.array(Y_train)), -1)
start = time.time()
for i in range(1, 100 + 1):
    if offset + batch_size > input_x.shape[0]:
        offset = (offset + batch_size) % output_y.shape[0]
    X_batch = input_x[offset:offset + batch_size]
    Y_batch = output_y[offset:offset + batch_size, :]
    feed = {inputs: X_batch, outputs: Y_batch}
    _, _loss = sess.run([optimizer, loss], feed_dict=feed)
    losses.append(_loss)
    if i % 100 == 0:
        end = time.time()
        print("Epoch: %d/1000000" % i, "time: %ss" % str(end - start)[:5], "Average Loss: %.2f" % np.mean(losses))
        losses = []
    if i % 5000 == 0:
        sim = sess.run(similarity, feed_dict={valid: valid_examples})
        for v in range(valid_size):
            valid_word = int2vocab[valid_examples[v]]
            top_k = 8
            nearest = (-sim[v, :]).argsort()[1:top_k + 1]
            s = 'Nearest to [%s]:' % valid_word
            for k in range(top_k):
                s += ' ' + int2vocab[nearest[k]]
            print(s)
    offset += batch_size

save_path = saver.save(sess, "cftest28.ckpt")
embed_mat = sess.run(normalized_embeddings)

viz_words = 100
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])
fig, ax = plt.subplots(figsize=(14, 14))
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int2vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
plt.show()

效果:

5000轮的训练结果

30000轮的训练结果

 

60000轮结果

降维可视化图

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值