Word2Vector Skip-gram方法详解

最新推荐文章于 2020-12-15 09:24:32 发布

我只有三天不想上班

最新推荐文章于 2020-12-15 09:24:32 发布

阅读量525

点赞数

分类专栏：机器学习文章标签： word2vector

本文链接：https://blog.csdn.net/sinat_41715275/article/details/100540186

版权

机器学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

本文整理自部分文章内容

即词x的左侧2个词和右侧两个词，不足不补。

由此便可进行训练。那么训练过程到底训练的是什么呢？答案是向量，为了让近义词之间的距离更接近，不同此之间的距离更远。

如何训练呢？

训练的过程就是更新这些编码的过程，这些编码就是每个词对应的向量。

代码详解：

# -*- ciding:utf-8 -*-
import matplotlib
matplotlib.use("WebAgg")
import numpy as np
import pandas as pd
import collections
import jieba
import re
import tensorflow as tf
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

"""https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.html"""


def preprocess(text, freq=5):
    text = re.sub("[^\u4E00-\u9FA5 \n]", "", text)  # 只保留汉字、空格和换行符
    words = text.split(" ")
    words = list(filter(None, words))  # 把空的元素去掉
    word_counts = collections.Counter(words)  # 统计每个词的个数
    trimmed_words = [word for word in words if word_counts[word] > freq]  # 把出现次数小于等于5的去掉
    return trimmed_words


# with open(u'凡人修仙传.txt', encoding='utf-8') as f:
#     readlines = f.readlines()
# text_cut = []
# for text in readlines:
#     if len(text) > 3:
#         text_cut.append(" ".join(jieba.cut(text)).strip())
# text_cut = " ".join(text_cut)
# with open('text_cut.txt', 'w+') as f:  # 保存分词结果，不用每次运行都分词了
#     f.write(text_cut)
text_cut = open('text_cut.txt', 'r').read()
words = preprocess(text_cut)

vocab = collections.Counter(words)
vocab2int = {w: c for c, w in enumerate(vocab)}
int2vocab = {c: w for c, w in enumerate(vocab)}
print('total words:%d' % len(words))
print('unique words: %d' % len(vocab))

t = 1e-5
threshold = 0.8
total_count = len(words)
int_words = [vocab2int[w] for w in words]
word_freqs = {w: c / total_count for w, c in collections.Counter(int_words).items()}
print(max(word_freqs.values()))  # max=0.06即没有词被剔除
pro_drop = {w: 1 - np.sqrt(t / f) for w, f in word_freqs.items()}
train_words = [w for w in int_words if pro_drop[w] < threshold]


def get_targets(words, index, window_size=5):
    target_window = np.random.randint(1, window_size + 1)  # 让模型更多地去关注离input word更近的词
    start_point = index - window_size if (index - window_size) > 0 else 0
    end_point = index + target_window
    targets = set(words[start_point:index] + words[index + 1:end_point])
    return list(targets)


def make_windows(words, window_size=5):
    """构建训练集，让词词之间一一对应"""
    x, y = [], []
    for i in tqdm(range(len(words))):
        start = i - window_size if (i - window_size) > 0 else 0
        end = i + window_size if (i + window_size) < len(words) - 1 else len(words) - 1
        while start <= end:
            if start == i:
                start += 1
                continue
            else:
                x.append(words[i])
                y.append(words[start])
                start += 1
    return x, y


# def get_batches(words, batch_size, window_size=5):
#     n_batches = len(words) // batch_size
#     x, y = make_windows(words, window_size)
#     for i in tqdm(range(0, n_batches * batch_size, batch_size)):
#         yield x[i:i + batch_size], y[i:i + batch_size]


batch_size = 512
vocab_size = len(int2vocab)
embedding_size = 100
num_negative_samples = 64
# with tf.Graph.as_default():
inputs = tf.placeholder(tf.int32, shape=[batch_size], name='inputs')
outputs = tf.placeholder(tf.int32, shape=[batch_size, 1], name='outputs')
embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1., 1.))  # 均匀分布，即落在每个点的概率相同
'''假如input=[1,3,5]，则找出embedding中下标为1,3,5的向量组成一个矩阵返回'''
embeded = tf.nn.embedding_lookup(embedding, inputs)
# 截尾正态分布
nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=1. / np.sqrt(embedding_size)))
nce_bias = tf.Variable(tf.zeros(vocab_size))
loss = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights, biases=nce_bias, labels=outputs,
                   inputs=embeded, num_sampled=num_negative_samples, num_classes=vocab_size))  # 内置方法，负采样
optimizer = tf.train.AdamOptimizer().minimize(loss)

norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), axis=1, keep_dims=True))
normalized_embeddings = embedding / norm  # 把所有词向量进行了单位化

valid = tf.placeholder(tf.int32, shape=[None], name='valid')
valid_embedding = tf.nn.embedding_lookup(normalized_embeddings, valid)  # 查找验证集的词向量
'''计算余弦相似度，通过余弦相似度来找到最相近的词'''
similarity = tf.matmul(valid_embedding, normalized_embeddings, transpose_b=True)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
offset = 0
losses = []
valid_size = 11
# valid_examples = np.random.choice(100, valid_size, replace=False)
valid_examples = [vocab2int['韩某'], vocab2int['元婴'], vocab2int['宝物'], vocab2int['修士'],
                  vocab2int['前辈'], vocab2int['突然'], vocab2int['少女'], vocab2int['举动'],
                  vocab2int['不是'], vocab2int['前面'], vocab2int['野兽']]
# batches = get_batches(train_words, batch_size)
X_train, Y_train = make_windows(train_words)
input_x = np.squeeze(np.array(X_train))
output_y = np.expand_dims(np.squeeze(np.array(Y_train)), -1)
start = time.time()
for i in range(1, 100 + 1):
    if offset + batch_size > input_x.shape[0]:
        offset = (offset + batch_size) % output_y.shape[0]
    X_batch = input_x[offset:offset + batch_size]
    Y_batch = output_y[offset:offset + batch_size, :]
    feed = {inputs: X_batch, outputs: Y_batch}
    _, _loss = sess.run([optimizer, loss], feed_dict=feed)
    losses.append(_loss)
    if i % 100 == 0:
        end = time.time()
        print("Epoch: %d/1000000" % i, "time: %ss" % str(end - start)[:5], "Average Loss: %.2f" % np.mean(losses))
        losses = []
    if i % 5000 == 0:
        sim = sess.run(similarity, feed_dict={valid: valid_examples})
        for v in range(valid_size):
            valid_word = int2vocab[valid_examples[v]]
            top_k = 8
            nearest = (-sim[v, :]).argsort()[1:top_k + 1]
            s = 'Nearest to [%s]:' % valid_word
            for k in range(top_k):
                s += ' ' + int2vocab[nearest[k]]
            print(s)
    offset += batch_size

save_path = saver.save(sess, "cftest28.ckpt")
embed_mat = sess.run(normalized_embeddings)

viz_words = 100
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])
fig, ax = plt.subplots(figsize=(14, 14))
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int2vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
plt.show()

效果：

5000轮的训练结果

30000轮的训练结果

60000轮结果

降维可视化图

我只有三天不想上班

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Word2Vector Skip-gram方法详解

本文整理自部分文章内容即词x的左侧2个词和右侧两个词，不足不补。由此便可进行训练。那么训练过程到底训练的是什么呢？答案是向量，为了让近义词之间的距离更接近，不同此之间的距离更远。如何训练呢？训练的过程就是更新这些编码的过程，这些编码就是每个词对应的向量。代码详解：# -*- ciding:utf-8 -*-import matplotlibm...
复制链接

扫一扫

专栏目录