tf_w2v_sg_demo.py
# -*- coding: utf-8 -*-
import time
import numpy as np
import tensorflow as tf
import random
from collections import Counter
# 2加载数据
#
with open('data/Javasplittedwords',encoding='utf-8') as f:
text = f.read()
# 3 数据预处理
# 3.1筛选低频词
words = text.split(' ')
words_count = Counter(words)
words = [w for w in words if words_count[w] > 50]
# 3.2构建映射表
vocab = set(words)
vocab_to_int = {w: c for c, w in enumerate(vocab)}
int_to_vocab = {c: w for c, w in enumerate(vocab)}
print("total words: {}".format(len(words)))
print("unique words: {}".format(len(set(words))))
# 3.3对原文本进行vocab到int的转换
int_words = [vocab_to_int[w] for w in words]
# 4采样
# 对停用词进行采样,例如“the”,“of”以及“for”这类单词进行剔除。
# 剔除这些单词以后能够加快我们的训练过程,同时减少训练过程中的噪音。
t = 1e-5 # t值
threshold = 0.9 # 剔除概率阈值
# 统计单词出现频次
int_word_counts = Counter(int_words)
total_count = len(int_words)
# 计算单词频率
word_freqs = {w: c/total_count for w, c in int_word_counts.items()}
# 计算被删除的概率
prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
<