第十课 tensorflow 实现word2vec

精简版本,去掉了各种tsne以及vector可视化的麻烦,直接看到word2vec的训练过程

对文本进行预处理

也就是将每一个单词转换成一个数字序号,同时将低频词过滤掉,统一使用 UKN来代替.

import collections
import tensorflow as tf


def pre_process(input_file_path, vocabulary_size, output_file_path=None):
    """
    对输入数据进行预处理。处理之后的结果存储到 output_file_path中
    :param input_file_path: 输入的文件路径
    :param output_file_path: 输出的文件路径
    :return: True: 成功; False: 失败
    """

    # 存储单词以及编号, key: word; value: 编号
    most_common_dict = dict()

    # 存储将每一个word转换成index之后的index序列。也就是相当于原先的word序列
    data = list()

    # 存储常见的单词是以元组的形式存储的,(word, word的数量)
    most_common_words = list()

    with open(input_file_path) as input_file:
        for line in input_file:
            words = line.strip().split(' ')
            most_common_words = collections.Counter(words).most_common(vocabulary_size - 1)

            most_common_words.insert(0, ['UKN', 0])

            tf.logging.info(str(len(most_common_words)))
            tf.logging.info(str(most_common_words[:5]))
            tf.logging.info(str(most_common_words[4998:]))

            # 建立most_common_words 字典, 使用字典的长度作为value,
            # 因为每增加一个单词,字典长度就会增加1,所以相当于对每一个单词进行了编号
            # 当然可以直接使用一个自增的变量来处理

            for common_word, _ in most_common_words:
                most_common_dict[common_word] = len(most_common_dict)

            # 统计 UKN 的数量
            for word in words:
                if word in most_common_dict:
                    index = most_common_dict[word]
                else:
                    # 这是UKN
                    index = 0
                    most_common_words[0][1] += 1
                data.append(index)

    # 将most_common_dict 翻转成 key: 编号; value: words
    reverse_most_common_dict = dict(zip(most_common_dict.values(),
                                        most_common_dict.keys()))

    tf.logging.info('data: ' + str(data[:5]))
    tf.logging.info('most_common_words: ' + str(most_common_words[:5]))
    tf.logging.info('most_common_dict: ' + str(zip(most_common_dict.keys()[:5], most_common_dict.values()[:5])))
    tf.logging.info('reverse_most_common_dict: ' + str(zip(reverse_most_common_dict.keys()[:5], reverse_most_common_dict.values()[:5])))
    return data, most_common_words, most_common_dict, reverse_most_common_dict


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)
    pre_process('./input/text8.txt', './input/pre_process_result.txt', 5000)

产生skip-gram训练集

class Word2VecInput(object):
    """
    读取训练数据
    """

    def __init__(self, batch_size, num_skips, skip_window, seq_data_index):
        """
        初始化, Skip-Gram
        :param batch_size: batch size
        :param num_skips: 每个单词产生的样本数量
        :param skip_window: 以当前词为中心的skip window步长
        :param seq_data_index: 输入的word序列, 其中的word已经转换成了index
        """

        # 对seq_data_index进行遍历用的index
        self._data_index = 0

        # 因为每个单词产生 num_skips个样本,所以batch_size需要保证包含完整的某个单词的样本
        assert batch_size % num_skips == 0

        # skip_window中心单词向两边的步长,所以num_skips必须小于2*skip_window
        assert num_skips <= 2 * skip_window

        self._batch_size = batch_size
        self._num_skips = num_skips
        self._skip_window = skip_window

        self._seq_data_index = seq_data_index

    def read_data(self):
        batch = np.ndarray(shape=[self._batch_size], dtype=np.int32)
        labels = np.ndarray(shape=[self._batch_size, 1], dtype=np.int32)

        # 每个单词产生skip-gram样本的跨度
        span = 2 * self._skip_window + 1

        # 建立双端队列,保存当前一段单词
        span_buffer = collections.deque(maxlen=span)

        for _ in range(span):
            # 每一个span中的单词放入队列
            span_buffer.append(self._seq_data_index[self._data_index])
            # 当_data_index超过了全部单词序列的长度时候,重新计算,这样可以循环使用无限产出
            self._data_index = (self._data_index + 1) % len(self._seq_data_index)

        # 产生一个batch
        for i in range(self._batch_size // self._num_skips):

            target = self._skip_window # 目标词,就是中心词,所以index 就和skip_window是一样的

            # target是要避免作为样本产出的
            targets_to_avoid = {target}

            for j in range(self._num_skips):

                # 随机找到寻找一个不是target 并且没有被抽样抽中的单词,
                while target in targets_to_avoid:
                    target = random.randint(0, span - 1)

                targets_to_avoid.add(target)

                # 对于样本来说 中心词也就是 _skip_window index就是样本
                batch[i * self._num_skips + j] = span_buffer[self._skip_window]
                # 对于label来说,target就是label
                labels[i * self._num_skips + j][0] = span_buffer[target]

            # 处理完一个单词之后,再向后移动一个单词,同时将最前面的单词出队列. 这是用过max_len=span来保证的
            span_buffer.append(self._seq_data_index[self._data_index])
            self._data_index = (self._data_index + 1) % len(self._seq_data_index)

        return batch, labels

构建word2vec模型

直接使用nce loss。

class Word2VecInference(object):

    def __init__(self, batch_size, embedding_size, num_sampled):
        """

        :param batch_size:
        :param embedding_size:
        :param num_sampled: 负采样的数量
        """

        super(Word2VecInference, self).__init__()
        self._bach_size = batch_size
        self._embedding_size = embedding_size
        self._num_sampled = num_sampled

        self._loss = None
        self._train_input_placeholder = None
        self._train_labels_placeholder = None
        self._normal_embedings = None

    @property
    def train_input_placeholder(self):
        return self._train_input_placeholder

    @property
    def train_labels_placeholder(self):
        return self._train_labels_placeholder

    @property
    def loss(self):
        return self._loss

    def inference(self):

        train_input = tf.placeholder(tf.int32, shape=[self._bach_size])
        train_labels = tf.placeholder(tf.int32, shape=[self._bach_size, 1])

        with tf.device('/cpu:0'):

            # 创建所有词汇的embedding, 初始化后是shape=(50000,128) 这说明每一个单词都是128维
            embedding = tf.Variable(
                tf.random_uniform([common.VOCABULARY_SIZE, self._embedding_size],
                                  -1.0, 1.0)
            )

            logging.info('embeding shape: ' + str(embedding.shape))

            # 训练的输入数据需要在embeding中查找
            # train_input,shape=(128) (128是batch_size大小),
            # 将train_input去embedding中查找到相应的自己的128维的向量,
            # 返回的embed shape=(batch_size=128, dim=128)
            # 这一这就是为什么在input_data中产出的 样本的维度是shape=(batch_size=128), 而label=(batch_size, 1)
            # 二者的维度对不上的缘故,因为样本还需要进行一个转换也就是这里的
            # train_input:shape=(batch_size) => embed:shape=(batch_size,128)
            # 转换后的embed 和 label的维度就吻合了
            # 这里同样也告诉我们 如果想要获取到某个word的维度只需要调用embedding_lookup就可以拿到了.
            embed = tf.nn.embedding_lookup(embedding, train_input)

            logging.info('embed shape: ' + str(embed.shape))
            # nce_weights 实际上就是对每一个embedding的words的每一个维度的weight
            nce_weights = tf.Variable(
                tf.truncated_normal([common.VOCABULARY_SIZE, self._embedding_size],
                                    stddev=1.0/math.sqrt(self._embedding_size))
            )

            # bias相当于每一个embedding拥有一个bias
            nce_bias = tf.Variable(tf.zeros([common.VOCABULARY_SIZE]))

            nce_loss = tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_bias,
                labels=train_labels,
                inputs=embed,
                num_sampled=self._num_sampled,
                num_classes=common.VOCABULARY_SIZE
            )

            loss = tf.reduce_mean(nce_loss)
            self._loss = loss
            self._train_input_placeholder = train_input
            self._train_labels_placeholder = train_labels

            # 实际使用的embedding的维度需要做一个归一化.
            embedding_square = tf.square(embedding)
            embedding_square_sum = tf.reduce_sum(embedding_square, 1, keep_dims=True)
            norm = tf.sqrt(embedding_square_sum)
            self._normal_embedings = embedding / norm

            logging.info('embedding_square shape: ' + str(embedding_square.shape))
            logging.info('embedding_square_sum shape: ' + str(embedding_square_sum.shape))
            logging.info('norm shape: ' + str(norm.shape))
            return loss

训练

class Word2VecTrain(ITrain):
def train(self):
seq_data_index, most_common_words, most_common_dict, reverse_most_common_dict = \
pre_process.pre_process(‘./input/text8.txt’, common.VOCABULARY_SIZE)

    input_data = Word2VecInput(common.BATCH_SIZE,
                               common.num_skips,
                               common.skip_window,
                               seq_data_index)

    infrence = Word2VecInference(common.BATCH_SIZE,
                                 common.EMBEDDING_SIZE,
                                 common.NUM_SAMPLED)
    graph = tf.Graph()

    with graph.as_default():
        loss = infrence.inference()

        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    with tf.Session(graph=graph) as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.local_variables_initializer())

        total_loss = 0

        for step in range(common.num_steps):
            batch_inputs, batch_labels = input_data.read_data()

            _, loss_val = session.run([optimizer, loss],
                                      feed_dict={infrence.train_input_placeholder: batch_inputs,
                                                 infrence.train_labels_placeholder: batch_labels})

            total_loss += loss_val

            if step % 1000 == 0:
                tf.logging.info('step: ' + str(step) + ', average loss: ' + str(total_loss / step))
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值