【tensorflow学习】ptb_reader源码解析

最新推荐文章于 2023-07-06 21:26:06 发布

adrianna_xy

最新推荐文章于 2023-07-06 21:26:06 发布

阅读量1.7k

点赞数 1

分类专栏： tensorflow学习 tensorflow学习文章标签：源码 tensorflow ptb-reader

本文链接：https://blog.csdn.net/u012223913/article/details/72829729

版权

tensorflow学习同时被 2 个专栏收录

15 篇文章 0 订阅

订阅专栏

tensorflow学习

14 篇文章 4 订阅

订阅专栏

# -*- coding: utf-8 -*-
# @Author: adrianna
# @Date:   2017-05-31 10:05:12
# @Last Modified by:   adrianna
# @Last Modified time: 2017-06-01 11:29:26


"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import tensorflow as tf


def _read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split() #返回1-d list 


def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data) #返回<word,count> dict 
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) # 按照count降序排序，返回<word,count> list

    words, _ = list(zip(*count_pairs)) #zip返回两个tuple：一个是word的tuple，另一个是count的tuple
    word_to_id = dict(zip(words, range(len(words)))) #按照count降序顺序将words编号[0，type_of_words）,返回<word,id> dict

    return word_to_id


def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]


def ptb_raw_data(data_path=None):
    """Load PTB raw data from data directory "data_path".

    Reads PTB text files, converts strings to integer ids,
    and performs mini-batching of the inputs.

    The PTB dataset comes from Tomas Mikolov's webpage:

    http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

    Args:
      data_path: string path to the directory where simple-examples.tgz has
        been extracted.

    Returns:
      tuple (train_data, valid_data, test_data, vocabulary)
      where each of the data objects can be passed to PTBIterator.
    """

    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")

    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    return train_data, valid_data, test_data, vocabulary


def ptb_producer(raw_data, batch_size, num_steps, name=None):
    """Iterate on the raw PTB data.

    This chunks up raw_data into batches of examples and returns Tensors that
    are drawn from these batches.

    Args:
      raw_data: one of the raw data outputs from ptb_raw_data.
      batch_size: int, the batch size.
      num_steps: int, the number of unrolls.
      name: the name of this operation (optional).

    Returns:
      A pair of Tensors, each shaped [batch_size, num_steps]. The second element
      of the tuple is the same data time-shifted to the right by one.

    Raises:
      tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
    """
    with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.convert_to_tensor(
            raw_data, name="raw_data", dtype=tf.int32) #converts Python objects of various types to `Tensor`objects. 
        data_len = tf.size(raw_data) #num_of_elements
        batch_len = data_len // batch_size
        data = tf.reshape(raw_data[0: batch_size * batch_len],
                          [batch_size, batch_len])  # reshape data to batch size

        epoch_size = (batch_len - 1) // num_steps  
        assertion = tf.assert_positive(
            epoch_size,
            message="epoch_size == 0, decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")

        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        x = tf.strided_slice(data, [0, i * num_steps],
                             [batch_size, (i + 1) * num_steps]) #begin,end坐标，取对角线内的data 
        x.set_shape([batch_size, num_steps])
        y = tf.strided_slice(data, [0, i * num_steps + 1],
                             [batch_size, (i + 1) * num_steps + 1])
        y.set_shape([batch_size, num_steps])
        return x, y