# -*- coding: utf-8 -*-
# @Author: adrianna
# @Date: 2017-05-31 10:05:12
# @Last Modified by: adrianna
# @Last Modified time: 2017-06-01 11:29:26
"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import tensorflow as tf
def _read_words(filename):
with tf.gfile.GFile(filename, "r") as f:
return f.read().replace("\n", "<eos>").split() #返回1-d list
def _build_vocab(filename):
data = _read_words(filename)
counter = collections.Counter(data) #返回<word,count> dict
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) # 按照count降序排序,返回<word,count> list
words, _ = list(zip(*count_pairs)) #zip返回两个tuple:一个是word的tuple,另一个是count的tuple
word_to_id = dict(zip(words, range(len(words)))) #按照count降序顺序将words编号[0,type_of_words),返回<word,id> dict
return word_to_id
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
return [word_to_id[word] for word in data if word in word_to_id]
def ptb_raw_data(data_path=None):
"""Load PTB raw data from data directory "data_path".
Reads PTB text files, converts strings to integer ids,
and performs mini-batching of the inputs.
The PTB dataset comes from Tomas Mikolov's webpage:
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Args:
data_path: string path to the directory where simple-examples.tgz has
been extracted.
Returns:
tuple (train_data, valid_data, test_data, vocabulary)
where each of the data objects can be passed to PTBIterator.
"""
train_path = os.path.join(data_path, "ptb.train.txt")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test.txt")
word_to_id = _build_vocab(train_path)
train_data = _file_to_word_ids(train_path, word_to_id)
valid_data = _file_to_word_ids(valid_path, word_to_id)
test_data = _file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)
return train_data, valid_data, test_data, vocabulary
def ptb_producer(raw_data, batch_size, num_steps, name=None):
"""Iterate on the raw PTB data.
This chunks up raw_data into batches of examples and returns Tensors that
are drawn from these batches.
Args:
raw_data: one of the raw data outputs from ptb_raw_data.
batch_size: int, the batch size.
num_steps: int, the number of unrolls.
name: the name of this operation (optional).
Returns:
A pair of Tensors, each shaped [batch_size, num_steps]. The second element
of the tuple is the same data time-shifted to the right by one.
Raises:
tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
"""
with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
raw_data = tf.convert_to_tensor(
raw_data, name="raw_data", dtype=tf.int32) #converts Python objects of various types to `Tensor`objects.
data_len = tf.size(raw_data) #num_of_elements
batch_len = data_len // batch_size
data = tf.reshape(raw_data[0: batch_size * batch_len],
[batch_size, batch_len]) # reshape data to batch size
epoch_size = (batch_len - 1) // num_steps
assertion = tf.assert_positive(
epoch_size,
message="epoch_size == 0, decrease batch_size or num_steps")
with tf.control_dependencies([assertion]):
epoch_size = tf.identity(epoch_size, name="epoch_size")
i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
x = tf.strided_slice(data, [0, i * num_steps],
[batch_size, (i + 1) * num_steps]) #begin,end坐标,取对角线内的data
x.set_shape([batch_size, num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1],
[batch_size, (i + 1) * num_steps + 1])
y.set_shape([batch_size, num_steps])
return x, y
【tensorflow学习】ptb_reader源码解析
最新推荐文章于 2023-07-06 21:26:06 发布