【TensorFlow】TensorFlow实战Google深度学习框架第2版参考代码（09-自然语言处理）_郑泽宇、顾思宇《tensorflow实战google深度学习框架》第2版下载-CSDN博客

本文链接：https://blog.csdn.net/hero_myself/article/details/109779637

《TensorFlow实战Google深度学习框架第2版》教材中的样例代码，由于tensorflow版本、运行过程的修改、敲码过程中的失误、教材样例自带的错误等原因，可能会导致代码与教材不一致或者不能运行成功，仅供参考。

第9章自然语言处理

9.2 按照词频顺序为每个词汇分配一个编号，然后将词汇表保存到一个独立的vocab文件中

#!/usr/bin/python
# -*- coding: utf-8 -*-

import codecs
import collections
from operator import itemgetter

RAW_DATA = "/path/to/ptb.train.txt"  # 训练集数据文件
VOCAB_OUTPUT = "/path/to/ptb.vocab"  # 输出的词汇表文件

counter = collections.Counter()  # 统计单词出现频率
with codecs.open(RAW_DATA, "r", "utf-8") as f:
    for line in f:
        for word in line.strip().split():
            counter[word] += 1

# 按词频顺序对单词进行排序
sorted_word_to_cnt = sorted(counter.items(),
                            key=itemgetter(1),
                            reverse=True)
sorted_words = [x[0] for x in sorted_word_to_cnt]

# 稍后我们需要在文本换行处加入句子结束符"<eos>"，这里预先将其加入词汇表。
sorted_words = ["<eos>"] + sorted_words

# 在9.3.2小节处理机器翻译数据时,除了"<eos>"以外,还需要将"<unk>"和句子起始符"<sos>"加入词汇表,并从词汇表中删除低频词汇
# sorted_words = ["<unk>", "<sos>", "<eos>"] + sorted_words
# if len(sorted_words) > 10000:
#     sorted_words = sorted_words[:10000]

with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:
    for word in sorted_words:
        file_output.write(word + "\n")
        print(word)

9.2 将训练文件、测试文件等都根据词汇文件转化为单词编号，每个单词的编号就是它在词汇文件中的行号

#!/usr/bin/python
# -*- coding: utf-8 -*-

import codecs

RAW_DATA = "/path/to/ptb.train.txt"  # 训练集数据文件
VOCAB = "/path/to/ptb.vocab"  # 输出的词汇表文件
OUTPUT_DATA = "/path/to/ptb.train"  # 将单词替换为单词编号后的输出文件

# 读取词汇表,并建立词汇到单词编号的映射
with codecs.open(VOCAB, "r", "utf-8") as f_vocab:
    vocab = [w.strip() for w in f_vocab.readlines()]
word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}


# 如果出现了不在词汇表内的低频词,则替换为"unk"
def get_id(word):
    return word_to_id[word] if word in word_to_id else word_to_id["<unk>"]


fin = codecs.open(RAW_DATA, "r", "utf-8")
fout = codecs.open(OUTPUT_DATA, 'w', 'utf-8')
for line in fin:
    words = line.strip().split() + ["<eos>"]  # 读取单词并添加<eos>结束符
    # 将每个单词替换为词汇表中的编号
    out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
    fout.write(out_line)
fin.close()
fout.close()

9.2 从文本文件中读取数据，并将数据整理成batch

#!/usr/bin/python
# -*- coding: utf-8 -*-

import numpy as np

TRAIN_DATA = "/path/to/ptb.train"  # 训练集数据文件
TRAIN_BATCH_SIZE = 20
TRAIN_NUM_STEP = 35


# 从文件中读取数据,并返回包含单词编号的数组
def read_data(file_path):
    with open(file_path, "r") as fin:
        # 将整个文档读进一个长字符串
        id_string = ' '.join([line.strip() for line in fin.readlines()])
    id_list = [int(w) for w in id_string.split()]  # 将读取的单词编号转为整数
    return id_list


def make_batches(id_list, batch_size, num_step):
    # 计算总的batch数量,每个batch包含的单词数量是batch_size * num step
    num_batches = (len(id_list) - 1) // (batch_size * num_step)
    # 如9-4图所示,将数据整理成一个维度为[batch_size, num_batches * num_step]的二维数组
    data = np.array(id_list[: num_batches * batch_size * num_step])
    data = np.reshape(data, [batch_size, num_batches * num_step])
    # 沿着第二个维度将数据切分成num_batches个batch并存入一个数组
    data_batches = np.split(data, num_batches, axis=1)
    # 重复上述操作,但是每个位置向右移动一位.这里得到的是RNN每一步输出所需要预测的下一个单词
    label = np.array(id_list[1: num_batches * batch_size * num_step + 1])
    label = np.reshape(label, [batch_size, num_batches * num_step])
    label_batches = np.split(label, num_batches, axis=1)
    # 返回一个长度为num_batches的数组,其中每一项包括一个data矩阵和一个label矩阵
    return list(zip(data_batches, label_batches))


def main():
    train_batches = make_batches((read_data(TRAIN_DATA), TRAIN_BATCH_SIZE, TRAIN_NUM_STEP))


if __name__ == '__main__':
    main()

9.2 一个完整的训练程序，它使用一个双层LSTM作为循环神经网络的主体，并共享Softmax层和词向量层的参数

#!/usr/bin/python
# -*- coding: utf-8 -*-

import numpy as np
import tensorflow as tf

# 以下数据需要预处理每行是word_id的形式
TRAIN_DATA = "/path/to/ptb.train"  # 训练数据路径
EVAL_DATA = "/path/to/ptb.valid"  # 验证数据路径
TEST_DATA = "/path/to/ptb.test"  # 测试数据路径

HIDDEN_SIZE = 300  # 隐藏层规模
NUM_LAYERS = 2  # 深层循环神经网络中LSTM结构的层数
VOCAB_SIZE = 10000  # 词典规模
TRAIN_BATCH_SIZE = 20  # 训练数据batch的大小
TRAIN_NUM_STEP = 35  # 训练数据截断长度

EVAL_BATCH_SIZE = 1  # 测试数据batch的大小
EVAL_NUM_STEP = 1  # 测试数据截断长度
NUM_EPOCH = 5  # 使用训练数据的轮数
LSTM_KEEP_PROB = 0.9  # LSTM节点不被dropout的概率
EMBEDDING_KEEP_PROB = 0.9  # 词向量不被dropout的概率
MAX_GRAD_NORM = 5  # 用于控制梯度膨胀的梯度大小上限
SHARE_EMB_AND_SOFTMAX = True  # 在Softmax层和词向量层之间共享参数


# 通过一个PTBModel类来描述模型,这样方便维护循环神经网络中的状态
class PTBModel(object):
    def __init__(self, is_training, batch_size, num_steps):
        # 记录使用的batch大小和截断长度
        self.batch_size = batch_size
        self.num_steps = num_steps

        # 定义每一步的输入和预期输出,两者的维度都是[batch_size, num_steps]
        self.input_data = tf.placeholder(tf.int32, [batch_s