tf.data加载文本
代码
"""
Created on 2020/11/20 15:31
@Author: CY
@email: 5844104706@qq.com
"""
import tensorflow as tf
import tensorflow_datasets as tfds
import os
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']
for name in FILE_NAMES:
text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL + name)
parent_dir = os.path.dirname(text_dir)
print("数据所在文件夹:", parent_dir)
def labeler(example, index):
return example, tf.cast(index, tf.int64)
labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
labeled_data_sets.append(labeled_dataset)
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
all_labeled_data = all_labeled_data.shuffle(
BUFFER_SIZE, reshuffle_each_iteration=False)
for ex in all_labeled_data.take(5):
print(ex)
print("将文本编码成数字")
tokenizer = tfds.deprecated.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
some_tokens = tokenizer.tokenize(text_tensor.numpy())
vocabulary_set.update(some_tokens)
vocab_size = len(vocabulary_set)
print('词汇表大小:', vocab_size)
print("#样本编码")
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)
example_text = next(iter(all_labeled_data))[0].numpy()
print('原文:',example_text)
encoded_example = encoder.encode(example_text)
print('编码:',encoded_example)
print("在数据集上运行编码器(通过将编码器打包到 tf.py_function 并且传参至数据集的 map 方法的方式来运行")
def encode(text_tensor, label):
encoded_text = encoder.encode(text_tensor.numpy())
return encoded_text, label
def encode_map_fn(text, label):
encoded_text, label = tf.py_function(encode,
inp=[text, label],
Tout=(tf.int64, tf.int64))
encoded_text.set_shape([None])
label.set_shape([])
return encoded_text, label
all_encoded_data = all_labeled_data.map(encode_map_fn)
print("#将数据集分割为测试集和训练集且进行分支")
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)
test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)
sample_text, sample_labels = next(iter(test_data))
sample_text[0], sample_labels[0]
vocab_size += 1
print("建模:")
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
print("#训练模型")
model.fit(train_data, epochs=3, validation_data=test_data)
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc))
执行结果
数据所在文件夹: C:\Users\Administrator\.keras\datasets
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
(<tf.Tensor: shape=(), dtype=string, numpy=b"May bury him, and to his mem'ry raise">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'The reins attaching to the chariot-rail,'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'And brisk in fight Oresbius; rich was he,'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'The son of Phylacus; these two in arms'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Patroclus: whom I never can forget,'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
将文本编码成数字
词汇表大小: 17178
#样本编码
原文: b"May bury him, and to his mem'ry raise"
编码: [1837, 2380, 16078, 12233, 2267, 7486, 7893, 17003, 1487]
在数据集上运行编码器(通过将编码器打包到 tf.py_function 并且传参至数据集的 map 方法的方式来运行
#将数据集分割为测试集和训练集且进行分支
建模:
#训练模型
Epoch 1/3
697/697 [==============================] - 21s 31ms/step - loss: 0.5098 - accuracy: 0.7559 - val_loss: 0.3877 - val_accuracy: 0.8248
Epoch 2/3
697/697 [==============================] - 19s 28ms/step - loss: 0.2940 - accuracy: 0.8698 - val_loss: 0.3612 - val_accuracy: 0.8364
Epoch 3/3
697/697 [==============================] - 18s 26ms/step - loss: 0.2186 - accuracy: 0.9045 - val_loss: 0.3686 - val_accuracy: 0.8418
79/79 [==============================] - 1s 17ms/step - loss: 0.3686 - accuracy: 0.8418
Eval loss: 0.36862486600875854, Eval accuracy: 0.8417999744415283