第六章 深度学习用于文本和序列

1.3.从原始文本到词嵌入


下面展示一些 内联代码片

# -*- coding: utf-8 -*-
"""
 1.3.从原始文本到词嵌入
"""
# 1.下载IMDB数据的原始文本(处理IMDB原始数据的标签)
import os

imdb_dir = 'F:/datasets/aclImdb/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []          # 评论标签
texts = []           # 评论

for label_type in ['neg', 'pos']:           # 负面、正面2种标签
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), 'r', encoding='UTF-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':        # 标签分类
                labels.append(0)           # 负面评论为0
            else:
                labels.append(1)           # 正面评论为1


# In[]:
# 2.对数据进行分词:对IMDB原始数据的文本进行分词(通过Tokenizer分词器)(预训练的词嵌入对训练数据就很少的问题特别有用)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100                     # 在100个单词后截断评论                 # 一条评论最大长度 100 words
training_samples = 200           # 在200个样本上训练                     # 训练样本 200 samples
validation_samples = 10000       # 在10000个样本上验证                   # 验证样本 10000 samples
max_words = 10000                # 只考虑数据集中前10000个最常见的单词    # 只编码最多 10000 words

tokenizer = Tokenizer(num_words=max_words)          # 创建一个分词器(tokenizer),设置为只考虑前max_words=10000个最常见的单词
tokenizer.fit_on_texts(texts)                       # 构建单词索引

sequences = tokenizer.texts_to_sequences(texts)     # 将字符转换为整数索引组成的列表 # 使用分词器编码文本
# print(sequences)
word_index = tokenizer.word_index                     # 找回单词索引
print('Found %s unique tokens.' % len(word_index))    # 显示不同的单词的个数


data = pad_sequences(sequences, maxlen=maxlen)        # 序列填充  # 截断过长样本

labels = np.asarray(labels)              # labels是个列表
print('Shape of data tensor:', data.shape)           # 2D(25000, 100)
print('Shape of label tensor:', labels.shape)        # (25000,)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)                # 打乱顺序  # 打乱数据
data = data[indices]
labels = labels[indices]

x_train = data[: training_samples]        # 拆分数据
y_train = labels[: training_samples]
x_val = data[training_samples : training_samples + validation_samples]
y_val = labels[training_samples : training_samples + validation_samples]


# In[]:
# 3.对词嵌入进行预处理(对一个.txt文件进行解析,构建一个将单词(字符串)映射为其向量表示(数值向量)的索引)
# 3.1.解析GloVe词嵌入文件
glove_dir = 'F:\datasets\glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 'r', encoding='UTF-8')
for line in f:
    values = line.split()     # 利用split()方法对样本进行分词.在实际应用中,还需要从样本中去掉标点和特殊字符
    word = values[0]
    cofes = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = cofes
f.close()

print('Found %s word vectors.' % len(embeddings_index))


# 3.2.准备GloVe词嵌入矩阵
embedding_dim = 100   # GloVe的向量维度

embedding_matrix = np.zeros((max_words, embedding_dim))    # 构造一个可以加载到Embedding层中的嵌入2D矩阵(10000,100)
for word, i in word_index.items():                    # Python字典dict.items()函数以列表返回可遍历的(,)元组数组
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector    # 嵌入索引(embeddings_index)中找不到的词,其嵌入向量全为0


# In[]:
# 4.定义模型
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))    # 3D (10000, 100, 100)
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()


# 5.在模型中加载GloVe嵌入(将预训练的词嵌入加载到Embedding层中)  # 手动装配权重
model.layers[0].set_weights([embedding_matrix])       # embedding_matrix是个2D矩阵
model.layers[0].trainable = False          # 冻结Embedding层(随机初始化的层会引起较大的梯度更新,会破坏已经学到的特征)


# 6.训练模型与评估模型
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy',
              metrics = ['acc'])

history = model.fit(x_train, y_train,
                    epochs = 10,
                    batch_size = 32,
                    validation_data = (x_val, y_val))

model.save_weights('pre_trained_glove_model.h5')             # 保存模型


# In[]:
# 7.绘制结果(acc,val_acc,loss,val_loss)
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Training val_acc')
plt.title('Training and validation accurary')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Training val_loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

# In[]:
# ************** 建模
# 8.在不使用预训练词嵌入的情况下,训练相同的模型
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))


import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Training val_acc')
plt.title('Training and validation accurary')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Training val_loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()


# ************* 测试
# 9.对测试集数据进行分词
test_dir = os.path.join(imdb_dir, 'test')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), 'r', encoding='UTF-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)

# 10.在测试集上评估模型
model.load_weights('pre_trained_glove_model.h5')
loss, accuracy = model.evaluate(x_test, y_test)
print('loss=', loss)
print('accuracy=', accuracy)


代码运行结果:无

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值