Keras系列之使用预训练词嵌入

作者:《python深度学习》学习笔记,用于自己熟悉和理解

接着上篇的 《Keras系列之文本向量化》

使用预训练的词嵌入,在数据集较小的情况下,难以学习到足够好的embedding层,选择一些权威的官方词嵌入数据库能够有效解决数据集的问题。话不多说,例子如下,在书上代码基础上进行改进和添加注释,适合小白。

IMDB数据集:http://mng.bz/0tIo
glove词嵌入文件:https://nlp.stanford.edu/projects/glove下载 2014 年英文维基百科的预计算嵌入
代码传上:https://github.com/Clarksh/keras_learning/blob/master/6.1.3-glove-using.py

maxlen = 100#句子长截断为100
training_samples = 200#在 200 个样本上训练
validation_samples = 10000#在 10 000 个样本上验证
max_words = 10000#只考虑数据集中前 10 000 个最常见的单词

"""把文件数据放入数组中"""
def dataProcess():
    imdb_dir = 'data/aclImdb'#基本路径,经常要打开这个
    #处理训练集
    train_dir = os.path.join(imdb_dir, 'train')#添加子路径
    train_labels = []
    train_texts = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):#获取目录下所有文件名字
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname),'r',encoding='utf8')
                train_texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    train_labels.append(0)
                else:train_labels.append(1)
    #处理测试集
    test_dir = os.path.join(imdb_dir, 'test')
    test_labels = []
    test_texts = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(test_dir, label_type)
        for fname in sorted(os.listdir(dir_name)):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname),'r',encoding='utf8')
                test_texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    test_labels.append(0)
                else:
                    test_labels.append(1)

    #对数据进行分词和划分训练集和数据集
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)#构建单词索引结构

    sequences = tokenizer.texts_to_sequences(train_texts)#整数索引的向量化模型
    word_index = tokenizer.word_index#索引字典
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=maxlen)
    train_labels = np.asarray(train_labels)#把列表转化为数组
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', train_labels.shape)
    indices = np.arange(data.shape[0])#评论顺序0,1,2,3
    np.random.shuffle(indices)#把评论顺序打乱3,1,2,0
    data = data[indices]
    train_labels = train_labels[indices]
    x_train = data[:training_samples]
    y_train = train_labels[:training_samples]
    x_val = data[training_samples: training_samples + validation_samples]
    y_val = train_labels[training_samples: training_samples + validation_samples]

    #同样需要将测试集向量化
    test_sequences = tokenizer.texts_to_sequences(test_texts)
    x_test = pad_sequences(test_sequences, maxlen=maxlen)
    y_test = np.asarray(test_labels)

    return x_train,y_train,x_val,y_val,x_test,y_test,word_index


embedding_dim = 100#特征数设为100
"""将预训练的glove词嵌入文件,构建成可以加载到embedding层中的嵌入矩阵"""
def load_glove(word_index):#导入glove的词向量
    embedding_file='data/glove.6B'
    embeddings_index={}#定义字典
    f = open(os.path.join(embedding_file, 'glove.6B.100d.txt'),'r',encoding='utf8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    """转化为矩阵:构建可以加载到embedding层中的嵌入矩阵,形为(max_words(单词数), embedding_dim(向量维数)) """
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():#字典里面的单词和索引
        if i >= max_words:continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

if __name__ == '__main__':
    t0 = time()
    """处理数据文件"""
    x_train, y_train, x_val, y_val,x_test,y_test, word_index = dataProcess()

    """处理glove词嵌入文件"""
    embedding_matrix=load_glove(word_index)
    #可以把得到的嵌入矩阵保存起来,方便后面fine-tune"""
    # #保存
    # with open('model/glove_embedding_matrix', 'wb') as fp:
    #     pickle.dump(embedding_matrix, fp)
    # # 读取
    # with open('model/glove_embedding_matrix', 'rb') as fp:
    #     embedding_matrix = pickle.load(fp)

    """构建神经网络"""
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(Flatten())#压平,把多维输入一维化,
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    """将预训练的词嵌入矩阵放到embedding层,并冻结起来,训练神经网络时该层不会再变化"""
    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = False
    """训练和评估"""
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    history = model.fit(x_train, y_train,
                        epochs=10,
                        batch_size=32,
                        validation_data=(x_val, y_val))
    model.save_weights('pre_trained_glove_model.h5')#保存结果

    """绘制结果"""
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

    """ 在测试集上评估模型"""
    model.load_weights('pre_trained_glove_model.h5')
    model.evaluate(x_test, y_test)

    print('Done in %.3fs...' % (time() - t0))

 

  • 6
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值