NLP - 加载预训练embedding示例

Sentimental analysis on text (a binary classification model based on Keras)

# -*- coding: utf-8 -*-
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
import matplotlib.pyplot as plt


labels = []
texts = []

maxlen = 100
training_samples = 20000
validation_samples = 4000
max_words = 10000
embedding_index = {}
embedding_dim = 300

predict_texts = ['I love you and you are so beautiful, you are good, you are good, you are good',
                 'I hate you and you are bad, you are bad, you are bad',
                 'you are good']


def get_datasource():
    imdb_dir = 'D:/DL/keras/aclImdb'
    train_dir = os.path.join(imdb_dir, 'train')
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), 'r', encoding='UTF-8')
                texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)

    print('texts', texts[0:3])
    print('len(texts)', len(texts))
    return texts, labels


def text_split(training_samples, validation_samples, texts, labels):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    # 将字符串转换为整数索引组成的列表
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=maxlen)

    labels = np.asarray(labels)
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    print('data[0]: ', len(data[0]), '\n', data[0])

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]

    x_train = data[:training_samples]
    y_train = labels[:training_samples]
    x_val = data[training_samples: training_samples + validation_samples]
    y_val = labels[training_samples: training_samples + validation_samples]

    return x_train, y_train, x_val, y_val, word_index, tokenizer


def prepare_Glove(embedding_index, word_index, max_words, embedding_dim):
    glove_dir = 'D:/DL/keras/glove.6B'
    f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), 'r', encoding='UTF-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embedding_index))

    # word_index contains the words in corpus, embedding_index is the pre-trained embedding
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


def predict_samples_vec(samples, tokenizer):
    sequence = tokenizer.texts_to_sequences(samples)
    pad_seq = pad_sequences(sequence, maxlen=maxlen)
    print('pad_seq', pad_seq)
    return pad_seq


def my_model(max_words, embedding_dim, maxlen):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    # 将三维的嵌入张量展平成形状为二维张量
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    return model


def train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val):
    model = my_model(max_words, embedding_dim, maxlen)
    model.layers[0].set_weights([embedding_matrix])  # 加载预训练embedding
    model.layers[0].trainable = False

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    save_model_file = 'pre_trained_glove_model.h5'

    if not os.path.exists(save_model_file):

        history = model.fit(x_train, y_train, epochs=20, batch_size=128, validation_data=(x_val, y_val))
        model.save_weights('pre_trained_glove_model.h5')

        acc = history.history['acc']
        val_acc = history.history['val_acc']
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        epochs = range(1, len(acc) + 1)
        plt.plot(epochs, acc, 'bo', label='Training acc')
        plt.plot(epochs, val_acc, 'b', label='Validation acc')
        plt.title('Training and validation accuracy')
        plt.legend()

        plt.figure()
        plt.plot(epochs, loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.legend()
        plt.show()


def predict():
    model = my_model(max_words, embedding_dim, maxlen)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

    save_model_file = 'pre_trained_glove_model.h5'
    model.load_weights(save_model_file)
    print('model has restored...')
    pred_test = model.predict(predict_texts)
    for i in pred_test:
        if i[0] < 0.3:
            print('this is negtive comment')
        else:
            print('this is positive comment')


if __name__ == "__main__":
    text, label = get_datasource()
    x_train, y_train, x_val, y_val, word_index, tokenizer = text_split(training_samples, validation_samples, text, label)
    # embedding_matrix = prepare_Glove(embedding_index, word_index, max_words, embedding_dim)

    predict_texts = predict_samples_vec(predict_texts, tokenizer)
    # train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val)
    predict()





There are two functions. 

def prepare_Glove(embedding_index, word_index, max_words, embedding_dim):
    glove_dir = 'D:/DL/keras/glove.6B'
    f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), 'r', encoding='UTF-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embedding_index))

    # word_index contains the words in corpus, embedding_index is the pre-trained embedding
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix
def train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val):
    model = my_model(max_words, embedding_dim, maxlen)
    model.layers[0].set_weights([embedding_matrix])  # 加载预训练embedding
    model.layers[0].trainable = False

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    save_model_file = 'pre_trained_glove_model.h5'
需要冻结 Embedding 层(即将其 trainable 属性设为 False ),其原理和预训练的卷 积神经网络特征相同,你已经很熟悉了。如果一个模型的一部分是经过预训练的(如 Embedding 层),而另一部分是随机初始化的(如分类器),那么在训练期间不应该更新预训练的部分,以避免丢失它们所保存的信息。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值