Keras系列入门第六天

对imdb使用预先训练好的词嵌入

import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense
imdb_dir = r'E:\aclImdb\aclImdb'
train_dir = os.path.join(imdb_dir,'train')
labels = []
texts = []
for label_type in ['neg','pos']: #对评论进行0,1标注
    dir_name = os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name): #os.listdir 循环路径下的文件并返回列表
        # print(fname)
        # if fname[-4:] == '.txt':
        if fname.endswith('.txt'):
            f = open(os.path.join(dir_name,fname),'r',encoding='utf8')
            # print(f.read())
            texts.append(f.read())#一个字一个字读取
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
print(texts[0])
maxlen = 100 #100个字符后截断
training_samples = 200 #200个样本训练
validation_samples = 10000 #测试机10000
max_word = 10000 #只考虑数据集中10000 个常见的单词
tokenizer = Tokenizer(num_words=max_word)#创建一个分词器,设置为只考虑前10000个常见词
tokenizer.fit_on_texts(texts) #构建单词索引
sequences = tokenizer.texts_to_sequences(texts) #将字符串转换为整数索引列表
# print(sequences)
word_index = tokenizer.word_index #找回索引
# print(len(sequences))
# print(sequences)
data = pad_sequences(sequences,maxlen=maxlen) #每一个填充0扩展
# print(data)
labels = np.asarray(labels)
indices = np.arange(data.shape[0]) #sequence 列表的拿到索引
np.random.shuffle(indices) #将数据打乱
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples+validation_samples]
y_val = labels[training_samples:training_samples+validation_samples]
glove_dir = r'D:\glove'
embeddings_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    # print(values)
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs #{word,多个index} word的嵌入向量数字的字典
# print(embeddings_index)
embedding_dim = 100
embeddings_matrix = np.zeros((max_word,embedding_dim))
for word,i in word_index.items(): #word_index 字典{单词:索引}
    if i < max_word:
        # print(word,i)
        embedding_vec = embeddings_index.get(word) #找到单词的向量
        # print(embedding_vec)
        if embedding_vec is not None: #如果单词不为空则嵌入矩阵中
            # print(embedding_vec)
            embeddings_matrix[i] = embedding_vec
# print(embeddings_matrix)
model = Sequential()
model.add(Embedding(max_word,embedding_dim,input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.layers[0].set_weights([embeddings_matrix]) #预训练的词嵌入加载到embedding层中
model.layers[0].trainable = False #冻结embedding层
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))
model.save_weights('pre_trained_glove_model.h5')

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值