对imdb使用预先训练好的词嵌入
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense
imdb_dir = r'E:\aclImdb\aclImdb'
train_dir = os.path.join(imdb_dir,'train')
labels = []
texts = []
for label_type in ['neg','pos']: #对评论进行0,1标注
dir_name = os.path.join(train_dir,label_type)
for fname in os.listdir(dir_name): #os.listdir 循环路径下的文件并返回列表
# print(fname)
# if fname[-4:] == '.txt':
if fname.endswith('.txt'):
f = open(os.path.join(dir_name,fname),'r',encoding='utf8')
# print(f.read())
texts.append(f.read())#一个字一个字读取
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
print(texts[0])
maxlen = 100 #100个字符后截断
training_samples = 200 #200个样本训练
validation_samples = 10000 #测试机10000
max_word = 10000 #只考虑数据集中10000 个常见的单词
tokenizer = Tokenizer(num_words=max_word)#创建一个分词器,设置为只考虑前10000个常见词
tokenizer.fit_on_texts(texts) #构建单词索引
sequences = tokenizer.texts_to_sequences(texts) #将字符串转换为整数索引列表
# print(sequences)
word_index = tokenizer.word_index #找回索引
# print(len(sequences))
# print(sequences)
data = pad_sequences(sequences,maxlen=maxlen) #每一个填充0扩展
# print(data)
labels = np.asarray(labels)
indices = np.arange(data.shape[0]) #sequence 列表的拿到索引
np.random.shuffle(indices) #将数据打乱
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples+validation_samples]
y_val = labels[training_samples:training_samples+validation_samples]
glove_dir = r'D:\glove'
embeddings_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf8')
for line in f:
values = line.split()
word = values[0]
# print(values)
coefs = np.asarray(values[1:],dtype='float32')
embeddings_index[word] = coefs #{word,多个index} word的嵌入向量数字的字典
# print(embeddings_index)
embedding_dim = 100
embeddings_matrix = np.zeros((max_word,embedding_dim))
for word,i in word_index.items(): #word_index 字典{单词:索引}
if i < max_word:
# print(word,i)
embedding_vec = embeddings_index.get(word) #找到单词的向量
# print(embedding_vec)
if embedding_vec is not None: #如果单词不为空则嵌入矩阵中
# print(embedding_vec)
embeddings_matrix[i] = embedding_vec
# print(embeddings_matrix)
model = Sequential()
model.add(Embedding(max_word,embedding_dim,input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.layers[0].set_weights([embeddings_matrix]) #预训练的词嵌入加载到embedding层中
model.layers[0].trainable = False #冻结embedding层
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))
model.save_weights('pre_trained_glove_model.h5')