from keras.preprocessing.text import Tokenizer
from keras.preprocessing. sequence import pad_sequences
import numpy as np
maxlen =100# cuts off review after 100 words
training_samples =200# Trains on 200 samples
validation_samples =10000# Validates o 10000 samples
max_words =10000# Considers only the top 10000 words in the dataset
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index # Length: 88582print("Found %s unique tokens."%len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)print("Shape of data tensor:", data.shape)print("Shape of label tensor:", labels.shape)
indices = np.arange(data.shape[0])# Splits data into training and validation set, but shuffles is, since samples are ordered: # all negatives first, then all positive
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]# (200, 100)
y_train = labels[:training_samples]# shape (200,)
x_val = data[training_samples:training_samples+validation_samples]# shape (10000, 100)
y_val = labels[training_samples:training_samples+validation_samples]# shape (10000,)
3. 下载glove词向量,并读入
glove_dir ="./"
embeddings_index ={}
f =open(os.path.join(glove_dir,"glove.6B.50d.txt"), encoding='utf-8')#added , encoding='utf-8'for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype="float32")
embeddings_index[word]= coefs
f.close()print("found %s word vectors."%len(embeddings_index))
4. 读入词向量
embedding_dim =50# GloVe contains 50-dimensional embedding vectors for 400.000 words
embedding_matrix = np.zeros((max_words, embedding_dim))# embedding_matrix.shape (10000, 50)for word, i in word_index.items():if i < max_words:
embedding_vector = embeddings_index.get(word)# embedding_vector.shape (100,)if embedding_vector isnotNone:
embedding_matrix[i]= embedding_vector # Words not found in the mebedding index will all be zeros
model.compile(optimizer ="rmsprop",
loss ="binary_crossentropy",# in a multiclass problem categorical_crossentropy would be used
metrics =["acc"])
history = model.fit(x_train, y_train,
epochs =10,
batch_size =32,
validation_data =(x_val, y_val))
model.save_weights("pre_trained_glove_model.h5")