import glob
import os
import numpy as np
from keras.callbacks import TensorBoard
from keras.layers import Dense,Dropout,SimpleRNN, Flatten
from keras.models import Sequential
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('googlenews-vectors-negative300.bin.gz', limit=20000, binary=True)
def pre_process_data(filepath):
positive_path = os.path.join(filepath, 'pos')
negative_path = os.path.join(filepath, 'neg')
pos_label = 1
neg_label = 0
dataset = []
for filename in glob.glob(os.path.join(positive_path, '*.txt')):
with open(filename, 'r', encoding='utf-8') as f:
dataset.append((pos_label, f.read()))
for filename in glob.glob(os.path.join(negative_path,'*.txt')):
with open(filename,'r', encoding='utf-8') as f:
dataset.append((neg_label, f.read()))
shuffle(dataset)
return dataset
def token_and_vectorizer(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
for simple in dataset:
tokens = tokenizer.tokenize(simple[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append(word_vectors[token])
except KeyError:
pass
vectorized_data.append(sample_vecs)
return vectorized_data
def collect_expected(dataset):
excepted = []
for sample in dataset:
excepted.append(sample[0])
return excepted
# 加载数据
dataset = pre_process_data('./aclImdb/train')
vectorized_data = token_and_vectorizer(dataset)
excepted = collect_expected(dataset)
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
x_test = vectorized_data[split_point:]
y_train = excepted[:split_point]
y_test = excepted[split_point:]
# 参数
maxlen = 40
batch_size = 2
embedding_dims = 300
epoch = 2
x_train = [smp[:maxlen]+[[0.]*embedding_dims]*(maxlen-len(smp)) for smp in x_train]
x_test = [smp[:maxlen]+[[0.]*embedding_dims]*(maxlen-len(smp)) for smp in x_test]
x_train = np.reshape(x_train,(len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)
# 构建模型
num_neurous = 50
model = Sequential()
model.add(SimpleRNN(num_neurous, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', loss='binary_crossentropy',metrics=['accuracy'])
model.summary()
tensorboard = TensorBoard(log_dir='log', update_freq='batch')
model.fit(x_train, y_train, batch_size=batch_size, epochs=epoch, validation_data=(x_test,y_test), callbacks=[tensorboard], verbose=1)
model_structure = model.to_json()
with open('simplernn_model.json',"w") as json_file:
json_file.write(model_structure)
model.save_weights('simplernn_model.h5')
利用RNN实现情感分析
最新推荐文章于 2024-04-27 11:15:08 发布