import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.layers import Conv1D, GlobalAveragePooling1D
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('googlenews-vectors-negative300.bin.gz', binary=True, limit=2000)
import glob
import os
from random import shuffle
from keras.callbacks import TensorBoard
def pre_process_data(filepath):
positive_path = os.path.join(filepath,'pos')
negative_path = os.path.join(filepath,'neg')
positive_lable = 1
negative_lable = 0
datasets = []
for filename in glob.glob(os.path.join(positive_path,'*.txt')):
with open(filename,'r', encoding='utf-8') as f:
for line in f:
datasets.append((positive_lable, f.read()))
for filename in glob.glob(os.path.join(negative_path,'*.txt')):
with open(filename,'r', encoding='utf-8') as f:
datasets.append((negative_lable, f.read()))
shuffle(datasets)
return datasets
def tokenize_and_vectorize(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
expected = []
for sample in dataset:
tokens = tokenizer.tokenize(sample[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append((word_vectors[token]))
except KeyError:
pass
vectorized_data.append(sample_vecs)
return vectorized_data
# 目标标签
def collected_excepted(dataset):
excepted = []
for sample in dataset:
excepted.append(sample[0])
return excepted
dataset = pre_process_data('E:/Pythonproject/NLP实战/aclImdb/train')
vectorized_data = tokenize_and_vectorize(dataset)
excepetd = collected_excepted(dataset)
# 数据集划分
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
y_train = excepetd[:split_point]
x_test = vectorized_data[split_point:]
y_test = excepetd[split_point:]
# 参数
maxlen = 40
batch_size = 2
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_size = 250
epoch = 20
# 补充维度信息
x_train = [smp[:maxlen]+[[0.]*embedding_dims]*(maxlen-len(smp)) for smp in x_train]
x_test = [smp[:maxlen]+[[0.]*embedding_dims]*(maxlen-len(smp)) for smp in x_test]
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)
# 构建模型
model = Sequential()
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1, input_shape=(maxlen, embedding_dims)))
model.add(GlobalAveragePooling1D())
model.add(Dense(hidden_size))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
# tensorboard
tensorboard = TensorBoard(log_dir='log', update_freq='batch')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epoch, validation_data=(x_test, y_test), verbose=1, callbacks=[tensorboard])
model_structure = model.to_json()
with open('cnn_model.json',"w") as json_file:
json_file.write(model_structure)
model.save_weights('cnn_models.h5')
利用CNN实现IBDM情感分析
最新推荐文章于 2024-03-22 20:38:03 发布