import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.layers import Conv1D, GlobalAveragePooling1D
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('googlenews-vectors-negative300.bin.gz', binary=True, limit=2000)
import glob
import os
from random import shuffle
from keras.callbacks import TensorBoard
def pre_process_data(filepath):
positive_path = os.path.join(filepath,'pos')
negative_path = os.path.join(filepath,'neg')
positive_lable = 1
negative_lable = 0
datasets = []
for filename in glob.glob(os.path.join(positive_path,'*.txt')):
with open(filename,'r', encoding='utf-8') as f:
for line in f:
datasets.append((positive_lable, f.read()))
for filename in glob.glob(os.path.join(negative_path,'*.txt')):
with open(filename,'r', encoding='utf-8') as f:
datasets.append((negative_lable, f.read()))
shuffle(datasets)
return datasets
def tokenize_and_vectorize(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
expected = []
for sample in dataset:
tokens = tokenizer.tokenize(sample[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append((word_vectors[token]))
except KeyError:
pass
vectorized_data.append(sample_vecs)
return vectorized_data
# 目标标签
def collected_excepted(dataset):
excepted = []
for sample in dataset:
excepted.append(sample[0])
return excepted
dataset = pre_process_data('E:/Pythonproject/NLP实战/aclImdb/train')
vectorized_data = tokenize_and_vectorize(dataset)
excepetd = collected_excepted(dataset)
# 数据集划分
split_point = int(len(vectorized_data)*0.8)
x_train = vectorized_data[:split_point]
y_train = excepetd[:split_point]
x_test = vectorized_data[split_point:]
y_test = excepetd[split_point:]
# 参数
maxlen = 40
batch_size = 2
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_size = 250
epoch = 20
# 补充维度信息
x_train = [smp[:maxlen]+[[0.]*embedding_dims]*(maxlen-len(smp)) for smp in x_train]
x_test = [smp[:maxlen]+[[0.]*embedding_dims]*(maxlen-len(smp)) for smp in x_test]
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)
# 构建模型
model = Sequential()
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1, input_shape=(maxlen, embedding_dims)))
model.add(GlobalAveragePooling1D())
model.add(Dense(hidden_size))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
# tensorboard
tensorboard = TensorBoard(log_dir='log', update_freq='batch')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epoch, validation_data=(x_test, y_test), verbose=1, callbacks=[tensorboard])
model_structure = model.to_json()
with open('cnn_model.json',"w") as json_file:
json_file.write(model_structure)
model.save_weights('cnn_models.h5')
利用CNN实现IBDM情感分析
最新推荐文章于 2024-11-20 12:00:17 发布
这段代码实现了一个情感分析的预处理流程,包括读取IMDb数据集,进行词汇向量化,构建并训练一个卷积神经网络模型。模型结构包含一个卷积层、全局平均池化层、全连接层以及Dropout层,最后通过Sigmoid激活函数输出预测概率。训练过程中使用了TensorBoard回调进行可视化,并保存了模型结构和权重。
3851

被折叠的 条评论
为什么被折叠?



