电影评论情感分析 keras

最新推荐文章于 2024-05-08 07:48:34 发布

·清尘·

最新推荐文章于 2024-05-08 07:48:34 发布

阅读量3.8k

点赞数

本文链接：https://blog.csdn.net/u012969412/article/details/71081690

版权

代码中rt-polarity.pos里面存储的是5000句电影英文好评，rt-polarity.neg里面存的是5000条英文版差评。

#-*- coding:utf-8 -*-
import re
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding, LSTM
from keras.utils import np_utils
from keras.datasets import mnist

from tensorflow.contrib import learn
import tensorflow as tf
from sklearn.cross_validation import train_test_split

tf.flags.DEFINE_string("positive_data_file", "./data/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polarity.neg", "Data source for the negative data.")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()


def clean_str(string):
	string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
	string = re.sub(r"\'s", " \'s", string)
	string = re.sub(r"\'ve", " \'ve", string)
	string = re.sub(r"n\'t", " n\'t", string)
	string = re.sub(r"\'re", " \'re", string)
	string = re.sub(r"\'d", " \'d", string)
	string = re.sub(r"\'ll", " \'ll", string)
	string = re.sub(r",", " , ", string)
	string = re.sub(r"!", " ! ", string)
	string = re.sub(r"\(", " \( ", string)
	string = re.sub(r"\)", " \) ", string)
	string = re.sub(r"\?", " \? ", string)
	string = re.sub(r"\s{2,}", " ", string)
	return string.strip().lower()


def load_data(positive_data_file, negative_data_file):
	""" 加载文件夹中的句子实例 """
	positive_examples = list(open(positive_data_file, "r").readlines())
	positive_examples = [s.strip() for s in positive_examples]
	negative_examples = list(open(negative_data_file, "r").readlines())
	negative_examples = [s.strip() for s in negative_examples]
	# Split by words
	x_text = positive_examples + negative_examples
	x_text = [clean_str(sent) for sent in x_text]
	# Generate labels
	positive_labels = [[0, 1] for _ in positive_examples]
	negative_labels = [[1, 0] for _ in negative_examples]
	y = np.concatenate([positive_labels, negative_labels], 0)
	return x_text, y


def deal_date():
	""" 处理数据成 数值型训练数据 + one-hot样式的target """
	x_text, y = load_data(FLAGS.positive_data_file, FLAGS.negative_data_file)
	max_document_length = max([len(x.split(" ")) for x in x_text])
	vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
	x = np.array( list(vocab_processor.fit_transform(x_text)) ) 
	np.random.seed(10)
	shuffle_indices = np.random.permutation( np.arange(len(y)) )
	x_shuffled = x[shuffle_indices]
	y_shuffled = y[shuffle_indices]
	return x_shuffled, y_shuffled, vocab_processor


def model_1():
	""" 模型1：用的5层的全连接深度网络训练 """
	x_shuffled, y_shuffled,vocab_processor = deal_date();
	train_x, test_x, train_y, test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)	
	
	model = Sequential()
	model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10,input_length=train_x.shape[1]) )
	model.add( Flatten() )
	model.add( Dense(128, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(64, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(16, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(2, activation='softmax') )

	model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
	model.fit( train_x, train_y, batch_size=64, epochs=40, verbose=1, shuffle=True)
	#print model.evaluate( test_x, test_y )
	pre_y = model.predict(test_x)
	print pre_y[:10],test_y[:10]

def model_2():
	""" 模型2： 1个卷积层+2个全连接层组成 """
	x_shuffled, y_shuffled,vocab_processor = deal_date();
	train_x, test_x, train_y,  test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)
	model = Sequential()
	model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10, input_length=train_x.shape[1]) )
	# 1个卷积层
	model.add( Reshape((train_x.shape[1],10,1), input_shape=(train_x.shape[1],10)) )
	model.add( Conv2D(32, (1, 1), activation='relu', input_shape = (train_x.shape[1],10,1)) ) # 第一层卷积
	model.add( MaxPooling2D(pool_size=(1,1)) )         # 第一层池化 
	model.add( Dropout(0.5) )
	# 2个全连接层
	model.add( Flatten() ) # 将多维数据压成1维，方便全连接层操作
	model.add( Dense(16, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(2, activation='softmax') )

	model.compile( loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'] )
	model.fit( train_x, train_y, batch_size=64, epochs=30, verbose=1 )
	score = model.evaluate( test_x, test_y, verbose=1 )
	print score

def model_3():
	""" 模型3: 用LSTM """
	x_shuffled, y_shuffled,vocab_processor = deal_date();
	train_x, test_x, train_y,  test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)	
	model = Sequential()
	#model.add( Flatten() ) # 将多维数据压成1维，方便全连接层操作
	model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10, input_length=train_x.shape[1]) )
	model.add( Reshape((1,train_x.shape[1]*10), input_shape=(train_x.shape[1],10)) )
	model.add( LSTM(64, activation = 'relu', dropout=0.5, input_shape = (1,train_x.shape[1]*10)) )
	model.add( Dense(16, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(2, activation='softmax') )

	model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
	model.fit( train_x, train_y, batch_size=64, epochs=20, verbose=1 )
	score = model.evaluate( test_x, test_y, verbose=1 )
	print score

if __name__ == '__main__':
	model_3();

·清尘·

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
电影评论情感分析 keras

#-*- coding:utf-8 -*-import reimport numpy as npfrom keras.models import Sequentialfrom keras.layers import Dense, Dropout, Activation, Flatten, Reshapefrom keras.layers.convolutional import Conv
复制链接

扫一扫