电影评论情感分析 keras

代码中rt-polarity.pos里面存储的是5000句 电影英文好评,rt-polarity.neg里面存的是5000条英文版差评。


#-*- coding:utf-8 -*-
import re
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding, LSTM
from keras.utils import np_utils
from keras.datasets import mnist

from tensorflow.contrib import learn
import tensorflow as tf
from sklearn.cross_validation import train_test_split

tf.flags.DEFINE_string("positive_data_file", "./data/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polarity.neg", "Data source for the negative data.")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()


def clean_str(string):
	string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
	string = re.sub(r"\'s", " \'s", string)
	string = re.sub(r"\'ve", " \'ve", string)
	string = re.sub(r"n\'t", " n\'t", string)
	string = re.sub(r"\'re", " \'re", string)
	string = re.sub(r"\'d", " \'d", string)
	string = re.sub(r"\'ll", " \'ll", string)
	string = re.sub(r",", " , ", string)
	string = re.sub(r"!", " ! ", string)
	string = re.sub(r"\(", " \( ", string)
	string = re.sub(r"\)", " \) ", string)
	string = re.sub(r"\?", " \? ", string)
	string = re.sub(r"\s{2,}", " ", string)
	return string.strip().lower()


def load_data(positive_data_file, negative_data_file):
	""" 加载文件夹中的句子实例 """
	positive_examples = list(open(positive_data_file, "r").readlines())
	positive_examples = [s.strip() for s in positive_examples]
	negative_examples = list(open(negative_data_file, "r").readlines())
	negative_examples = [s.strip() for s in negative_examples]
	# Split by words
	x_text = positive_examples + negative_examples
	x_text = [clean_str(sent) for sent in x_text]
	# Generate labels
	positive_labels = [[0, 1] for _ in positive_examples]
	negative_labels = [[1, 0] for _ in negative_examples]
	y = np.concatenate([positive_labels, negative_labels], 0)
	return x_text, y


def deal_date():
	""" 处理数据成 数值型训练数据 + one-hot样式的target """
	x_text, y = load_data(FLAGS.positive_data_file, FLAGS.negative_data_file)
	max_document_length = max([len(x.split(" ")) for x in x_text])
	vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
	x = np.array( list(vocab_processor.fit_transform(x_text)) ) 
	np.random.seed(10)
	shuffle_indices = np.random.permutation( np.arange(len(y)) )
	x_shuffled = x[shuffle_indices]
	y_shuffled = y[shuffle_indices]
	return x_shuffled, y_shuffled, vocab_processor


def model_1():
	""" 模型1:用的5层的全连接深度网络训练 """
	x_shuffled, y_shuffled,vocab_processor = deal_date();
	train_x, test_x, train_y, test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)	
	
	model = Sequential()
	model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10,input_length=train_x.shape[1]) )
	model.add( Flatten() )
	model.add( Dense(128, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(64, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(16, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(2, activation='softmax') )

	model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
	model.fit( train_x, train_y, batch_size=64, epochs=40, verbose=1, shuffle=True)
	#print model.evaluate( test_x, test_y )
	pre_y = model.predict(test_x)
	print pre_y[:10],test_y[:10]

def model_2():
	""" 模型2: 1个卷积层+2个全连接层组成 """
	x_shuffled, y_shuffled,vocab_processor = deal_date();
	train_x, test_x, train_y,  test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)
	model = Sequential()
	model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10, input_length=train_x.shape[1]) )
	# 1个卷积层
	model.add( Reshape((train_x.shape[1],10,1), input_shape=(train_x.shape[1],10)) )
	model.add( Conv2D(32, (1, 1), activation='relu', input_shape = (train_x.shape[1],10,1)) ) # 第一层卷积
	model.add( MaxPooling2D(pool_size=(1,1)) )         # 第一层池化 
	model.add( Dropout(0.5) )
	# 2个全连接层
	model.add( Flatten() ) # 将多维数据压成1维,方便全连接层操作
	model.add( Dense(16, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(2, activation='softmax') )

	model.compile( loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'] )
	model.fit( train_x, train_y, batch_size=64, epochs=30, verbose=1 )
	score = model.evaluate( test_x, test_y, verbose=1 )
	print score

def model_3():
	""" 模型3: 用LSTM """
	x_shuffled, y_shuffled,vocab_processor = deal_date();
	train_x, test_x, train_y,  test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)	
	model = Sequential()
	#model.add( Flatten() ) # 将多维数据压成1维,方便全连接层操作
	model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10, input_length=train_x.shape[1]) )
	model.add( Reshape((1,train_x.shape[1]*10), input_shape=(train_x.shape[1],10)) )
	model.add( LSTM(64, activation = 'relu', dropout=0.5, input_shape = (1,train_x.shape[1]*10)) )
	model.add( Dense(16, activation='relu') )
	model.add( Dropout(0.5) )
	model.add( Dense(2, activation='softmax') )

	model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
	model.fit( train_x, train_y, batch_size=64, epochs=20, verbose=1 )
	score = model.evaluate( test_x, test_y, verbose=1 )
	print score

if __name__ == '__main__':
	model_3();





  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值