代码中rt-polarity.pos里面存储的是5000句 电影英文好评,rt-polarity.neg里面存的是5000条英文版差评。
#-*- coding:utf-8 -*-
import re
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding, LSTM
from keras.utils import np_utils
from keras.datasets import mnist
from tensorflow.contrib import learn
import tensorflow as tf
from sklearn.cross_validation import train_test_split
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polarity.neg", "Data source for the negative data.")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data(positive_data_file, negative_data_file):
""" 加载文件夹中的句子实例 """
positive_examples = list(open(positive_data_file, "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(negative_data_file, "r").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return x_text, y
def deal_date():
""" 处理数据成 数值型训练数据 + one-hot样式的target """
x_text, y = load_data(FLAGS.positive_data_file, FLAGS.negative_data_file)
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array( list(vocab_processor.fit_transform(x_text)) )
np.random.seed(10)
shuffle_indices = np.random.permutation( np.arange(len(y)) )
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
return x_shuffled, y_shuffled, vocab_processor
def model_1():
""" 模型1:用的5层的全连接深度网络训练 """
x_shuffled, y_shuffled,vocab_processor = deal_date();
train_x, test_x, train_y, test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)
model = Sequential()
model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10,input_length=train_x.shape[1]) )
model.add( Flatten() )
model.add( Dense(128, activation='relu') )
model.add( Dropout(0.5) )
model.add( Dense(64, activation='relu') )
model.add( Dropout(0.5) )
model.add( Dense(16, activation='relu') )
model.add( Dropout(0.5) )
model.add( Dense(2, activation='softmax') )
model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
model.fit( train_x, train_y, batch_size=64, epochs=40, verbose=1, shuffle=True)
#print model.evaluate( test_x, test_y )
pre_y = model.predict(test_x)
print pre_y[:10],test_y[:10]
def model_2():
""" 模型2: 1个卷积层+2个全连接层组成 """
x_shuffled, y_shuffled,vocab_processor = deal_date();
train_x, test_x, train_y, test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)
model = Sequential()
model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10, input_length=train_x.shape[1]) )
# 1个卷积层
model.add( Reshape((train_x.shape[1],10,1), input_shape=(train_x.shape[1],10)) )
model.add( Conv2D(32, (1, 1), activation='relu', input_shape = (train_x.shape[1],10,1)) ) # 第一层卷积
model.add( MaxPooling2D(pool_size=(1,1)) ) # 第一层池化
model.add( Dropout(0.5) )
# 2个全连接层
model.add( Flatten() ) # 将多维数据压成1维,方便全连接层操作
model.add( Dense(16, activation='relu') )
model.add( Dropout(0.5) )
model.add( Dense(2, activation='softmax') )
model.compile( loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'] )
model.fit( train_x, train_y, batch_size=64, epochs=30, verbose=1 )
score = model.evaluate( test_x, test_y, verbose=1 )
print score
def model_3():
""" 模型3: 用LSTM """
x_shuffled, y_shuffled,vocab_processor = deal_date();
train_x, test_x, train_y, test_y = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=0)
model = Sequential()
#model.add( Flatten() ) # 将多维数据压成1维,方便全连接层操作
model.add( Embedding(input_dim=len(vocab_processor.vocabulary_), output_dim=10, input_length=train_x.shape[1]) )
model.add( Reshape((1,train_x.shape[1]*10), input_shape=(train_x.shape[1],10)) )
model.add( LSTM(64, activation = 'relu', dropout=0.5, input_shape = (1,train_x.shape[1]*10)) )
model.add( Dense(16, activation='relu') )
model.add( Dropout(0.5) )
model.add( Dense(2, activation='softmax') )
model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
model.fit( train_x, train_y, batch_size=64, epochs=20, verbose=1 )
score = model.evaluate( test_x, test_y, verbose=1 )
print score
if __name__ == '__main__':
model_3();