1.流程图
2.数据准备
from keras. datasets import imdb
from keras. preprocessing import sequence
from keras. preprocessing. text import Tokenizer
import re
re_tag = re. compile ( r'<[^>]+>' )
def rm_tags ( text) :
return re_tag. sub( '' , text)
import os
def read_files ( filetype) :
path = "data/aclImdb/"
file_list= [ ]
positive_path= path + filetype+ "/pos/"
for f in os. listdir( positive_path) :
file_list+= [ positive_path+ f]
negative_path= path + filetype+ "/neg/"
for f in os. listdir( negative_path) :
file_list+= [ negative_path+ f]
print ( 'read' , filetype, 'files:' , len ( file_list) )
all_labels = ( [ 1 ] * 12500 + [ 0 ] * 12500 )
all_texts = [ ]
for fi in file_list:
with open ( fi, encoding= 'utf8' ) as file_input:
all_texts += [ rm_tags( " " . join( file_input. readlines( ) ) ) ]
return all_labels, all_texts
token = Tokenizer( num_words= 3800 )
token. fit_on_texts( train_text)
x_train_seq = token. texts_to_sequences( train_text)
x_test_seq = token. texts_to_sequences( test_text)
x_train = sequence. pad_sequences( x_train_seq, maxlen= 380 )
x_test = sequence. pad_sequences( x_test_seq, maxlen= 380 )
3.建立模型
from keras. models import Sequential
from keras. layers. core import Dense, Dropout, Activation
from keras. layers. embeddings import Embedding
from keras. layers. recurrent import SimpleRNN
model = Sequential( )
model. add( Embedding( output_dim= 32 ,
input_dim= 3800 ,
input_length= 380 ) )
model. add( Dropout( 0.35 ) )
model. add( SimpleRNN( units= 16 ) )
model. add( Dense( units= 256 , activation= 'relu' ) )
model. add( Dropout( 0.35 ) )
model. add( Dense( units= 1 , activation= 'sigmoid' ) )
model. summary( )
4.模型训练
model. compile ( loss= 'binary_crossentropy' ,
optimizer= 'adam' ,
metrics= [ 'accuracy' ] )
train_history = model. fit( x_train, y_train, batch_size= 100 ,
epochs= 10 , verbose= 1 ,
validation_split= 0.2 )
import matplotlib. pyplot as plt
def show_train_history ( train_history, train, validation) :
plt. plot( train_history. history[ train] )
plt. plot( train_history. history[ validation] )
plt. title( 'Train History' )
plt. ylabel( train)
plt. xlabel( 'Epoch' )
plt. legend( [ 'train' , 'validation' ] , loc= 'upper left' )
plt. show( )
show_train_history( train_history, 'acc' , 'val_acc' )
show_train_history( train_history, 'loss' , 'val_loss' )
5.模型准确率
scores = model. evaluate( x_test, y_test, verbose= 1 )
scores[ 1 ]
6.模型预测
probility= model. predict( x_test)
predict= model. predict_classes( x_test)
SentimentDict= { 1 : '正面的' , 0 : '负面的' }
def display_test_Sentiment ( i) :
print ( test_text[ i] )
print ( 'label真实值:' , SentimentDict[ y_test[ i] ] ,
'预测结果:' , SentimentDict[ predict_classes[ i] ] )
display_test_Sentiment( 2 )
7.保存模型
model_json = model. to_json( )
with open ( "SaveModel/Imdb_RNN_model.json" , "w" ) as json_file:
json_file. write( model_json)
model. save_weights( "SaveModel/Imdb_RNN_model.h5" )
print ( "Saved model to disk" )