IMDB情感分析例子(keras)

加载IMDB数据集

X_train[0]=[1,14,22,.....32]    长度为228

X_train=sequence.pad_sequences(X_train,maxlen=500) 

x_train[0]变为[0,0,0.......1,14,22,....32]   长度为500

 

import numpy
from keras.datasets import imdb
from matplotlib import pyplot
from keras.preprocessing import sequence

(X_train,y_train),(X_test,y_test)=imdb.load_data()
print("Train data:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_train[0])
print("first length:")
print(len(X_train[0]))
print("classes:")
print(numpy.unique(y_train))
print("number of words:")
print(len(numpy.unique(numpy.hstack(X_train))))
print("review length:")
result=map(len, X_train)


#sequence.pad_sequences  
#将228长度的句子,填充到500,在前面前冲0
X_train=sequence.pad_sequences(X_train,maxlen=500)
print(X_train[0])
print(len(X_train[0]))
print("mean %.2f words(%f)"% (numpy.mean(result),numpy.std(result)))
pyplot.subplot(121)
pyplot.boxplot(result)
pyplot.subplot(122)
pyplot.hist(result)
pyplot.show()

 

Word Embeddings

 

imdb.load_data(nb_words=5000,test__split=0.33)

X_train=sequence.pad_sequences(X_train,maxlen=500)

X_test=sequence.pad_sequences(X_test,maxlen=500)

model.add(Embedding(5000,32,input_length=500))

5000词汇量,每个句子500长度,每个词用32位向量表示

 

普通神经网络

 

import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

seed=7
numpy.random.seed(seed)
top_words=5000
test_split=0.33
(X_train,y_train),(X_test,y_test)=imdb.load_data(num_words=top_words)
num_lizi=X_train.shape[0]/10
num_lizi2=X_test.shape[0]/10
X_train=X_train[0:num_lizi]
y_train=y_train[0:num_lizi]
X_test=X_test[0:num_lizi2]
y_test=y_test[0:num_lizi2]

max_words=500
X_train=sequence.pad_sequences(X_train,maxlen=max_words)
X_test=sequence.pad_sequences(X_test,maxlen=max_words)
model=Sequential()
model.add(Embedding(top_words,32,input_length=max_words))
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=2,batch_size=128,verbose=1)
scores=model.evaluate(X_test, y_test,verbose=0)
print("Accuracy: %.2f%%"%(scores[1]*100))

 

一维CNN处理IMDB问题

 

# CNN for the IMDB problem
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

seed = 7
numpy.random.seed(seed)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
test_split = 0.33
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# pad dataset to a maximum review length in words
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=2, batch_size=128, verbose=1)
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

 

 

 

 

 

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值