一、说明
本代码使用keras对文本文档进行处理,主要包括
1.使用urllib下载数据集
2.使用tarfile解压数据集
3.使用re书写正则表达式,替换文本中的格式符
4.使用Tokenizer去建立字典
5.使用sequence去变换文本长度,短的补0,长的截取
6.使用Embedding层将数字列表转换为向量列表
二、特别注意
1.keras的simpleRNN函数使用的RNN结构暂时不清楚
2.keras嵌入层从数字列表转化为向量列表中间过程暂时不清楚
三、源码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import urllib.request
import tarfile
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
# regular expression
import re
def rm_tags(text):
re_tag = re.compile(r'<[^>]+>')
return re_tag.sub('',text)
def read_files(filetype):
path = "./aclImdb/"
file_list=[]
positive_path=path + filetype+"/pos/"
for f in os.listdir(positive_path):
file_list+=[positive_path+f]
negative_path=path + filetype+"/neg/"
for f in os.listdir(negative_path):
file_list+=[negative_path+f]
print('read',filetype, 'files:',len(file_list))
all_labels = ([1] * 12500 + [0] * 12500)
all_texts = []
for fi in file_list:
with open(fi,encoding='utf8') as file_input:
filelines = file_input.readlines()
all_texts += [rm_tags(filelines[0])]
return all_labels,all_texts
url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
result=urllib.request.urlretrieve(url,filepath)
print('downloaded:',result)
if not os.path.exists("aclImdb"):
tfile = tarfile.open("aclImdb_v1.tar.gz", 'r:gz')
result=tfile.extractall('.')
y_train, x_train = read_files('train')
y_test, x_test = read_files('test')
token = Tokenizer(num_words=2000)
token.fit_on_texts(x_train)
x_train_seq = token.texts_to_sequences(x_train)
x_test_seq = token.texts_to_sequences(x_test)
x_train_v = sequence.pad_sequences(x_train_seq,maxlen=100)
x_test_v = sequence.pad_sequences(x_test_seq,maxlen=100)
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding,SimpleRNN
model = Sequential()
model.add(Embedding(input_dim=2000,output_dim=32,input_length=100))
model.add(Flatten())
#model.add(SimpleRNN(units=32))
model.add(Dense(units=256,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
train_his = model.fit(x_train_v,y_train,batch_size=128,epochs=10,verbose=2,validation_split=0.1)
scores = model.evaluate(x_test_v,y_test,verbose=1)
scores[1]