这是IMDB情感分析,我把三种模型三合一了,要仔细分清
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
import urllib.request
import os
import tarfile
from tensorflow.keras.datasets import imdb
(x_train,y_train),(x_test,y_test) = keras.datasets.imdb.load_data()
import re
def rm_tags(text):
re_tag=re.compile(r'<[^>]+>')
return re_tag.sub('',text)
import os
def read_files(filetype):
path="data/aclImdb/"
file_list=[]
positive_path=path+filetype+"/pos/"
for f in os.listdir(positive_path):
file_list+=[positive_path+f]
negative_path=path+filetype+"/neg/"
for f in os.listdir(negative_path):
file_list+=[negative_path+f]
print('read',filetype,'files:',len(file_list))
all_labels=([1]*12500+[0]*12500)
all_texts=[]
for fi in file_list:
with open(fi,encoding='utf8') as file_input:
all_texts+=[rm_tags("".join(file_input.readlines()))]
return all_labels,all_texts
y_train,train_text=read_files("train")
y_test,test_text=read_files("test")
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)
x_train_seq=token.texts_to_sequences(train_text)
x_test_seq=token.texts_to_sequences(test_text)
x_train=sequence.pad_sequences(x_train_seq,maxlen=100)
x_test &#