from keras.layers import Dense,BatchNormalization,Bidirectional,CuDNNLSTM,Conv1D,MaxPooling1D,SpatialDropout1D,Dropout,Embedding
import numpy as np
from keras import preprocessing
import jieba
import re
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import Sequential
from keras.preprocessing.text import Tokenizer
def preprocess(file,stop):
categories = {'体育':0, '财经':1, '房产':2, '家居':3, '教育':4, '科技':5, '时尚':6,'时政' :7, '游戏':8 ,'娱乐':9}
label = []
data = []
f = open(file,'r',encoding='utf8')
stop = open(stop,'r',encoding='utf8')
stop_list = [re.sub('[\r\n]','',i) for i in stop.readlines()]
for i in f.readlines():
i = re.sub('[\r\n]','',i)
i = i.split('\t')
k = '<s>'
if i[0] in categories.keys():
label.append(categories.get(i[0]))
data_values = jieba.cut(i[1])
for j in data_values:
if j not in stop_list:
j = k +j
k = j
k = re.sub('<s>','',k)
data.append(k)
f.close()
stop.close()
return label,data
label,data = preprocess(r'E:\BaiduNetdiskDownload\cnews\cnews.train.txt','E:\BaiduNetdiskDownload\cnews\cnews.vocab.txt')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequence = tokenizer.sequences_to_texts(data)#找回索引
X_train = preprocessing.sequence.pad_sequences(sequence,maxlen=20)
X_test = preprocessing.sequence.pad_sequences(sequence,maxlen=20)
label = to_categorical(label)
indices = np.arange(X_train.shape[0]) #sequence 列表的拿到索引
print(tokenizer.word_index )
np.random.shuffle(indices) #将数据打乱
label = np.asarray(label)
X_train = X_train[indices]
y_train = label[indices]