这是一个非常经典的代码,我增加了一定的注释和保存模型的代码
使用的语料库和数据集都是网站公开数据(百度直接搜索名字都能搜到,下面提供下载地址)
from __future__ import print_function
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
BASE_DIR = ''
#glove模型路径
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
#文本语料路径
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector
#1.准备glove词向量和它们对应的字典映射
print('Indexing word vectors.')
#我们从GloVe文件中解析出每个词和它所对应的词向量,并用字典的方式存储
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
for line in f:
word, coefs