1.2.使用词嵌入
下面展示一些
内联代码片
。
"""
1.2.使用词嵌入
"""
# 1.将一个Embedding层实例化
from keras.layers import Embedding
embedding_layer = Embedding(1000, 64) # Embedding层至少需要2个参数:标记的个数,和嵌入的维度 2D (1000, 64)
# 2.加载IMDB数据,准备用于Embedding层
from keras.datasets import imdb
from keras import preprocessing
max_features = 10000 # 作为特征的单词个数
maxlen = 20 # 在20个单词后截断文本(这些单词都属于前max_features个最常见的单词)(限制评论长度为20个单词的批量)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) # 将数据加载为整数列表(只要前1000个最常见的单词)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# 用preprocessing.sequence.pad_sequences()做预处理,将整数列表转换成形状为(samples,maxlen)的二维整数张量
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# 3.在IMDB数据上使用Embedding层和分类器
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen)) # 3D,对这10000个词,每个词都学习一个8维嵌入,将输入的整数序列(2D整数张量)转换为嵌入序列(3D浮点张量)
model.add(Flatten()) # 将3D的嵌入张量展平为形状为(samples,maxlen*8)的2D张量
model.add(Dense(1, activation='sigmoid')) # 在上面添加分类器1D
model.summary()
model.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['acc'])
history = model.fit(x_train, y_train,
epochs = 10,
batch_size = 32,
validation_split = 0.2) # validation_split用于没有提供验证集的时候,按比例从训练集中取出一部分作为验证集
代码运行结果: