#!/usr/bin/env python
# -*- coding:utf-8 -*-
import numpy as np
import os
import sys
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation
# 2.2 数据预处理
# 这部分是设定训练相关参数,并且读入训练好的GloVe词向量文件。
# --把文本读入进list里,一个文本存成一个str,变成一个[str]
BASE_DIR = '/home/lich/Workspace/Learning'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
batch_size = 32
# first, build index mapping words in the embeddings set
# to their embedding vector
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
#Found 400000 word vectors.
# second, prepare text samples and their labels
print('Processing text dataset')
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
if sys.version_info < (3,):
f = open(fpath)
else:
f = open(fpath, encoding='latin-1')
texts.append(f.read()) # 整篇文章读取,不是按行读取
f.close()
labels.append(label_id)
print('Found %s texts.' % len(texts))
#Found 19997 texts.
#### labels_index 与 20_newsgroup 的20个分类一一对应
labels_index['alt.atheism']
#0
labels_index['comp.sys.ibm.pc.hardware']
#3
labels[:10]
#[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labels[1000:1010]
#[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels[2000:2010]
#[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
#### 2.3Tokenize
# Tokenizer 所有文本,并且把texts里面的str值先tokenizer然后映射到相应index。
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#Found 214909 unique tokens.
# 上面的代码把所有的单词都转换成了数字
word_index['newsgroups']
# 43
sequences[2][:20]
"""
[43,127,357, 44,...]"""
### 2.4 切分Train和Validate数据集
# data 是一个长度为 1000 的 array
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# labels 被转换成了 one-hot 编码的形式
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# ('Shape of data tensor:', (19997, 1000))
# ('Shape of label tensor:', (19997, 20))
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices) # 实现特征向量与类别标签的同步shuffle
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_train.shape
#(15998, 1000)
y_train.shape
#(15998, 20)
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Preparing embedding matrix.')
#### 2.5 生成Embedding Matrix
nb_words = min(MAX_NB_WORDS, len(word_index)) #这个多余的吧。。tokenizer.word_index
#20000
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NB_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)
#(20001, 100)
### 2.6 LSTM训练
# 注意训练集data的shape是(N_SAMPLES, MAX_SEQUENCE_LENGT),100是词向量长度,然后根据Embedding层会变成3D的Matrix
embedding_layer = Embedding(nb_words + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
dropout=0.2)
print('Build model...')
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer()
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2)) # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.add(Dense(len(labels_index), activation='softmax'))
model.layers[1].trainable=False
# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=5,
validation_data=(x_val, y_val))
score, acc = model.evaluate(x_val, y_val,
batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
作者:鱼er
链接:http://www.jianshu.com/p/795a5e2cd10c
來源:简书
#
Keras 实现 LSTM在20_newsgroup新闻数据集
最新推荐文章于 2024-08-16 10:07:48 发布