Keras 实现 LSTM在20_newsgroup新闻数据集

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import numpy as np
import os
import sys
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

# 2.2 数据预处理
# 这部分是设定训练相关参数,并且读入训练好的GloVe词向量文件。
# --把文本读入进list里,一个文本存成一个str,变成一个[str]
BASE_DIR = '/home/lich/Workspace/Learning'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
batch_size = 32

# first, build index mapping words in the embeddings set
# to their embedding vector

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
#Found 400000 word vectors.

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())  # 整篇文章读取,不是按行读取
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))
#Found 19997 texts.

#### labels_index 与 20_newsgroup 的20个分类一一对应

labels_index['alt.atheism']
#0
labels_index['comp.sys.ibm.pc.hardware']
#3
labels[:10]
#[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labels[1000:1010]
#[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
labels[2000:2010]
#[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


#### 2.3Tokenize
# Tokenizer 所有文本,并且把texts里面的str值先tokenizer然后映射到相应index。

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#Found 214909 unique tokens.

# 上面的代码把所有的单词都转换成了数字
word_index['newsgroups']
# 43

sequences[2][:20]
"""
[43,127,357, 44,...]"""


### 2.4 切分Train和Validate数据集

# data 是一个长度为 1000 的 array
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# labels 被转换成了 one-hot 编码的形式
labels = to_categorical(np.asarray(labels))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# ('Shape of data tensor:', (19997, 1000))
# ('Shape of label tensor:', (19997, 20))

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices) # 实现特征向量与类别标签的同步shuffle
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_train.shape
#(15998, 1000)

y_train.shape
#(15998, 20)

x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Preparing embedding matrix.')




#### 2.5 生成Embedding Matrix

nb_words = min(MAX_NB_WORDS, len(word_index)) #这个多余的吧。。tokenizer.word_index
#20000
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)
#(20001, 100)


### 2.6 LSTM训练
# 注意训练集data的shape是(N_SAMPLES, MAX_SEQUENCE_LENGT),100是词向量长度,然后根据Embedding层会变成3D的Matrix
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            dropout=0.2)

print('Build model...')
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer()
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.add(Dense(len(labels_index), activation='softmax'))
model.layers[1].trainable=False

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=5,
          validation_data=(x_val, y_val))
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

作者:鱼er
链接:http://www.jianshu.com/p/795a5e2cd10c
來源:简书











#

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值