keras句子分类 keras_demo_for_sentence_classification (simplified version)

最新推荐文章于 2023-08-29 15:34:23 发布

Shingle_

最新推荐文章于 2023-08-29 15:34:23 发布

阅读量1.4k

点赞数 2

分类专栏：自然语言处理深度学习文章标签： keras

本文链接：https://blog.csdn.net/shingle_/article/details/73692186

版权

自然语言处理同时被 2 个专栏收录

27 篇文章 0 订阅

订阅专栏

深度学习

17 篇文章 3 订阅

订阅专栏

'''
This script loads pre-trained word embeddings(word2vec embeddings)
into a Keras Embedding layer, and uses it to train a text classification model on a customized dataset.
'''

from __future__ import print_function
from collections import defaultdict
import os
import numpy as np
import pandas as pd
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Convolution1D, Dropout, Activation 
from keras.models import Sequential
from keras.models import Model

w2v_file = 'G:/pre_trained word embeddings/word2vec/vectors.bin'
train_data = './cqa_title/traindata/userprofilepythontitle.txt'
test_data = './cqa_title/testdata/TestQuestionsPythonTitle.txt'
EMBEDDING_DIM = 400
MAX_SEQUENCE_LENGTH = 25
NB_FILTER = 128
FILTER_LENGTH = 5


def build_data_cv(clean_string=False):
    """
    Loads data.
    """
    revs = []
    vocab = defaultdict(float)
    with open(train_data, "r") as f:
        for line in f:
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = orig_rev.split()
            n_y = int(words[0])
            for word in words[1:]:
                    vocab[word] += 1
            datum  = {"y":n_y-1, 
                      "text": " ".join(words[1:]),                             
                      "num_words": len(words)-1,
                      "split": 0}
            revs.append(datum)
    with open(test_data, "r") as f:
        for line in f:
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = orig_rev.split()
            n_y = int(words[0])
            for word in words[1:]:
                    vocab[word] += 1
            datum  = {"y":n_y-1, 
                      "text": " ".join(words[1:]),                             
                      "num_words": len(words)-1,
                      "split": 1}
            revs.append(datum)
    return revs, vocab

def clean_str(string, TREC=False):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip() if TREC else string.strip().lower()

def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)   
            if word in vocab:
               word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')  
            else:
                f.read(binary_len)
    return word_vecs

def add_unknown_words(word_vecs, vocab, min_df=1, k=EMBEDDING_DIM):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)
    return word_vecs

def get_W(word_vecs, k=EMBEDDING_DIM):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1, k), dtype='float32')            
    W[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map

Using Theano backend.

print ("loading data...")
revs, vocab = build_data_cv()
df = pd.DataFrame(revs)
max_l = np.max(df)["num_words"]
print ("data loaded!")
print ("number of sentences: " + str(len(revs)))
print ("vocab size: " + str(len(vocab)))
print ("max sentence length: " + str(max_l))

loading data...
data loaded!
number of sentences: 252313
vocab size: 40441
max sentence length: 37

print ("loading data...")
revs, vocab = build_data_cv()

loading data...

print (type(revs), len(revs),type(vocab),len(vocab))

<type 'list'> 252313 <type 'collections.defaultdict'> 40441

print (revs[0])

{'y': 0, 'text': 'parsing and modification of sql statements in java', 'split': 0, 'num_words': 8}

小结一： revs是存数据的列表—（252313条数据），vocab是词表–字典（40441个单词）

print ("loding word2vec vectors...")
w2v = load_bin_vec(w2v_file, vocab)
print ("word2vec loaded!")
print ("num words already in word2vec: " + str(len(w2v)))
w2v = add_unknown_words(w2v, vocab)
W, word_idx_map = get_W(w2v)
print ("dataset creaded!")

loding word2vec vectors...
word2vec loaded!
num words already in word2vec: 28520
dataset creaded!

type(w2v) #word embeddings

dict

len(w2v)

print (type(W),len(W))
print (W.shape)

<type 'numpy.ndarray'> 40442
(40442L, 400L)

小结二：词向量是字典–w2v 转换成np.ndarray–W, W前面补一个向量是因为，句子存在补零，那个向量对应补零位

texts = df.text
tokenizer = Tokenizer(nb_words=len(vocab))
tokenizer.fit_on_texts(texts)
sequences =tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print ("Found %s unique tokens." % len(word_index))
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)
labels = df.y
labels = to_categorical(np.asarray(labels))
print ("Shape of the data tensor:", data.shape)
print("Shape of the label tensor:", labels.shape)

Found 40435 unique tokens.
Shape of the data tensor: (252313L, 25L)
Shape of the label tensor: (252313L, 2064L)

print (type(data),len(data), type(labels))

<type 'numpy.ndarray'> 252313 <type 'numpy.ndarray'>

print(type(data[0]), len(data[0]), data[0].shape)
print(data[0])
print(data[1])
print(data[2]) # keras's pad_sequences 补零在前？？

<type 'numpy.ndarray'> 25 (25L,)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0  385    8 2985    6   26  727    2   27]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0   13  234    1 4315    1 5683]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   20    9    5  432  168   78  124  385 1913]

小结三： data是pad完的文本，np.ndarray（252313条，长度25），labels是data对应的标签(252313,2064)

x_train = data[:248300]
y_train = labels[:248300]
x_test = data[248300:]
y_test = labels[248300:]

#lod pre-trained word embeddings into an Embedding layer
#note that we set trainable = True so as to fine tune the embeddings 
embedding_layer = Embedding(len(vocab) + 1,
                            EMBEDDING_DIM,
                            weights=[W],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

小结四：vocab是词表——字典，加一是因为pad补零。EMBEDDING_DIM是词向量维度400。weights是词向量矩阵（40442,400）。输入长度25.

print ("Training model.")
model = Sequential()
model.add(embedding_layer)
model.add(Convolution1D(nb_filter=NB_FILTER,
                        filter_length=FILTER_LENGTH,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
# use max pooling:
model.add(MaxPooling1D(pool_length=model.output_shape[1]))
# We flatten the output of the conv layer,
model.add(Dropout(0.5))
model.add(Activation('relu'))
# so that we can add a vanilla dense layer:
model.add(Flatten())
# We add a vanilla hidden layer:
model.add(Dense(128, activation='relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(len(labels[0]), activation='softmax'))
model.summary()

Training model.
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
embedding_1 (Embedding)          (None, 25, 400)       16176800    embedding_input_1[0][0]          
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 21, 128)       256128      embedding_1[0][0]                
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)    (None, 1, 128)        0           convolution1d_1[0][0]            
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 1, 128)        0           maxpooling1d_1[0][0]             
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 1, 128)        0           dropout_1[0][0]                  
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 128)           0           activation_1[0][0]               
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 128)           16512       flatten_1[0][0]                  
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 2064)          266256      dense_1[0][0]                    
====================================================================================================
Total params: 16,715,696
Trainable params: 16,715,696
Non-trainable params: 0
____________________________________________________________________________________________________

小结五：参数计算。

embedding层：40442 * 400 = 16176800
convolotion层：128 * 5 * 400 + 128 = 256128
倒数第二层，全连接层：128 * 2064 + 2064 = 266256
倒数第一层，softmax层： 128 * 128 + 128 = 16512


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.fit(x_train, y_train, validation_data=(x_test, y_test),
          nb_epoch=20, batch_size=128)

Shingle_

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
3
评论
keras句子分类 keras_demo_for_sentence_classification (simplified version)

'''This script loads pre-trained word embeddings(word2vec embeddings)into a Keras Embedding layer, and uses it to train a text classification model on a customized dataset.'''from __future__ import
复制链接

扫一扫