[attention] HAN模型笔记-使用keras编写

最新推荐文章于 2024-04-25 07:46:35 发布

weixin_42663919

最新推荐文章于 2024-04-25 07:46:35 发布

阅读量1.1k

点赞数 1

文章标签：自然语言处理深度学习

本文链接：https://blog.csdn.net/weixin_42663919/article/details/107165620

版权

HAN模型

理论部分

文章：Hierarchical Attention Networks for Document Classification
链接：https://pan.baidu.com/s/1qZieu90lKPMwwj2BxoO2MA
提取码：6pv0
模型理解

代码复现

数据获取与介绍
参考https://blog.csdn.net/qq_36047533/article/details/88360833
数据处理代码
使用jieba包里的分词，keras.preprocessing.text 包里的Tokenizer, text_to_word_sequence，texts_to_sequences，re正则化等功能

import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import re


###########################数据处理##################################

#train_data=pd.read_csv('cnews.train.txt',sep='\t',names=['label','content'])#50000
#test_data=pd.read_csv('cnews.test.txt',sep='\t',names=['label','content'])#10000
val_data=pd.read_csv('cnews.val.txt',sep='\t',names=['label','content'])#1000
train_data=val_data

def read_category(y_train):
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    cat_to_id = dict(zip(categories, range(len(categories))))
    label_id = []
    for i in range(len(y_train)):
        label_id.append(cat_to_id[y_train[i]])
    return label_id
    
train_target=train_data['label']  
train_label=read_category(train_target)

#test_target=test_data['label']  
#test_label=read_category(test_target)

'''
from keras.preprocessing.text import Tokenizer, text_to_word_sequence,one_hot

仅用于英文,中文需要经过结巴分词后才能正确使用，
--英文通过nltk的tokenize.sent_tokenize(text)对段落分句，再用text_to_word_sequence对句子分词；
--英文Tokenizer可以直接对英文段落使用
--中文通过结巴分词分词，再用re正则化分句(含有'')，再用text_to_word_sequence对句子分词（不含有''）
--中文Tokenizer要对结巴分词的中文段落使用

text1='some thing to eat'
text2='some thing to drink'
texts=[text1,text2]#分句

text_to_word_sequence,one_hot介绍
print text_to_word_sequence(text1)  #以空格区分，中文也不例外 ['some', 'thing', 'to', 'eat']
print one_hot(text1,10)  #[7, 9, 3, 4] -- （10表示数字化向量为10以内的数字）
print one_hot(text2,10)  #[7, 9, 3, 1]

Tokenizer
---介绍
document_count 处理的文档数量
word_index 一个dict，保存所有word对应的编号id，从1开始
word_counts 一个dict，保存每个word在所有文档中出现的次数
word_docs 一个dict，保存每个word出现的文档的数量
index_docs 一个dict，保存word的id出现的文档的数量

---用法
tokenizer = Tokenizer(num_words=None) #num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer.fit_on_texts(texts)#texts为list类，每个元素为一个文档。
print( tokenizer.word_counts) #[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]
print( tokenizer.word_index) #{'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5}
print( tokenizer.word_docs) #{'some': 2, 'thing': 2, 'to': 2, 'drink': 1,  'eat': 1}
print( tokenizer.index_docs) #{1: 2, 2: 2, 3: 2, 4: 1, 5: 1}

#tokenizer.texts_to_sequences要用于tokenizer.fit_on_texts(texts)之后
# num_words=多少会影响下面的结果，行数=num_words
print( tokenizer.texts_to_sequences(texts)) #得到词索引[[1, 2, 3, 4], [1, 2, 3, 5]]
print( tokenizer.texts_to_matrix(texts))  # 矩阵化=one_hot
[[ 0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
 [ 0.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.]
 
'''
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"”", "", string)
    string = re.sub(r"“", "", string)
    string = re.sub(r"‘", "", string)
    string = re.sub(r"’", "", string)
    string = re.sub(r"（.*）", "", string)#去中文括号以及括号内部
    string = re.sub(r"\(.*\)", "", string)#去英文括号以及括号内部
    return string.strip() #.strip()括号为空默认是去空格，.lower()字符串中所有大写字符为小写

for idx in range(train_data.shape[0]):
    para=train_data['content'][idx]   
    para1=clean_str(para)
    train_data['content'][idx]=para1

import jieba
train_data['texts'] = train_data['content'].apply(lambda x: " ".join(jieba.cut(x)))

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
tokenizer = Tokenizer(num_words=50000) #num_words:None或整数,处理的最大单词数量。多于此数的单词丢掉


tokenizer.fit_on_texts(train_data['texts'])
train_data1=tokenizer.texts_to_sequences(train_data['texts']) 
#train_data1 vs train_data['content']
'''
#test
test=train_data['content'][2]
tmp=train_data1[2]
t2=[]
for i in range(len(tmp)):
    t1=tmp[i]
    t2.append(idx2vocab[t1])
    
#train_data1是没有分句的
'''  
word_index = tokenizer.word_index
dic=word_index
idx2vocab={idx+1: char for idx, char in enumerate(dic.keys())}

dic['<PAD>'] = 0##list(dic.keys())中<PAD>在最后一个
vocab2idx=dic
idx2vocab[0]='<PAD>'
 
###########################################################################################################
'''
段落encode处理1
'''
MAX_SENT_LENGTH = 150
MAX_SENTS = 15
PARA_NUM=train_data.shape[0]
#MAX_NB_WORDS=len(tokenizer.word_index)+1 #0是空位，tokenizer.word_index.values()从1开始
MAX_NB_WORDS = 50000

data = np.zeros((PARA_NUM, MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
sent_stop1=dic['。']#3
sent_stop2=dic['！']#35
sent_stop3=dic['？']#48

for idx in range(len(train_data1)):
    para=train_data1[idx]
    print("**********第"+str(idx)+"段**************")
    sent_num=0
    sent_length=0
    sent_start=0
    sent_end=0
    for j in range(len(para)):
        print("**********第"+str(sent_num-1)+"句**************")
        #句子数太多，则进行下一段
        if(sent_num>=MAX_SENTS):
            #print('break')
            break
        #句子结束,(j==len(para)用于没有句子结束符的
        if(para[j]==sent_stop1)|(para[j]==sent_stop2)|(para[j]==sent_stop3)|(j==len(para)-1):
            #print('find end')
            sent_end=j
            sent_length=sent_end-sent_start+1
            #句子长度太大，则进行下一句
            if(sent_length>MAX_SENT_LENGTH):
                #print(sent_length)
                sent_start=j+1
                continue
            else:
                data[idx][sent_num][:sent_length]=para[sent_start:sent_end+1]       
                sent_num=sent_num+1
                sent_start=j+1
         
    print('Total %s unique tokens.' % str(len(word_index)+1))

labels = to_categorical(train_label)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

TEST_SPLIT=0.3
nb_test_samples = int(TEST_SPLIT * data.shape[0])

x_train = data[:-nb_test_samples]
y_train = labels[:-nb_test_samples]
x_test = data[-nb_test_samples:]
y_test = labels[-nb_test_samples:]

############################################################################################

模型

对比模型双层LSTM

from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation,Dropout, GRU
#from keras.preprocessing import sequence
from keras import metrics
from keras import backend as K
from keras.callbacks import Callback
import matplotlib.pyplot as plt
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping

#双层LSTM模型
def DoubleLSTM(train_x,train_y,test_x,test_y):
    #创建模型
    HIDDEN_SIZE1=64
    HIDDEN_SIZE2=32
    BATCH_SIZE=64
    EPOCHS = 500
    
    model=Sequential()
    model.add(LSTM(HIDDEN_SIZE1, input_shape=(MAX_SENTS, MAX_SENT_LENGTH), return_sequences=True))  #返回所有节点的输出
    model.add(Dropout(0.2))
    model.add(LSTM(HIDDEN_SIZE2,input_shape=(MAX_SENTS, HIDDEN_SIZE1),return_sequences=False))  #返回最后一个节点的输出
    model.add(Dropout(0.2))
    model.add(Dense(10,kernel_initializer="uniform",activation='softmax'))
    #查看网络结构
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3,factor=0.1,mode='min')    
    #early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min') 
    
    model.summary()
    #编译模型
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    
    #训练模型
    #model.fit(train_x,train_y,batch_size=500,nb_epoch=15,verbose=2,validation_data=(valid_x,valid_y))
    hist=model.fit(train_x,train_y,batch_size=BATCH_SIZE,verbose=1, epochs=EPOCHS,validation_split=0.1, callbacks=[reduce_lr])
    
    #评估模型
    score=model.evaluate(test_x,test_y,batch_size=64,verbose=2)
    model.save("doublelstm_textclassification.h5")
    
    print('test_loss:',score[0],'- test_acc:',score[1])
    return hist


hist=DoubleLSTM(x_train,y_train,x_test,y_test)

def acc_plotting(hist,str_save):
     a=hist.history
     loss_=a['loss']#['val_loss', 'val_categorical_accuracy', 'loss', 'categorical_accuracy', 'lr']
     val_loss_=a['val_loss']    
     acc_=a['acc']
     val_acc_=a['val_acc']
     x=np.arange(1,len(loss_)+1)
       
    # plt.figure(figsize=(8, 5))
     plt.figure()
     # 标题    
     plt.rcParams['font.family'] = ['simhei']
    
     plt.subplot(1,2,1)
     plt.title("模型损失函数变化曲线")
     plt.grid() 
     # 数据
     plt.plot(x, loss_, label='训练集loss', linewidth=3, color='r')
     plt.plot(x, val_loss_, label='验证集loss', linewidth=3, color='blue')
     
     # 横坐标描述
     plt.xlabel('迭代次数')    
     # 纵坐标描述
     plt.ylabel('函数值')
    
     plt.legend()
    
     plt.subplot(1,2,2)
     plt.title("模型准确率变化曲线")
     plt.grid() 
     # 数据
     plt.plot(x, acc_, label='训练集acc', linewidth=3, color='r')
     plt.plot(x, val_acc_, label='验证集acc', linewidth=3, color='blue')
     
     # 横坐标描述
     plt.xlabel('迭代次数')     
     # 纵坐标描述
     plt.ylabel('函数值')    
     plt.legend()
     plt.show()
     plt.savefig(str_save) 
     
str_save="double_lstm模型.png"
acc_plotting(hist,str_save)

HAN模型
这里attention提供两种写法：继承keras的层自定义attention层/使用keras的层合成attention层；

embedding层使用腾讯词向量，参考https://blog.csdn.net/weixin_42663919/article/details/104878512

def loadEmbedding(embeddingFile, word2id):
    with open(embeddingFile, "r", encoding='ISO-8859-1') as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split()) #map规范输出为整型, vector_size是200
        initW = np.random.uniform(-0.25,0.25,(len(word2id), vector_size))#word2id表示自己任务字典字数，包括特殊词，且有随机初始化，不需要特别定义特殊词
        count = 0
        for i in range(vocab_size):
            line = f.readline()
            lists = line.split(' ') #tencent文件每行是 word wordvector
            word = lists[0]
            try: word = word.encode('ISO-8859-1').decode('utf8')
            except: pass
            if word in word2id:
                count += 1
                number = map(float, lists[1:])
                number = list(number)
                vector = np.array(number)
                initW[word2id[word]] = vector #原来word在第几行，embedding后id(行数)不变。
        print(count)
        initW[word2id['<PAD>']]=np.zeros(vector_size)
        return initW

    
#######获取预加载词向量###########################
####重载自己的数据库###############
####重载自己的数据库###############
     
file = r'E:\Tencent_AILab_ChineseEmbedding.txt'
embedding_matrix=loadEmbedding(file, vocab2idx)#字典有包括'<PAD>''
    
print('Total %s word vectors.' % len(embedding_matrix))


EMBEDDING_DIM = 200

# building Hierachical Attention network,字典多一个，0表示null--len(word_index) + 1
embedding_layer = Embedding(len(word_index) ,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            mask_zero=True,trainable=False)



from keras.engine.topology import Layer, InputSpec
from keras import initializers

'''
attention写法1
缺点：还没能搞懂mask怎么写，若使用attention写法1，embedding_layer 中要去掉“ mask_zero=True”
'''
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')#keras的初始化
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)), name='W')
        self.b = K.variable(self.init((self.attention_dim, )), name='b')
        self.u = K.variable(self.init((self.attention_dim, 1)), name='u')
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, input_dim]
        # size of b :[batch_size, attention_dim]
        # size of W :[input_dim, attention_dim]
        # size of xW:[batch_size, sel_len, attention_dim]
        # uit = tanh(xW+b)
        # self.u:(attention_dim, 1)        
        # size of ait:[batch_size, sel_len]
        # size of output:[batch_size,input_dim]
     
  
            
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)#K.dot,将2个张量（和/或变量）相乘并返回一个张量
        ait = K.squeeze(ait, -1)#在索引 axis 轴(-1)，移除 1 个尺寸的维度

        ait = K.exp(ait)
   
        # if mask is not None:
        #     # Cast the mask to floatX to avoid float64 upcasting in theano,cast用于换变量类型
        #     ait *= K.cast(mask, K.floatx())

       
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait) #expand_dims，在索引 axis 轴，添加 1 个尺寸的维度。[batch_size, sel_len,1]
        #K.eval(c1)查看变量值，K.dot真的是数学矩阵相乘
        weighted_input = x * ait#点乘
        output = K.sum(weighted_input, axis=1)#输出2维，[batch_size, sel_len, input_dim]->[batch_size, input_dim]

        return output

    # def call(self, x):
    #     # size of x :[batch_size, sel_len, input_dim]
    #     # size of b :[batch_size, attention_dim]
    #     # size of W :[input_dim, attention_dim]
    #     # size of xW:[batch_size, sel_len, attention_dim]
    #     # H = tanh(xW+b)--[batch_size, sel_len, attention_dim]
    #     # self.V:(attention_dim, 1)        
    #     # size of score:[batch_size, sel_len,1]
    #     # size of output:[batch_size,input_dim]        
    #     H = K.tanh(K.dot(x, self.W) + self.b)
    #     score = K.softmax(K.dot(H, self.u), axis=1)
    #     outputs = K.sum(score * x, axis=1)
    #    # print("attention output shape:",outputs.shape)
    #     return outputs
    

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

      
def HAN(train_x,train_y,test_x,test_y):
    #创建模型
    '''
    MAX_SENT_LENGTH = 100
    MAX_SENTS = 15
    PARA_NUM=train_data.shape[0]
    #MAX_NB_WORDS=len(tokenizer.word_index)+1 #0是空位，tokenizer.word_index.values()从1开始
    MAX_NB_WORDS = 50000
    '''
    
    HIDDEN_SIZE1=100
    ATTENTION_DIM=100
    BATCH_SIZE=64
    EPOCHS = 30
    
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)#输出[batch,MAX_SENT_LENGTH,EMBEDDING_DIM]
    l_lstm = Bidirectional(GRU(HIDDEN_SIZE1, return_sequences=True))(embedded_sequences)#输出[batch,MAX_SENT_LENGTH,HIDDEN_SIZE1]
    l_att = AttLayer(ATTENTION_DIM)(l_lstm)#输出[batch,HIDDEN_SIZE1]
    sentEncoder = Model(sentence_input, l_att)
    
    review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
   # mask_ = Masking(mask_value=0)(review_input)
    #对每个时间步（每句话）做sentEncoder操作，承接上一层review_input
 
    # l_lstm_test = Bidirectional(GRU(HIDDEN_SIZE1, return_sequences=True))(review_input)
    # l_att_test = AttLayer(ATTENTION_DIM)(l_lstm_test)
    # preds = Dense(10, activation='softmax')(l_att_test)
    # model = Model(review_input, preds)
    
    review_encoder = TimeDistributed(sentEncoder)(review_input)#输出si--[batch,MAX_SENTS,HIDDEN_SIZE1]   
    l_lstm_sent = Bidirectional(GRU(HIDDEN_SIZE1, return_sequences=True))(review_encoder)#输出[batch,MAX_SENTS,HIDDEN_SIZE1]
    l_att_sent = AttLayer(ATTENTION_DIM)(l_lstm_sent)
    preds = Dense(10, activation='softmax')(l_att_sent)
    model = Model(review_input, preds)

    
    
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
        
    print("model fitting - Hierachical attention network")
    #model.fit(train_x, train_y,batch_size=BATCH_SIZE,verbose=1, epochs=EPOCHS,validation_split=0.1)
    
    indices = np.arange(train_x.shape[0])
    np.random.shuffle(indices)
    train_x = train_x[indices]
    train_y = train_y[indices]

    VALIDATION_SPLIT=0.1
    nb_validation_samples = int(VALIDATION_SPLIT * train_x.shape[0])
    
    train_x1 = train_x[:-nb_validation_samples]
    train_y1 = train_y[:-nb_validation_samples]
    x_val = train_x[-nb_validation_samples:]
    y_val = train_y[-nb_validation_samples:]    
    hist1=model.fit(train_x1, train_y1, validation_data=(x_val, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE)
    

    score=model.evaluate(test_x,test_y,batch_size=BATCH_SIZE,verbose=2)
    model.save("HAN_textclassification.h5")
    
    print('test_loss:',score[0],'- test_acc:',score[1])
    return hist1

hist1=HAN(x_train,y_train,x_test,y_test)
str_save="HAN模型.png"
acc_plotting(hist1,str_save)


#def attention_3d_block(hidden_states,attention_dim):


'''
attention写法2
若使用attention写法2，embedding_layer 中不用去掉“ mask_zero=True”
'''
from keras.layers import Dense, Lambda, dot, Activation, concatenate,Reshape, Softmax
from keras import backend as K
#d1=dot([a1,b1],[1,1])---没有提到的axis就是可以盖住的
#c1=K.dot(a1,b1)

def attention_3d_block(hidden_states,attention_dim,time_steps):
    """
    # size of x :[batch_size, sel_len, input_dim]
    # size of b :[batch_size, attention_dim]
    # size of W :[input_dim, attention_dim]
    # size of xW:[batch_size, sel_len, attention_dim]   
   
    # uit = tanh(xW+b)  (score_first_part ):[batch_size, sel_len, attention_dim]
    # self.u:(attention_dim, 1) ,ait = K.dot(uit, self.u)      
    # size of ait (attention_weights):[batch_size, sel_len]
   
    # outputs = K.sum(score * inputs, axis=1)
    # size of output:[batch_size,input_dim]
    """
    hidden_size = int(hidden_states.shape[2])
    #time_steps=int(hidden_states.shape[1])
    
    # Inside dense layer
    #              hidden_states            dot               W            =>           score_first_part
    # (batch_size, time_steps, hidden_size) dot (hidden_size, attention_dim) => (batch_size, time_steps, attention_dim)
    # W is the trainable weight matrix of attention Luong's multiplicative style score
    
    score_first_part = Dense(attention_dim, use_bias=True,activation='tanh', name='attention_score_vec')(hidden_states)
    #h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
    #score = dot([score_first_part, h_t], [2, 1], name='attention_score')
    
    #            score_first_part           dot        u     => attention_weights
    # (batch_size, time_steps, hidden_size) dot   (hidden_size, 1)  => (batch_size, time_steps, 1)
    
    scores=Dense(1,use_bias=False,name='attention_score')(score_first_part)
    attention_weights = Softmax(axis=1)(scores)
    #attention_weights = Reshape((time_steps,))(attention_weights)
    
    # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps,1) => (batch_size, hidden_size,1)
    context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector')
    context_vector = Reshape((hidden_size,))(context_vector)#(batch_size, hidden_size,1) => (batch_size, hidden_size)
    
    return context_vector
    
def HAN1(train_x,train_y,test_x,test_y):
    #创建模型
    '''
    MAX_SENT_LENGTH = 100
    MAX_SENTS = 15
    PARA_NUM=train_data.shape[0]
    #MAX_NB_WORDS=len(tokenizer.word_index)+1 #0是空位，tokenizer.word_index.values()从1开始
    MAX_NB_WORDS = 50000
    '''
    
    HIDDEN_SIZE1=100
    ATTENTION_DIM=100
    BATCH_SIZE=64
    EPOCHS = 30
    
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    #mask_ = Masking(mask_value=0)(sentence_input)
    embedded_sequences = embedding_layer(sentence_input)
    l_lstm =  Bidirectional(GRU(HIDDEN_SIZE1, return_sequences=True))(embedded_sequences)
    l_att = attention_3d_block(l_lstm,ATTENTION_DIM,MAX_SENT_LENGTH)
    sentEncoder = Model(sentence_input, l_att)
    
    review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    #对每个时间步（每句话）做sentEncoder操作，承接上一层review_input
    review_encoder = TimeDistributed(sentEncoder)(review_input)#输出si
    l_lstm_sent =Bidirectional(GRU(HIDDEN_SIZE1, return_sequences=True))(review_encoder)
    l_att_sent = attention_3d_block(l_lstm_sent,ATTENTION_DIM,MAX_SENTS)
    preds = Dense(10, activation='softmax')(l_att_sent)
    model = Model(review_input, preds)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3,factor=0.1,mode='min')    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min')    
 
    model.summary()
    
    # model.compile(loss='categorical_crossentropy',
    #               optimizer='rmsprop',
    #               metrics=['acc'])
    
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

    print("model fitting - Hierachical attention network")
    #model.fit(train_x, train_y,batch_size=BATCH_SIZE,verbose=1, epochs=EPOCHS,validation_split=0.1)
    
    indices = np.arange(train_x.shape[0])
    np.random.shuffle(indices)
    train_x = train_x[indices]
    train_y = train_y[indices]

    VALIDATION_SPLIT=0.1
    nb_validation_samples = int(VALIDATION_SPLIT * train_x.shape[0])
    
    train_x1 = train_x[:-nb_validation_samples]
    train_y1 = train_y[:-nb_validation_samples]
    x_val = train_x[-nb_validation_samples:]
    y_val = train_y[-nb_validation_samples:]    
    hist1=model.fit(train_x1, train_y1, validation_data=(x_val, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE,callbacks=[reduce_lr,early_stopping])
    
    

    score=model.evaluate(test_x,test_y,batch_size=BATCH_SIZE,verbose=2)
    model.save("HAN_textclassification.h5")
    
    print('test_loss:',score[0],'- test_acc:',score[1])
    return hist1

hist1=HAN1(x_train,y_train,x_test,y_test)
str_save="HAN1模型.png"
acc_plotting(hist1,str_save)

实验结果

双层LSTM

Epoch 254/500
3150/3150 [==============================] - 1s 425us/step - loss: 1.7993 - acc: 0.3768 - val_loss: 2.0158 - val_acc: 0.3114
Epoch 255/500
3150/3150 [==============================] - 1s 427us/step - loss: 1.7895 - acc: 0.3810 - val_loss: 2.0158 - val_acc: 0.3114
Epoch 256/500
3150/3150 [==============================] - 1s 429us/step - loss: 1.7902 - acc: 0.3835 - val_loss: 2.0158 - val_acc: 0.3114

。。。
Epoch 497/500
3150/3150 [==============================] - 1s 428us/step - loss: 1.7971 - acc: 0.3797 - val_loss: 2.0158 - val_acc: 0.3114
Epoch 498/500
3150/3150 [==============================] - 1s 426us/step - loss: 1.7918 - acc: 0.3752 - val_loss: 2.0158 - val_acc: 0.3114
Epoch 499/500
3150/3150 [==============================] - 1s 428us/step - loss: 1.7962 - acc: 0.3759 - val_loss: 2.0158 - val_acc: 0.3114
Epoch 500/500
3150/3150 [==============================] - 1s 426us/step - loss: 1.7877 - acc: 0.3841 - val_loss: 2.0158 - val_acc: 0.3114
test_loss: 1.9885118150711059 - test_acc: 0.29733333325386047

HAN模型

_______________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_2 (InputLayer)            (None, 15, 150)      0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 15, 200)      16654200    input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 15, 200)      180600      time_distributed_1[0][0]         
__________________________________________________________________________________________________
attention_score_vec (Dense)     (None, 15, 100)      20100       bidirectional_2[0][0]            
__________________________________________________________________________________________________
attention_score (Dense)         (None, 15, 1)        100         attention_score_vec[0][0]        
__________________________________________________________________________________________________
softmax_2 (Softmax)             (None, 15, 1)        0           attention_score[0][0]            
__________________________________________________________________________________________________
context_vector (Dot)            (None, 200, 1)       0           bidirectional_2[0][0]            
                                                                 softmax_2[0][0]                  
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 200)          0           context_vector[0][0]             
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 10)           2010        reshape_2[0][0]                  
==================================================================================================
Total params: 16,857,010
Trainable params: 403,610
Non-trainable params: 16,453,400
__________________________________________________________________________________________________
model fitting - Hierachical attention network
Train on 3150 samples, validate on 350 samples
Epoch 1/30
3150/3150 [==============================] - 296s 94ms/step - loss: 1.1192 - acc: 0.6857 - val_loss: 0.1612 - val_acc: 0.9457
Epoch 2/30
3150/3150 [==============================] - 351s 111ms/step - loss: 0.1314 - acc: 0.9616 - val_loss: 0.1265 - val_acc: 0.9657
Epoch 3/30
3150/3150 [==============================] - 356s 113ms/step - loss: 0.0776 - acc: 0.9756 - val_loss: 0.1040 - val_acc: 0.9714
Epoch 4/30
3150/3150 [==============================] - 377s 120ms/step - loss: 0.0554 - acc: 0.9848 - val_loss: 0.1137 - val_acc: 0.9771
Epoch 5/30
3150/3150 [==============================] - 363s 115ms/step - loss: 0.0383 - acc: 0.9902 - val_loss: 0.1038 - val_acc: 0.9771
Epoch 6/30
3150/3150 [==============================] - 358s 114ms/step - loss: 0.0256 - acc: 0.9937 - val_loss: 0.1109 - val_acc: 0.9771
Epoch 7/30
3150/3150 [==============================] - 358s 114ms/step - loss: 0.0301 - acc: 0.9927 - val_loss: 0.1154 - val_acc: 0.9686
Epoch 8/30
3150/3150 [==============================] - 366s 116ms/step - loss: 0.0098 - acc: 0.9994 - val_loss: 0.1132 - val_acc: 0.9771
Epoch 9/30
3150/3150 [==============================] - 366s 116ms/step - loss: 0.0065 - acc: 0.9997 - val_loss: 0.1089 - val_acc: 0.9771
Epoch 10/30
3150/3150 [==============================] - 352s 112ms/step - loss: 0.0061 - acc: 0.9997 - val_loss: 0.1082 - val_acc: 0.9771
Epoch 11/30
3150/3150 [==============================] - 358s 114ms/step - loss: 0.0059 - acc: 0.9997 - val_loss: 0.1077 - val_acc: 0.9771
Epoch 12/30
3150/3150 [==============================] - 661s 210ms/step - loss: 0.0057 - acc: 0.9997 - val_loss: 0.1077 - val_acc: 0.9771
Epoch 13/30
3150/3150 [==============================] - 835s 265ms/step - loss: 0.0057 - acc: 0.9997 - val_loss: 0.1078 - val_acc: 0.9771
Epoch 14/30
3150/3150 [==============================] - 818s 260ms/step - loss: 0.0057 - acc: 0.9997 - val_loss: 0.1077 - val_acc: 0.9771
Epoch 15/30
3150/3150 [==============================] - 823s 261ms/step - loss: 0.0056 - acc: 0.9997 - val_loss: 0.1077 - val_acc: 0.9771
Epoch 00015: early stopping
test_loss: 0.05889446383342147 - test_acc: 0.9846666666666667

weixin_42663919

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
1
评论
[attention] HAN模型笔记-使用keras编写

HAN模型理论部分文章：Hierarchical Attention Networks for Document Classification链接：https://pan.baidu.com/s/1qZieu90lKPMwwj2BxoO2MA提取码：6pv0模型理解代码复现数据获取与介绍参考https://blog.csdn.net/qq_36047533/article/details/88360833数据处理代码使用jieba包里的分词，keras.preproces
复制链接

扫一扫