孪生网络图像相似度_基于Siamese孪生网络的相似问答检索,简单实用

041b86a9f7a990c58f03ac9a0ed71e61.png

完整代码如下。

# -*- coding: utf-8 -*-import datetimeimport itertoolsfrom time import timeimport matplotlib.pyplot as pltimport numpy as npimport pandas as pdimport tensorflow as tfimport tensorflow.keras.backend as Kfrom sklearn.model_selection import train_test_splitfrom tensorflow import keras# 可复现性np.random.seed(0)tf.random.set_seed(0)# 加载数据df = pd.read_csv('./data/atec_nlp.csv', sep='', names=['question1', 'question2', 'label'])# 分割成训练集与测试集train_df, test_df, y_train, y_test = train_test_split(pd.DataFrame(df, columns=['question1', 'question2']), df['label'],                                                      test_size=0.2)# 对数据集的汉字进行Embedding# 加载Word2Vec,每个汉字128维的向量df = pd.read_csv('./data/char_vec', sep='s+', header=None, index_col=0)# 行的index即汉字alphabet = df.index.values  # len is 1575vocabulary = dict([(alphabet[i], i) for i in range(len(alphabet))])# 添加占位符vocabulary[''] = len(vocabulary)inverse_vocabulary = alphabet + ['']  #  是占位符embedding_dim = 128embeddings = np.random.randn(len(vocabulary), embedding_dim)  # embedding matrix# 填充for i, (name, data) in enumerate(df.iterrows()):    embeddings[i] = data.to_numpy()# 汉字转成数字id,此步骤需要一些时间for dataset in [train_df, test_df]:    for index, row in dataset.iterrows():        # 两列句子都处理        for question in ['question1', 'question2']:            question2n = []  # question2n -> question numbers representation            # TODO 更好的切分汉字与其它字符            for word in row[question]:                if word not in vocabulary:                    question2n.append(vocabulary[''])                else:                    question2n.append(vocabulary[word])            # 逐个将文本替换为数字            dataset.at[index, question] = question2n# 准备training与validation数据max_seq_length = max(train_df['question1'].map(lambda x: len(x)).max(),                     train_df['question2'].map(lambda x: len(x)).max(),                     train_df['question1'].map(lambda x: len(x)).max(),                     train_df['question2'].map(lambda x: len(x)).max())print('max_seq_length {}'.format(max_seq_length))# 分割为train与validationvalidation_size = 10000training_size = len(train_df) - validation_sizequestions_cols = ['question1', 'question2']X_train, X_validation, Y_train, Y_validation = train_test_split(train_df, y_train, test_size=validation_size)# Siamese网络是孪生网络,因此分左右两个同时输入的数据集X_train = {'left': X_train.question1, 'right': X_train.question2}X_validation = {'left': X_validation.question1, 'right': X_validation.question2}X_test = {'left': test_df.question1, 'right': test_df.question2}# 从pandas取出其numpy形式的表示Y_train = Y_train.valuesY_validation = Y_validation.values# 使用0填充成一样的长度for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):    dataset[side] = tf.keras.preprocessing.sequence.pad_sequences(dataset[side], maxlen=max_seq_length)# 检查下是否正确assert X_train['left'].shape == X_train['right'].shapeassert len(X_train['left']) == len(Y_train)# 构建模型n_hidden = 30gradient_clipping_norm = 1.25batch_size = 80n_epoch = 5  # 第一轮就拟合了def exponent_neg_manhattan_distance(left, right):    """度量LSTM输出向量相似度"""    return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))# 输入层,即可见层left_input = keras.Input(shape=(max_seq_length,), dtype='float32')right_input = keras.Input(shape=(max_seq_length,), dtype='float32')# 使用已定义好的embedding matrixembedding_layer = keras.layers.Embedding(len(embeddings), embedding_dim,                                         embeddings_initializer=tf.keras.initializers.Constant(embeddings),                                         input_length=max_seq_length, trainable=False)# 对输入层进行Embedding,即每个汉字转成128维嵌入向量encoded_left = embedding_layer(left_input)encoded_right = embedding_layer(right_input)# Siamese网络,因此两边共享LSTM参数shared_lstm = keras.layers.LSTM(n_hidden)left_output = shared_lstm(encoded_left)right_output = shared_lstm(encoded_right)# 计算由MaLSTM模型所定义的句子向量间的距离malstm_distance = keras.layers.Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),                                      output_shape=lambda x: (x[0][0], 1))([left_output, right_output])# 组装成最终的模型malstm = keras.models.Model([left_input, right_input], [malstm_distance])# 使用Adadelta优化器optimizer = keras.optimizers.Adadelta(clipnorm=gradient_clipping_norm)malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])# 开始训练training_start_time = time()malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, epochs=n_epoch,                            validation_data=([X_validation['left'], X_validation['right']], Y_validation))print("Training time finished.{} epochs in {}".format(n_epoch,                                                        datetime.timedelta(seconds=time() - training_start_time)))# 画出训练过程# 画正确率plt.plot(malstm_trained.history['accuracy'])plt.plot(malstm_trained.history['val_accuracy'])plt.title('Model Accuracy')plt.ylabel('Accuracy')plt.xlabel('Epoch')plt.legend(['Train', 'Validation'], loc='upper left')plt.show()# 画Lossplt.plot(malstm_trained.history['loss'])plt.plot(malstm_trained.history['val_loss'])plt.title('Model Loss')plt.ylabel('Loss')plt.xlabel('Epoch')plt.legend(['Train', 'Validation'], loc='upper right')plt.show()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值