多模态&多目标学习-vsn+transformer

多模态:结构化数据(表格数据)+文本数据(或图片、音频)进行特征融合

多目标:共享分特征处理部分,然后分别再次全连接激活,输出多个训练目标

本文针对如上两个特点,基于TensorFlow2.0,实现了该网络,如下代码供参考。

# -*- coding: utf-8 -*-
"""

@Time  : 2022/7/29 16:21
@Author: Breeze
@File  : vsn+transfomer.py
"""
import pandas as pd
import os
import sys
import json

_WORK_DIR = os.path.split(os.path.realpath(__file__))[0]
sys.path.append(os.path.join(_WORK_DIR, '..'))
import time
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer

from focal_loss import sparse_categorical_focal_loss

seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)
print(tf.__version__)

# 画出模型,需要GraphViz包。
from tensorflow.keras.utils import plot_model

# plot_model(NN, to_file='NN.png')

def WEIGHT_COLUMN_NAME_map(x):
    return 1 if x >= 1 else 0

with open('added_tokens.json', encoding='utf-8') as f:
    kvs = json.load(f)


def get_default_value(x):
    if kvs.get(x):
        return kvs.get(x) - 28995
    return 0


class GatedLinearUnit(tf.keras.layers.Layer):
    def __init__(self, units):
        super(GatedLinearUnit, self).__init__()
        self.linear = tf.keras.layers.Dense(units)
        self.sigmoid = tf.keras.layers.Dense(units, activation='sigmoid')

    def get_config(self):
        config = super(GatedLinearUnit, self).get_config()
        config["linear"] = self.linear
        config["sigmoid"] = self.sigmoid
        return config

    def call(self, inputs, training=None):
        return self.linear(inputs) * self.sigmoid(inputs)


class GatedResidualNetwork(tf.keras.layers.Layer):
    def __init__(self, units, dropout_rate):
        super(GatedResidualNetwork, self).__init__()
        self.units = units
        self.elu_dense = tf.keras.layers.Dense(units, activation='elu')
        self.linear_dense = tf.keras.layers.Dense(units)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.gated_linear_unit = GatedLinearUnit(units)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.project = tf.keras.layers.Dense(units)

    def get_config(self):
        config = super(GatedResidualNetwork, self).get_config()
        config["units"] = self.units
        config["elu_dense"] = self.elu_dense
        config["linear_dense"] = self.linear_dense
        config["dropout"] = self.dropout
        config["gated_linear_unit"] = self.gated_linear_unit
        config["layer_norm"] = self.layer_norm
        config["project"] = self.project
        return config

    def call(self, inputs, training=None):
        x = self.elu_dense(inputs)
        x = self.linear_dense(x)
        x = self.dropout(x, training=training)
        if inputs.shape[-1] != self.units:
            inputs = self.project(inputs)
        else:
            self.project = None
        x = inputs + self.gated_linear_unit(x)

        x = self.layer_norm(x)
        return x


class VariableSelection(tf.keras.layers.Layer):

    def __init__(self,
                 category_features,
                 numeric_features,
                 encoding_size,
                 dropout_rate,
                 category_feature_vocabulary,
                 project_dim
                 ):
        super(VariableSelection, self).__init__()
        # Create embedding layer for each  category features
        self.category_nums = len(category_features)
        self.embeddings = []
        # share embedding layers in multi-object learning
        for feature_name in category_features:
            feature_value = category_feature_vocabulary[feature_name]
            print('构建embedding', feature_name, len(feature_value) + 1, encoding_size)
            self.embeddings.append(
                tf.keras.layers.Embedding(
                    input_dim=len(feature_value) + 1,
                    output_dim=encoding_size)
            )

        # Project the numeric feature to encoding_size using linear transformation
        self.project_layers = []
        for feature_name in numeric_features:
            self.project_layers.append(tf.keras.layers.Dense(units=encoding_size))

        self.features = category_features + numeric_features
        self.category_features = category_features
        self.numeric_features = numeric_features

        num_features = len(category_features) + len(numeric_features)
        # Create a GRN for each feature independently
        self.grns = list()
        for idx in range(num_features):
            grn = GatedResidualNetwork(encoding_size, dropout_rate)
            self.grns.append(grn)

        # Create a GRN for the concatenation of all the features
        self.grn_concat = GatedResidualNetwork(encoding_size, dropout_rate)
        self.softmax = tf.keras.layers.Dense(units=num_features, activation="softmax")

        # project output to project_dim
        # tf.nn.sigmoid(output)
        # self.transfer_layer = tf.keras.layers.Dense(project_dim, activation="sigmoid")
        self.transfer_layer = tf.keras.layers.Dense(project_dim, activation="softmax")

    def get_config(self):
        config = super(VariableSelection, self).get_config()
        config["grns"] = self.grns
        config["grn_concat"] = self.grn_concat
        config["softmax"] = self.softmax
        return config

    def call(self, inputs, training=None, mask=None):
        encoded_features = []
        # print('inputs:', inputs)
        for i, feature_name in enumerate(self.features):
            if feature_name in self.category_features:
                feature_embedding = self.embeddings[i]
                input_feature = inputs[:, i]
                # print('encoded_feature', i, feature_name, feature_embedding.input_dim,feature_embedding.output_dim)
                encoded_feature = feature_embedding(input_feature)
            else:
                encoded_feature = tf.expand_dims(inputs[:, i], -1)
                encoded_feature = self.project_layers[i - self.category_nums](encoded_feature)
            encoded_features.append(encoded_feature)

        v = tf.keras.layers.concatenate(encoded_features)
        # actually equal to attention fusion network
        v = self.grn_concat(v, training=training)
        # 在最后增加一维数据
        v = tf.expand_dims(self.softmax(v), axis=-1)

        x = []
        for idx, input in enumerate(encoded_features):
            x.append(self.grns[idx](input, training=training))

        x = tf.stack(x, axis=1)  # 横向维度扩展
        # output shape [batch_size, encoding_size]
        outputs = tf.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)
        # transfer to project_dim
        outputs = self.transfer_layer(outputs, training=training)
        return outputs


class MultiHeadSelfAttention(tf.keras.layers.Layer):
    """多头Attention"""
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embed_dim)
        self.key_dense = tf.keras.layers.Dense(embed_dim)
        self.value_dense = tf.keras.layers.Dense(embed_dim)
        self.combine_heads = tf.keras.layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output


class TransformerBlock(tf.keras.layers.Layer):
    """Transformer的Encoder部分"""
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim), ]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    """Transformer输入的编码层"""
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


class Trainer(tf.keras.Model):

    def __init__(self, pretrain_path,
                 train_data_path,
                 test_data_path,
                 save_dir,
                 labels=[],
                 category_features=[],
                 numeric_features=[],
                 encoding_size=10,
                 dropout_rate=0.5,
                 category_feature_vocabulary=[],
                 project_dim=10,
                 max_len=128
                 ):
        # `super(YourClass, self).__init__()`. Always start with this line.
        super(Trainer, self).__init__()
        self.max_len = max_len
        self.save_dir = save_dir
        self.embed_dim = 32  # Embedding size for each token
        self.num_heads = 2  # Number of attention heads
        self.ff_dim = 32  # Hidden layer size in feed forward network inside transformer
        self.vocab_size = 400
        self.optimizer = None

        def click_map(x):
            # input_embedding = tf.nn.embedding_lookup(W, input_x)
            words = str(x).split(',')
            words_idx = list(map(lambda word:get_default_value(word),words))

            pad_sequences =tf.keras.preprocessing.sequence.pad_sequences([words_idx],
                                                       maxlen=max_len,
                                                       dtype='float64',
                                                       padding='post',
                                                       truncating='pre',
                                                       value=0.)

            return pad_sequences[0].reshape(1,max_len)

        # 数据的加载
        train_data = pd.read_csv(train_data_path)
        self.x_train = train_data[category_features + numeric_features].values
        self.x_train_txt = train_data['last_click'].apply(click_map).values
        # df1 = self.x_train_txt.apply(pd.Series, index=[f'col{i}' for i in range(max_len)])
        self.x_train_txt = np.array([e for e in self.x_train_txt ]).reshape(len(train_data), max_len)
        self.x_train = np.concatenate([self.x_train, self.x_train_txt], axis=1)
        self.y_train = train_data[labels[0]].values
        self.y2_train = train_data[labels[1]].apply(WEIGHT_COLUMN_NAME_map).values
        # print(train_data.info())

        test_data = pd.read_csv(test_data_path)
        self.x_test = test_data[category_features + numeric_features]
        self.x_test_txt = test_data['last_click'].apply(click_map).values
        self.x_test_txt = np.array([e for e in self.x_test_txt]).reshape(len(self.x_test_txt), max_len)
        self.x_test = np.concatenate([self.x_test, self.x_test_txt], axis=1)

        self.y_test = test_data[labels[0]].values
        self.y2_test = test_data[labels[1]].apply(WEIGHT_COLUMN_NAME_map).values
        # print(test_data.info())

        del train_data, test_data

        #

        self.train_loss_1 = tf.keras.metrics.Mean(name='train_loss_1')
        self.train_auc_1 = tf.keras.metrics.AUC(name='train_auc_1')
        self.test_loss_1 = tf.keras.metrics.Mean(name='test_loss_1')
        self.test_auc_1 = tf.keras.metrics.AUC(name='test_auc_1')

        self.train_loss_2 = tf.keras.metrics.Mean(name='train_loss_2')
        self.train_auc_2 = tf.keras.metrics.AUC(name='train_auc_2')
        self.test_loss_2 = tf.keras.metrics.Mean(name='test_loss_2')
        self.test_auc_2 = tf.keras.metrics.AUC(name='test_auc_2')
        # 共享特征提取模块
        self.model = VariableSelection(category_features,
                                       numeric_features,
                                       encoding_size,
                                       dropout_rate,
                                       category_feature_vocabulary,
                                       project_dim)

        # 单词+位置编码
        self.embedding_layer = TokenAndPositionEmbedding(self.max_len, self.vocab_size, embed_dim=self.embed_dim)
        self.transformer_block = TransformerBlock(embed_dim=self.embed_dim, num_heads=self.num_heads, ff_dim=self.ff_dim)
        # 目标1网络模块
        self.h11 = tf.keras.layers.Dense(units=16, activation="relu"
                                         , name='out1'
                                         , kernel_regularizer=tf.keras.regularizers.l1(0.01)
                                         , bias_regularizer=tf.keras.regularizers.l1(0.01)
                                         , activity_regularizer=tf.keras.regularizers.l1(0.01)
                                         )
        self.h12 = tf.keras.layers.Dense(8, activation='relu')
        self.h13 = tf.keras.layers.Dense(8, activation='relu')
        # sigmoid 对应 cross_entropy, softmax sparse_categorical_focal_loss
        self.h14 = tf.keras.layers.Dense(project_dim, activation='softmax', name='out1')

        # 目标2网络模块
        self.h21 = tf.keras.layers.Dense(units=16, activation="relu"
                                         , name='out2'
                                         , kernel_regularizer=tf.keras.regularizers.l1(0.01)
                                         , bias_regularizer=tf.keras.regularizers.l1(0.01)
                                         , activity_regularizer=tf.keras.regularizers.l1(0.01)
                                         )
        self.h22 = tf.keras.layers.Dense(8, activation='relu')
        self.h23 = tf.keras.layers.Dense(8, activation='relu')
        self.h24 = tf.keras.layers.Dense(project_dim, activation='softmax', name='out2')

        # transformer部分隐藏层

        self.tf_l1 = tf.keras.layers.GlobalAveragePooling1D()
        self.tf_d1 = tf.keras.layers.Dropout(0.1)
        self.tf_l2 = tf.keras.layers.Dense(20, activation="relu")
        self.tf_d2 = tf.keras.layers.Dropout(0.1)
        self.tf_l3 = tf.keras.layers.Dense(10, activation="relu")

    def call(self, inputs, training=None, mask=None):
        # 
        click_inputs = inputs[:, -self.max_len:]
        # click_inputs_transpose = tf.transpose(click_inputs)
        embeddings = self.embedding_layer(click_inputs)
        transformer_out = self.transformer_block(embeddings, training)

        transformer_out = self.tf_l1(transformer_out)
        transformer_out = self.tf_d1(transformer_out)
        transformer_out = self.tf_l2(transformer_out)
        transformer_out = self.tf_d2(transformer_out)
        transformer_out = self.tf_l3(transformer_out)

        vsn_out = self.model(inputs[:,:-self.max_len], training, mask)
        out = tf.concat([transformer_out, vsn_out], axis=1)

        out1 = self.h11(out, training=training)
        out1 = self.h12(out1, training=training)
        out1 = self.h13(out1, training=training)
        out1 = self.h14(out1, training=training)

        out2 = self.h21(out, training=training)
        out2 = self.h22(out2, training=training)
        out2 = self.h23(out2, training=training)
        out2 = self.h24(out2, training=training)
        return out1, out2

    # 使用autograph机制转换成静态图加速
    # @tf.function
    def train_step(self, x, y1, y2, steps):
        # print('train_step:',x, y1, y2)
        with tf.GradientTape() as tape:
            # print('GradientTape:', x, y)
            pred1, pred2 = self(x, training=True)
            # loss = tf.keras.losses.sparse_categorical_crossentropy(y, pred)
            loss1 = sparse_categorical_focal_loss(y1, pred1, gamma=2)
            loss2 = sparse_categorical_focal_loss(y2, pred2, gamma=2)
            #     # Compute gradients
            gradients = tape.gradient(loss1 + loss2, self.trainable_variables,
                                      unconnected_gradients=tf.UnconnectedGradients.ZERO)

            # Update weights
            self.optimizer.apply_gradients(grads_and_vars=zip(gradients, self.trainable_variables))

            # Update metrics (includes the metric that tracks the loss)
            self.train_loss_1.update_state(loss1)
            self.train_auc_1.update_state(y1, pred1[:, 1])

            self.train_loss_2.update_state(loss2)
            self.train_auc_2.update_state(y2, pred2[:, 1])
            return loss1 + loss2

    # @tf.function  # 使用autograph机制转换成静态图加速
    def test_step(self, x, y1, y2):
        pred1, pred2 = self.call(x, training=False)
        # loss = tf.keras.losses.sparse_categorical_crossentropy(y, pred)
        loss_1 = sparse_categorical_focal_loss(y1, pred1, gamma=2)
        loss_2 = sparse_categorical_focal_loss(y2, pred2, gamma=2)

        # transfer predictions
        self.test_auc_1.update_state(y1, pred1[:, 1])
        self.test_loss_1.update_state(loss_1)
        self.test_auc_2.update_state(y2, pred2[:, 1])
        self.test_loss_2.update_state(loss_2)

    # @tf.function
    def train(self,
              epochs,
              batch_size=16,
              lr=2e-5,
              evaluation_steps=100
              ):
        print('start train model ...')
        self.optimizer = tf.keras.optimizers.Adam(lr)

        train_data = tf.data.Dataset.from_tensor_slices((self.x_train, (self.y_train, self.y2_train))) \
            .shuffle(len(self.y_train)).batch(batch_size)
        test_data = tf.data.Dataset.from_tensor_slices((self.x_test, (self.y_test, self.y2_test))).batch(batch_size)

        def benchmark(dataset, num_epochs=2):
            start_time = time.perf_counter()
            for epoch_num in range(num_epochs):
                for sample in dataset:
                    # Performing a training step
                    time.sleep(0.01)
            print("Execution time:", time.perf_counter() - start_time)
        # benchmark(train_data,20)
        # benchmark(test_data,20)

        best_auc = 0.
        step = 0
        for epoch in range(epochs):
            self.train_loss_1.reset_states()
            self.train_loss_2.reset_states()
            self.train_auc_1.reset_states()
            self.train_auc_2.reset_states()

            start = time.time()
            cnt = 0
            for x, (y1, y2) in train_data:
                cnt += 1
                # print('cnt is:', cnt, x, y)
                self.train_step(x, y1, y2, cnt)
                step += 1
                if step > 0 and step % (evaluation_steps // 5) == 0:
                    print("Epoch {} step {} train Loss1 {}  Loss2 {}".format(epoch + 1, step
                                                                             , self.train_loss_1.result()
                                                                             , self.train_loss_2.result()))
                    print("Epoch {} step {} train auc1 {}   auc2  {}".format(epoch + 1, step
                                                                             , self.train_auc_1.result()
                                                                             , self.train_auc_2.result()))
                # evaluate
                if step > 0 and step % evaluation_steps == 0:
                    self.test_auc_1.reset_states()
                    self.test_auc_2.reset_states()
                    self.test_loss_1.reset_states()
                    self.test_loss_2.reset_states()
                    for x, (y1, y2) in test_data:
                        self.test_step(x, y1, y2)
                    cur_auc = self.test_auc_1.result()
                    cur_auc2 = self.test_auc_2.result()
                    print("Epoch {} Step {} test loss1 {} loss2 {}".format(epoch + 1, step
                                                                           , self.test_loss_1.result()
                                                                           , self.test_loss_2.result()))
                    print("Epoch {} Step {} test AUC1 {}  AUC2 {}".format(epoch + 1, step, cur_auc, cur_auc2 ))
                    if cur_auc > best_auc:
                        best_auc = cur_auc
                        print("save model...")
                        # tf.saved_model.save(self, self.save_dir)
                    print("best auc {}".format(best_auc))
            # time exhaust
            delta_t = time.time() - start
            h = int(delta_t // 3600)
            m = int((delta_t - 3600 * h) // 60)
            s = int(delta_t % 60)
            print("Epoch {} time exhaust: {}h-{}m-{}s".format(epoch + 1, h, m, s))


if __name__ == '__main__':
    # Target feature name.
    TARGET_FEATURE_NAME = "target_label_2"
    # Weight column name.
    WEIGHT_COLUMN_NAME = "paid_installment_no"

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--pretrained_path', default='./data/pretrained_model/vsn')
    parser.add_argument('--train_data_path', default='./train_data.csv')
    parser.add_argument('--test_data_path', default='./test_data.csv')
    parser.add_argument('--label', default=[TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME])

    parser.add_argument('--save_dir', default='./data/model')
    parser.add_argument('--lr', default=0.002, type=float)
    parser.add_argument('--batch_size', default=1024, type=int)
    parser.add_argument('--epochs', default=10, type=int)
    parser.add_argument('--evaluation_steps', default=500, type=int)
    parser.add_argument('--num_warmup_steps', default=20, type=int)
    parser.add_argument('--weight_decay_rate', default=0.01, type=float)

    args = parser.parse_args()
    top_n_obj_cols_a = {}
    with open('./data/obj_feature_value_A_biz.txt', encoding='utf-8') as f:
        for line in f.readlines():
            key, value_dict = line.split("|")
            kv = json.loads(value_dict.replace("'", "\""))
            top_n_obj_cols_a[key] = list(kv.values())
            # print(key,top_n_obj_cols[key])

    from data_util import digt_cols, obj_cols

    category_features = obj_cols
    numeric_features = digt_cols
    encoding_size = 10
    dropout_rate = 0.2
    category_feature_vocabulary = top_n_obj_cols_a
    project_dim = 2
    trainer = Trainer(args.pretrained_path,
                      train_data_path=args.train_data_path,
                      test_data_path=args.test_data_path,
                      save_dir=args.save_dir,
                      labels=args.label,
                      category_features=category_features,
                      numeric_features=numeric_features,
                      encoding_size=encoding_size,
                      dropout_rate=dropout_rate,
                      category_feature_vocabulary=category_feature_vocabulary,
                      project_dim=project_dim
                      )

    trainer.train(epochs=args.epochs, batch_size=args.batch_size, lr=args.lr, evaluation_steps=args.evaluation_steps)
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

mtj66

看心情

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值