Multimodal Fusion Transformer for Remote Sensing Image Classification论文代码详解


导入需要的库

import sys
# 导入Python的sys模块,用于与Python解释器进行交互,通常用于操作命令行参数、模块搜索路径等。

sys.path.append("./../")
# 将当前脚本所在目录的上一级目录添加到Python的模块搜索路径中,以便能够导入该目录下的模块。

from torch.nn import LayerNorm, Linear, Dropout, Softmax
# 从PyTorch的神经网络模块中导入LayerNorm(层归一化)、Linear(全连接层)、Dropout(随机失活)和Softmax(Softmax激活函数)。

from einops import rearrange, repeat
# 从einops库中导入rearrange和repeat函数,einops是一个用于操作张量的库,rearrange用于重排张量的维度,repeat用于重复张量的元素。

import copy
# 导入Python的copy模块,用于复制对象,包括浅拷贝和深拷贝。

from timm.models.layers import DropPath, trunc_normal_
# 从timm库的models.layers模块中导入DropPath(随机路径丢弃)和trunc_normal_(截断正态分布初始化)。timm是一个用于深度学习的库,提供了许多预训练模型和实用工具。

from pathlib import Path
# 从Python的pathlib模块中导入Path类,用于处理文件路径,提供了面向对象的路径操作方式。

import re
# 导入Python的re模块,用于正则表达式操作,通常用于字符串的匹配和替换。

import torch.backends.cudnn as cudnn
# 导入PyTorch的cudnn模块,用于配置CUDA的cuDNN库,通常用于设置cuDNN的性能优化选项。

import record
# 导入record模块,假设这是一个自定义模块,用于记录某些信息或数据。

import matplotlib.pyplot as plt
# 导入matplotlib库的pyplot模块,用于绘制图表,通常用于数据可视化。

from torchsummary import summary
# 从torchsummary库中导入summary函数,用于打印PyTorch模型的结构和参数信息。

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, cohen_kappa_score
# 从scikit-learn库的metrics模块中导入confusion_matrix(混淆矩阵)、accuracy_score(准确率)、classification_report(分类报告)和cohen_kappa_score(Cohen's Kappa系数),用于评估分类模型的性能。

from operator import truediv
# 从Python的operator模块中导入truediv函数,用于执行精确除法(浮点数除法)。

import math
# 导入Python的math模块,提供了许多数学函数和常量。

from PIL import Image
# 从PIL库中导入Image类,用于处理图像,PIL是Python Imaging Library的缩写。

import time
# 导入Python的time模块,用于处理时间相关的操作,如获取当前时间、延时等。

import torchvision.transforms.functional as TF
# 从torchvision库的transforms模块中导入functional子模块,并命名为TF,提供了许多图像变换的函数。

from torch.nn.parameter import Parameter
# 从PyTorch的nn.parameter模块中导入Parameter类,用于定义可训练的参数。

from sklearn.decomposition import PCA
# 从scikit-learn库的decomposition模块中导入PCA类,用于主成分分析(Principal Component Analysis)。

from scipy.io import loadmat as loadmat
# 从scipy库的io模块中导入loadmat函数,并命名为loadmat,用于加载MATLAB格式的数据文件。

from scipy import io
# 从scipy库中导入io模块,提供了输入输出操作的函数。

import torch.utils.data as dataf
# 从PyTorch的utils.data模块中导入dataf,通常用于创建数据集和数据加载器。

import torch.nn as nn
# 从PyTorch的nn模块中导入nn,提供了构建神经网络的各种层和函数。

import torch
# 导入PyTorch库,提供了张量操作、自动微分、神经网络构建等功能。

import torch.nn.functional as F
# 从PyTorch的nn.functional模块中导入F,提供了许多神经网络的函数,如激活函数、损失函数等。

from torch import einsum
# 从PyTorch库中导入einsum函数,用于爱因斯坦求和约定,可以高效地进行张量操作。

import random
# 导入Python的random模块,用于生成随机数和进行随机操作。

import numpy as np
# 导入NumPy库,提供了科学计算的基础功能,如多维数组操作、线性代数、傅里叶变换等。

import os
# 导入Python的os模块,提供了与操作系统交互的功能,如文件和目录操作。

cudnn.deterministic = True
# 设置cudnn的deterministic参数为True,表示使用确定性的算法,即每次运行结果相同,但可能会牺牲一些性能。

cudnn.benchmark = False
# 设置cudnn的benchmark参数为False,表示不使用自动调优的算法,通常在需要确定性结果时设置为False。

创建模型 - MFT WITH CHANNEL TOKENIZATION

# MFT WITH CHANNEL TOKENIZATION

from torch.nn import LayerNorm, Linear, Dropout, Softmax
# 从PyTorch的神经网络模块中导入LayerNorm(层归一化)、Linear(全连接层)、Dropout(随机失活)和Softmax(Softmax激活函数)。

import copy
# 导入Python的copy模块,用于复制对象,包括浅拷贝和深拷贝。

def INF(B, H, W):
    return -torch.diag(torch.tensor(float("inf")).cuda().repeat(H), 0).unsqueeze(0).repeat(B * W, 1, 1)
# 定义一个INF函数,返回一个形状为(B*W, H, H)的张量,其中对角线元素为负无穷大,用于在注意力机制中避免自注意力。

class HetConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=None, bias=None, p=64, g=64):
        super(HetConv, self).__init__()
        # Groupwise Convolution
        self.gwc = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, groups=g, padding=kernel_size // 3, stride=stride)
        # Pointwise Convolution
        self.pwc = nn.Conv2d(in_channels, out_channels, kernel_size=1, groups=p, stride=stride)

    def forward(self, x):
        return self.gwc(x) + self.pwc(x)
# 定义一个异构卷积层(HetConv),包含组卷积(Groupwise Convolution)和逐点卷积(Pointwise Convolution),并在前向传播中将两者相加。

class MCrossAttention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.1, proj_drop=0.1):
        super().__init__()
        self.num_heads = num_heads  # 设置多头注意力的头数
        head_dim = dim // num_heads  # 计算每个头的维度
        self.scale = qk_scale or head_dim ** -0.5  # 计算缩放因子,默认使用 head_dim 的平方根的倒数

        self.wq = nn.Linear(head_dim, dim, bias=qkv_bias)  # 定义查询向量的线性变换层
        self.wk = nn.Linear(head_dim, dim, bias=qkv_bias)  # 定义键向量的线性变换层
        self.wv = nn.Linear(head_dim, dim, bias=qkv_bias)  # 定义值向量的线性变换层
        self.proj = nn.Linear(dim * num_heads, dim)  # 定义输出投影层
        self.proj_drop = nn.Dropout(proj_drop)  # 定义输出投影层的 dropout 层

    def forward(self, x):
        B, N, C = x.shape  # 获取输入张量的形状 (batch_size, num_tokens, embedding_dim)
        q = self.wq(x[:, 0:1, ...].reshape(B, 1, self.num_heads, C // self.num_heads)).permute(0, 2, 1, 3)  # B1C -> B1H(C/H) -> BH1(C/H)
        # 从输入张量中提取第一个 token 作为查询向量,并进行线性变换和重塑
        k = self.wk(x.reshape(B, N, self.num_heads, C // self.num_heads)).permute(0, 2, 1, 3)  # BNC -> BNH(C/H) -> BHN(C/H)
        # 对输入张量进行线性变换和重塑,得到键向量
        v = self.wv(x.reshape(B, N, self.num_heads, C // self.num_heads)).permute(0, 2, 1, 3)  # BNC -> BNH(C/H) -> BHN(C/H)
        # 对输入张量进行线性变换和重塑,得到值向量
        attn = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale  # 计算注意力分数
        attn = attn.softmax(dim=-1)  # 对注意力分数进行 softmax 归一化
        x = torch.einsum('bhij,bhjd->bhid', attn, v).transpose(1, 2)  # 计算加权和并进行转置
        x = x.reshape(B, 1, C * self.num_heads)  # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
        # 将结果重塑为 (batch_size, 1, embedding_dim * num_heads)
        x = self.proj(x)  # 对结果进行线性投影
        x = self.proj_drop(x)  # 对结果进行 dropout
        return x  # 返回最终的输出张量
# 定义一个多头交叉注意力机制(MCrossAttention),包含查询(q)、键(k)和值(v)的线性变换,以及注意力分数的计算和值的加权求和。

class Mlp(nn.Module):
    def __init__(self, dim):
        super(Mlp, self).__init__()  # 调用父类 nn.Module 的初始化函数
        self.fc1 = Linear(dim, 512)  # 定义第一个全连接层,输入维度为 dim,输出维度为 512
        self.fc2 = Linear(512, dim)  # 定义第二个全连接层,输入维度为 512,输出维度为 dim
        self.act_fn = nn.GELU()  # 定义激活函数为 GELU
        self.dropout = Dropout(0.1)  # 定义 dropout 层,dropout 概率为 0.1

        self._init_weights()  # 调用权重初始化函数

    def _init_weights(self):
        nn.init.xavier_uniform_(self.fc1.weight)  # 使用 Xavier 均匀分布初始化第一个全连接层的权重
        nn.init.xavier_uniform_(self.fc2.weight)  # 使用 Xavier 均匀分布初始化第二个全连接层的权重
        nn.init.normal_(self.fc1.bias, std=1e-6)  # 使用正态分布初始化第一个全连接层的偏置,标准差为 1e-6
        nn.init.normal_(self.fc2.bias, std=1e-6)  # 使用正态分布初始化第二个全连接层的偏置,标准差为 1e-6

    def forward(self, x):
        x = self.fc1(x)  # 输入通过第一个全连接层
        x = self.act_fn(x)  # 应用 GELU 激活函数
        x = self.dropout(x)  # 应用 dropout
        x = self.fc2(x)  # 输入通过第二个全连接层
        x = self.dropout(x)  # 再次应用 dropout
        return x  # 返回最终的输出张量
# 定义一个多层感知机(Mlp),包含两个全连接层和一个GELU激活函数,以及权重初始化方法。

class Block(nn.Module):
    def __init__(self, dim):
        super(Block, self).__init__()  # 调用父类 nn.Module 的初始化函数
        self.hidden_size = dim  # 设置隐藏层维度
        self.attention_norm = LayerNorm(dim, eps=1e-6)  # 定义注意力层的归一化层,eps 为 1e-6
        self.ffn_norm = LayerNorm(dim, eps=1e-6)  # 定义前馈神经网络层的归一化层,eps 为 1e-6
        self.ffn = Mlp(dim)  # 定义前馈神经网络层,使用之前定义的 Mlp 类
        self.attn = MCrossAttention(dim=dim)  # 定义注意力层,使用之前定义的 MCrossAttention 类

    def forward(self, x):
        h = x  # 保存输入张量 x 的副本
        x = self.attention_norm(x)  # 对输入张量进行归一化
        x = self.attn(x)  # 通过注意力层
        x = x + h  # 残差连接,将注意力层的输出与输入张量相加

        h = x  # 保存注意力层输出张量的副本
        x = self.ffn_norm(x)  # 对注意力层输出张量进行归一化
        x = self.ffn(x)  # 通过前馈神经网络层
        x = x + h  # 残差连接,将前馈神经网络层的输出与注意力层输出张量相加

        return x  # 返回最终的输出张量
# 定义一个Transformer块(Block),包含多头交叉注意力机制和前馈神经网络(FFN),以及层归一化。

class TransformerEncoder(nn.Module):
    def __init__(self, dim, num_heads=8, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0.1, attn_drop=0.1,
                 drop_path=0.1, act_layer=nn.GELU, norm_layer=nn.LayerNorm, has_mlp=False):
        super().__init__()  # 调用父类 nn.Module 的初始化函数
        self.layer = nn.ModuleList()  # 初始化一个 ModuleList,用于存储多个 Block 层
        self.encoder_norm = LayerNorm(dim, eps=1e-6)  # 定义编码器的归一化层,eps 为 1e-6
        for _ in range(2):  # 循环两次,添加两个 Block 层
            layer = Block(dim)  # 创建一个 Block 层
            self.layer.append(copy.deepcopy(layer))  # 将 Block 层添加到 ModuleList 中

    def forward(self, x):
        for layer_block in self.layer:  # 遍历每个 Block 层
            x = layer_block(x)  # 将输入张量 x 通过当前 Block 层

        encoded = self.encoder_norm(x)  # 对最终的输出张量进行归一化
        return encoded[:, 0]  # 返回归一化后的输出张量的第一个 token
# 定义一个Transformer编码器(TransformerEncoder),包含多个Transformer块和一个层归一化。

class MFT(nn.Module):
    def __init__(self, FM, NC, NCLidar, Classes, HSIOnly):
        super(MFT, self).__init__()  # 调用父类 nn.Module 的初始化函数
        self.HSIOnly = HSIOnly  # 是否仅使用高光谱图像数据
        self.conv5 = nn.Sequential(
            nn.Conv3d(1, 8, (9, 3, 3), padding=(0, 1, 1), stride=1),  # 定义 3D 卷积层
            nn.BatchNorm3d(8),  # 定义 3D 批归一化层
            nn.ReLU()  # 定义 ReLU 激活函数
        )

        self.conv6 = nn.Sequential(
            HetConv(8 * (NC - 8), FM * 4,  # 定义异构卷积层
                    p=1,
                    g=(FM * 4) // 4 if (8 * (NC - 8)) % FM == 0 else (FM * 4) // 8,
                    ),
            nn.BatchNorm2d(FM * 4),  # 定义 2D 批归一化层
            nn.ReLU()  # 定义 ReLU 激活函数
        )

        self.last_BandSize = NC // 2 // 2 // 2  # 计算最后一个卷积层的输出通道数

        self.lidarConv = nn.Sequential(
            nn.Conv2d(NCLidar, 64, 3, 1, 1),  # 定义 2D 卷积层
            nn.BatchNorm2d(64),  # 定义 2D 批归一化层
            nn.GELU()  # 定义 GELU 激活函数
        )
        self.ca = TransformerEncoder(FM * 4)  # 定义 Transformer 编码器
        self.out3 = nn.Linear(FM * 4, Classes)  # 定义全连接层,输出类别数
        self.position_embeddings = nn.Parameter(torch.randn(1, 4 + 1, FM * 4))  # 定义位置嵌入
        self.dropout = nn.Dropout(0.1)  # 定义 dropout 层
        torch.nn.init.xavier_uniform_(self.out3.weight)  # 初始化全连接层的权重
        torch.nn.init.normal_(self.out3.bias, std=1e-6)  # 初始化全连接层的偏置
        self.token_wA = nn.Parameter(torch.empty(1, 4, 64),
                                     requires_grad=True)  # 定义 tokenization 参数
        torch.nn.init.xavier_normal_(self.token_wA)  # 初始化 tokenization 参数
        self.token_wV = nn.Parameter(torch.empty(1, 64, 64),
                                     requires_grad=True)  # 定义 tokenization 参数
        torch.nn.init.xavier_normal_(self.token_wV)  # 初始化 tokenization 参数

        self.token_wA_L = nn.Parameter(torch.empty(1, 1, 64),
                                       requires_grad=True)  # 定义 tokenization 参数
        torch.nn.init.xavier_normal_(self.token_wA_L)  # 初始化 tokenization 参数
        self.token_wV_L = nn.Parameter(torch.empty(1, 64, 64),
                                       requires_grad=True)  # 定义 tokenization 参数
        torch.nn.init.xavier_normal_(self.token_wV_L)  # 初始化 tokenization 参数

    def forward(self, x1, x2):
        x1 = x1.reshape(x1.shape[0], -1, patchsize, patchsize)  # 重塑输入张量 x1
        x1 = x1.unsqueeze(1)  # 增加一个维度
        x2 = x2.reshape(x2.shape[0], -1, patchsize, patchsize)  # 重塑输入张量 x2
        x1 = self.conv5(x1)  # 通过 3D 卷积层
        x1 = x1.reshape(x1.shape[0], -1, patchsize, patchsize)  # 重塑输出张量

        x1 = self.conv6(x1)  # 通过异构卷积层
        x2 = self.lidarConv(x2)  # 通过 2D 卷积层
        x2 = x2.reshape(x2.shape[0], -1, patchsize ** 2)  # 重塑输出张量
        x2 = x2.transpose(-1, -2)  # 转置张量
        wa_L = self.token_wA_L.expand(x1.shape[0], -1, -1)  # 扩展 tokenization 参数
        wa_L = rearrange(wa_L, 'b h w -> b w h')  # 转置张量
        A_L = torch.einsum('bij,bjk->bik', x2, wa_L)  # 计算注意力矩阵
        A_L = rearrange(A_L, 'b h w -> b w h')  # 转置张量
        A_L = A_L.softmax(dim=-1)  # 对注意力矩阵进行 softmax 归一化
        wv_L = self.token_wV_L.expand(x2.shape[0], -1, -1)  # 扩展 tokenization 参数
        VV_L = torch.einsum('bij,bjk->bik', x2, wv_L)  # 计算值矩阵
        x2 = torch.einsum('bij,bjk->bik', A_L, VV_L)  # 计算加权和
        x1 = x1.flatten(2)  # 展平张量

        x1 = x1.transpose(-1, -2)  # 转置张量
        wa = self.token_wA.expand(x1.shape[0], -1, -1)  # 扩展 tokenization 参数
        wa = rearrange(wa, 'b h w -> b w h')  # 转置张量
        A = torch.einsum('bij,bjk->bik', x1, wa)  # 计算注意力矩阵
        A = rearrange(A, 'b h w -> b w h')  # 转置张量
        A = A.softmax(dim=-1)  # 对注意力矩阵进行 softmax 归一化
        wv = self.token_wV.expand(x1.shape[0], -1, -1)  # 扩展 tokenization 参数
        VV = torch.einsum('bij,bjk->bik', x1, wv)  # 计算值矩阵
        T = torch.einsum('bij,bjk->bik', A, VV)  # 计算加权和
        x = torch.cat((x2, T), dim=1)  # 连接张量
        embeddings = x + self.position_embeddings  # 添加位置嵌入
        embeddings = self.dropout(embeddings)  # 应用 dropout
        x = self.ca(embeddings)  # 通过 Transformer 编码器
        x = x.reshape(x.shape[0], -1)  # 重塑张量
        out3 = self.out3(x)  # 通过全连接层
        return out3  # 返回最终的输出张量
# 定义一个多模态融合Transformer(MFT)模型,包含多个卷积层、Transformer编码器和最终的分类层。

# 定义批次大小为 64
batchsize = 64

# 定义图像块的大小为 11x11
patchsize = 11

# 创建 MFT 模型实例,并将其移动到 GPU 上
model = MFT(16, 144, 1, 15, False).to("cuda")

# 使用 summary 函数打印模型的摘要信息
summary(model, [(144, 121), (1, 121)], device='cuda')
# 设置批量大小和补丁大小,创建MFT模型实例并将其移动到GPU,最后打印模型的摘要信息。

模型训练

DATASETS_WITH_HSI_PARTS = ['Berlin', 'Augsburg']
# 定义包含HSI部分的特定数据集名称列表。

DATA2_List = ['SAR', 'DSM', 'MS']
# 定义第二数据源的类型列表。

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# 设置CUDA可见设备为1,即使用GPU 1。

# All datasets = "Houston","Trento","MUUFL","HoustonMS","AugsburgSAR","AugsburgDSM"
datasetNames = ["Trento"]
# 定义要训练的数据集名称列表。

patchsize = 11
# 定义补丁大小。

batchsize = 64
# 定义批量大小。

testSizeNumber = 500
# 定义测试集的大小。

EPOCH = 1
# 定义训练的轮数。

BandSize = 1
# 定义频带大小。

LR = 5e-4
# 定义学习率。

FM = 16
# 定义特征映射的大小。

HSIOnly = False
# 定义是否仅使用HSI数据进行训练。

FileName = 'MFT'
# 定义文件名前缀。

def AA_andEachClassAccuracy(confusion_matrix):
    # 获取混淆矩阵的类别数量
    counter = confusion_matrix.shape[0]
    
    # 获取混淆矩阵的对角线元素,即每个类别的正确分类数量
    list_diag = np.diag(confusion_matrix)
    
    # 获取混淆矩阵每一行的和,即每个类别的总样本数量
    list_raw_sum = np.sum(confusion_matrix, axis=1)
    
    # 计算每个类别的准确率,并处理除零错误
    each_acc = np.nan_to_num(truediv(list_diag, list_raw_sum))
    
    # 计算所有类别的平均准确率
    average_acc = np.mean(each_acc)
    
    # 返回每个类别的准确率和平均准确率
    return each_acc, average_acc
# 定义一个函数,计算每个类别的准确率和平均准确率。

def reports(xtest, xtest2, ytest, name, model):
    # 初始化预测结果数组
    pred_y = np.empty((len(ytest)), dtype=np.float32)
    
    # 计算批次数量
    number = len(ytest) // testSizeNumber
    
    # 分批次处理测试数据
    for i in range(number):
        # 获取当前批次的测试数据
        temp = xtest[i * testSizeNumber:(i + 1) * testSizeNumber, :, :]
        temp = temp.cuda()  # 将数据移动到 GPU
        
        temp1 = xtest2[i * testSizeNumber:(i + 1) * testSizeNumber, :, :]
        temp1 = temp1.cuda()  # 将数据移动到 GPU

        # 使用模型进行预测
        temp2 = model(temp, temp1)

        # 获取预测结果中的最大值索引
        temp3 = torch.max(temp2, 1)[1].squeeze()
        
        # 将预测结果存储到 pred_y 中
        pred_y[i * testSizeNumber:(i + 1) * testSizeNumber] = temp3.cpu()
        
        # 释放临时变量
        del temp, temp2, temp3, temp1

    # 处理剩余的测试数据
    if (i + 1) * testSizeNumber < len(ytest):
        temp = xtest[(i + 1) * testSizeNumber:len(ytest), :, :]
        temp = temp.cuda()  # 将数据移动到 GPU
        
        temp1 = xtest2[(i + 1) * testSizeNumber:len(ytest), :, :]
        temp1 = temp1.cuda()  # 将数据移动到 GPU

        # 使用模型进行预测
        temp2 = model(temp, temp1)
        
        # 获取预测结果中的最大值索引
        temp3 = torch.max(temp2, 1)[1].squeeze()
        
        # 将预测结果存储到 pred_y 中
        pred_y[(i + 1) * testSizeNumber:len(ytest)] = temp3.cpu()
        
        # 释放临时变量
        del temp, temp2, temp3, temp1

    # 将预测结果转换为 LongTensor
    pred_y = torch.from_numpy(pred_y).long()

    # 根据数据集名称设置类别名称
    if name == 'Houston':
        target_names = ['Healthy grass', 'Stressed grass', 'Synthetic grass',
                        'Trees', 'Soil', 'Water',
                        'Residential', 'Commercial', 'Road', 'Highway',
                        'Railway', 'Parking Lot 1', 'Parking Lot 2', 'Tennis Court',
                        'Running Track']
    elif name == 'Trento':
        target_names = ['Apples', 'Buildings', 'Ground', 'Woods', 'Vineyard',
                        'Roads']
    elif name == 'MUUFL' or name == 'MUUFLS' or name == 'MUUFLSR':
        target_names = ['Trees', 'Grass_Pure', 'Grass_Groundsurface', 'Dirt_And_Sand', 'Road_Materials', 'Water', "Buildings'_Shadow",
                        'Buildings', 'Sidewalk', 'Yellow_Curb', 'ClothPanels']
    elif name == 'IP':
        target_names = ['Alfalfa', 'Corn-notill', 'Corn-mintill', 'Corn',
                        'Grass-pasture', 'Grass-trees', 'Grass-pasture-mowed',
                        'Hay-windrowed', 'Oats', 'Soybean-notill', 'Soybean-mintill',
                        'Soybean-clean', 'Wheat', 'Woods', 'Buildings-Grass-Trees-Drives',
                        'Stone-Steel-Towers']
    elif name == 'SA':
        target_names = ['Brocoli_green_weeds_1', 'Brocoli_green_weeds_2', 'Fallow', 'Fallow_rough_plow', 'Fallow_smooth',
                        'Stubble', 'Celery', 'Grapes_untrained', 'Soil_vinyard_develop', 'Corn_senesced_green_weeds',
                        'Lettuce_romaine_4wk', 'Lettuce_romaine_5wk', 'Lettuce_romaine_6wk', 'Lettuce_romaine_7wk',
                        'Vinyard_untrained', 'Vinyard_vertical_trellis']
    elif name == 'UP':
        target_names = ['Asphalt', 'Meadows', 'Gravel', 'Trees', 'Painted metal sheets', 'Bare Soil', 'Bitumen',
                        'Self-Blocking Bricks', 'Shadows']

    # 计算总体准确率
    oa = accuracy_score(ytest, pred_y)
    
    # 计算混淆矩阵
    confusion = confusion_matrix(ytest, pred_y)
    
    # 计算每个类别的准确率和平均准确率
    each_acc, aa = AA_andEachClassAccuracy(confusion)
    
    # 计算 Kappa 系数
    kappa = cohen_kappa_score(ytest, pred_y)

    # 返回混淆矩阵、总体准确率、每个类别的准确率、平均准确率和 Kappa 系数
    return confusion, oa * 100, each_acc * 100, aa * 100, kappa * 100
# 定义一个函数,生成分类报告,包括混淆矩阵、总体准确率、每个类别的准确率、平均准确率和Kappa系数。

def set_seed(seed):
    # 设置 PyTorch 的随机种子
    torch.manual_seed(seed)
    
    # 设置所有 GPU 的随机种子
    torch.cuda.manual_seed_all(seed)
    
    # 设置 NumPy 的随机种子
    np.random.seed(seed)
# 定义一个函数,设置随机种子以确保结果的可重复性。

def train():
    # 遍历 BandSize 列表
    for BandSize in [1]:
        # 遍历数据集名称列表
        for datasetName in datasetNames:
            print("----------------------------------Training for ", datasetName, " ---------------------------------------------")
            try:
                # 创建数据集目录
                os.makedirs(datasetName)
            except FileExistsError:
                pass
            data1Name = ''
            data2Name = ''
            if datasetName in ["Houston", "Trento", "MUUFL"]:
                data1Name = datasetName
                data2Name = "LIDAR"
            else:
                for dataName in DATA2_List:
                    dataNameToCheck = re.compile(dataName)
                    matchObj = dataNameToCheck.search(datasetName)
                    if matchObj:
                        data1Name = datasetName.replace(dataName, "")
                        data2Name = dataName

            # 加载训练数据
            HSI = io.loadmat('./../' + data1Name + '11x11/HSI_Tr.mat')
            TrainPatch = HSI['Data']
            TrainPatch = TrainPatch.astype(np.float32)
            NC = TrainPatch.shape[3]  # NC is number of bands

            LIDAR = io.loadmat('./../' + data1Name + '11x11/' + data2Name + '_Tr.mat')
            TrainPatch2 = LIDAR['Data']
            TrainPatch2 = TrainPatch2.astype(np.float32)
            NCLIDAR = TrainPatch2.shape[3]  # NC is number of bands

            label = io.loadmat('./../' + data1Name + '11x11/TrLabel.mat')
            TrLabel = label['Data']

            # 加载测试数据
            if data1Name in DATASETS_WITH_HSI_PARTS:
                i = 2
                basePath = "./../" + data1Name + '11x11/HSI_Te_Part'
                TestPatch = io.loadmat(basePath + str(i - 1) + '.mat')['Data']
                while True:
                    my_file = Path(basePath + str(i) + '.mat')
                    if my_file.exists():
                        TestPatch = np.concatenate([TestPatch, io.loadmat(basePath + str(i) + '.mat')['Data']], axis=0)
                        i += 1
                    else:
                        break
            else:
                HSI = io.loadmat('./../' + data1Name + '11x11/HSI_Te.mat')
                TestPatch = HSI['Data']
            TestPatch = TestPatch.astype(np.float32)

            LIDAR = io.loadmat('./../' + data1Name + '11x11/' + data2Name + '_Te.mat')
            TestPatch2 = LIDAR['Data']
            TestPatch2 = TestPatch2.astype(np.float32)

            label = io.loadmat('./../' + data1Name + '11x11/TeLabel.mat')
            TsLabel = label['Data']

            # 转换训练数据为 PyTorch 张量
            TrainPatch1 = torch.from_numpy(TrainPatch).to(torch.float32)
            TrainPatch1 = TrainPatch1.permute(0, 3, 1, 2)
            TrainPatch1 = TrainPatch1.reshape(TrainPatch1.shape[0], TrainPatch1.shape[1], -1).to(torch.float32)
            TrainPatch2 = torch.from_numpy(TrainPatch2).to(torch.float32)
            TrainPatch2 = TrainPatch2.permute(0, 3, 1, 2)
            TrainPatch2 = TrainPatch2.reshape(TrainPatch2.shape[0], TrainPatch2.shape[1], -1).to(torch.float32)
            TrainLabel1 = torch.from_numpy(TrLabel) - 1
            TrainLabel1 = TrainLabel1.long()
            TrainLabel1 = TrainLabel1.reshape(-1)

            # 转换测试数据为 PyTorch 张量
            TestPatch1 = torch.from_numpy(TestPatch).to(torch.float32)
            TestPatch1 = TestPatch1.permute(0, 3, 1, 2)
            TestPatch1 = TestPatch1.reshape(TestPatch1.shape[0], TestPatch1.shape[1], -1).to(torch.float32)
            TestPatch2 = torch.from_numpy(TestPatch2).to(torch.float32)
            TestPatch2 = TestPatch2.permute(0, 3, 1, 2)
            TestPatch2 = TestPatch2.reshape(TestPatch2.shape[0], TestPatch2.shape[1], -1).to(torch.float32)
            TestLabel1 = torch.from_numpy(TsLabel) - 1
            TestLabel1 = TestLabel1.long()
            TestLabel1 = TestLabel1.reshape(-1)

            # 计算类别数量
            Classes = len(np.unique(TrainLabel1))
            dataset = dataf.TensorDataset(TrainPatch1, TrainPatch2, TrainLabel1)
            if data1Name in ['Berlin']:
                train_loader = dataf.DataLoader(dataset, batch_size=batchsize, shuffle=True, num_workers=0)
            else:
                train_loader = dataf.DataLoader(dataset, batch_size=batchsize, shuffle=True, num_workers=4)
            print("HSI Train data shape = ", TrainPatch1.shape)
            print(data2Name + " Train data shape = ", TrainPatch2.shape)
            print("Train label shape = ", TrainLabel1.shape)

            print("HSI Test data shape = ", TestPatch1.shape)
            print(data2Name + " Test data shape = ", TestPatch2.shape)
            print("Test label shape = ", TestLabel1.shape)

            print("Number of Classes = ", Classes)
            KAPPA = []
            OA = []
            AA = []
            ELEMENT_ACC = np.zeros((3, Classes))

            # 设置随机种子
            set_seed(42)
            for iterNum in range(3):
                # 创建模型并将其移动到 GPU
                model = MFT(FM, NC, NCLIDAR, Classes, HSIOnly).cuda()
                summary(model, [(NC, patchsize ** 2), (NCLIDAR, patchsize ** 2)])
                optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=5e-3)
                loss_func = nn.CrossEntropyLoss()  # 目标标签不是 one-hot 编码
                scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.9)
                BestAcc = 0

                torch.cuda.synchronize()
                start = time.time()
                # 训练和测试模型
                for epoch in range(EPOCH):
                    for step, (b_x1, b_x2, b_y) in enumerate(train_loader):

                        # 将训练数据移动到 GPU
                        b_x1 = b_x1.cuda()
                        b_y = b_y.cuda()
                        if HSIOnly:
                            out1 = model(b_x1, b_x2)
                            loss = loss_func(out1, b_y)
                        else:
                            b_x2 = b_x2.cuda()
                            out = model(b_x1, b_x2)
                            loss = loss_func(out, b_y)

                        optimizer.zero_grad()  # 清除梯度
                        loss.backward()  # 反向传播,计算梯度
                        optimizer.step()  # 应用梯度

                        if step % 50 == 0:
                            model.eval()
                            pred_y = np.empty((len(TestLabel1)), dtype='float32')
                            number = len(TestLabel1) // testSizeNumber
                            for i in range(number):
                                temp = TestPatch1[i * testSizeNumber:(i + 1) * testSizeNumber, :, :]
                                temp = temp.cuda()
                                temp1 = TestPatch2[i * testSizeNumber:(i + 1) * testSizeNumber, :, :]
                                temp1 = temp1.cuda()
                                if HSIOnly:
                                    temp2 = model(temp, temp1)
                                    temp3 = torch.max(temp2, 1)[1].squeeze()
                                    pred_y[i * testSizeNumber:(i + 1) * testSizeNumber] = temp3.cpu()
                                    del temp, temp2, temp3
                                else:
                                    temp2 = model(temp, temp1)
                                    temp3 = torch.max(temp2, 1)[1].squeeze()
                                    pred_y[i * testSizeNumber:(i + 1) * testSizeNumber] = temp3.cpu()
                                    del temp, temp1, temp2, temp3

                            if (i + 1) * testSizeNumber < len(TestLabel1):
                                temp = TestPatch1[(i + 1) * testSizeNumber:len(TestLabel1), :, :]
                                temp = temp.cuda()
                                temp1 = TestPatch2[(i + 1) * testSizeNumber:len(TestLabel1), :, :]
                                temp1 = temp1.cuda()
                                if HSIOnly:
                                    temp2 = model(temp, temp1)
                                    temp3 = torch.max(temp2, 1)[1].squeeze()
                                    pred_y[(i + 1) * testSizeNumber:len(TestLabel1)] = temp3.cpu()
                                    del temp, temp2, temp3
                                else:
                                    temp2 = model(temp, temp1)
                                    temp3 = torch.max(temp2, 1)[1].squeeze()
                                    pred_y[(i + 1) * testSizeNumber:len(TestLabel1)] = temp3.cpu()
                                    del temp, temp1, temp2, temp3

                            pred_y = torch.from_numpy(pred_y).long()
                            accuracy = torch.sum(pred_y == TestLabel1).type(torch.FloatTensor) / TestLabel1.size(0)

                            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.cpu().numpy(), '| test accuracy: %.4f' % (accuracy * 100))

                            # 保存网络参数
                            if accuracy > BestAcc:
                                BestAcc = accuracy
                                torch.save(model.state_dict(), datasetName + '/net_params_' + FileName + '.pkl')

                            model.train()
                    scheduler.step()
                torch.cuda.synchronize()
                end = time.time()
                print(end - start)
                Train_time = end - start

                # 加载保存的参数
                model.load_state_dict(torch.load(datasetName + '/net_params_' + FileName + '.pkl'))

                model.eval()
                confusion, oa, each_acc, aa, kappa = reports(TestPatch1, TestPatch2, TestLabel1, datasetName, model)
                KAPPA.append(kappa)
                OA.append(oa)
                AA.append(aa)
                ELEMENT_ACC[iterNum, :] = each_acc
                torch.save(model, datasetName + '/best_model_' + FileName + '_BandSize' + str(BandSize) + '_Iter' + str(iterNum) + '.pt')

                print("OA = ", oa)
            print("----------" + datasetName + " Training Finished -----------")
            record.record_output(OA, AA, KAPPA, ELEMENT_ACC, './' + datasetName + '/' + FileName + '_BandSize' + str(BandSize) + '_Report_' + datasetName + '.txt')

train()
# 定义训练函数,加载数据、初始化模型、训练模型并保存结果。

MFT WITH PIXEL TOKENIZATION

# 导入PyTorch中必要的模块
from torch.nn import LayerNorm, Linear, Dropout, Softmax
import copy
import torch
import torch.nn as nn
from einops import rearrange

# 定义一个函数,用于创建一个带有负无穷值的对角矩阵
def INF(B, H, W):
    return -torch.diag(torch.tensor(float("inf")).cuda().repeat(H), 0).unsqueeze(0).repeat(B*W, 1, 1)

# 异构卷积(HetConv)模块
class HetConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=None, bias=None, p=64, g=64):
        super(HetConv, self).__init__()
        # 分组卷积
        self.gwc = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, groups=g, padding=kernel_size//3, stride=stride)
        # 逐点卷积
        self.pwc = nn.Conv2d(in_channels, out_channels, kernel_size=1, groups=p, stride=stride)
    
    def forward(self, x):
        # 将分组卷积和逐点卷积的结果相加
        return self.gwc(x) + self.pwc(x)

# 交叉注意力(CrossAttention)模块
class CrossAttention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.1, proj_drop=0.1):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # 计算缩放因子
        self.scale = qk_scale or head_dim ** -0.5

        # 定义查询、键、值的线性变换
        self.wq = nn.Linear(head_dim, dim, bias=qkv_bias)
        self.wk = nn.Linear(head_dim, dim, bias=qkv_bias)
        self.wv = nn.Linear(head_dim, dim, bias=qkv_bias)
        
        # 定义投影层和投影丢弃层
        self.proj = nn.Linear(dim * num_heads, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        # 计算查询向量
        q = self.wq(x[:, 0:1, ...].reshape(B, 1, self.num_heads, C // self.num_heads)).permute(0, 2, 1, 3)
        # 计算键向量
        k = self.wk(x.reshape(B, N, self.num_heads, C // self.num_heads)).permute(0, 2, 1, 3)
        # 计算值向量
        v = self.wv(x.reshape(B, N, self.num_heads, C // self.num_heads)).permute(0, 2, 1, 3)
        
        # 计算注意力分数
        attn = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        attn = attn.softmax(dim=-1)
        
        # 计算加权值向量
        x = torch.einsum('bhij,bhjd->bhid', attn, v).transpose(1, 2)
        x = x.reshape(B, 1, C * self.num_heads)
        
        # 投影和丢弃
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

# 多层感知机(MLP)模块
class Mlp(nn.Module):
    def __init__(self, dim):
        super(Mlp, self).__init__()
        self.fc1 = Linear(dim, 512)
        self.fc2 = Linear(512, dim)
        self.act_fn = nn.GELU()
        self.dropout = Dropout(0.1)

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.normal_(self.fc1.bias, std=1e-6)
        nn.init.normal_(self.fc2.bias, std=1e-6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act_fn(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x

# 块(Block)模块,包含注意力机制和多层感知机
class Block(nn.Module):
    def __init__(self, dim):
        super(Block, self).__init__()
        self.hidden_size = dim
        self.attention_norm = LayerNorm(dim, eps=1e-6)
        self.ffn_norm = LayerNorm(dim, eps=1e-6)
        self.ffn = Mlp(dim)
        self.attn = CrossAttention(dim=dim)

    def forward(self, x):
        h = x
        x = self.attention_norm(x)
        x = self.attn(x)
        x = x + h

        h = x
        x = self.ffn_norm(x)
        x = self.ffn(x)
        x = x + h
        
        return x

# 交叉注意力块(CrossAttentionBlock)模块
class CrossAttentionBlock(nn.Module):
    def __init__(self, dim, num_heads=8, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0.1, attn_drop=0.1,
                 drop_path=0.1, act_layer=nn.GELU, norm_layer=nn.LayerNorm, has_mlp=False):
        super().__init__()
        self.layer = nn.ModuleList()
        self.encoder_norm = LayerNorm(dim, eps=1e-6)
        for _ in range(2):
            layer = Block(dim)
            self.layer.append(copy.deepcopy(layer))

    def forward(self, x):
        for layer_block in self.layer:
            x = layer_block(x)
        
        encoded = self.encoder_norm(x)
        return encoded[:, 0]

# 卷积神经网络(CNN)模块
class CNN(nn.Module):
    def __init__(self, FM, NC, NCLidar, Classes, HSIOnly):
        super(CNN, self).__init__()
        self.HSIOnly = HSIOnly
        self.conv5 = nn.Sequential(
            nn.Conv3d(1, 8, (9, 3, 3), padding=(0, 1, 1), stride=1),
            nn.BatchNorm3d(8),
            nn.ReLU()
        )
        
        self.conv6 = nn.Sequential(
            HetConv(8 * (NC - 8), FM*4, p=1, g=(FM*4)//4 if (8 * (NC - 8))%FM == 0 else (FM*4)//8),
            nn.BatchNorm2d(FM*4),
            nn.ReLU()
        )
        
        self.last_BandSize = NC//2//2//2
        
        self.lidarConv = nn.Sequential(
            nn.Conv2d(NCLidar, 1, 3, 1, 1),
            nn.BatchNorm2d(1),
            nn.GELU()
        )
        
        self.ca = CrossAttentionBlock(FM*4)
        self.out3 = nn.Linear(FM*4, Classes)
        self.position_embeddings = nn.Parameter(torch.randn(1, 4 + 1, FM*4))
        self.dropout = nn.Dropout(0.1)
        torch.nn.init.xavier_uniform_(self.out3.weight)
        torch.nn.init.normal_(self.out3.bias, std=1e-6)
        
        self.token_wA = nn.Parameter(torch.empty(1, 4, 64), requires_grad=True)
        torch.nn.init.xavier_normal_(self.token_wA)
        self.token_wV = nn.Parameter(torch.empty(1, 64, 64), requires_grad=True)
        torch.nn.init.xavier_normal_(self.token_wV)
        
        self.token_wA_L = nn.Parameter(torch.empty(1, 1, 1), requires_grad=True)
        torch.nn.init.xavier_normal_(self.token_wA_L)
        self.token_wV_L = nn.Parameter(torch.empty(1, 1, 64), requires_grad=True)
        torch.nn.init.xavier_normal_(self.token_wV_L)

    def forward(self, x1, x2):
        x1 = x1.reshape(x1.shape[0], -1, patchsize, patchsize)
        x1 = x1.unsqueeze(1)
        x2 = x2.reshape(x2.shape[0], -1, patchsize, patchsize)
        x1 = self.conv5(x1)
        x1 = x1.reshape(x1.shape[0], -1, patchsize, patchsize)
        
        x1 = self.conv6(x1)
        x2 = self.lidarConv(x2)
        x2 = x2.reshape(x2.shape[0], -1, patchsize**2)
        x2 = x2.transpose(-1, -2)
        
        wa_L = self.token_wA_L.expand(x1.shape[0], -1, -1)
        A_L = torch.einsum('bij,bjk->bik', x2, wa_L)
        A_L = rearrange(A_L, 'b h w -> b w h')
        A_L = A_L.softmax(dim=-1)
        
        wv_L = self.token_wV_L.expand(x2.shape[0], -1, -1)
        VV_L = torch.einsum('bij,bjk->bik', x2, wv_L)
        x2 = torch.einsum('bij,bjk->bik', A_L, VV_L)
        
        x1 = x1.flatten(2)
        x1 = x1.transpose(-1, -2)
        
        wa = self.token_wA.expand(x1.shape[0], -1, -1)
        wa = rearrange(wa, 'b h w -> b w h')
        A = torch.einsum('bij,bjk->bik', x1, wa)
        A = rearrange(A, 'b h w -> b w h')
        A = A.softmax(dim=-1)
        
        wv = self.token_wV.expand(x1.shape[0], -1, -1)
        VV = torch.einsum('bij,bjk->bik', x1, wv)
        T = torch.einsum('bij,bjk->bik', A, VV)
        
        x = torch.cat((x2, T), dim=1)
        embeddings = x + self.position_embeddings
        embeddings = self.dropout(embeddings)
        
        x = self.ca(x)
        x = x.reshape(x.shape[0], -1)
        out3 = self.out3(x)
        return out3

# 定义批次大小和补丁大小
batchsize = 64
patchsize = 11

# 创建模型实例
model = CNN(16, 144, 1, 15, False)

# 打印模型摘要
summary(model, [(144, 121), (1, 121)], device='cuda')
引用\[2\]:论文名称:Improving Multimodal Named Entity Recognition via Entity Span Detection with Unified Multimodal Transformer 论文作者:Jianfei Yu, Jing Jiang, Li Yang, Rui Xia 论文地址:2020.acl-main.306 https://www.aclweb.org/anthology/2020.acl-main.306/ 论文代码:UMT https://github.com/jefferyYu/UMT 论文摘要: 根据引用\[2\]提供的信息,multimodal transformer代码可以在GitHub上找到,代码的链接是https://github.com/jefferyYu/UMT。这个代码是用于实现论文《Improving Multimodal Named Entity Recognition via Entity Span Detection with Unified Multimodal Transformer》中提出的方法的。 #### 引用[.reference_title] - *1* *3* [[深度学习论文笔记] TransBTS: Multimodal Brain Tumor Segmentation Using Transformer 基于Transformer的...](https://blog.csdn.net/weixin_49627776/article/details/115449591)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insertT0,239^v3^insert_chatgpt"}} ] [.reference_item] - *2* [通过统一多模态Transformer(UMT)的实体扫描检测(ESD)来改进多模态命名实体识别(MNER)](https://blog.csdn.net/qq_43703681/article/details/113748435)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insertT0,239^v3^insert_chatgpt"}} ] [.reference_item] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

司南锤

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值