BERT实现文本标签分类任务

安徒生在ACL讲一千零一夜

已于 2023-06-30 10:20:06 修改

阅读量272

点赞数

分类专栏： # NLP大模型文章标签： bert 分类人工智能

于 2023-06-29 22:13:29 首次发布

本文链接：https://blog.csdn.net/weixin_63595187/article/details/131463389

版权

NLP大模型专栏收录该内容

11 篇文章 0 订阅

订阅专栏

模型参数配置

导入数据集与分类类别名单

（通过在bert模型中配置）在这里插入代码片

class Config(object):
    
    """配置参数"""
    def __init__(self, dataset):
        self.model_name = 'bert'
        self.train_path = dataset + '/data/train.txt' # 训练集
        self.dev_path = dataset + '/data/dev.txt' # 验证集
        self.test_path = dataset + '/data/test.txt' # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt').readlines()] # 类别名单
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备
        self.require_improvement = 1000 # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list) # 类别数
        self.num_epochs = 3 # epoch数
        self.batch_size = 64 # mini-batch大小
        self.pad_size = 32 # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5 # 学习率
        self.bert_path = './bert_pretrain'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768

Model 1 : 数据预处理

Tokenization


def tokenize(self, text):

    split_tokens = []  # 初始化分割后的子词列表
    if self.do_basic_tokenize:  # 如果指定了基础分词
        for token in self.basic_tokenizer.tokenize(text):  
        # 对文本进行基础分词
            for sub_token in self.wordpiece_tokenizer.tokenize(token):  # 对分词后的每个单词进行词片段分词
                split_tokens.append(sub_token)  # 将分词后的子词添加到列表中
    else:  # 如果没有指定基础分词
        split_tokens = self.wordpiece_tokenizer.tokenize(text)  # 直接对文本进行词片段分词
    return split_tokens  # 返回分词后得到的子词列表
'''这个函数用于将输入的文本进行分词处理。它通过basic_tokenizer和wordpiece_tokenizer实现。

  

如果指定了基础分词，它将首先对文本进行基础分词，得到一个单词列表。

然后，在基础分词后的每个单词中，它会再次使用词片段分词器将单词分割为更小的子词，并将它们添加到分割子词的列表中。

如果没有指定基础分词，它将直接使用词片段分词器对整个文本进行处理，得到分割子词的列表。

最后，函数返回包含分割后子词的列表。

这个函数通常用于文本预处理阶段，将输入文本转换为模型能够处理的形式。'''

WordPiece Tokenization

class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""
    
    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
        # 初始化WordpieceTokenizer类
        # 参数：
        # vocab: 词汇表，用于标记化文本
        # unk_token: 未知标记的符号，默认为"[UNK]"
        # max_input_chars_per_word: 单词的最大字符数，默认为100
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        将文本分割成词块。

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
        
        采用贪婪的最长匹配算法对文本进行标记化，使用给定的词汇表。

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.
            
        参数：
        text: 单个标记或以空格分隔的标记。应已通过 `BasicTokenizer` 处理。

        Returns:
          A list of wordpiece tokens.
          
        返回值：
        一个由Wordpiece标记组成的列表。
        """

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

Model 2 : input

Embedding

class BertEmbeddings(nn.Module):

    """构建从单词、位置和类型嵌入到嵌入向量的过程。

    """

    def __init__(self, config):

        super(BertEmbeddings, self).__init__()

        # 词嵌入层，将输入的单词转换为词向量表示

        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)

        # 位置嵌入层，根据输入单词的位置信息生成位置向量表示

        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # 类型嵌入层，将输入的类型信息转换为类型向量表示

        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

  

        # 层归一化层，用于对嵌入向量进行归一化处理

        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

        # Dropout层，用于防止过拟合

        self.dropout = nn.Dropout(config.hidden_dropout_prob)

  

    def forward(self, input_ids, token_type_ids=None):

        seq_length = input_ids.size(1)

        # 生成位置向量的位置信息，从0到序列长度-1

        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)

        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        if token_type_ids is None:

            token_type_ids = torch.zeros_like(input_ids)

  

        # 将输入的单词、位置和类型信息映射为嵌入向量

        words_embeddings = self.word_embeddings(input_ids)

        position_embeddings = self.position_embeddings(position_ids)

        token_type_embeddings = self.token_type_embeddings(token_type_ids)

  

        # 将词嵌入向量、位置嵌入向量和类型嵌入向量相加

        embeddings = words_embeddings + position_embeddings + token_type_embeddings

        # 应用层归一化和dropout操作

        embeddings = self.LayerNorm(embeddings)

        embeddings = self.dropout(embeddings)

        return embeddings

Model 3 : Makes LM 任务

class BertForMaskedLM(BertPreTrainedModel):

    def __init__(self, config):

        # 构造函数接受一个BertConfig实例作为参数，用于配置模型

        super().__init__(config)

        self.bert = BertModel(config)

        # 创建MLM Head，用于预测mask的标记

        self.cls = BertOnlyMLMHead(config)

        self.apply(self.init_weights) # 初始化模型权重

  

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, masked_lm_labels=None):

        outputs = self.bert(

            input_ids,

            attention_mask=attention_mask,

            token_type_ids=token_type_ids

        )

        sequence_output = outputs[0]

  

        # 预测遮蔽标签

        prediction_scores = self.cls(sequence_output)

  

        # 如果有提供masked_lm_labels，那么计算损失（负对数似然）

        masked_lm_loss = None

        if masked_lm_labels is not None:

            loss_fct = CrossEntropyLoss()

            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))

  

        # 返回损失或预测分数

        return masked_lm_loss, prediction_scores

Model 4 : NSP任务

class BertForNextSentencePrediction(BertPreTrainedModel):

    """BERT model with next sentence prediction head.

    This module comprises the BERT model followed by the next sentence classification head.

  

    Params:

        config: a BertConfig class instance with the configuration to build a new model.

  

    Inputs:

        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]

            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts

            `extract_features.py`, `run_classifier.py` and `run_squad.py`)

        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token

            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to

            a `sentence B` token (see BERT paper for more details).

        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices

            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max

            input sequence length in the current batch. It's the mask that we typically use for attention when

            a batch has varying length sentences.

        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]

            with indices selected in [0, 1].

            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

  

    Outputs:

        if `next_sentence_label` is not `None`:

            Outputs the total_loss which is the sum of the masked language modeling loss and the next

            sentence classification loss.

        if `next_sentence_label` is `None`:

            Outputs the next sentence classification logits of shape [batch_size, 2].

    """

    def __init__(self, config):

        super(BertForNextSentencePrediction, self).__init__(config)

        self.bert = BertModel(config)  # BERT模型

        self.cls = BertOnlyNSPHead(config)  # 只包含NSP头部的NSP头部

        self.apply(self.init_bert_weights)

  

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):

        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,

                                     output_all_encoded_layers=False)  # 获取BERT模型的输出，只返回最后一层的输出

        seq_relationship_score = self.cls(pooled_output)  # 通过NSP头部获取下一个句子预测的得分

  

        if next_sentence_label is not None:  # 如果提供了下一个句子预测的标签

            loss_fct = CrossEntropyLoss(ignore_index=-1)  # 定义交叉熵损失函数

            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))  # 计算下一个句子预测的损失

            return next_sentence_loss  # 返回损失

        else:

            return seq_relationship_score  # 返回预测得分

Model 5 : Encoder

# 自注意力机制

class BertSelfAttention(nn.Module):

    def __init__(self, config):

        super(BertSelfAttention, self).__init__()

        if config.hidden_size % config.num_attention_heads != 0:

            raise ValueError(

                "The hidden size (%d) is not a multiple of the number of attention "

                "heads (%d)" % (config.hidden_size, config.num_attention_heads))

        self.num_attention_heads = config.num_attention_heads # 设置Self-Attention头数，根据给定的隐藏层大小和头的数量计算出各个变量的大小

        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)

        self.all_head_size = self.num_attention_heads * self.attention_head_size

  

        # 创建了三个线性层，用于将隐藏状态转换为Query、Key和Value的向量值

        self.query = nn.Linear(config.hidden_size, self.all_head_size)

        self.key = nn.Linear(config.hidden_size, self.all_head_size)

        self.value = nn.Linear(config.hidden_size, self.all_head_size)

  

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob) # dropout层，避免过拟合

  

    def transpose_for_scores(self, x):

        # 用于重新排列张量的维度，以便进行注意力计算

        # 将输入张量x的形状从[batch_size, seq_length, hidden_size]转换为[batch_size, num_attention_heads, seq_length, attention_head_size]

        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)

        x = x.view(*new_x_shape)

        return x.permute(0, 2, 1, 3) # 重新排列维度以0213的序号排列

  

    def forward(self, hidden_states, attention_mask):

        # 前向传播函数，接受隐藏状态和注意力掩码作为输入，并输出上下文层

  

        # 将隐藏状态转换为查询、键和值的张量

        mixed_query_layer = self.query(hidden_states)

        mixed_key_layer = self.key(hidden_states)

        mixed_value_layer = self.value(hidden_states)

  

        # 对它们进行形状转换，以便进行并行计算

        query_layer = self.transpose_for_scores(mixed_query_layer)

        key_layer = self.transpose_for_scores(mixed_key_layer)

        value_layer = self.transpose_for_scores(mixed_value_layer)

  

        # 计算注意力得分，将其除以根号下注意力头的大小，并加上注意力掩码

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)

        attention_scores = attention_scores + attention_mask

  

        # 通过softmax函数对注意力得分进行归一化处理

        attention_probs = nn.Softmax(dim=-1)(attention_scores)

  

        # This is actually dropping out entire tokens to attend to, which might

        # seem a bit unusual, but is taken from the original Transformer paper.

        attention_probs = self.dropout(attention_probs) # 用dropout层以防止过拟合

  

        # 计算上下文层，通过注意力概率和值层的矩阵乘法得到

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()

        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)

        context_layer = context_layer.view(*new_context_layer_shape)

        # 对上下文层进行形状转换，返回最终的上下文层

        return context_layer

  

# 隐状态输出层

class BertSelfOutput(nn.Module):

    def __init__(self, config):

        super(BertSelfOutput, self).__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 全连接层映射，将隐藏状态的维度映射到与输入维度相同

        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)  # 归一化层，对每个维度进行归一化

        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # dropout层，防止过拟合

  

    def forward(self, hidden_states, input_tensor):

        hidden_states = self.dense(hidden_states)  # 将隐藏状态映射到与输入维度相同

        hidden_states = self.dropout(hidden_states)  # 使用dropout随机失活一部分神经元

        hidden_states = self.LayerNorm(hidden_states + input_tensor)  # 将映射后的隐藏状态与输入进行相加后归一化

        return hidden_states

  

# BERT的注意力层

class BertSelfOutput(nn.Module):

    def __init__(self, config):

        super(BertSelfOutput, self).__init__()

        # 线性层self.dense，用于调整隐藏层状态维度，使之匹配config中定义的隐藏状态的维度

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)

        # Dropout层，防止过拟合发生

        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 归一化层，有助于稳定模型训练，并加速收敛

        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    def forward(self, hidden_states, input_tensor):
        # 线性转换和激活函数 (hidden_states -> intermediate_output)隐藏状态hidden_states通过线性层self.dense,应用激活函数Gelu。将产生中间输出intermediate_output,减少过拟合
        intermediate_output = self.dense(hidden_states)
        intermediate_output = nn.functional.gelu(intermediate_output)
        
        # 通过dropout层产生中间输出intermediate_output
        intermediate_output = self.dropout(intermediate_output)
        
        # 残差连接
        hidden_states = hidden_states + intermediate_output
        
        # 归一化输出
        hidden_states = self.LayerNorm(hidden_states)
        
        return hidden_states
        # 返回隐藏状态作为输出

# 将输入的隐藏状态进行全连接层和激活函数处理，生成中间表示
class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        
        # 定义一个全连接层，输入维度为config.hidden_size，输出维度为config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        
        # 根据配置参数，选择中间激活函数
        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        # 输入hidden_states经过全连接层处理
        hidden_states = self.dense(hidden_states)
        
        # 经过中间激活函数处理
        hidden_states = self.intermediate_act_fn(hidden_states)
        
        # 返回处理后的hidden_states
        return hidden_states

# 处理输入的隐藏状态并生成最终的隐藏状态输出
class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        
        # 定义一个线性层，用于将输入的隐藏状态转换为与config.hidden_size相匹配的维度
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        
        # 定义一个BertLayerNorm层，用于对线性层输出的隐藏状态进行标准化
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        
        # 定义一个Dropout层，用于在隐藏状态上进行随机失活，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 将隐藏状态输入到线性层中，进行维度变换
        hidden_states = self.dense(hidden_states)
        
        # 在变换后的隐藏状态上应用随机失活，以减少模型过拟合的可能性
        hidden_states = self.dropout(hidden_states)
        
        # 将输入的隐藏状态与变换后的隐藏状态相加，并通过LayerNorm层进行标准化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        
        # 返回最终的隐藏状态输出
        return hidden_states

# BERT中Layer的结构定义
class BertLayer(nn.Module):

    def __init__(self, config):
	# config是一个包含各种配置参数的对象。在这个函数中，我们首先调用了父类nn.model来初始化基类，并进一步定义了`self.attention`、`self.intermediate` 和 `self.output` 这三个成员变量
        super(BertLayer, self).__init__()

        self.attention = BertAttention(config)
        # `BertAttention`是`BertLayer`中的一个子模块，负责计算输入的注意力
        self.intermediate = BertIntermediate(config)
        # `BertIntermediate`是`BertLayer`中的另一个子模块，用于进行BertAttention的中间输出转换

        self.output = BertOutput(config)
        # `BertOutput`是`BertLayer`中的最后一个子模块，负责处理中间输出并生成最终的层输出。

    def forward(self, hidden_states, attention_mask):
		# 前向传播，接受两个输入参数：`hidden_states`表示输入的隐藏状态，而`attention_mask`表示用于注意力计算的掩码
        attention_output = self.attention(hidden_states, attention_mask) # 计算注意力输出

        intermediate_output = self.intermediate(attention_output) # 中间处理

        layer_output = self.output(intermediate_output, attention_output) # 生成最终层输出

        return layer_output

Model 6 : output

池化层

# 用于将所有输入序列的隐藏状态进行汇总得到一个固定长度的向量表示。这样对输入的隐藏状态进行汇总和嵌入操作，获得整个输入序列的特征表示，以便后续的下游任务使用
class BertPooler(nn.Module):

    def __init__(self, config):

        super(BertPooler, self).__init__()
		# 定义了一个全连接层self.dense，它将输入的隐藏状态的维度从config.hidden_size映射为config.hidden_size。这个层的目的是将输入的隐藏状态进行线性变换，以便更好地捕捉特征。
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
		# 激活函数self.activation，这里使用了tanh函数。这个激活函数可以帮助我们处理输入数据，使其能够具有非线性的特性
        self.activation = nn.Tanh()

  

    def forward(self, hidden_states):
    # 将输入的隐藏状态hidden_states作为参数,hidden_states的形状为(batch_size, sequence_length, hidden_size)，即一个batch中一系列输入的隐藏状态。我们将池化操作定义为获取hidden_states中每个样本序列的第一个词的隐藏状态，也就是hidden_states的[:, 0]切片。这是因为在BERT中，第一个词通常是特殊的"[CLS]"标记，用于表示整个输入序列的分类任务。
        first_token_tensor = hidden_states[:, 0] # 获取第一个token的张量

        pooled_output = self.dense(first_token_tensor) # 第一个词的隐藏状态通过self.dense进行线性变换

        pooled_output = self.activation(pooled_output) # 经过self.activation进行激活，这个操作可以将第一个词的隐藏状态映射到一个固定长度的向量，称为池化输出

        return pooled_output # 返回池化输出

Model 7 : 文本分类任务

def __init__(self, config, num_labels):
    super(BertForSequenceClassification, self).__init__(config)
    # 初始化函数，设置模型参数
    self.num_labels = num_labels
    self.bert = BertModel(config)  # 创建一个Bert模型对象
    self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 创建一个dropout层，用于防止过拟合
    self.classifier = nn.Linear(config.hidden_size, num_labels)  # 创建一个线性层，用于分类
    self.apply(self.init_bert_weights)  # 初始化模型参数

def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
    # 前向传播函数，定义模型的计算过程
    _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
    # 使用Bert模型对输入进行编码，获得pooled_output（序列的池化表示）和 encoded_layers（所有层的编码表示）
    pooled_output = self.dropout(pooled_output)  # 对池化输出进行dropout操作，防止过拟合
    logits = self.classifier(pooled_output)  # 使用线性层对池化输出进行分类，得到logits（预测的标签）

    if labels is not None:
        # 如果提供了labels（真实标签），则计算并返回损失函数值
        loss_fct = CrossEntropyLoss()  # 创建交叉熵损失函数对象
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))  # 计算损失值
        return loss
    else:
        # 如果没有提供labels，则返回logits（预测的标签）
        return logits