注意力熵保存

def scaled_dot_product_attention(query_states, key_states, attn_mask=None):
    """
    Calculates scaled dot-product attention scores.

    Parameters:
        query_states: torch.Tensor of shape (bs, n_heads, q_len, d_h_qk)
        key_states: torch.Tensor of shape (bs, n_heads, kv_len, d_h_qk)
        attn_mask: torch.Tensor of shape (bs, n_heads, q_len, kv_len), optional

    Returns:
        torch.Tensor: Attention scores after softmax of shape (bs, n_heads, q_len, kv_len)
    """
    # Calculate the raw attention scores from query and key
    d_h_qk = query_states.size(-1)  # Dimension of the query/key vectors in each head
    raw_scores = torch.matmul(query_states, key_states.transpose(-2, -1))
    # Scale scores by the inverse square root of the dimensionality
    scaled_scores = raw_scores / (d_h_qk ** 0.5)

    # Apply the attention mask if provided (assuming mask elements are 0 for positions to attend to and -inf or a large negative value for masked positions)
    if attn_mask is not None:
        scaled_scores = scaled_scores + attn_mask

    # Apply softmax to get probabilities (dim=-1 to softmax over the key/value dimension kv_len)
    attention_probs = F.softmax(scaled_scores, dim=-1)

    return attention_probs


def entropy_of_distributions(tensor):
    """
    Calculates the entropy of distributions for each q_len in the tensor of shape (bs, n_heads, q_len, kv_len).

    Parameters:
        tensor: torch.Tensor of shape (bs, n_heads, q_len, kv_len) containing probability distributions

    Returns:
        torch.Tensor: Entropy values for each distribution of shape (bs, n_heads, q_len)
    """
    # Ensure the tensor values are treated as probabilities, i.e., >= 0 and sum to 1 across kv_len
    tensor = torch.clamp(tensor, min=0)  # Clamp values to be >= 0 to handle potential numerical issues
    tensor /= tensor.sum(dim=-1, keepdim=True)  # Normalize to ensure sums to 1 along kv_len

    # Calculate entropy
    log_tensor = torch.log2(tensor + 1e-6)  # Adding epsilon to avoid log(0)
    entropy = -torch.sum(tensor * log_tensor, dim=-1)

    return entropy


def save_attention_details(query_states, key_states, causal_mask):
    """
    计算注意力得分并保存注意力得分的形状和熵到文件中。

    参数:
    - query_states: 查询向量(tensor), shape: (bs, n_heads, q_len, d_h_qk)
    - key_states: 键向量(tensor), shape: (bs, n_heads, kv_len, d_h_qk)
    - causal_mask: 因果掩码(tensor), shape: (bs, n_heads, q_len, kv_len)
    """
    # 计算注意力得分
    attn_score = scaled_dot_product_attention(query_states, key_states, attn_mask=causal_mask)

    # 计算注意力得分的熵
    attn_entropy = entropy_of_distributions(attn_score)  # 假设这个函数已经定义

    # 打开文件进行追加
    with open('/data2/hkzheng/clm/tensor.txt', 'a') as f:
        # 写入attn_score的形状
        attn_score_shape_str = str(attn_score.shape)
        f.write('attn_score shape: ' + attn_score_shape_str + '\n')

        # 选择attn_entropy的第一个元素,并将其从GPU转移到CPU,然后转换为numpy数组
        tensor_np = attn_entropy[0].cpu().numpy()

        # 写入attn_entropy的形状
        tensor_np_shape_str = str(tensor_np.shape)
        f.write('attn_entropy shape: ' + tensor_np_shape_str + '\n')

        # 写入tensor_np的内容
        np.savetxt(f, tensor_np, fmt='%.3e', delimiter=' ', newline='\n')
        f.write('\n')  # 在数组后添加一个新行以便区分不同的写入块

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值