def scaled_dot_product_attention(query_states, key_states, attn_mask=None):
"""
Calculates scaled dot-product attention scores.
Parameters:
query_states: torch.Tensor of shape (bs, n_heads, q_len, d_h_qk)
key_states: torch.Tensor of shape (bs, n_heads, kv_len, d_h_qk)
attn_mask: torch.Tensor of shape (bs, n_heads, q_len, kv_len), optional
Returns:
torch.Tensor: Attention scores after softmax of shape (bs, n_heads, q_len, kv_len)
"""
# Calculate the raw attention scores from query and key
d_h_qk = query_states.size(-1) # Dimension of the query/key vectors in each head
raw_scores = torch.matmul(query_states, key_states.transpose(-2, -1))
# Scale scores by the inverse square root of the dimensionality
scaled_scores = raw_scores / (d_h_qk ** 0.5)
# Apply the attention mask if provided (assuming mask elements are 0 for positions to attend to and -inf or a large negative value for masked positions)
if attn_mask is not None:
scaled_scores = scaled_scores + attn_mask
# Apply softmax to get probabilities (dim=-1 to softmax over the key/value dimension kv_len)
attention_probs = F.softmax(scaled_scores, dim=-1)
return attention_probs
def entropy_of_distributions(tensor):
"""
Calculates the entropy of distributions for each q_len in the tensor of shape (bs, n_heads, q_len, kv_len).
Parameters:
tensor: torch.Tensor of shape (bs, n_heads, q_len, kv_len) containing probability distributions
Returns:
torch.Tensor: Entropy values for each distribution of shape (bs, n_heads, q_len)
"""
# Ensure the tensor values are treated as probabilities, i.e., >= 0 and sum to 1 across kv_len
tensor = torch.clamp(tensor, min=0) # Clamp values to be >= 0 to handle potential numerical issues
tensor /= tensor.sum(dim=-1, keepdim=True) # Normalize to ensure sums to 1 along kv_len
# Calculate entropy
log_tensor = torch.log2(tensor + 1e-6) # Adding epsilon to avoid log(0)
entropy = -torch.sum(tensor * log_tensor, dim=-1)
return entropy
def save_attention_details(query_states, key_states, causal_mask):
"""
计算注意力得分并保存注意力得分的形状和熵到文件中。
参数:
- query_states: 查询向量(tensor), shape: (bs, n_heads, q_len, d_h_qk)
- key_states: 键向量(tensor), shape: (bs, n_heads, kv_len, d_h_qk)
- causal_mask: 因果掩码(tensor), shape: (bs, n_heads, q_len, kv_len)
"""
# 计算注意力得分
attn_score = scaled_dot_product_attention(query_states, key_states, attn_mask=causal_mask)
# 计算注意力得分的熵
attn_entropy = entropy_of_distributions(attn_score) # 假设这个函数已经定义
# 打开文件进行追加
with open('/data2/hkzheng/clm/tensor.txt', 'a') as f:
# 写入attn_score的形状
attn_score_shape_str = str(attn_score.shape)
f.write('attn_score shape: ' + attn_score_shape_str + '\n')
# 选择attn_entropy的第一个元素,并将其从GPU转移到CPU,然后转换为numpy数组
tensor_np = attn_entropy[0].cpu().numpy()
# 写入attn_entropy的形状
tensor_np_shape_str = str(tensor_np.shape)
f.write('attn_entropy shape: ' + tensor_np_shape_str + '\n')
# 写入tensor_np的内容
np.savetxt(f, tensor_np, fmt='%.3e', delimiter=' ', newline='\n')
f.write('\n') # 在数组后添加一个新行以便区分不同的写入块
注意力熵保存
最新推荐文章于 2024-05-16 16:52:02 发布