论文:Square Attack: a query-efficient black-box adversarial attack via random search
代码:https://github.com/max-andr/square-attack
1 基于分数的黑盒攻击
基于梯度的白盒攻击容易受到gradient obfuscation或masking[1][2]所影响,而黑盒攻击和PGD形式上不太一样,但是相比于白盒攻击来说要很多查询次数,而且一般性能差一些,基于score-based的黑盒攻击不访问梯度信息,而是在访问分类模型softmax之前最后一层的得分矩阵,对抗损失也是通过访问这个得分矩阵来得到对抗效果
1.2 基本思想
Square Attack是基于随机搜索的,其核心思想就是每次迭代ramdom一个噪声, 加到对抗样本再传入到对抗损失中看对抗效果是否提升,如果提升则使用这一次添加的噪声,如果没有提升则丢弃这一次添加的噪声.
3.h是挑选要添加噪声的窗口, 4.P根据这个窗口随机采样噪声 5.将噪声添加到对抗样本中形成xnew,6.把xnew放到对抗损失中,如果对抗损失值下降了(对抗效果变好)则保留本轮添加的噪声,如果对抗损失值没有下降则丢弃本次添加的噪声
1.3 正方形的随机噪声采样
基于笔记: Square Attack - 知乎 (zhihu.com)
1.4 L∞攻击
1.5 Margin Loss
无目标
L(f(x),y)=fy(x) - max(k≠y)fk(x)
其中y是正确标签,k是错误标签,fk(x)是图像x经过模型后得到矩阵的k维度值,fy(x)是图像x经过模型后得到矩阵的y维度值,损失最后是使得他们之间的差距越小越好代表对抗效果越好
有目标
2. pytorch代码实现
2.1 main函数
def load():
image_path = "./干净样本.jpg"
image = Image.open(image_path)
# 定义图像预处理的变换
preprocess = transforms.Compose([
transforms.Resize((224, 224)), # 调整图像大小为 224x224
transforms.ToTensor(), # 将图像转换为 PyTorch Tensor,
#像素值自动从0-255转到0-1
])
# 对图像进行预处理
x = preprocess(image)
# 在第0维添加一个维度,使其成为形状为 [1, 3, 224, 224] 的 Tensor
x = x.unsqueeze(0)
# y = torch.tensor(917)
y =np.array([917])
return x,y
x_test,y_test=load()
y_target_onehot = utils.dense_to_onehot(y_target, n_cls=n_cls)
n_queries, x_adv = square_attack(model, x_test, y_target_onehot, corr_classified, args.eps, args.n_iter,
args.p, metrics_path, args.targeted, args.loss)
# 假设 x 是经过预处理后的 PyTorch Tensor
#byte() 方法被用于将张量的数据类型转换为8位整数类型,再从0-1转0-255
print("攻击后图像类别是:{}".format(model.predict(x_test).argmax(1)))
x_adv = (x_adv.squeeze(0) * 255).byte()
# 将 PyTorch Tensor 转换为 NumPy 数组
x_adv = x_adv.numpy()
#CHW->HWC
x_adv=np.transpose(x_adv, (1, 2, 0))
#保存
Image.fromarray(x_adv).save("restored_image.jpg")
print("查询次数为:{}".format(n_queries))
2.2 square_attack
ef square_attack_linf(model, x, y, corr_classified, eps, n_iters, p_init, metrics_path, targeted, loss_type):
""" The Linf square attack """
np.random.seed(0) # important to leave it here as well
min_val, max_val = 0, 1 if x.max() <= 1 else 255 #min_val=0 max_val=1
c, h, w = x.shape[1:] #c=3 h=224 w=224
n_features = c*h*w #n_feature=150528
n_ex_total = x.shape[0] #n_ex_total=batch_size=1
x, y = x[corr_classified], y[corr_classified] #仅取被resNet50模型正确分类的样本
# [c, 1, w], i.e. vertical stripes work best for untargeted attacks
init_delta = np.random.choice([-eps, eps], size=[x.shape[0], c, 1, w]) #随机初始化噪声范围[-0.0001,+0.0001]
# x_best=[1,3,224,224],依然是初始样本,只是像素值裁剪到[0,1],x_best同时也是最终生成的对抗样本
x_best = np.clip(x + init_delta, min_val, max_val)
logits = model.predict(x_best) #获取模型经过最后一层softmax之间的输出,logits=[1,1000]
loss_min = model.loss(y, logits, targeted, loss_type=loss_type) #模型为ModelPT,默认损失函数为margin_loss loss_min=[2.96]
margin_min = model.loss(y, logits, targeted, loss_type='margin_loss') #margin_min=2.96
#n_queries的内容为: min=1.0 max=1.0 shape=(1,)
n_queries = np.ones(x.shape[0]) # ones because we have already used 1 query
time_start = time.time()
metrics = np.zeros([n_iters, 7]) #n_iter=10000 metrics=[10000,7]
for i_iter in range(n_iters - 1):
idx_to_fool = margin_min > 0 #margin_min=2.96>0 idx_to_fool=true
x_curr, x_best_curr, y_curr = x[idx_to_fool], x_best[idx_to_fool], y[idx_to_fool]
loss_min_curr, margin_min_curr = loss_min[idx_to_fool], margin_min[idx_to_fool] #loss_min_curr=[2.9] margin_min_curr=2.9
deltas = x_best_curr - x_curr #添加的噪声deltas=[1,3,224,224]
p = p_selection(p_init, i_iter, n_iters) #p=0.05
for i_img in range(x_best_curr.shape[0]): #由于batch_size=1,仅取出一个张图片i_img
s = int(round(np.sqrt(p * n_features / c))) #s=50
s = min(max(s, 1), h-1) # at least c x 1 x 1 window is taken and at most c x h-1 x h-1,s=50
center_h = np.random.randint(0, h - s) #center_h=107
center_w = np.random.randint(0, w - s) #center_w=158
#选择要添加扰动噪声的窗口 中心为x_curr_window,x_best_curr_window,长宽各为50
x_curr_window = x_curr[i_img, :, center_h:center_h+s, center_w:center_w+s] #x_curr_window=[3,50,50]
x_best_curr_window = x_best_curr[i_img, :, center_h:center_h+s, center_w:center_w+s] #x_best_curr_window=[3,50,50]
# prevent trying out a delta if it doesn't change x_curr (e.g. an overlapping patch)
while torch.sum(np.abs(np.clip(x_curr_window + deltas[i_img, :, center_h:center_h+s, center_w:center_w+s], min_val, max_val) - x_best_curr_window) < 10**-7) == c*s*s:
#往窗口里随机添加噪声
deltas[i_img, :, center_h:center_h+s, center_w:center_w+s] = torch.from_numpy(np.random.choice([-eps, eps], size=[c, 1, 1]))
#新添加完噪声的图片
x_new = np.clip(x_curr + deltas, min_val, max_val)
logits = model.predict(x_new) #重新在模型中获取softmax层之间的得分矩阵logits=[1,1000]
loss = model.loss(y_curr, logits, targeted, loss_type=loss_type) #传入magin loss中求得loss=2.96265
margin = model.loss(y_curr, logits, targeted, loss_type='margin_loss')
idx_improved = loss < loss_min_curr #idx_improved =false,本次迭代没有降低margin loss
loss_min[idx_to_fool] = idx_improved * loss + ~idx_improved * loss_min_curr #loss_min=[2.96165]
margin_min[idx_to_fool] = idx_improved * margin + ~idx_improved * margin_min_curr #margin_min=[2.96165]
idx_improved = np.reshape(idx_improved, [-1, *[1]*len(x.shape[:-1])]) #idx_improved=false
idx_improved =torch.from_numpy(idx_improved)
x_best[idx_to_fool] = idx_improved * x_new + ~idx_improved * x_best_curr
n_queries[idx_to_fool] += 1 #查询次数+1
acc = (margin_min > 0.0).sum() / n_ex_total #acc=1.0
acc_corr = (margin_min > 0.0).mean() #acc_corr=1.0 mean_nq=2.0 median_nq_ae=nan
mean_nq, mean_nq_ae, median_nq_ae = np.mean(n_queries), np.mean(n_queries[margin_min <= 0]), np.median(n_queries[margin_min <= 0])
avg_margin_min = np.mean(margin_min) #avg_margin_min=2.9616505
time_total = time.time() - time_start
print('{}: acc={:.2%} acc_corr={:.2%} avg#q_ae={:.2f} med#q={:.1f}, avg_margin={:.2f} (n_ex={}, eps={:.3f}, {:.2f}s)'.format(
i_iter + 1, acc, acc_corr, mean_nq_ae, median_nq_ae, avg_margin_min, x.shape[0], eps, time_total
))
metrics[i_iter] = [acc, acc_corr, mean_nq, mean_nq_ae, median_nq_ae, margin_min.mean(), time_total]
# if (i_iter <= 500 and i_iter % 20 == 0) or (i_iter > 100 and i_iter % 50 == 0) or i_iter + 1 == n_iters or acc == 0:
# np.save(metrics_path, metrics)
if acc == 0:
break
return n_queries, x_best
2.3 margin loss
#使用margin_loss损失,logits是经过最后一层softmax之前的得分矩阵[1000],y是正确标签并已经进行了独热编码为[1000]
#y = utils.dense_to_onehot(y, n_cls=n_cls) 是否是有目标攻击targeted
def loss(self, y, logits, targeted=False, loss_type='margin_loss'):
""" Implements the margin loss (difference between the correct and 2nd best class). """
if loss_type == 'margin_loss':
preds_correct_class = (logits * y).sum(1, keepdims=True) #保持[1000],仅取正常分类标签的分数
diff = preds_correct_class - logits # difference between the correct class and all other classes
diff[y] = np.inf # to exclude zeros coming from f_correct - f_correct
margin = diff.min(1, keepdims=True) #取和目标标签差距最小的那个
loss = margin * -1 if targeted else margin
elif loss_type == 'cross_entropy':
probs = utils.softmax(logits)
loss = -np.log(probs[y])
loss = loss * -1 if not targeted else loss
else:
raise ValueError('Wrong loss.')
return loss.flatten()
1]Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples
[2]Logit pairing methods can fool gradient-based attacks