  • 对抗样本生成模块支持安全工程师快速高效地生成对抗样本,用于攻击AI模型。

  • 对抗样本检测、防御模块支持用户检测过滤对抗样本、增强AI模型对于对抗样本的鲁棒性。

  • 评估模块提供多种指标全面评估对抗样本攻防性能。


本例面向CPU、GPU、Ascend 910 AI处理器,你可以在这里下载完整的样例代码:https://gitee.com/mindspore/docs/tree/r1.0/tutorials/tutorial_code/model_safety

  • mnist_attack_fgsm.py:包含攻击代码。

  • mnist_defense_nad.py:包含防御代码。




  1. import os
  2. import numpy as np
  3. from scipy.special import softmax
  4. from mindspore import dataset as ds
  5. import mindspore.common.dtype as mstype
  6. import mindspore.dataset.vision.c_transforms as CV
  7. import mindspore.dataset.transforms.c_transforms as C
  8. from mindspore.dataset.vision import Inter
  9. import mindspore.nn as nn
  10. from mindspore.nn import SoftmaxCrossEntropyWithLogits
  11. from mindspore.common.initializer import TruncatedNormal
  12. from mindspore import Model
  13. from mindspore import Tensor
  14. from mindspore import context
  15. from mindspore.train.callback import LossMonitor
  16. from mindarmour.adv_robustness.attacks import FastGradientSignMethod
  17. from mindarmour.utils.logger import LogUtil
  18. from mindarmour.adv_robustness.evaluations import AttackEvaluate
  19. context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
  20. LOGGER = LogUtil.get_instance()
  21. LOGGER.set_level("INFO")
  22. TAG = 'demo'



  1. # generate dataset for train of test
  2. def generate_mnist_dataset(data_path, batch_size=32, repeat_size=1,
  3. num_parallel_workers=1, sparse=True):
  4. """
  5. create dataset for training or testing
  6. """
  7. # define dataset
  8. ds1 = ds.MnistDataset(data_path)
  9. # define operation parameters
  10. resize_height, resize_width = 32, 32
  11. rescale = 1.0 / 255.0
  12. shift = 0.0
  13. # define map operations
  14. resize_op = CV.Resize((resize_height, resize_width),
  15. interpolation=Inter.LINEAR)
  16. rescale_op = CV.Rescale(rescale, shift)
  17. hwc2chw_op = CV.HWC2CHW()
  18. type_cast_op = C.TypeCast(mstype.int32)
  19. # apply map operations on images
  20. if not sparse:
  21. one_hot_enco = C.OneHot(10)
  22. ds1 = ds1.map(operations=one_hot_enco, input_columns="label",
  23. num_parallel_workers=num_parallel_workers)
  24. type_cast_op = C.TypeCast(mstype.float32)
  25. ds1 = ds1.map(operations=type_cast_op, input_columns="label",
  26. num_parallel_workers=num_parallel_workers)
  27. ds1 = ds1.map(operations=resize_op, input_columns="image",
  28. num_parallel_workers=num_parallel_workers)
  29. ds1 = ds1.map(operations=rescale_op, input_columns="image",
  30. num_parallel_workers=num_parallel_workers)
  31. ds1 = ds1.map(operations=hwc2chw_op, input_columns="image",
  32. num_parallel_workers=num_parallel_workers)
  33. # apply DatasetOps
  34. buffer_size = 10000
  35. ds1 = ds1.shuffle(buffer_size=buffer_size)
  36. ds1 = ds1.batch(batch_size, drop_remainder=True)
  37. ds1 = ds1.repeat(repeat_size)
  38. return ds1



  1. 定义LeNet模型网络。

    ``` def conv(in_channels, out_channels, kernel_size, stride=1, padding=0):

    1. weight = weight_variable()
    2. return nn.Conv2d(in_channels, out_channels,
    3. kernel_size=kernel_size, stride=stride, padding=padding,
    4. weight_init=weight, has_bias=False, pad_mode="valid")
  1. def fc_with_initialize(input_channels, out_channels):
  2. weight = weight_variable()
  3. bias = weight_variable()
  4. return nn.Dense(input_channels, out_channels, weight, bias)
  5. def weight_variable():
  6. return TruncatedNormal(0.02)
  7. class LeNet5(nn.Cell):
  8. """
  9. Lenet network
  10. """
  11. def __init__(self):
  12. super(LeNet5, self).__init__()
  13. self.conv1 = conv(1, 6, 5)
  14. self.conv2 = conv(6, 16, 5)
  15. self.fc1 = fc_with_initialize(16*5*5, 120)
  16. self.fc2 = fc_with_initialize(120, 84)
  17. self.fc3 = fc_with_initialize(84, 10)
  18. self.relu = nn.ReLU()
  19. self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
  20. self.flatten = nn.Flatten()
  21. def construct(self, x):
  22. x = self.conv1(x)
  23. x = self.relu(x)
  24. x = self.max_pool2d(x)
  25. x = self.conv2(x)
  26. x = self.relu(x)
  27. x = self.max_pool2d(x)
  28. x = self.flatten(x)
  29. x = self.fc1(x)
  30. x = self.relu(x)
  31. x = self.fc2(x)
  32. x = self.relu(x)
  33. x = self.fc3(x)
  34. return x
  35. ```
  1. 训练LeNet模型。利用上面定义的数据加载函数generate_mnist_dataset载入数据。

    1. mnist_path = "./MNIST/"
    2. batch_size = 32
    3. # train original model
    4. ds_train = generate_mnist_dataset(os.path.join(mnist_path, "train"),
    5. batch_size=batch_size, repeat_size=1,
    6. sparse=False)
    7. net = LeNet5()
    8. loss = SoftmaxCrossEntropyWithLogits(sparse=False)
    9. opt = nn.Momentum(net.trainable_params(), 0.01, 0.09)
    10. model = Model(net, loss, opt, metrics=None)
    11. model.train(10, ds_train, callbacks=[LossMonitor()],
    12. dataset_sink_mode=False)
    13. # 2. get test data
    14. ds_test = generate_mnist_dataset(os.path.join(mnist_path, "test"),
    15. batch_size=batch_size, repeat_size=1,
    16. sparse=False)
    17. inputs = []
    18. labels = []
    19. for data in ds_test.create_tuple_iterator():
    20. inputs.append(data[0].asnumpy().astype(np.float32))
    21. labels.append(data[1].asnumpy())
    22. test_inputs = np.concatenate(inputs)
    23. test_labels = np.concatenate(labels)
  2. 测试模型。

    1. # prediction accuracy before attack
    2. net.set_train(False)
    3. test_logits = []
    4. batches = test_inputs.shape[0] // batch_size
    5. for i in range(batches):
    6. batch_inputs = test_inputs[i*batch_size : (i + 1)*batch_size]
    7. batch_labels = test_labels[i*batch_size : (i + 1)*batch_size]
    8. logits = net(Tensor(batch_inputs)).asnumpy()
    9. test_logits.append(logits)
    10. test_logits = np.concatenate(test_logits)
    11. tmp = np.argmax(test_logits, axis=1) == np.argmax(test_labels, axis=1)
    12. accuracy = np.mean(tmp)
    13. LOGGER.info(TAG, 'prediction accuracy before attacking is : %s', accuracy)


    1. prediction accuracy before attacking is : 0.9895833333333334



  1. # attacking
  2. # get adv data
  3. attack = FastGradientSignMethod(net, eps=0.3, loss_fn=loss)
  4. adv_data = attack.batch_generate(test_inputs, test_labels)
  5. # get accuracy of adv data on original model
  6. adv_logits = []
  7. for i in range(batches):
  8. batch_inputs = adv_data[i*batch_size : (i + 1)*batch_size]
  9. logits = net(Tensor(batch_inputs)).asnumpy()
  10. adv_logits.append(logits)
  11. adv_logits = np.concatenate(adv_logits)
  12. adv_proba = softmax(adv_logits, axis=1)
  13. tmp = np.argmax(adv_proba, axis=1) == np.argmax(test_labels, axis=1)
  14. accuracy_adv = np.mean(tmp)
  15. LOGGER.info(TAG, 'prediction accuracy after attacking is : %s', accuracy_adv)
  16. attack_evaluate = AttackEvaluate(test_inputs.transpose(0, 2, 3, 1),
  17. test_labels,
  18. adv_data.transpose(0, 2, 3, 1),
  19. adv_proba)
  20. LOGGER.info(TAG, 'mis-classification rate of adversaries is : %s',
  21. attack_evaluate.mis_classification_rate())
  22. LOGGER.info(TAG, 'The average confidence of adversarial class is : %s',
  23. attack_evaluate.avg_conf_adv_class())
  24. LOGGER.info(TAG, 'The average confidence of true class is : %s',
  25. attack_evaluate.avg_conf_true_class())
  26. LOGGER.info(TAG, 'The average distance (l0, l2, linf) between original '
  27. 'samples and adversarial samples are: %s',
  28. attack_evaluate.avg_lp_distance())
  29. LOGGER.info(TAG, 'The average structural similarity between original '
  30. 'samples and adversarial samples are: %s',
  31. attack_evaluate.avg_ssim())


  1. prediction accuracy after attacking is : 0.052083
  2. mis-classification rate of adversaries is : 0.947917
  3. The average confidence of adversarial class is : 0.803375
  4. The average confidence of true class is : 0.042139
  5. The average distance (l0, l2, linf) between original samples and adversarial samples are: (1.698870, 0.465888, 0.300000)
  6. The average structural similarity between original samples and adversarial samples are: 0.332538

对模型进行FGSM无目标攻击后,模型精度由98.9%降到5.2%,误分类率高达95%,成功攻击的对抗样本的预测类别的平均置信度(ACAC)为 0.803375,成功攻击的对抗样本的真实类别的平均置信度(ACTC)为 0.042139,同时给出了生成的对抗样本与原始样本的零范数距离、二范数距离和无穷范数距离,平均每个对抗样本与原始样本间的结构相似性为0.332538,平均每生成一张对抗样本所需时间为0.003125s。







  1. from mindarmour.adv_robustness.defenses import NaturalAdversarialDefense
  2. # defense
  3. net.set_train()
  4. nad = NaturalAdversarialDefense(net, loss_fn=loss, optimizer=opt,
  5. bounds=(0.0, 1.0), eps=0.3)
  6. nad.batch_defense(test_inputs, test_labels, batch_size=32, epochs=10)
  7. # get accuracy of test data on defensed model
  8. net.set_train(False)
  9. test_logits = []
  10. for i in range(batches):
  11. batch_inputs = test_inputs[i*batch_size : (i + 1)*batch_size]
  12. batch_labels = test_labels[i*batch_size : (i + 1)*batch_size]
  13. logits = net(Tensor(batch_inputs)).asnumpy()
  14. test_logits.append(logits)
  15. test_logits = np.concatenate(test_logits)
  16. tmp = np.argmax(test_logits, axis=1) == np.argmax(test_labels, axis=1)
  17. accuracy = np.mean(tmp)
  18. LOGGER.info(TAG, 'accuracy of TEST data on defensed model is : %s', accuracy)
  19. # get accuracy of adv data on defensed model
  20. adv_logits = []
  21. for i in range(batches):
  22. batch_inputs = adv_data[i*batch_size : (i + 1)*batch_size]
  23. logits = net(Tensor(batch_inputs)).asnumpy()
  24. adv_logits.append(logits)
  25. adv_logits = np.concatenate(adv_logits)
  26. adv_proba = softmax(adv_logits, axis=1)
  27. tmp = np.argmax(adv_proba, axis=1) == np.argmax(test_labels, axis=1)
  28. accuracy_adv = np.mean(tmp)
  29. attack_evaluate = AttackEvaluate(test_inputs.transpose(0, 2, 3, 1),
  30. test_labels,
  31. adv_data.transpose(0, 2, 3, 1),
  32. adv_proba)
  33. LOGGER.info(TAG, 'accuracy of adv data on defensed model is : %s',
  34. np.mean(accuracy_adv))
  35. LOGGER.info(TAG, 'defense mis-classification rate of adversaries is : %s',
  36. attack_evaluate.mis_classification_rate())
  37. LOGGER.info(TAG, 'The average confidence of adversarial class is : %s',
  38. attack_evaluate.avg_conf_adv_class())
  39. LOGGER.info(TAG, 'The average confidence of true class is : %s',
  40. attack_evaluate.avg_conf_true_class())


  1. accuracy of TEST data on defensed model is : 0.974259
  2. accuracy of adv data on defensed model is : 0.856370
  3. defense mis-classification rate of adversaries is : 0.143629
  4. The average confidence of adversarial class is : 0.616670
  5. The average confidence of true class is : 0.177374
