Fixes comments and membership scores for thresholds attack.

PiperOrigin-RevId: 555579896
2023-08-10 11:29:52 -07:00 · 2023-08-10 11:29:52 -07:00 · 27069d347d
commit 27069d347d
parent fafa69b65c
7 changed files with 17 additions and 17 deletions
--- a/research/mi_lira_2021/README.md
+++ b/research/mi_lira_2021/README.md
@ -94,15 +94,15 @@ which should give (something like) the following output

 ```
 Attack Ours (online)
-   AUC 0.6675, Accuracy 0.6074, TPR@0.1%FPR of 0.0104
+   AUC 0.6676, Accuracy 0.6077, TPR@0.1%FPR of 0.0169
 Attack Ours (online, fixed variance)
-   AUC 0.6831, Accuracy 0.6140, TPR@0.1%FPR of 0.0541
+   AUC 0.6856, Accuracy 0.6137, TPR@0.1%FPR of 0.0593
 Attack Ours (offline)
-   AUC 0.5465, Accuracy 0.5486, TPR@0.1%FPR of 0.0073
+   AUC 0.5488, Accuracy 0.5500, TPR@0.1%FPR of 0.0130
 Attack Ours (offline, fixed variance)
-   AUC 0.5518, Accuracy 0.5485, TPR@0.1%FPR of 0.0259
+   AUC 0.5549, Accuracy 0.5537, TPR@0.1%FPR of 0.0299
 Attack Global threshold
-   AUC 0.5900, Accuracy 0.6018, TPR@0.1%FPR of 0.0007
+   AUC 0.5921, Accuracy 0.6044, TPR@0.1%FPR of 0.0009
 ```

 where the global threshold attack is the baseline, and our online,
--- a/research/mi_lira_2021/fprtpr.png
+++ b/research/mi_lira_2021/fprtpr.png
--- a/research/mi_lira_2021/inference.py
+++ b/research/mi_lira_2021/inference.py
@ -69,7 +69,7 @@ def main(argv):
                for dy in range(0, 2*shift+1, stride):
                    this_x = aug_pad[:, dx:dx+32, dy:dy+32, :].transpose((0,3,1,2))

-                    logits = model.model(this_x, training=False)
+                    logits = model.model(this_x, training=True)
                    outs.append(logits)

        print(np.array(outs).shape)
--- a/research/mi_poison_2022/README.md
+++ b/research/mi_poison_2022/README.md
@ -88,13 +88,13 @@ which should give (something like) the following output

 ```
 Attack No poison (LiRA)
-   AUC 0.6992, Accuracy 0.6240, TPR@0.1%FPR of 0.0529
+   AUC 0.7025, Accuracy 0.6258, TPR@0.1%FPR of 0.0544
 Attack No poison (Global threshold)
-   AUC 0.6200, Accuracy 0.6167, TPR@0.1%FPR of 0.0011
+   AUC 0.6191, Accuracy 0.6173, TPR@0.1%FPR of 0.0012
 Attack With poison (LiRA)
-   AUC 0.9904, Accuracy 0.9617, TPR@0.1%FPR of 0.3730
+   AUC 0.9943, Accuracy 0.9653, TPR@0.1%FPR of 0.4945
 Attack With poison (Global threshold)
-   AUC 0.9911, Accuracy 0.9580, TPR@0.1%FPR of 0.2130
+   AUC 0.9922, Accuracy 0.9603, TPR@0.1%FPR of 0.3930
 ```

 where the baselines are LiRA and a simple global threshold on the membership
--- a/research/mi_poison_2022/fprtpr.png
+++ b/research/mi_poison_2022/fprtpr.png
--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/membership_inference_attack.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/membership_inference_attack.py
@ -109,8 +109,7 @@ def _run_trained_attack(
          labels[train_indices],
          sample_weight=sample_weights_train,
      )
-      predictions = attacker.predict(features[test_indices])
-      scores[test_indices] = predictions
+      scores[test_indices] = attacker.predict(features[test_indices])
  except ValueError as ve:
    if 'cannot be greater than the number of members in each class.' in str(ve):
      logging.warning('kf.split in _run_trained_attack fails with: %s', str(ve))
@ -200,8 +199,9 @@ def _run_threshold_attack(attack_input: AttackInputData):
      slice_spec=_get_slice_spec(attack_input),
      data_size=DataSize(ntrain=ntrain, ntest=ntest),
      attack_type=AttackType.THRESHOLD_ATTACK,
-      membership_scores_train=attack_input.get_loss_train(),
-      membership_scores_test=attack_input.get_loss_test(),
+      # Negate loss because training examples are expected to have lower loss.
+      membership_scores_train=-attack_input.get_loss_train(),
+      membership_scores_test=-attack_input.get_loss_test(),
      roc_curve=roc_curve,
      epsilon_lower_bound_value=epsilon_lower_bound_value,
  )
--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/models.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/models.py
@ -185,19 +185,19 @@ class TrainedAttacker(object):
    raise NotImplementedError()

  def predict(self, input_features):
-    """Predicts whether input_features belongs to train or test.
+    """Predicts the probability that input_features belongs to train.

    Args:
      input_features : A vector of features with the same semantics as x_train
        passed to train_model.

    Returns:
-      An array of probabilities denoting whether the example belongs to test.
+      An array of probabilities that the examples belongs to train.
    """
    if self.model is None:
      raise AssertionError(
          'Model not trained yet. Please call train_model first.')
-    return self.model.predict_proba(input_features)[:, 1]
+    return self.model.predict_proba(input_features)[:, 1]  # Train has label 1


 class LogisticRegressionAttacker(TrainedAttacker):