Adds per-example membership scores to trained attackers.

PiperOrigin-RevId: 431615160
2022-02-28 23:51:55 -08:00 · 2022-02-28 23:51:55 -08:00 · 767788e9cf
commit 767788e9cf
parent a33afde0c1
5 changed files with 232 additions and 134 deletions
--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/codelabs/example.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/codelabs/example.py
@ -23,6 +23,7 @@ from absl import app
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scipy.stats
 from sklearn import metrics
 import tensorflow as tf
 from tensorflow_privacy.privacy.privacy_tests.membership_inference_attack import data_structures
@ -69,69 +70,69 @@ def generate_features_and_labels(samples_per_cluster=250, scale=0.1):
  return (features, labels)


-# Hint: Play with "noise_scale" for different levels of overlap between
-# the generated clusters. More noise makes the classification harder.
-noise_scale = 2
-training_features, training_labels = generate_features_and_labels(
-    samples_per_cluster=250, scale=noise_scale)
-test_features, test_labels = generate_features_and_labels(
-    samples_per_cluster=250, scale=noise_scale)
+def get_models(num_clusters):
+  """Get the two models we will be using."""
+  # Hint: play with the number of layers to achieve different level of
+  # over-fitting and observe its effects on membership inference performance.
+  three_layer_model = tf.keras.Sequential([
+      tf.keras.layers.Dense(300, activation="relu"),
+      tf.keras.layers.Dense(300, activation="relu"),
+      tf.keras.layers.Dense(300, activation="relu"),
+      tf.keras.layers.Dense(num_clusters, activation="relu"),
+      tf.keras.layers.Softmax()
+  ])
+  three_layer_model.compile(
+      optimizer="adam",
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+      metrics=["accuracy"])

-num_clusters = int(round(np.max(training_labels))) + 1
-
-# Hint: play with the number of layers to achieve different level of
-# over-fitting and observe its effects on membership inference performance.
-three_layer_model = tf.keras.Sequential([
-    tf.keras.layers.Dense(300, activation="relu"),
-    tf.keras.layers.Dense(300, activation="relu"),
-    tf.keras.layers.Dense(300, activation="relu"),
-    tf.keras.layers.Dense(num_clusters, activation="relu"),
-    tf.keras.layers.Softmax()
-])
-three_layer_model.compile(
-    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
-
-two_layer_model = tf.keras.Sequential([
-    tf.keras.layers.Dense(300, activation="relu"),
-    tf.keras.layers.Dense(300, activation="relu"),
-    tf.keras.layers.Dense(num_clusters, activation="relu"),
-    tf.keras.layers.Softmax()
-])
-two_layer_model.compile(
-    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
-
-
-def crossentropy(true_labels, predictions):
-  return tf.keras.backend.eval(
-      tf.keras.metrics.binary_crossentropy(
-          tf.keras.backend.variable(
-              tf.keras.utils.to_categorical(true_labels, num_clusters)),
-          tf.keras.backend.variable(predictions)))
+  two_layer_model = tf.keras.Sequential([
+      tf.keras.layers.Dense(300, activation="relu"),
+      tf.keras.layers.Dense(300, activation="relu"),
+      tf.keras.layers.Dense(num_clusters, activation="relu"),
+      tf.keras.layers.Softmax()
+  ])
+  two_layer_model.compile(
+      optimizer="adam",
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+      metrics=["accuracy"])
+  return three_layer_model, two_layer_model


 def main(unused_argv):
-  epoch_results = data_structures.AttackResultsCollection([])
+  # Hint: Play with "noise_scale" for different levels of overlap between
+  # the generated clusters. More noise makes the classification harder.
+  noise_scale = 2
+  training_features, training_labels = generate_features_and_labels(
+      samples_per_cluster=250, scale=noise_scale)
+  test_features, test_labels = generate_features_and_labels(
+      samples_per_cluster=250, scale=noise_scale)

-  num_epochs = 2
+  num_clusters = int(round(np.max(training_labels))) + 1
+
+  three_layer_model, two_layer_model = get_models(num_clusters)
  models = {
-      "two layer model": two_layer_model,
-      "three layer model": three_layer_model,
+      "two_layer_model": two_layer_model,
+      "three_layer_model": three_layer_model,
  }
-  for model_name in models:
-    # Incrementally train the model and store privacy metrics every num_epochs.
-    for i in range(1, 6):
-      models[model_name].fit(
+
+  num_epochs_per_round = 20
+  epoch_results = data_structures.AttackResultsCollection([])
+  for model_name, model in models.items():
+    print(f"Train {model_name}.")
+    # Incrementally train the model and store privacy metrics
+    # every num_epochs_per_round.
+    for i in range(5):
+      model.fit(
          training_features,
-          tf.keras.utils.to_categorical(training_labels, num_clusters),
-          validation_data=(test_features,
-                           tf.keras.utils.to_categorical(
-                               test_labels, num_clusters)),
+          training_labels,
+          validation_data=(test_features, test_labels),
          batch_size=64,
-          epochs=num_epochs,
+          epochs=num_epochs_per_round,
          shuffle=True)

-      training_pred = models[model_name].predict(training_features)
-      test_pred = models[model_name].predict(test_features)
+      training_pred = model.predict(training_features)
+      test_pred = model.predict(test_features)

      # Add metadata to generate a privacy report.
      privacy_report_metadata = data_structures.PrivacyReportMetadata(
@ -139,7 +140,7 @@ def main(unused_argv):
              training_labels, np.argmax(training_pred, axis=1)),
          accuracy_test=metrics.accuracy_score(test_labels,
                                               np.argmax(test_pred, axis=1)),
-          epoch_num=num_epochs * i,
+          epoch_num=num_epochs_per_round * (i + 1),
          model_variant_label=model_name)

      attack_results = mia.run_attacks(
@ -147,9 +148,7 @@ def main(unused_argv):
              labels_train=training_labels,
              labels_test=test_labels,
              probs_train=training_pred,
-              probs_test=test_pred,
-              loss_train=crossentropy(training_labels, training_pred),
-              loss_test=crossentropy(test_labels, test_pred)),
+              probs_test=test_pred),
          data_structures.SlicingSpec(entire_dataset=True, by_class=True),
          attack_types=(data_structures.AttackType.THRESHOLD_ATTACK,
                        data_structures.AttackType.LOGISTIC_REGRESSION),
@ -216,6 +215,39 @@ def main(unused_argv):
  # For saving a figure into a file:
  # plotting.save_plot(figure, <file_path>)

+  # Let's look at the per-example membership scores. We'll look at how the
+  # scores from threshold and logistic regression attackers correlate.
+
+  # We take the MIA result of the final three layer model
+  sample_model = epoch_results.attack_results_list[-1]
+  print("We will look at the membership scores of",
+        sample_model.privacy_report_metadata.model_variant_label, "at epoch",
+        sample_model.privacy_report_metadata.epoch_num)
+  sample_results = sample_model.single_attack_results
+
+  # The first two entries of sample_results are from the threshold and
+  # logistic regression attackers on the whole dataset.
+  print("Correlation between the scores of the following two attackers:", "\n ",
+        sample_results[0].slice_spec, sample_results[0].attack_type, "\n ",
+        sample_results[1].slice_spec, sample_results[1].attack_type)
+  threshold_results = np.concatenate(  # scores by threshold attacker
+      (sample_results[0].membership_scores_train,
+       sample_results[0].membership_scores_test))
+  lr_results = np.concatenate(  # scores by logistic regression attacker
+      (sample_results[1].membership_scores_train,
+       sample_results[1].membership_scores_test))
+
+  # Order the scores and plot them
+  threshold_orders = scipy.stats.rankdata(threshold_results)
+  lr_orders = scipy.stats.rankdata(lr_results)
+
+  fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
+  axes.scatter(threshold_orders, lr_orders, alpha=0.2, linewidths=0)
+  m, b = np.polyfit(threshold_orders, lr_orders, 1)  # linear fit
+  axes.plot(threshold_orders, m * threshold_orders + b, color="orange")
+  axes.set_aspect("equal", adjustable="box")
+  fig.show()
+

 if __name__ == "__main__":
  app.run(main)
--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/membership_inference_attack.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/membership_inference_attack.py
@ -21,6 +21,7 @@ from typing import Iterable

 import numpy as np
 from sklearn import metrics
+from sklearn import model_selection

 from tensorflow_privacy.privacy.privacy_tests.membership_inference_attack import models
 from tensorflow_privacy.privacy.privacy_tests.membership_inference_attack.data_structures import AttackInputData
@ -44,49 +45,61 @@ def _get_slice_spec(data: AttackInputData) -> SingleSliceSpec:
  return SingleSliceSpec()


+# TODO(b/220394926): Allow users to specify their own attack models.
 def _run_trained_attack(attack_input: AttackInputData,
                        attack_type: AttackType,
-                        balance_attacker_training: bool = True):
+                        balance_attacker_training: bool = True,
+                        cross_validation_folds: int = 2):
  """Classification attack done by ML models."""
-  attacker = None
-
-  if attack_type == AttackType.LOGISTIC_REGRESSION:
-    attacker = models.LogisticRegressionAttacker()
-  elif attack_type == AttackType.MULTI_LAYERED_PERCEPTRON:
-    attacker = models.MultilayerPerceptronAttacker()
-  elif attack_type == AttackType.RANDOM_FOREST:
-    attacker = models.RandomForestAttacker()
-  elif attack_type == AttackType.K_NEAREST_NEIGHBORS:
-    attacker = models.KNearestNeighborsAttacker()
-  else:
-    raise NotImplementedError('Attack type %s not implemented yet.' %
-                              attack_type)
-
  prepared_attacker_data = models.create_attacker_data(
      attack_input, balance=balance_attacker_training)
+  indices = prepared_attacker_data.fold_indices
+  left_out_indices = prepared_attacker_data.left_out_indices
+  features = prepared_attacker_data.features_all
+  labels = prepared_attacker_data.labels_all

-  attacker.train_model(prepared_attacker_data.features_train,
-                       prepared_attacker_data.is_training_labels_train)
+  # We are going to train multiple models on disjoint subsets of the data
+  # (`features`, `labels`), so we can get the membership scores of all samples,
+  # and each example gets its score assigned only once.
+  # An alternative implementation is to train multiple models on overlapping
+  # subsets of the data, and take an average to get the score for each sample.
+  # `scores` will record the membership score of each sample, initialized to nan
+  scores = np.full(features.shape[0], np.nan)

-  # Run the attacker on (permuted) test examples.
-  predictions_test = attacker.predict(prepared_attacker_data.features_test)
+  # We use StratifiedKFold to create disjoint subsets of samples. Notice that
+  # the index it returns is with respect to the samples shuffled with `indices`.
+  kf = model_selection.StratifiedKFold(cross_validation_folds, shuffle=False)
+  for train_indices_in_shuffled, test_indices_in_shuffled in kf.split(
+      features[indices], labels[indices]):
+    # `train_indices_in_shuffled` is with respect to the data shuffled with
+    # `indices`. We convert it to `train_indices` to work with the original
+    # data (`features` and 'labels').
+    train_indices = indices[train_indices_in_shuffled]
+    test_indices = indices[test_indices_in_shuffled]
+    # Make sure one sample only got score predicted once
+    assert np.all(np.isnan(scores[test_indices]))

-  # Generate ROC curves with predictions.
-  fpr, tpr, thresholds = metrics.roc_curve(
-      prepared_attacker_data.is_training_labels_test, predictions_test)
+    attacker = models.create_attacker(attack_type)
+    attacker.train_model(features[train_indices], labels[train_indices])
+    scores[test_indices] = attacker.predict(features[test_indices])

+  # Predict the left out with the last attacker
+  if left_out_indices.size:
+    assert np.all(np.isnan(scores[left_out_indices]))
+    scores[left_out_indices] = attacker.predict(features[left_out_indices])
+  assert not np.any(np.isnan(scores))
+
+  # Generate ROC curves with scores.
+  fpr, tpr, thresholds = metrics.roc_curve(labels, scores)
  roc_curve = RocCurve(tpr=tpr, fpr=fpr, thresholds=thresholds)

-  # NOTE: In the current setup we can't obtain membership scores for all
-  # samples, since some of them were used to train the attacker. This can be
-  # fixed by training several attackers to ensure each sample was left out
-  # in exactly one attacker (basically, this means performing cross-validation).
-  # TODO(b/175870479): Implement membership scores for predicted attackers.
-
+  in_train_indices = (labels == 0)
  return SingleAttackResult(
      slice_spec=_get_slice_spec(attack_input),
      data_size=prepared_attacker_data.data_size,
      attack_type=attack_type,
+      membership_scores_train=scores[in_train_indices],
+      membership_scores_test=scores[~in_train_indices],
      roc_curve=roc_curve)


@ -107,8 +120,8 @@ def _run_threshold_attack(attack_input: AttackInputData):
      slice_spec=_get_slice_spec(attack_input),
      data_size=DataSize(ntrain=ntrain, ntest=ntest),
      attack_type=AttackType.THRESHOLD_ATTACK,
-      membership_scores_train=-attack_input.get_loss_train(),
-      membership_scores_test=-attack_input.get_loss_test(),
+      membership_scores_train=attack_input.get_loss_train(),
+      membership_scores_test=attack_input.get_loss_test(),
      roc_curve=roc_curve)


--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/membership_inference_attack_test.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/membership_inference_attack_test.py
@ -90,6 +90,31 @@ class RunAttacksTest(absltest.TestCase):
    self.assertLen(result.membership_scores_train, 100)
    self.assertLen(result.membership_scores_test, 50)

+  def test_run_attack_trained_sets_membership_scores(self):
+    attack_input = AttackInputData(
+        logits_train=np.tile([500., -500.], (100, 1)),
+        logits_test=np.tile([0., 0.], (50, 1)))
+
+    result = mia._run_trained_attack(
+        attack_input,
+        AttackType.LOGISTIC_REGRESSION,
+        balance_attacker_training=True)
+    self.assertLen(result.membership_scores_train, 100)
+    self.assertLen(result.membership_scores_test, 50)
+
+    # Scores for all training (resp. test) examples should be close
+    np.testing.assert_allclose(
+        result.membership_scores_train,
+        result.membership_scores_train[0],
+        rtol=1e-3)
+    np.testing.assert_allclose(
+        result.membership_scores_test,
+        result.membership_scores_test[0],
+        rtol=1e-3)
+    # Training score should be smaller than test score
+    self.assertLess(result.membership_scores_train[0],
+                    result.membership_scores_test[0])
+
  def test_run_attack_threshold_calculates_correct_auc(self):
    result = mia._run_attack(
        AttackInputData(
--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/models.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/models.py
@ -15,7 +15,6 @@

 import dataclasses
 from typing import Optional
-
 import numpy as np
 from sklearn import ensemble
 from sklearn import linear_model
@ -23,30 +22,34 @@ from sklearn import model_selection
 from sklearn import neighbors
 from sklearn import neural_network

-from tensorflow_privacy.privacy.privacy_tests.membership_inference_attack.data_structures import AttackInputData
-from tensorflow_privacy.privacy.privacy_tests.membership_inference_attack.data_structures import DataSize
+from tensorflow_privacy.privacy.privacy_tests.membership_inference_attack import data_structures


@dataclasses.dataclass
 class AttackerData:
  """Input data for an ML classifier attack.

-  This includes only the data, and not configuration.
+  Labels in this class correspond to whether an example was in the
+  train or test set.
  """
+  # Features of in-training and out-of-training examples.
+  features_all: Optional[np.ndarray] = None
+  # Indicator for whether the example is in-training (0) or out-of-training (1).
+  labels_all: Optional[np.ndarray] = None

-  features_train: Optional[np.ndarray] = None
-  # element-wise boolean array denoting if the example was part of training.
-  is_training_labels_train: Optional[np.ndarray] = None
+  # Indices for `features_all` and `labels_all` that are going to be used for
+  # training the attackers.
+  fold_indices: Optional[np.ndarray] = None

-  features_test: Optional[np.ndarray] = None
-  # element-wise boolean array denoting if the example was part of training.
-  is_training_labels_test: Optional[np.ndarray] = None
+  # Indices for `features_all` and `labels_all` that were left out due to
+  # balancing. Disjoint from `fold_indices`.
+  left_out_indices: Optional[np.ndarray] = None

-  data_size: Optional[DataSize] = None
+  # Number of in-training and out-of-training examples.
+  data_size: Optional[data_structures.DataSize] = None


-def create_attacker_data(attack_input_data: AttackInputData,
-                         test_fraction: float = 0.25,
+def create_attacker_data(attack_input_data: data_structures.AttackInputData,
                         balance: bool = True) -> AttackerData:
  """Prepare AttackInputData to train ML attackers.

@ -54,7 +57,6 @@ def create_attacker_data(attack_input_data: AttackInputData,

  Args:
    attack_input_data: Original AttackInputData
-    test_fraction: Fraction of the dataset to include in the test split.
    balance: Whether the training and test sets for the membership inference
      attacker should have a balanced (roughly equal) number of samples from the
      training and test sets used to develop the model under attack.
@ -67,25 +69,49 @@ def create_attacker_data(attack_input_data: AttackInputData,
  attack_input_test = _column_stack(attack_input_data.logits_or_probs_test,
                                    attack_input_data.get_loss_test())

-  if balance:
-    min_size = min(attack_input_data.get_train_size(),
-                   attack_input_data.get_test_size())
-    attack_input_train = _sample_multidimensional_array(attack_input_train,
-                                                        min_size)
-    attack_input_test = _sample_multidimensional_array(attack_input_test,
-                                                       min_size)
  ntrain, ntest = attack_input_train.shape[0], attack_input_test.shape[0]
-
  features_all = np.concatenate((attack_input_train, attack_input_test))
+  labels_all = np.concatenate((np.zeros(ntrain), np.ones(ntest)))

-  labels_all = np.concatenate(((np.zeros(ntrain)), (np.ones(ntest))))
+  fold_indices = np.arange(ntrain + ntest)
+  left_out_indices = np.asarray([], dtype=np.int32)

-  # Perform a train-test split
-  features_train, features_test, is_training_labels_train, is_training_labels_test = model_selection.train_test_split(
-      features_all, labels_all, test_size=test_fraction, stratify=labels_all)
-  return AttackerData(features_train, is_training_labels_train, features_test,
-                      is_training_labels_test,
-                      DataSize(ntrain=ntrain, ntest=ntest))
+  if balance:
+    idx_train, idx_test = range(ntrain), range(ntrain, ntrain + ntest)
+    min_size = min(ntrain, ntest)
+    if ntrain > min_size:
+      left_out_size = ntrain - min_size
+      perm_train = np.random.permutation(idx_train)  # shuffle training
+      left_out_indices = perm_train[:left_out_size]
+      fold_indices = np.concatenate((perm_train[left_out_size:], idx_test))
+    elif ntest > min_size:
+      left_out_size = ntest - min_size
+      perm_test = np.random.permutation(idx_test)  # shuffle test
+      left_out_indices = perm_test[:left_out_size]
+      fold_indices = np.concatenate((perm_test[left_out_size:], idx_train))
+
+  # Shuffle indices for the downstream attackers.
+  fold_indices = np.random.permutation(fold_indices)
+
+  return AttackerData(
+      features_all=features_all,
+      labels_all=labels_all,
+      fold_indices=fold_indices,
+      left_out_indices=left_out_indices,
+      data_size=data_structures.DataSize(ntrain=ntrain, ntest=ntest))
+
+
+def create_attacker(attack_type):
+  """Returns the corresponding attacker for the provided attack_type."""
+  if attack_type == data_structures.AttackType.LOGISTIC_REGRESSION:
+    return LogisticRegressionAttacker()
+  if attack_type == data_structures.AttackType.MULTI_LAYERED_PERCEPTRON:
+    return MultilayerPerceptronAttacker()
+  if attack_type == data_structures.AttackType.RANDOM_FOREST:
+    return RandomForestAttacker()
+  if attack_type == data_structures.AttackType.K_NEAREST_NEIGHBORS:
+    return KNearestNeighborsAttacker()
+  raise NotImplementedError('Attack type %s not implemented yet.' % attack_type)


 def _sample_multidimensional_array(array, size):
--- a/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/models_test.py
+++ b/tensorflow_privacy/privacy/privacy_tests/membership_inference_attack/models_test.py
@ -33,9 +33,8 @@ class TrainedAttackerTest(absltest.TestCase):
  def test_create_attacker_data_loss_only(self):
    attack_input = AttackInputData(
        loss_train=np.array([1, 3]), loss_test=np.array([2, 4]))
-    attacker_data = models.create_attacker_data(attack_input, 0.5)
-    self.assertLen(attacker_data.features_test, 2)
-    self.assertLen(attacker_data.features_train, 2)
+    attacker_data = models.create_attacker_data(attack_input, 2)
+    self.assertLen(attacker_data.features_all, 4)

  def test_create_attacker_data_loss_and_logits(self):
    attack_input = AttackInputData(
@ -43,15 +42,22 @@ class TrainedAttackerTest(absltest.TestCase):
        logits_test=np.array([[10, 11], [14, 15]]),
        loss_train=np.array([3, 7, 10]),
        loss_test=np.array([12, 16]))
-    attacker_data = models.create_attacker_data(
-        attack_input, 0.25, balance=False)
-    self.assertLen(attacker_data.features_test, 2)
-    self.assertLen(attacker_data.features_train, 3)
+    attacker_data = models.create_attacker_data(attack_input, balance=False)
+    self.assertLen(attacker_data.features_all, 5)
+    self.assertLen(attacker_data.fold_indices, 5)
+    self.assertEmpty(attacker_data.left_out_indices)

-    for i, feature in enumerate(attacker_data.features_train):
-      self.assertLen(feature, 3)  # each feature has two logits and one loss
-      expected = feature[:2] not in attack_input.logits_train
-      self.assertEqual(attacker_data.is_training_labels_train[i], expected)
+  def test_unbalanced_create_attacker_data_loss_and_logits(self):
+    attack_input = AttackInputData(
+        logits_train=np.array([[1, 2], [5, 6], [8, 9]]),
+        logits_test=np.array([[10, 11], [14, 15]]),
+        loss_train=np.array([3, 7, 10]),
+        loss_test=np.array([12, 16]))
+    attacker_data = models.create_attacker_data(attack_input, balance=True)
+    self.assertLen(attacker_data.features_all, 5)
+    self.assertLen(attacker_data.fold_indices, 4)
+    self.assertLen(attacker_data.left_out_indices, 1)
+    self.assertIn(attacker_data.left_out_indices[0], [0, 1, 2])

  def test_balanced_create_attacker_data_loss_and_logits(self):
    attack_input = AttackInputData(
@ -59,14 +65,10 @@ class TrainedAttackerTest(absltest.TestCase):
        logits_test=np.array([[10, 11], [14, 15], [17, 18]]),
        loss_train=np.array([3, 7, 10]),
        loss_test=np.array([12, 16, 19]))
-    attacker_data = models.create_attacker_data(attack_input, 0.33)
-    self.assertLen(attacker_data.features_test, 2)
-    self.assertLen(attacker_data.features_train, 4)
-
-    for i, feature in enumerate(attacker_data.features_train):
-      self.assertLen(feature, 3)  # each feature has two logits and one loss
-      expected = feature[:2] not in attack_input.logits_train
-      self.assertEqual(attacker_data.is_training_labels_train[i], expected)
+    attacker_data = models.create_attacker_data(attack_input)
+    self.assertLen(attacker_data.features_all, 6)
+    self.assertLen(attacker_data.fold_indices, 6)
+    self.assertEmpty(attacker_data.left_out_indices)


 if __name__ == '__main__':