From 438da5a09bf4b4393351fbd86ac0643982d5d1e3 Mon Sep 17 00:00:00 2001
From: Shuang Song <shuangsong@google.com>
Date: Mon, 31 Jan 2022 13:26:57 -0800
Subject: [PATCH] For secret sharer exposures, allow more dictionary key types,
 and break ties for the same perplexities. Fix a bug in the test.

PiperOrigin-RevId: 425446829
---
 .../privacy_tests/secret_sharer/exposures.py  | 87 ++++++++++---------
 .../secret_sharer/exposures_test.py           | 46 +++++-----
 2 files changed, 72 insertions(+), 61 deletions(-)

diff --git a/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures.py b/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures.py
index 82ec230..1ad2097 100644
--- a/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures.py
+++ b/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures.py
@@ -13,74 +13,81 @@
 # limitations under the License.
 """Measuring exposure for secret sharer attack."""
 
-from typing import Dict, List
-
+from typing import Iterable, TypeVar, Mapping
 import numpy as np
 from scipy import stats
 
 
+_KT = TypeVar('_KT')
+
+
 def compute_exposure_interpolation(
-    perplexities: Dict[int, List[float]],
-    perplexities_reference: List[float]) -> Dict[int, List[float]]:
-  """Get exposure using interpolation.
+    perplexities: Mapping[_KT, Iterable[float]],
+    perplexities_reference: Iterable[float]) -> Mapping[_KT, Iterable[float]]:
+  """Gets exposure using interpolation.
 
   Args:
-    perplexities: a dictionary, key is number of secret repetitions, value is a
-      list of perplexities
-    perplexities_reference: a list, perplexities of the random sequences that
-      did not appear in the training data
+    perplexities: a `Mapping` where the key is an identifier for the secrets
+      set, e.g. number of secret repetitions, and the value is an iterable of
+      perplexities.
+    perplexities_reference: perplexities of the random sequences that did not
+      appear in the training data.
 
   Returns:
     The exposure of every secret measured using interpolation (not necessarily
-    in the same order as the input)
+    in the same order as the input), keyed in the same way as perplexities.
   """
-  repetitions = list(perplexities.keys())
-  # Concatenate all perplexities, including those for references
-  perplexities_concat = np.concatenate([perplexities[r] for r in repetitions] +
-                                       [perplexities_reference])
-  # Concatenate the number of repetitions for each secret
-  repetitions_concat = np.concatenate([[r] * len(perplexities[r])
-                                       for r in repetitions] +
-                                      [[0] * len(perplexities_reference)])
+  # Get the keys in some fixed order which will be used internally only
+  # further down.
+  keys = list(perplexities)
+  # Concatenate all perplexities, including those from `perplexities_reference`.
+  # Add another dimension indicating which set the perplexity is from: -1 for
+  # reference, {0, ..., len(perplexities)} for secrets
+  perplexities_concat = [(p, -1) for p in perplexities_reference]
+  for i, k in enumerate(keys):
+    perplexities_concat.extend((p, i) for p in perplexities[k])
 
-  # Sort the repetition list according to the corresponding perplexity
-  idx = np.argsort(perplexities_concat)
-  repetitions_concat = repetitions_concat[idx]
+  # Get the indices list sorted according to the corresponding perplexity,
+  # in case of tie, keep the reference before the secret
+  indices_concat = np.fromiter((i for _, i in sorted(perplexities_concat)),
+                               dtype=int)
 
-  # In the sorted repetition list, if there are m examples with repetition 0
-  # (does not appear in training) in front of an example, then its rank is
-  # (m + 1). To get the number of examples with repetition 0 in front of
-  # any example, we use the cummulative sum of the indicator vecotr
-  # (repetitions_concat == 0).
-  cum_sum = np.cumsum(repetitions_concat == 0)
-  ranks = {r: cum_sum[repetitions_concat == r] + 1 for r in repetitions}
+  # In the sorted indices list, if there are m examples with index -1
+  # (from the reference set) in front of an example, then its rank is
+  # (m + 1). To get the number of examples with index -1 in front of
+  # any example, we use the cumulative sum of the indicator vector
+  # (indices_concat == -1).
+  cum_sum = np.cumsum(indices_concat == -1)
+  ranks = {k: cum_sum[indices_concat == i] + 1 for i, k in enumerate(keys)}
   exposures = {
-      r: np.log2(len(perplexities_reference)) - np.log2(ranks[r])
-      for r in repetitions
+      k: np.log2(len(list(perplexities_reference))) - np.log2(ranks[k])
+      for k in ranks
   }
   return exposures
 
 
 def compute_exposure_extrapolation(
-    perplexities: Dict[int, List[float]],
-    perplexities_reference: List[float]) -> Dict[int, List[float]]:
-  """Get exposure using extrapolation.
+    perplexities: Mapping[_KT, Iterable[float]],
+    perplexities_reference: Iterable[float]) -> Mapping[_KT, Iterable[float]]:
+  """Gets exposure using extrapolation.
 
   Args:
-    perplexities: a dictionary, key is number of secret repetitions, value is a
-      list of perplexities
-    perplexities_reference: a list, perplexities of the random sequences that
-      did not appear in the training data
+    perplexities: a `Mapping` where the key is an identifier for the secrets
+      set, e.g. number of secret repetitions, and the value is an iterable of
+      perplexities.
+    perplexities_reference: perplexities of the random sequences that did not
+      appear in the training data.
 
   Returns:
-    The exposure of every secret measured using extrapolation
+    The exposure of every secret measured using extrapolation, keyed in the same
+    way as perplexities.
   """
   # Fit a skew normal distribution using the perplexities of the references
   snormal_param = stats.skewnorm.fit(perplexities_reference)
 
   # Estimate exposure using the fitted distribution
   exposures = {
-      r: -np.log2(stats.skewnorm.cdf(perplexities[r], *snormal_param))
-      for r in perplexities.keys()
+      r: -np.log2(stats.skewnorm.cdf(p, *snormal_param))
+      for r, p in perplexities.items()
   }
   return exposures
diff --git a/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures_test.py b/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures_test.py
index bc95e51..dc83c6d 100644
--- a/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures_test.py
+++ b/tensorflow_privacy/privacy/privacy_tests/secret_sharer/exposures_test.py
@@ -15,8 +15,8 @@
 from absl.testing import absltest
 import numpy as np
 from scipy import stats
-from tensorflow_privacy.privacy.privacy_tests.secret_sharer.exposures import compute_exposure_extrapolation
-from tensorflow_privacy.privacy.privacy_tests.secret_sharer.exposures import compute_exposure_interpolation
+
+from tensorflow_privacy.privacy.privacy_tests.secret_sharer import exposures
 
 
 class UtilsTest(absltest.TestCase):
@@ -28,42 +28,46 @@ class UtilsTest(absltest.TestCase):
   def test_exposure_interpolation(self):
     """Test exposure by interpolation."""
     perplexities = {
-        1: [0, 0.1],  # smallest perplexities
-        2: [20.0],  # largest perplexities
-        5: [3.5]
-    }  # rank = 4
+        '1': [0, 0.1],  # smallest perplexities
+        '2': [20.0],  # largest perplexities
+        '5': [3.5],  # rank = 4
+        '8': [3.5],  # rank = 4
+    }
     perplexities_reference = [float(x) for x in range(1, 17)]
-    exposures = compute_exposure_interpolation(perplexities,
-                                               perplexities_reference)
+    resulted_exposures = exposures.compute_exposure_interpolation(
+        perplexities, perplexities_reference)
     num_perplexities_reference = len(perplexities_reference)
     exposure_largest = np.log2(num_perplexities_reference)
     exposure_smallest = np.log2(num_perplexities_reference) - np.log2(
         num_perplexities_reference + 1)
+    exposure_rank4 = np.log2(num_perplexities_reference) - np.log2(4)
     expected_exposures = {
-        1: np.array([exposure_largest] * 2),
-        2: np.array([exposure_smallest]),
-        5: np.array([np.log2(num_perplexities_reference) - np.log2(4)])
+        '1': np.array([exposure_largest] * 2),
+        '2': np.array([exposure_smallest]),
+        '5': np.array([exposure_rank4]),
+        '8': np.array([exposure_rank4])
     }
 
-    self.assertEqual(exposures.keys(), expected_exposures.keys())
-    for r in exposures.keys():
-      np.testing.assert_almost_equal(exposures[r], exposures[r])
+    self.assertEqual(resulted_exposures.keys(), expected_exposures.keys())
+    for r in resulted_exposures.keys():
+      np.testing.assert_almost_equal(expected_exposures[r],
+                                     resulted_exposures[r])
 
   def test_exposure_extrapolation(self):
     parameters = (4, 0, 1)
     perplexities = {
-        1: stats.skewnorm.rvs(*parameters, size=(2,)),
-        10: stats.skewnorm.rvs(*parameters, size=(5,))
+        '1': stats.skewnorm.rvs(*parameters, size=(2,)),
+        '10': stats.skewnorm.rvs(*parameters, size=(5,))
     }
     perplexities_reference = stats.skewnorm.rvs(*parameters, size=(10000,))
-    exposures = compute_exposure_extrapolation(perplexities,
-                                               perplexities_reference)
+    resulted_exposures = exposures.compute_exposure_extrapolation(
+        perplexities, perplexities_reference)
     fitted_parameters = stats.skewnorm.fit(perplexities_reference)
 
-    self.assertEqual(exposures.keys(), perplexities.keys())
-    for r in exposures.keys():
+    self.assertEqual(resulted_exposures.keys(), perplexities.keys())
+    for r in resulted_exposures.keys():
       np.testing.assert_almost_equal(
-          exposures[r],
+          resulted_exposures[r],
           -np.log2(stats.skewnorm.cdf(perplexities[r], *fitted_parameters)))