Add support for PLD Accountant in computing DP-SGD privacy statement [TF Privacy]

PiperOrigin-RevId: 587854134
2023-12-04 15:08:16 -08:00 · 2023-12-04 15:08:16 -08:00 · 93376c9d6a
commit 93376c9d6a
parent f51b637dda
3 changed files with 170 additions and 43 deletions
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy.py
@ -18,20 +18,20 @@ The script applies the RDP accountant to estimate privacy budget of an iterated
 Sampled Gaussian Mechanism. The mechanism's parameters are controlled by flags.

 Example:
-  compute_dp_sgd_privacy
+  compute_dp_sgd_privacy \
    --N=60000 \
    --batch_size=256 \
    --noise_multiplier=1.12 \
    --epochs=60 \
-    --delta=1e-5
+    --delta=1e-5 \
+    --accountant_type=RDP

-The output states that DP-SGD with these parameters satisfies (2.92, 1e-5)-DP.
+Prints out the privacy statement corresponding to the above parameters.
 """

 from absl import app
 from absl import flags
-
-from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy_statement
+from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib


 _NUM_EXAMPLES = flags.DEFINE_integer(
@ -70,6 +70,9 @@ _MAX_EXAMPLES_PER_USER = flags.DEFINE_integer(
        'user-level DP guarantee.'
    ),
 )
+_ACCOUNTANT_TYPE = flags.DEFINE_enum(
+    'accountant_type', 'RDP', ['RDP', 'PLD'], 'DP accountant to use.'
+)

 flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])

@ -77,7 +80,7 @@ flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
 def main(argv):
  del argv  # argv is not used.

-  statement = compute_dp_sgd_privacy_statement(
+  statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
      _NUM_EXAMPLES.value,
      _BATCH_SIZE.value,
      _NUM_EPOCHS.value,
@ -85,6 +88,7 @@ def main(argv):
      _DELTA.value,
      _USED_MICROBATCHING.value,
      _MAX_EXAMPLES_PER_USER.value,
+      compute_dp_sgd_privacy_lib.AccountantType(_ACCOUNTANT_TYPE.value),
  )
  print(statement)

--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
@ -14,6 +14,7 @@
 # ==============================================================================
 """Library for computing privacy values for DP-SGD."""

+import enum
 import functools
 import math
 import textwrap
@ -34,6 +35,20 @@ def _logexpm1(x: float) -> float:
  return x + math.log(-math.expm1(-x))


+class AccountantType(enum.Enum):
+  """Accountant to use for privacy accounting."""
+
+  RDP = 'RDP'
+  PLD = 'PLD'
+
+  def get_accountant(self) -> dp_accounting.PrivacyAccountant:
+    if self == AccountantType.RDP:
+      return dp_accounting.rdp.RdpAccountant()
+    if self == AccountantType.PLD:
+      return dp_accounting.pld.PLDAccountant()
+    raise ValueError(f'Unsupported Accountant type {self.value}')
+
+
 def _compute_dp_sgd_user_privacy(
    num_epochs: float,
    noise_multiplier: float,
@ -41,6 +56,7 @@ def _compute_dp_sgd_user_privacy(
    max_examples_per_user: int,
    used_microbatching: bool = True,
    poisson_subsampling_probability: Optional[float] = None,
+    accountant_type: AccountantType = AccountantType.RDP,
 ) -> float:
  """Computes add-or-remove-one-user DP epsilon using group privacy.

@ -63,6 +79,10 @@ def _compute_dp_sgd_user_privacy(
    used_microbatching: If true, increases sensitivity by a factor of two.
    poisson_subsampling_probability: If not None, gives the probability that
      each record is chosen in a batch. If None, assumes no subsampling.
+    accountant_type: The privacy accountant for computing epsilon. While this
+      method supports both PLD and RDP accountants, the behavior for PLD
+      accountant can sometimes be overly pessimistic. This remains to be
+      investigated and fixed (b/271341062).

  Returns:
    The add-or-remove-one-user DP epsilon value using group privacy.
@ -92,6 +112,7 @@ def _compute_dp_sgd_user_privacy(
        user_delta,
        used_microbatching,
        poisson_subsampling_probability,
+        accountant_type,
    )

  # The computation below to estimate user_eps works as follows.
@ -188,6 +209,7 @@ def _compute_dp_sgd_example_privacy(
    example_delta: float,
    used_microbatching: bool = True,
    poisson_subsampling_probability: Optional[float] = None,
+    accountant_type: AccountantType = AccountantType.RDP,
 ) -> float:
  """Computes add-or-remove-one-example DP epsilon.

@ -201,6 +223,7 @@ def _compute_dp_sgd_example_privacy(
    used_microbatching: If true, increases sensitivity by a factor of two.
    poisson_subsampling_probability: If not None, gives the probability that
      each record is chosen in a batch. If None, assumes no subsampling.
+    accountant_type: The privacy accountant for computing epsilon.

  Returns:
    The epsilon value.
@ -229,10 +252,10 @@ def _compute_dp_sgd_example_privacy(
  event_ = dp_accounting.SelfComposedDpEvent(count=count, event=event_)

  return (
-      dp_accounting.rdp.RdpAccountant()
+      accountant_type.get_accountant()
      .compose(event_)
      .get_epsilon(example_delta)
-  )  # TODO(b/271341062)
+  )


 def compute_dp_sgd_privacy_statement(
@ -243,6 +266,7 @@ def compute_dp_sgd_privacy_statement(
    delta: float,
    used_microbatching: bool = True,
    max_examples_per_user: Optional[int] = None,
+    accountant_type: AccountantType = AccountantType.RDP,
 ) -> str:
  """Produces a privacy report summarizing the DP guarantee.

@ -267,6 +291,11 @@ def compute_dp_sgd_privacy_statement(
    max_examples_per_user: If the data set is constructed to cap the maximum
      number of examples each user contributes, provide this argument to also
      print a user-level DP guarantee.
+    accountant_type: The privacy accountant for computing epsilon. Since the
+      current approach for computing user-level privacy when using PLD
+      accountant can sometimes be overly pessimistic, this method does not
+      provide user-level privacy guarantee for PLD accountant_type. This remains
+      to be investigated and fixed (b/271341062).

  Returns:
    A str precisely articulating the privacy guarantee.
@ -296,12 +325,16 @@ addition to the final model.""",
  paragraph = textwrap.fill(
      f"""\
 Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
-with RDP accounting:""",
+with {accountant_type.value} accounting:""",
      width=80,
  )

  example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
-      num_epochs, noise_multiplier, delta, used_microbatching
+      num_epochs,
+      noise_multiplier,
+      delta,
+      used_microbatching,
+      accountant_type=accountant_type,
  )
  example_eps_subsampling = _compute_dp_sgd_example_privacy(
      num_epochs,
@ -309,6 +342,7 @@ with RDP accounting:""",
      delta,
      used_microbatching,
      poisson_subsampling_probability=batch_size / number_of_examples,
+      accountant_type=accountant_type,
  )

  paragraph += f"""
@ -320,13 +354,33 @@ with RDP accounting:""",
  paragraphs.append(paragraph)

  inf_user_eps = False
-  if max_examples_per_user is not None:
+  if max_examples_per_user is None:
+    paragraphs.append(
+        textwrap.fill(
+            """\
+No user-level privacy guarantee is possible without a bound on the number of \
+examples per user.""",
+            width=80,
+        )
+    )
+  elif accountant_type == AccountantType.PLD:
+    # TODO(b/271341062): Add User level DP support for PLD.
+    paragraphs.append(
+        textwrap.fill(
+            """\
+User-level DP epsilon computation is not supported for PLD accounting at this \
+time. Use RDP accounting to obtain user-level DP guarantees.""",
+            width=80,
+        )
+    )
+  else:  # Case: max_examples_per_user is not None and accountant_type is RDP
    user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
        num_epochs,
        noise_multiplier,
        delta,
        max_examples_per_user,
        used_microbatching,
+        accountant_type=accountant_type,
    )
    user_eps_subsampling = _compute_dp_sgd_user_privacy(
        num_epochs,
@ -335,6 +389,7 @@ with RDP accounting:""",
        max_examples_per_user,
        used_microbatching,
        poisson_subsampling_probability=batch_size / number_of_examples,
+        accountant_type=accountant_type,
    )
    if math.isinf(user_eps_no_subsampling):
      user_eps_no_subsampling_str = '    inf (**)'
@ -350,7 +405,7 @@ with RDP accounting:""",
    paragraph = textwrap.fill(
        f"""\
 User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
-using RDP accounting and group privacy:""",
+using {accountant_type.value} accounting and group privacy:""",
        width=80,
    )
    paragraph += f"""
@ -360,23 +415,14 @@ using RDP accounting and group privacy:""",
 {user_eps_subsampling_str}"""

    paragraphs.append(paragraph)
-  else:
-    paragraphs.append(
-        textwrap.fill(
-            """\
-No user-level privacy guarantee is possible without a bound on the number of \
-examples per user.""",
-            width=80,
-        )
-    )

  paragraphs.append(
      textwrap.fill(
          """\
 (*) Poisson sampling is not usually done in training pipelines, but assuming \
-that the data was randomly shuffled, it is believed the actual epsilon should \
-be closer to this value than the conservative assumption of an arbitrary data \
-order.""",
+that the data was randomly shuffled, it is believed that the actual epsilon \
+should be closer to this value than the conservative assumption of an \
+arbitrary data order.""",
          width=80,
      )
  )
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
@ -23,6 +23,8 @@ from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib

 _example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
 _user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
+_RDP = compute_dp_sgd_privacy_lib.AccountantType.RDP
+_PLD = compute_dp_sgd_privacy_lib.AccountantType.PLD


 DP_SGD_STATEMENT_KWARGS = dict(
@ -81,13 +83,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
      _example_privacy(**args)

  @parameterized.named_parameters(
-      ('no_microbatching_no_subsampling', False, None, 10.8602036),
-      ('microbatching_no_subsampling', True, None, 26.2880374),
-      ('no_microbatching_with_subsampling', False, 1e-2, 3.2391922),
-      ('microbatching_with_subsampling', True, 1e-2, 22.5970358),
+      ('no_microbatching_no_subsampling_rdp', False, None, _RDP, 10.8602036),
+      ('microbatching_no_subsampling_rdp', True, None, _RDP, 26.2880374),
+      ('no_microbatching_with_subsampling_rdp', False, 1e-2, _RDP, 3.2391922),
+      ('microbatching_with_subsampling_rdp', True, 1e-2, _RDP, 22.5970358),
+      ('no_microbatching_no_subsampling_pld', False, None, _PLD, 10.1224946),
+      ('microbatching_no_subsampling_pld', True, None, _PLD, 24.7160779),
+      ('no_microbatching_with_subsampling_pld', False, 1e-2, _PLD, 2.4612381),
+      ('microbatching_with_subsampling_pld', True, 1e-2, _PLD, 18.6977407),
  )
  def test_compute_dp_sgd_example_privacy(
-      self, used_microbatching, poisson_subsampling_probability, expected_eps
+      self,
+      used_microbatching,
+      poisson_subsampling_probability,
+      accountant_type,
+      expected_eps,
  ):
    num_epochs = 1.2
    noise_multiplier = 0.7
@ -98,6 +108,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
        example_delta,
        used_microbatching,
        poisson_subsampling_probability,
+        accountant_type,
    )
    self.assertAlmostEqual(eps, expected_eps)

@ -119,17 +130,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
    with self.assertRaises(ValueError):
      _user_privacy(**args)

-  def test_user_privacy_one_example_per_user(self):
+  @parameterized.named_parameters(('RDP', _RDP), ('PLD', _PLD))
+  def test_user_privacy_one_example_per_user(self, accountant_type):
    num_epochs = 1.2
    noise_multiplier = 0.7
    delta = 1e-5

-    example_eps = _example_privacy(num_epochs, noise_multiplier, delta)
+    example_eps = _example_privacy(
+        num_epochs, noise_multiplier, delta, accountant_type=accountant_type
+    )
    user_eps = _user_privacy(
        num_epochs,
        noise_multiplier,
        delta,
        max_examples_per_user=1,
+        accountant_type=accountant_type,
    )
    self.assertEqual(user_eps, example_eps)

@ -146,6 +161,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
        noise_multiplier=noise_multiplier,
        example_delta=example_delta,
        poisson_subsampling_probability=q,
+        accountant_type=_RDP,
    )

    user_delta = math.exp(
@ -161,12 +177,14 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
        user_delta=user_delta,
        max_examples_per_user=max_examples_per_user,
        poisson_subsampling_probability=q,
+        accountant_type=_RDP,
    )
    self.assertAlmostEqual(user_eps, example_eps * max_examples_per_user)

-  def test_dp_sgd_privacy_statement_no_user_dp(self):
+  def test_dp_sgd_privacy_statement_no_user_dp_with_rdp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
+        accountant_type=_RDP,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -185,16 +203,17 @@ No user-level privacy guarantee is possible without a bound on the number of
 examples per user.

 (*) Poisson sampling is not usually done in training pipelines, but assuming
-that the data was randomly shuffled, it is believed the actual epsilon should be
-closer to this value than the conservative assumption of an arbitrary data
-order.
+that the data was randomly shuffled, it is believed that the actual epsilon
+should be closer to this value than the conservative assumption of an arbitrary
+data order.
 """
    self.assertEqual(statement, expected_statement)

-  def test_dp_sgd_privacy_statement_user_dp(self):
+  def test_dp_sgd_privacy_statement_user_dp_with_rdp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=3,
+        accountant_type=_RDP,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -215,16 +234,17 @@ RDP accounting and group privacy:
    Epsilon assuming Poisson sampling (*):                      6.425

 (*) Poisson sampling is not usually done in training pipelines, but assuming
-that the data was randomly shuffled, it is believed the actual epsilon should be
-closer to this value than the conservative assumption of an arbitrary data
-order.
+that the data was randomly shuffled, it is believed that the actual epsilon
+should be closer to this value than the conservative assumption of an arbitrary
+data order.
 """
    self.assertEqual(statement, expected_statement)

-  def test_dp_sgd_privacy_statement_user_dp_infinite(self):
+  def test_dp_sgd_privacy_statement_user_dp_infinite_with_rdp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=10,
+        accountant_type=_RDP,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -245,14 +265,71 @@ RDP accounting and group privacy:
    Epsilon assuming Poisson sampling (*):                   inf (**)

 (*) Poisson sampling is not usually done in training pipelines, but assuming
-that the data was randomly shuffled, it is believed the actual epsilon should be
-closer to this value than the conservative assumption of an arbitrary data
-order.
+that the data was randomly shuffled, it is believed that the actual epsilon
+should be closer to this value than the conservative assumption of an arbitrary
+data order.

 (**) A finite example-level epsilon implies a finite user-level epsilon at any
 `max_examples_per_user`, but because conversion from example-level to user-level
 DP is not exact, it is possible for the upper bound on the user-level epsilon to
 still be infinite.
+"""
+    self.assertEqual(statement, expected_statement)
+
+  def test_dp_sgd_privacy_statement_no_user_dp_with_pld(self):
+    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
+        **DP_SGD_STATEMENT_KWARGS,
+        accountant_type=_PLD,
+    )
+    expected_statement = """\
+DP-SGD performed over 10000 examples with 64 examples per iteration, noise
+multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
+examples per user.
+
+This privacy guarantee protects the release of all model checkpoints in addition
+to the final model.
+
+Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
+PLD accounting:
+    Epsilon with each example occurring once per epoch:        12.595
+    Epsilon assuming Poisson sampling (*):                      1.199
+
+No user-level privacy guarantee is possible without a bound on the number of
+examples per user.
+
+(*) Poisson sampling is not usually done in training pipelines, but assuming
+that the data was randomly shuffled, it is believed that the actual epsilon
+should be closer to this value than the conservative assumption of an arbitrary
+data order.
+"""
+    self.assertEqual(statement, expected_statement)
+
+  def test_dp_sgd_privacy_statement_user_dp_with_pld(self):
+    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
+        **DP_SGD_STATEMENT_KWARGS,
+        max_examples_per_user=3,
+        accountant_type=_PLD,
+    )
+    expected_statement = """\
+DP-SGD performed over 10000 examples with 64 examples per iteration, noise
+multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
+user.
+
+This privacy guarantee protects the release of all model checkpoints in addition
+to the final model.
+
+Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
+PLD accounting:
+    Epsilon with each example occurring once per epoch:        12.595
+    Epsilon assuming Poisson sampling (*):                      1.199
+
+User-level DP epsilon computation is not supported for PLD accounting at this
+time. Use RDP accounting to obtain user-level DP guarantees.
+
+(*) Poisson sampling is not usually done in training pipelines, but assuming
+that the data was randomly shuffled, it is believed that the actual epsilon
+should be closer to this value than the conservative assumption of an arbitrary
+data order.
 """
    self.assertEqual(statement, expected_statement)