Add support for PLD Accountant in computing DP-SGD privacy statement [TF Privacy]

PiperOrigin-RevId: 587854134
2023-12-04 15:08:16 -08:00 · 2023-12-04 15:08:16 -08:00 · 93376c9d6a
commit 93376c9d6a
parent f51b637dda
3 changed files with 170 additions and 43 deletions
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy.py
@ -18,20 +18,20 @@ The script applies the RDP accountant to estimate privacy budget of an iterated
 Sampled Gaussian Mechanism. The mechanism's parameters are controlled by flags.
 Example:
-  compute_dp_sgd_privacy
+  compute_dp_sgd_privacy \
    --N=60000 \
    --batch_size=256 \
    --noise_multiplier=1.12 \
    --epochs=60 \
-    --delta=1e-5
+    --delta=1e-5 \
    --accountant_type=RDP
-The output states that DP-SGD with these parameters satisfies (2.92, 1e-5)-DP.
+Prints out the privacy statement corresponding to the above parameters.
 """
 from absl import app
 from absl import flags
-
+from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
 from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy_statement
 _NUM_EXAMPLES = flags.DEFINE_integer(
@ -70,6 +70,9 @@ _MAX_EXAMPLES_PER_USER = flags.DEFINE_integer(
        'user-level DP guarantee.'
    ),
 )
 _ACCOUNTANT_TYPE = flags.DEFINE_enum(
    'accountant_type', 'RDP', ['RDP', 'PLD'], 'DP accountant to use.'
 )
 flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
@ -77,7 +80,7 @@ flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
 def main(argv):
  del argv  # argv is not used.
-  statement = compute_dp_sgd_privacy_statement(
+  statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
      _NUM_EXAMPLES.value,
      _BATCH_SIZE.value,
      _NUM_EPOCHS.value,
@ -85,6 +88,7 @@ def main(argv):
      _DELTA.value,
      _USED_MICROBATCHING.value,
      _MAX_EXAMPLES_PER_USER.value,
      compute_dp_sgd_privacy_lib.AccountantType(_ACCOUNTANT_TYPE.value),
  )
  print(statement)
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
@ -14,6 +14,7 @@
 # ==============================================================================
 """Library for computing privacy values for DP-SGD."""
 import enum
 import functools
 import math
 import textwrap
@ -34,6 +35,20 @@ def _logexpm1(x: float) -> float:
  return x + math.log(-math.expm1(-x))
 class AccountantType(enum.Enum):
  """Accountant to use for privacy accounting."""
  RDP = 'RDP'
  PLD = 'PLD'
  def get_accountant(self) -> dp_accounting.PrivacyAccountant:
    if self == AccountantType.RDP:
      return dp_accounting.rdp.RdpAccountant()
    if self == AccountantType.PLD:
      return dp_accounting.pld.PLDAccountant()
    raise ValueError(f'Unsupported Accountant type {self.value}')
 def _compute_dp_sgd_user_privacy(
    num_epochs: float,
    noise_multiplier: float,
@ -41,6 +56,7 @@ def _compute_dp_sgd_user_privacy(
    max_examples_per_user: int,
    used_microbatching: bool = True,
    poisson_subsampling_probability: Optional[float] = None,
    accountant_type: AccountantType = AccountantType.RDP,
 ) -> float:
  """Computes add-or-remove-one-user DP epsilon using group privacy.
@ -63,6 +79,10 @@ def _compute_dp_sgd_user_privacy(
    used_microbatching: If true, increases sensitivity by a factor of two.
    poisson_subsampling_probability: If not None, gives the probability that
      each record is chosen in a batch. If None, assumes no subsampling.
    accountant_type: The privacy accountant for computing epsilon. While this
      method supports both PLD and RDP accountants, the behavior for PLD
      accountant can sometimes be overly pessimistic. This remains to be
      investigated and fixed (b/271341062).
  Returns:
    The add-or-remove-one-user DP epsilon value using group privacy.
@ -92,6 +112,7 @@ def _compute_dp_sgd_user_privacy(
        user_delta,
        used_microbatching,
        poisson_subsampling_probability,
        accountant_type,
    )
  # The computation below to estimate user_eps works as follows.
@ -188,6 +209,7 @@ def _compute_dp_sgd_example_privacy(
    example_delta: float,
    used_microbatching: bool = True,
    poisson_subsampling_probability: Optional[float] = None,
    accountant_type: AccountantType = AccountantType.RDP,
 ) -> float:
  """Computes add-or-remove-one-example DP epsilon.
@ -201,6 +223,7 @@ def _compute_dp_sgd_example_privacy(
    used_microbatching: If true, increases sensitivity by a factor of two.
    poisson_subsampling_probability: If not None, gives the probability that
      each record is chosen in a batch. If None, assumes no subsampling.
    accountant_type: The privacy accountant for computing epsilon.
  Returns:
    The epsilon value.
@ -229,10 +252,10 @@ def _compute_dp_sgd_example_privacy(
  event_ = dp_accounting.SelfComposedDpEvent(count=count, event=event_)
  return (
-      dp_accounting.rdp.RdpAccountant()
+      accountant_type.get_accountant()
      .compose(event_)
      .get_epsilon(example_delta)
-  )  # TODO(b/271341062)
+  )
 def compute_dp_sgd_privacy_statement(
@ -243,6 +266,7 @@ def compute_dp_sgd_privacy_statement(
    delta: float,
    used_microbatching: bool = True,
    max_examples_per_user: Optional[int] = None,
    accountant_type: AccountantType = AccountantType.RDP,
 ) -> str:
  """Produces a privacy report summarizing the DP guarantee.
@ -267,6 +291,11 @@ def compute_dp_sgd_privacy_statement(
    max_examples_per_user: If the data set is constructed to cap the maximum
      number of examples each user contributes, provide this argument to also
      print a user-level DP guarantee.
    accountant_type: The privacy accountant for computing epsilon. Since the
      current approach for computing user-level privacy when using PLD
      accountant can sometimes be overly pessimistic, this method does not
      provide user-level privacy guarantee for PLD accountant_type. This remains
      to be investigated and fixed (b/271341062).
  Returns:
    A str precisely articulating the privacy guarantee.
@ -296,12 +325,16 @@ addition to the final model.""",
  paragraph = textwrap.fill(
      f"""\
 Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
-with RDP accounting:""",
+with {accountant_type.value} accounting:""",
      width=80,
  )
  example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
-      num_epochs, noise_multiplier, delta, used_microbatching
+      num_epochs,
      noise_multiplier,
      delta,
      used_microbatching,
      accountant_type=accountant_type,
  )
  example_eps_subsampling = _compute_dp_sgd_example_privacy(
      num_epochs,
@ -309,6 +342,7 @@ with RDP accounting:""",
      delta,
      used_microbatching,
      poisson_subsampling_probability=batch_size / number_of_examples,
      accountant_type=accountant_type,
  )
  paragraph += f"""
@ -320,13 +354,33 @@ with RDP accounting:""",
  paragraphs.append(paragraph)
  inf_user_eps = False
-  if max_examples_per_user is not None:
+  if max_examples_per_user is None:
    paragraphs.append(
        textwrap.fill(
            """\
 No user-level privacy guarantee is possible without a bound on the number of \
 examples per user.""",
            width=80,
        )
    )
  elif accountant_type == AccountantType.PLD:
    # TODO(b/271341062): Add User level DP support for PLD.
    paragraphs.append(
        textwrap.fill(
            """\
 User-level DP epsilon computation is not supported for PLD accounting at this \
 time. Use RDP accounting to obtain user-level DP guarantees.""",
            width=80,
        )
    )
  else:  # Case: max_examples_per_user is not None and accountant_type is RDP
    user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
        num_epochs,
        noise_multiplier,
        delta,
        max_examples_per_user,
        used_microbatching,
        accountant_type=accountant_type,
    )
    user_eps_subsampling = _compute_dp_sgd_user_privacy(
        num_epochs,
@ -335,6 +389,7 @@ with RDP accounting:""",
        max_examples_per_user,
        used_microbatching,
        poisson_subsampling_probability=batch_size / number_of_examples,
        accountant_type=accountant_type,
    )
    if math.isinf(user_eps_no_subsampling):
      user_eps_no_subsampling_str = '    inf (**)'
@ -350,7 +405,7 @@ with RDP accounting:""",
    paragraph = textwrap.fill(
        f"""\
 User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
-using RDP accounting and group privacy:""",
+using {accountant_type.value} accounting and group privacy:""",
        width=80,
    )
    paragraph += f"""
@ -360,23 +415,14 @@ using RDP accounting and group privacy:""",
 {user_eps_subsampling_str}"""
    paragraphs.append(paragraph)
  else:
    paragraphs.append(
        textwrap.fill(
            """\
 No user-level privacy guarantee is possible without a bound on the number of \
 examples per user.""",
            width=80,
        )
    )
  paragraphs.append(
      textwrap.fill(
          """\
 (*) Poisson sampling is not usually done in training pipelines, but assuming \
-that the data was randomly shuffled, it is believed the actual epsilon should \
+that the data was randomly shuffled, it is believed that the actual epsilon \
-be closer to this value than the conservative assumption of an arbitrary data \
+should be closer to this value than the conservative assumption of an \
-order.""",
+arbitrary data order.""",
          width=80,
      )
  )
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
@ -23,6 +23,8 @@ from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
 _example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
 _user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
 _RDP = compute_dp_sgd_privacy_lib.AccountantType.RDP
 _PLD = compute_dp_sgd_privacy_lib.AccountantType.PLD
 DP_SGD_STATEMENT_KWARGS = dict(
@ -81,13 +83,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
      _example_privacy(**args)
  @parameterized.named_parameters(
-      ('no_microbatching_no_subsampling', False, None, 10.8602036),
+      ('no_microbatching_no_subsampling_rdp', False, None, _RDP, 10.8602036),
-      ('microbatching_no_subsampling', True, None, 26.2880374),
+      ('microbatching_no_subsampling_rdp', True, None, _RDP, 26.2880374),
-      ('no_microbatching_with_subsampling', False, 1e-2, 3.2391922),
+      ('no_microbatching_with_subsampling_rdp', False, 1e-2, _RDP, 3.2391922),
-      ('microbatching_with_subsampling', True, 1e-2, 22.5970358),
+      ('microbatching_with_subsampling_rdp', True, 1e-2, _RDP, 22.5970358),
      ('no_microbatching_no_subsampling_pld', False, None, _PLD, 10.1224946),
      ('microbatching_no_subsampling_pld', True, None, _PLD, 24.7160779),
      ('no_microbatching_with_subsampling_pld', False, 1e-2, _PLD, 2.4612381),
      ('microbatching_with_subsampling_pld', True, 1e-2, _PLD, 18.6977407),
  )
  def test_compute_dp_sgd_example_privacy(
-      self, used_microbatching, poisson_subsampling_probability, expected_eps
+      self,
      used_microbatching,
      poisson_subsampling_probability,
      accountant_type,
      expected_eps,
  ):
    num_epochs = 1.2
    noise_multiplier = 0.7
@ -98,6 +108,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
        example_delta,
        used_microbatching,
        poisson_subsampling_probability,
        accountant_type,
    )
    self.assertAlmostEqual(eps, expected_eps)
@ -119,17 +130,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
    with self.assertRaises(ValueError):
      _user_privacy(**args)
-  def test_user_privacy_one_example_per_user(self):
+  @parameterized.named_parameters(('RDP', _RDP), ('PLD', _PLD))
  def test_user_privacy_one_example_per_user(self, accountant_type):
    num_epochs = 1.2
    noise_multiplier = 0.7
    delta = 1e-5
-    example_eps = _example_privacy(num_epochs, noise_multiplier, delta)
+    example_eps = _example_privacy(
        num_epochs, noise_multiplier, delta, accountant_type=accountant_type
    )
    user_eps = _user_privacy(
        num_epochs,
        noise_multiplier,
        delta,
        max_examples_per_user=1,
        accountant_type=accountant_type,
    )
    self.assertEqual(user_eps, example_eps)
@ -146,6 +161,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
        noise_multiplier=noise_multiplier,
        example_delta=example_delta,
        poisson_subsampling_probability=q,
        accountant_type=_RDP,
    )
    user_delta = math.exp(
@ -161,12 +177,14 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
        user_delta=user_delta,
        max_examples_per_user=max_examples_per_user,
        poisson_subsampling_probability=q,
        accountant_type=_RDP,
    )
    self.assertAlmostEqual(user_eps, example_eps * max_examples_per_user)
-  def test_dp_sgd_privacy_statement_no_user_dp(self):
+  def test_dp_sgd_privacy_statement_no_user_dp_with_rdp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        accountant_type=_RDP,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -185,16 +203,17 @@ No user-level privacy guarantee is possible without a bound on the number of
 examples per user.
 (*) Poisson sampling is not usually done in training pipelines, but assuming
-that the data was randomly shuffled, it is believed the actual epsilon should be
+that the data was randomly shuffled, it is believed that the actual epsilon
-closer to this value than the conservative assumption of an arbitrary data
+should be closer to this value than the conservative assumption of an arbitrary
-order.
+data order.
 """
    self.assertEqual(statement, expected_statement)
-  def test_dp_sgd_privacy_statement_user_dp(self):
+  def test_dp_sgd_privacy_statement_user_dp_with_rdp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=3,
        accountant_type=_RDP,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -215,16 +234,17 @@ RDP accounting and group privacy:
    Epsilon assuming Poisson sampling (*):                      6.425
 (*) Poisson sampling is not usually done in training pipelines, but assuming
-that the data was randomly shuffled, it is believed the actual epsilon should be
+that the data was randomly shuffled, it is believed that the actual epsilon
-closer to this value than the conservative assumption of an arbitrary data
+should be closer to this value than the conservative assumption of an arbitrary
-order.
+data order.
 """
    self.assertEqual(statement, expected_statement)
-  def test_dp_sgd_privacy_statement_user_dp_infinite(self):
+  def test_dp_sgd_privacy_statement_user_dp_infinite_with_rdp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=10,
        accountant_type=_RDP,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -245,14 +265,71 @@ RDP accounting and group privacy:
    Epsilon assuming Poisson sampling (*):                   inf (**)
 (*) Poisson sampling is not usually done in training pipelines, but assuming
-that the data was randomly shuffled, it is believed the actual epsilon should be
+that the data was randomly shuffled, it is believed that the actual epsilon
-closer to this value than the conservative assumption of an arbitrary data
+should be closer to this value than the conservative assumption of an arbitrary
-order.
+data order.
 (**) A finite example-level epsilon implies a finite user-level epsilon at any
 `max_examples_per_user`, but because conversion from example-level to user-level
 DP is not exact, it is possible for the upper bound on the user-level epsilon to
 still be infinite.
 """
    self.assertEqual(statement, expected_statement)
  def test_dp_sgd_privacy_statement_no_user_dp_with_pld(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        accountant_type=_PLD,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
 multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
 examples per user.
 This privacy guarantee protects the release of all model checkpoints in addition
 to the final model.
 Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
 PLD accounting:
    Epsilon with each example occurring once per epoch:        12.595
    Epsilon assuming Poisson sampling (*):                      1.199
 No user-level privacy guarantee is possible without a bound on the number of
 examples per user.
 (*) Poisson sampling is not usually done in training pipelines, but assuming
 that the data was randomly shuffled, it is believed that the actual epsilon
 should be closer to this value than the conservative assumption of an arbitrary
 data order.
 """
    self.assertEqual(statement, expected_statement)
  def test_dp_sgd_privacy_statement_user_dp_with_pld(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=3,
        accountant_type=_PLD,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
 multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
 user.
 This privacy guarantee protects the release of all model checkpoints in addition
 to the final model.
 Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
 PLD accounting:
    Epsilon with each example occurring once per epoch:        12.595
    Epsilon assuming Poisson sampling (*):                      1.199
 User-level DP epsilon computation is not supported for PLD accounting at this
 time. Use RDP accounting to obtain user-level DP guarantees.
 (*) Poisson sampling is not usually done in training pipelines, but assuming
 that the data was randomly shuffled, it is believed that the actual epsilon
 should be closer to this value than the conservative assumption of an arbitrary
 data order.
 """
    self.assertEqual(statement, expected_statement)