Adds compute_dp_sgd_privacy_statement for accurate privacy accounting report.

PiperOrigin-RevId: 518934979
2023-03-23 12:36:43 -07:00 · 2023-03-23 12:36:43 -07:00 · d5d60e2eac
commit d5d60e2eac
parent 52806ba952
2 changed files with 279 additions and 16 deletions
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
@ -15,6 +15,7 @@
 """Library for computing privacy values for DP-SGD."""
 import math
 import textwrap
 from typing import Optional
 from absl import app
@ -224,6 +225,166 @@ def _compute_dp_sgd_example_privacy(
  return accountant.get_epsilon(example_delta)
 def compute_dp_sgd_privacy_statement(
    number_of_examples: int,
    batch_size: int,
    num_epochs: float,
    noise_multiplier: float,
    delta: float,
    used_microbatching: bool = True,
    max_examples_per_user: Optional[int] = None,
 ) -> str:
  """Produces a privacy report summarizing the DP guarantee.
  Args:
    number_of_examples: Total number of examples in the dataset. For DP-SGD, an
      "example" corresponds to one row in a minibatch. E.g., for sequence models
      this would be a sequence of maximum length.
    batch_size: The number of examples in a batch. This should be the number of
      examples in a batch, *regardless of whether/how they are grouped into
      microbatches*.
    num_epochs: The number of epochs of training. May be fractional.
    noise_multiplier: The ratio of the Gaussian noise to the clip norm at each
      round. It is assumed that the noise_multiplier is constant although the
      clip norm may be variable if, for example, adaptive clipping is used.
    delta: The target delta.
    used_microbatching: Whether microbatching was used (with microbatch size
      greater than one). Microbatching inflates sensitivity by a factor of two
      in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical
      Guide to Machine Learning with Differential Privacy",
      https://arxiv.org/abs/2303.00654, Sec 5.6.)
    max_examples_per_user: If the data set is constructed to cap the maximum
      number of examples each user contributes, provide this argument to also
      print a user-level DP guarantee.
  Returns:
    A str precisely articulating the privacy guarantee.
  """
  paragraph = f"""\
 DP-SGD performed over {number_of_examples} examples with {batch_size} \
 examples per iteration, noise multiplier {noise_multiplier} for {num_epochs} \
 epochs {'with' if used_microbatching else 'without'} microbatching"""
  if max_examples_per_user is None:
    paragraph += ', and no bound on number of examples per user.'
  else:
    paragraph += f', and at most {max_examples_per_user} examples per user.'
  paragraphs = [textwrap.fill(paragraph, width=80)]
  paragraphs.append(
      textwrap.fill(
          """\
 This privacy guarantee protects the release of all model checkpoints in \
 addition to the final model.""",
          width=80,
      )
  )
  paragraph = textwrap.fill(
      f"""\
 Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
 with RDP accounting:""",
      width=80,
  )
  example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
      num_epochs, noise_multiplier, delta, used_microbatching
  )
  example_eps_subsampling = _compute_dp_sgd_example_privacy(
      num_epochs,
      noise_multiplier,
      delta,
      used_microbatching,
      poisson_subsampling_probability=batch_size / number_of_examples,
  )
  paragraph += f"""
    Epsilon with each example occurring once per epoch:  \
 {example_eps_no_subsampling:12.3f}
    Epsilon assuming Poisson sampling (*):               \
 {example_eps_subsampling:12.3f}"""
  paragraphs.append(paragraph)
  inf_user_eps = False
  if max_examples_per_user is not None:
    user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
        num_epochs,
        noise_multiplier,
        delta,
        max_examples_per_user,
        used_microbatching,
    )
    user_eps_subsampling = _compute_dp_sgd_user_privacy(
        num_epochs,
        noise_multiplier,
        delta,
        max_examples_per_user,
        used_microbatching,
        poisson_subsampling_probability=batch_size / number_of_examples,
    )
    if math.isinf(user_eps_no_subsampling):
      user_eps_no_subsampling_str = '    inf (**)'
      inf_user_eps = True
    else:
      user_eps_no_subsampling_str = f'{user_eps_no_subsampling:12.3f}'
    if math.isinf(user_eps_subsampling):
      user_eps_subsampling_str = '    inf (**)'
      inf_user_eps = True
    else:
      user_eps_subsampling_str = f'{user_eps_subsampling:12.3f}'
    paragraph = textwrap.fill(
        f"""\
 User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
 using RDP accounting and group privacy:""",
        width=80,
    )
    paragraph += f"""
    Epsilon with each example occurring once per epoch:  \
 {user_eps_no_subsampling_str}
    Epsilon assuming Poisson sampling (*):               \
 {user_eps_subsampling_str}"""
    paragraphs.append(paragraph)
  else:
    paragraphs.append(
        textwrap.fill(
            """\
 No user-level privacy guarantee is possible witout a bound on the number of \
 examples per user.""",
            width=80,
        )
    )
  paragraphs.append(
      textwrap.fill(
          """\
 (*) Poisson sampling is not usually done in training pipelines, but assuming \
 that the data was randomly shuffled, it is believed the actual epsilon should \
 be closer to this value than the conservative assumption of an arbitrary data \
 order.""",
          width=80,
      )
  )
  if inf_user_eps:
    paragraphs.append(
        textwrap.fill(
            """\
 (**) A finite example-level epsilon implies a finite user-level epsilon at any \
 `max_examples_per_user`, but because conversion from example-level to user-\
 level DP is not exact, it is possible for the upper bound on the user-level \
 epsilon to still be infinite.""",
            width=80,
        )
    )
  return '\n\n'.join(paragraphs) + '\n'
 def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
  """Compute epsilon based on the given hyperparameters.
@ -231,12 +392,11 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
  with microbatching, and assumes Poisson subsampling, which is rarely used in
  practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with
  Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users
-  should call `compute_dp_sgd_privacy_statement` (which will be added shortly),
+  should call `compute_dp_sgd_privacy_statement`, which provides appropriate
-  which provides appropriate context for the guarantee (see the reporting
+  context for the guarantee (see the reporting recommendations in "How to DP-fy
-  recommendations in "How to DP-fy ML", Sec 5.3). If you need a numeric epsilon
+  ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions,
-  value under specific assumptions, it is recommended to use the `dp_accounting`
+  it is recommended to use the `dp_accounting` libraries directly to compute
-  libraries directly to compute epsilon, with the precise and correct
+  epsilon, with the precise and correct assumptions of your application.
  assumptions of your application.
  Args:
    n: Number of examples in the training data.
@ -248,20 +408,22 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
  Returns:
    A 2-tuple containing the value of epsilon and the optimal RDP order.
  """
-  # TODO(b/265168958): Update this text for `compute_dp_sgd_privacy_statement`.
+  logging.warn("""\
-  logging.warn(
+`compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \
-      '`compute_dp_sgd_privacy` is deprecated. It does not account '
+sensitivity with microbatching, and assumes Poisson subsampling, which is \
-      'for doubling of sensitivity with microbatching, and assumes Poisson '
+rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \
-      'subsampling, which is rarely used in practice. Please use the '
+provides appropriate context for the guarantee. To compute epsilon under \
-      '`dp_accounting` libraries directly to compute epsilon, using the '
+different assumptions than those in `compute_dp_sgd_privacy_statement`, call \
-      'precise and correct assumptions of your application.'
+the `dp_accounting` libraries directly.""")
  )
  q = batch_size / n  # q - the sampling ratio.
  if q > 1:
    raise app.UsageError('n must be larger than the batch size.')
-  orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] +
+  orders = (
-            list(range(5, 64)) + [128, 256, 512])
+      [1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 3.5, 4.0, 4.5]
      + list(range(5, 64))
      + [128, 256, 512]
  )
  steps = int(math.ceil(epochs * n / batch_size))
  accountant = dp_accounting.rdp.RdpAccountant(orders)
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
@ -25,6 +25,15 @@ _example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
 _user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
 DP_SGD_STATEMENT_KWARGS = dict(
    number_of_examples=10000,
    batch_size=64,
    num_epochs=5.0,
    noise_multiplier=2.0,
    delta=1e-6,
 )
 class ComputeDpSgdPrivacyTest(parameterized.TestCase):
  @parameterized.named_parameters(
@ -145,6 +154,98 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
    )
    self.assertAlmostEqual(user_eps, example_eps * k)
  def test_dp_sgd_privacy_statement_no_user_dp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
 multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
 examples per user.
 This privacy guarantee protects the release of all model checkpoints in addition
 to the final model.
 Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
 RDP accounting:
    Epsilon with each example occurring once per epoch:        13.376
    Epsilon assuming Poisson sampling (*):                      1.616
 No user-level privacy guarantee is possible witout a bound on the number of
 examples per user.
 (*) Poisson sampling is not usually done in training pipelines, but assuming
 that the data was randomly shuffled, it is believed the actual epsilon should be
 closer to this value than the conservative assumption of an arbitrary data
 order.
 """
    self.assertEqual(statement, expected_statement)
  def test_dp_sgd_privacy_statement_user_dp(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=3,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
 multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
 user.
 This privacy guarantee protects the release of all model checkpoints in addition
 to the final model.
 Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
 RDP accounting:
    Epsilon with each example occurring once per epoch:        13.376
    Epsilon assuming Poisson sampling (*):                      1.616
 User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
 RDP accounting and group privacy:
    Epsilon with each example occurring once per epoch:       113.899
    Epsilon assuming Poisson sampling (*):                      8.129
 (*) Poisson sampling is not usually done in training pipelines, but assuming
 that the data was randomly shuffled, it is believed the actual epsilon should be
 closer to this value than the conservative assumption of an arbitrary data
 order.
 """
    self.assertEqual(statement, expected_statement)
  def test_dp_sgd_privacy_statement_user_dp_infinite(self):
    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
        **DP_SGD_STATEMENT_KWARGS,
        max_examples_per_user=9,
    )
    expected_statement = """\
 DP-SGD performed over 10000 examples with 64 examples per iteration, noise
 multiplier 2.0 for 5.0 epochs with microbatching, and at most 9 examples per
 user.
 This privacy guarantee protects the release of all model checkpoints in addition
 to the final model.
 Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
 RDP accounting:
    Epsilon with each example occurring once per epoch:        13.376
    Epsilon assuming Poisson sampling (*):                      1.616
 User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
 RDP accounting and group privacy:
    Epsilon with each example occurring once per epoch:      inf (**)
    Epsilon assuming Poisson sampling (*):                   inf (**)
 (*) Poisson sampling is not usually done in training pipelines, but assuming
 that the data was randomly shuffled, it is believed the actual epsilon should be
 closer to this value than the conservative assumption of an arbitrary data
 order.
 (**) A finite example-level epsilon implies a finite user-level epsilon at any
 `max_examples_per_user`, but because conversion from example-level to user-level
 DP is not exact, it is possible for the upper bound on the user-level epsilon to
 still be infinite.
 """
    self.assertEqual(statement, expected_statement)
 if __name__ == '__main__':
  absltest.main()