Adds compute_dp_sgd_privacy_statement for accurate privacy accounting report.

PiperOrigin-RevId: 518934979
2023-03-23 12:36:43 -07:00 · 2023-03-23 12:36:43 -07:00 · d5d60e2eac
commit d5d60e2eac
parent 52806ba952
2 changed files with 279 additions and 16 deletions
--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py
@ -15,6 +15,7 @@
 """Library for computing privacy values for DP-SGD."""

 import math
+import textwrap
 from typing import Optional

 from absl import app
@ -224,6 +225,166 @@ def _compute_dp_sgd_example_privacy(
  return accountant.get_epsilon(example_delta)


+def compute_dp_sgd_privacy_statement(
+    number_of_examples: int,
+    batch_size: int,
+    num_epochs: float,
+    noise_multiplier: float,
+    delta: float,
+    used_microbatching: bool = True,
+    max_examples_per_user: Optional[int] = None,
+) -> str:
+  """Produces a privacy report summarizing the DP guarantee.
+
+  Args:
+    number_of_examples: Total number of examples in the dataset. For DP-SGD, an
+      "example" corresponds to one row in a minibatch. E.g., for sequence models
+      this would be a sequence of maximum length.
+    batch_size: The number of examples in a batch. This should be the number of
+      examples in a batch, *regardless of whether/how they are grouped into
+      microbatches*.
+    num_epochs: The number of epochs of training. May be fractional.
+    noise_multiplier: The ratio of the Gaussian noise to the clip norm at each
+      round. It is assumed that the noise_multiplier is constant although the
+      clip norm may be variable if, for example, adaptive clipping is used.
+    delta: The target delta.
+    used_microbatching: Whether microbatching was used (with microbatch size
+      greater than one). Microbatching inflates sensitivity by a factor of two
+      in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical
+      Guide to Machine Learning with Differential Privacy",
+      https://arxiv.org/abs/2303.00654, Sec 5.6.)
+    max_examples_per_user: If the data set is constructed to cap the maximum
+      number of examples each user contributes, provide this argument to also
+      print a user-level DP guarantee.
+
+  Returns:
+    A str precisely articulating the privacy guarantee.
+  """
+
+  paragraph = f"""\
+DP-SGD performed over {number_of_examples} examples with {batch_size} \
+examples per iteration, noise multiplier {noise_multiplier} for {num_epochs} \
+epochs {'with' if used_microbatching else 'without'} microbatching"""
+
+  if max_examples_per_user is None:
+    paragraph += ', and no bound on number of examples per user.'
+  else:
+    paragraph += f', and at most {max_examples_per_user} examples per user.'
+
+  paragraphs = [textwrap.fill(paragraph, width=80)]
+
+  paragraphs.append(
+      textwrap.fill(
+          """\
+This privacy guarantee protects the release of all model checkpoints in \
+addition to the final model.""",
+          width=80,
+      )
+  )
+
+  paragraph = textwrap.fill(
+      f"""\
+Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
+with RDP accounting:""",
+      width=80,
+  )
+
+  example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
+      num_epochs, noise_multiplier, delta, used_microbatching
+  )
+  example_eps_subsampling = _compute_dp_sgd_example_privacy(
+      num_epochs,
+      noise_multiplier,
+      delta,
+      used_microbatching,
+      poisson_subsampling_probability=batch_size / number_of_examples,
+  )
+
+  paragraph += f"""
+    Epsilon with each example occurring once per epoch:  \
+{example_eps_no_subsampling:12.3f}
+    Epsilon assuming Poisson sampling (*):               \
+{example_eps_subsampling:12.3f}"""
+
+  paragraphs.append(paragraph)
+
+  inf_user_eps = False
+  if max_examples_per_user is not None:
+    user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
+        num_epochs,
+        noise_multiplier,
+        delta,
+        max_examples_per_user,
+        used_microbatching,
+    )
+    user_eps_subsampling = _compute_dp_sgd_user_privacy(
+        num_epochs,
+        noise_multiplier,
+        delta,
+        max_examples_per_user,
+        used_microbatching,
+        poisson_subsampling_probability=batch_size / number_of_examples,
+    )
+    if math.isinf(user_eps_no_subsampling):
+      user_eps_no_subsampling_str = '    inf (**)'
+      inf_user_eps = True
+    else:
+      user_eps_no_subsampling_str = f'{user_eps_no_subsampling:12.3f}'
+    if math.isinf(user_eps_subsampling):
+      user_eps_subsampling_str = '    inf (**)'
+      inf_user_eps = True
+    else:
+      user_eps_subsampling_str = f'{user_eps_subsampling:12.3f}'
+
+    paragraph = textwrap.fill(
+        f"""\
+User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
+using RDP accounting and group privacy:""",
+        width=80,
+    )
+    paragraph += f"""
+    Epsilon with each example occurring once per epoch:  \
+{user_eps_no_subsampling_str}
+    Epsilon assuming Poisson sampling (*):               \
+{user_eps_subsampling_str}"""
+
+    paragraphs.append(paragraph)
+  else:
+    paragraphs.append(
+        textwrap.fill(
+            """\
+No user-level privacy guarantee is possible witout a bound on the number of \
+examples per user.""",
+            width=80,
+        )
+    )
+
+  paragraphs.append(
+      textwrap.fill(
+          """\
+(*) Poisson sampling is not usually done in training pipelines, but assuming \
+that the data was randomly shuffled, it is believed the actual epsilon should \
+be closer to this value than the conservative assumption of an arbitrary data \
+order.""",
+          width=80,
+      )
+  )
+
+  if inf_user_eps:
+    paragraphs.append(
+        textwrap.fill(
+            """\
+(**) A finite example-level epsilon implies a finite user-level epsilon at any \
+`max_examples_per_user`, but because conversion from example-level to user-\
+level DP is not exact, it is possible for the upper bound on the user-level \
+epsilon to still be infinite.""",
+            width=80,
+        )
+    )
+
+  return '\n\n'.join(paragraphs) + '\n'
+
+
 def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
  """Compute epsilon based on the given hyperparameters.

@ -231,12 +392,11 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
  with microbatching, and assumes Poisson subsampling, which is rarely used in
  practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with
  Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users
-  should call `compute_dp_sgd_privacy_statement` (which will be added shortly),
-  which provides appropriate context for the guarantee (see the reporting
-  recommendations in "How to DP-fy ML", Sec 5.3). If you need a numeric epsilon
-  value under specific assumptions, it is recommended to use the `dp_accounting`
-  libraries directly to compute epsilon, with the precise and correct
-  assumptions of your application.
+  should call `compute_dp_sgd_privacy_statement`, which provides appropriate
+  context for the guarantee (see the reporting recommendations in "How to DP-fy
+  ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions,
+  it is recommended to use the `dp_accounting` libraries directly to compute
+  epsilon, with the precise and correct assumptions of your application.

  Args:
    n: Number of examples in the training data.
@ -248,20 +408,22 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
  Returns:
    A 2-tuple containing the value of epsilon and the optimal RDP order.
  """
-  # TODO(b/265168958): Update this text for `compute_dp_sgd_privacy_statement`.
-  logging.warn(
-      '`compute_dp_sgd_privacy` is deprecated. It does not account '
-      'for doubling of sensitivity with microbatching, and assumes Poisson '
-      'subsampling, which is rarely used in practice. Please use the '
-      '`dp_accounting` libraries directly to compute epsilon, using the '
-      'precise and correct assumptions of your application.'
-  )
+  logging.warn("""\
+`compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \
+sensitivity with microbatching, and assumes Poisson subsampling, which is \
+rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \
+provides appropriate context for the guarantee. To compute epsilon under \
+different assumptions than those in `compute_dp_sgd_privacy_statement`, call \
+the `dp_accounting` libraries directly.""")

  q = batch_size / n  # q - the sampling ratio.
  if q > 1:
    raise app.UsageError('n must be larger than the batch size.')
-  orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] +
-            list(range(5, 64)) + [128, 256, 512])
+  orders = (
+      [1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 3.5, 4.0, 4.5]
+      + list(range(5, 64))
+      + [128, 256, 512]
+  )
  steps = int(math.ceil(epochs * n / batch_size))
  accountant = dp_accounting.rdp.RdpAccountant(orders)

--- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
+++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py
@ -25,6 +25,15 @@ _example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
 _user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy


+DP_SGD_STATEMENT_KWARGS = dict(
+    number_of_examples=10000,
+    batch_size=64,
+    num_epochs=5.0,
+    noise_multiplier=2.0,
+    delta=1e-6,
+)
+
+
 class ComputeDpSgdPrivacyTest(parameterized.TestCase):

  @parameterized.named_parameters(
@ -145,6 +154,98 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
    )
    self.assertAlmostEqual(user_eps, example_eps * k)

+  def test_dp_sgd_privacy_statement_no_user_dp(self):
+    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
+        **DP_SGD_STATEMENT_KWARGS,
+    )
+    expected_statement = """\
+DP-SGD performed over 10000 examples with 64 examples per iteration, noise
+multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
+examples per user.
+
+This privacy guarantee protects the release of all model checkpoints in addition
+to the final model.
+
+Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
+RDP accounting:
+    Epsilon with each example occurring once per epoch:        13.376
+    Epsilon assuming Poisson sampling (*):                      1.616
+
+No user-level privacy guarantee is possible witout a bound on the number of
+examples per user.
+
+(*) Poisson sampling is not usually done in training pipelines, but assuming
+that the data was randomly shuffled, it is believed the actual epsilon should be
+closer to this value than the conservative assumption of an arbitrary data
+order.
+"""
+    self.assertEqual(statement, expected_statement)
+
+  def test_dp_sgd_privacy_statement_user_dp(self):
+    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
+        **DP_SGD_STATEMENT_KWARGS,
+        max_examples_per_user=3,
+    )
+    expected_statement = """\
+DP-SGD performed over 10000 examples with 64 examples per iteration, noise
+multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
+user.
+
+This privacy guarantee protects the release of all model checkpoints in addition
+to the final model.
+
+Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
+RDP accounting:
+    Epsilon with each example occurring once per epoch:        13.376
+    Epsilon assuming Poisson sampling (*):                      1.616
+
+User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
+RDP accounting and group privacy:
+    Epsilon with each example occurring once per epoch:       113.899
+    Epsilon assuming Poisson sampling (*):                      8.129
+
+(*) Poisson sampling is not usually done in training pipelines, but assuming
+that the data was randomly shuffled, it is believed the actual epsilon should be
+closer to this value than the conservative assumption of an arbitrary data
+order.
+"""
+    self.assertEqual(statement, expected_statement)
+
+  def test_dp_sgd_privacy_statement_user_dp_infinite(self):
+    statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
+        **DP_SGD_STATEMENT_KWARGS,
+        max_examples_per_user=9,
+    )
+    expected_statement = """\
+DP-SGD performed over 10000 examples with 64 examples per iteration, noise
+multiplier 2.0 for 5.0 epochs with microbatching, and at most 9 examples per
+user.
+
+This privacy guarantee protects the release of all model checkpoints in addition
+to the final model.
+
+Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
+RDP accounting:
+    Epsilon with each example occurring once per epoch:        13.376
+    Epsilon assuming Poisson sampling (*):                      1.616
+
+User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
+RDP accounting and group privacy:
+    Epsilon with each example occurring once per epoch:      inf (**)
+    Epsilon assuming Poisson sampling (*):                   inf (**)
+
+(*) Poisson sampling is not usually done in training pipelines, but assuming
+that the data was randomly shuffled, it is believed the actual epsilon should be
+closer to this value than the conservative assumption of an arbitrary data
+order.
+
+(**) A finite example-level epsilon implies a finite user-level epsilon at any
+`max_examples_per_user`, but because conversion from example-level to user-level
+DP is not exact, it is possible for the upper bound on the user-level epsilon to
+still be infinite.
+"""
+    self.assertEqual(statement, expected_statement)
+

 if __name__ == '__main__':
  absltest.main()