diff --git a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py index 678824a..7e3d567 100644 --- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py +++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_lib.py @@ -15,6 +15,7 @@ """Library for computing privacy values for DP-SGD.""" import math +import textwrap from typing import Optional from absl import app @@ -224,6 +225,166 @@ def _compute_dp_sgd_example_privacy( return accountant.get_epsilon(example_delta) +def compute_dp_sgd_privacy_statement( + number_of_examples: int, + batch_size: int, + num_epochs: float, + noise_multiplier: float, + delta: float, + used_microbatching: bool = True, + max_examples_per_user: Optional[int] = None, +) -> str: + """Produces a privacy report summarizing the DP guarantee. + + Args: + number_of_examples: Total number of examples in the dataset. For DP-SGD, an + "example" corresponds to one row in a minibatch. E.g., for sequence models + this would be a sequence of maximum length. + batch_size: The number of examples in a batch. This should be the number of + examples in a batch, *regardless of whether/how they are grouped into + microbatches*. + num_epochs: The number of epochs of training. May be fractional. + noise_multiplier: The ratio of the Gaussian noise to the clip norm at each + round. It is assumed that the noise_multiplier is constant although the + clip norm may be variable if, for example, adaptive clipping is used. + delta: The target delta. + used_microbatching: Whether microbatching was used (with microbatch size + greater than one). Microbatching inflates sensitivity by a factor of two + in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical + Guide to Machine Learning with Differential Privacy", + https://arxiv.org/abs/2303.00654, Sec 5.6.) + max_examples_per_user: If the data set is constructed to cap the maximum + number of examples each user contributes, provide this argument to also + print a user-level DP guarantee. + + Returns: + A str precisely articulating the privacy guarantee. + """ + + paragraph = f"""\ +DP-SGD performed over {number_of_examples} examples with {batch_size} \ +examples per iteration, noise multiplier {noise_multiplier} for {num_epochs} \ +epochs {'with' if used_microbatching else 'without'} microbatching""" + + if max_examples_per_user is None: + paragraph += ', and no bound on number of examples per user.' + else: + paragraph += f', and at most {max_examples_per_user} examples per user.' + + paragraphs = [textwrap.fill(paragraph, width=80)] + + paragraphs.append( + textwrap.fill( + """\ +This privacy guarantee protects the release of all model checkpoints in \ +addition to the final model.""", + width=80, + ) + ) + + paragraph = textwrap.fill( + f"""\ +Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \ +with RDP accounting:""", + width=80, + ) + + example_eps_no_subsampling = _compute_dp_sgd_example_privacy( + num_epochs, noise_multiplier, delta, used_microbatching + ) + example_eps_subsampling = _compute_dp_sgd_example_privacy( + num_epochs, + noise_multiplier, + delta, + used_microbatching, + poisson_subsampling_probability=batch_size / number_of_examples, + ) + + paragraph += f""" + Epsilon with each example occurring once per epoch: \ +{example_eps_no_subsampling:12.3f} + Epsilon assuming Poisson sampling (*): \ +{example_eps_subsampling:12.3f}""" + + paragraphs.append(paragraph) + + inf_user_eps = False + if max_examples_per_user is not None: + user_eps_no_subsampling = _compute_dp_sgd_user_privacy( + num_epochs, + noise_multiplier, + delta, + max_examples_per_user, + used_microbatching, + ) + user_eps_subsampling = _compute_dp_sgd_user_privacy( + num_epochs, + noise_multiplier, + delta, + max_examples_per_user, + used_microbatching, + poisson_subsampling_probability=batch_size / number_of_examples, + ) + if math.isinf(user_eps_no_subsampling): + user_eps_no_subsampling_str = ' inf (**)' + inf_user_eps = True + else: + user_eps_no_subsampling_str = f'{user_eps_no_subsampling:12.3f}' + if math.isinf(user_eps_subsampling): + user_eps_subsampling_str = ' inf (**)' + inf_user_eps = True + else: + user_eps_subsampling_str = f'{user_eps_subsampling:12.3f}' + + paragraph = textwrap.fill( + f"""\ +User-level DP with add-or-remove-one adjacency at delta = {delta} computed \ +using RDP accounting and group privacy:""", + width=80, + ) + paragraph += f""" + Epsilon with each example occurring once per epoch: \ +{user_eps_no_subsampling_str} + Epsilon assuming Poisson sampling (*): \ +{user_eps_subsampling_str}""" + + paragraphs.append(paragraph) + else: + paragraphs.append( + textwrap.fill( + """\ +No user-level privacy guarantee is possible witout a bound on the number of \ +examples per user.""", + width=80, + ) + ) + + paragraphs.append( + textwrap.fill( + """\ +(*) Poisson sampling is not usually done in training pipelines, but assuming \ +that the data was randomly shuffled, it is believed the actual epsilon should \ +be closer to this value than the conservative assumption of an arbitrary data \ +order.""", + width=80, + ) + ) + + if inf_user_eps: + paragraphs.append( + textwrap.fill( + """\ +(**) A finite example-level epsilon implies a finite user-level epsilon at any \ +`max_examples_per_user`, but because conversion from example-level to user-\ +level DP is not exact, it is possible for the upper bound on the user-level \ +epsilon to still be infinite.""", + width=80, + ) + ) + + return '\n\n'.join(paragraphs) + '\n' + + def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta): """Compute epsilon based on the given hyperparameters. @@ -231,12 +392,11 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta): with microbatching, and assumes Poisson subsampling, which is rarely used in practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users - should call `compute_dp_sgd_privacy_statement` (which will be added shortly), - which provides appropriate context for the guarantee (see the reporting - recommendations in "How to DP-fy ML", Sec 5.3). If you need a numeric epsilon - value under specific assumptions, it is recommended to use the `dp_accounting` - libraries directly to compute epsilon, with the precise and correct - assumptions of your application. + should call `compute_dp_sgd_privacy_statement`, which provides appropriate + context for the guarantee (see the reporting recommendations in "How to DP-fy + ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions, + it is recommended to use the `dp_accounting` libraries directly to compute + epsilon, with the precise and correct assumptions of your application. Args: n: Number of examples in the training data. @@ -248,20 +408,22 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta): Returns: A 2-tuple containing the value of epsilon and the optimal RDP order. """ - # TODO(b/265168958): Update this text for `compute_dp_sgd_privacy_statement`. - logging.warn( - '`compute_dp_sgd_privacy` is deprecated. It does not account ' - 'for doubling of sensitivity with microbatching, and assumes Poisson ' - 'subsampling, which is rarely used in practice. Please use the ' - '`dp_accounting` libraries directly to compute epsilon, using the ' - 'precise and correct assumptions of your application.' - ) + logging.warn("""\ +`compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \ +sensitivity with microbatching, and assumes Poisson subsampling, which is \ +rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \ +provides appropriate context for the guarantee. To compute epsilon under \ +different assumptions than those in `compute_dp_sgd_privacy_statement`, call \ +the `dp_accounting` libraries directly.""") q = batch_size / n # q - the sampling ratio. if q > 1: raise app.UsageError('n must be larger than the batch size.') - orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] + - list(range(5, 64)) + [128, 256, 512]) + orders = ( + [1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 3.5, 4.0, 4.5] + + list(range(5, 64)) + + [128, 256, 512] + ) steps = int(math.ceil(epochs * n / batch_size)) accountant = dp_accounting.rdp.RdpAccountant(orders) diff --git a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py index 99a9333..975bd1b 100644 --- a/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py +++ b/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy_test.py @@ -25,6 +25,15 @@ _example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy _user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy +DP_SGD_STATEMENT_KWARGS = dict( + number_of_examples=10000, + batch_size=64, + num_epochs=5.0, + noise_multiplier=2.0, + delta=1e-6, +) + + class ComputeDpSgdPrivacyTest(parameterized.TestCase): @parameterized.named_parameters( @@ -145,6 +154,98 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase): ) self.assertAlmostEqual(user_eps, example_eps * k) + def test_dp_sgd_privacy_statement_no_user_dp(self): + statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement( + **DP_SGD_STATEMENT_KWARGS, + ) + expected_statement = """\ +DP-SGD performed over 10000 examples with 64 examples per iteration, noise +multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of +examples per user. + +This privacy guarantee protects the release of all model checkpoints in addition +to the final model. + +Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with +RDP accounting: + Epsilon with each example occurring once per epoch: 13.376 + Epsilon assuming Poisson sampling (*): 1.616 + +No user-level privacy guarantee is possible witout a bound on the number of +examples per user. + +(*) Poisson sampling is not usually done in training pipelines, but assuming +that the data was randomly shuffled, it is believed the actual epsilon should be +closer to this value than the conservative assumption of an arbitrary data +order. +""" + self.assertEqual(statement, expected_statement) + + def test_dp_sgd_privacy_statement_user_dp(self): + statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement( + **DP_SGD_STATEMENT_KWARGS, + max_examples_per_user=3, + ) + expected_statement = """\ +DP-SGD performed over 10000 examples with 64 examples per iteration, noise +multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per +user. + +This privacy guarantee protects the release of all model checkpoints in addition +to the final model. + +Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with +RDP accounting: + Epsilon with each example occurring once per epoch: 13.376 + Epsilon assuming Poisson sampling (*): 1.616 + +User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using +RDP accounting and group privacy: + Epsilon with each example occurring once per epoch: 113.899 + Epsilon assuming Poisson sampling (*): 8.129 + +(*) Poisson sampling is not usually done in training pipelines, but assuming +that the data was randomly shuffled, it is believed the actual epsilon should be +closer to this value than the conservative assumption of an arbitrary data +order. +""" + self.assertEqual(statement, expected_statement) + + def test_dp_sgd_privacy_statement_user_dp_infinite(self): + statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement( + **DP_SGD_STATEMENT_KWARGS, + max_examples_per_user=9, + ) + expected_statement = """\ +DP-SGD performed over 10000 examples with 64 examples per iteration, noise +multiplier 2.0 for 5.0 epochs with microbatching, and at most 9 examples per +user. + +This privacy guarantee protects the release of all model checkpoints in addition +to the final model. + +Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with +RDP accounting: + Epsilon with each example occurring once per epoch: 13.376 + Epsilon assuming Poisson sampling (*): 1.616 + +User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using +RDP accounting and group privacy: + Epsilon with each example occurring once per epoch: inf (**) + Epsilon assuming Poisson sampling (*): inf (**) + +(*) Poisson sampling is not usually done in training pipelines, but assuming +that the data was randomly shuffled, it is believed the actual epsilon should be +closer to this value than the conservative assumption of an arbitrary data +order. + +(**) A finite example-level epsilon implies a finite user-level epsilon at any +`max_examples_per_user`, but because conversion from example-level to user-level +DP is not exact, it is possible for the upper bound on the user-level epsilon to +still be infinite. +""" + self.assertEqual(statement, expected_statement) + if __name__ == '__main__': absltest.main()