forked from 626_privacy/tensorflow_privacy
Adds compute_dp_sgd_privacy_statement
for accurate privacy accounting report.
PiperOrigin-RevId: 518934979
This commit is contained in:
parent
52806ba952
commit
d5d60e2eac
2 changed files with 279 additions and 16 deletions
|
@ -15,6 +15,7 @@
|
|||
"""Library for computing privacy values for DP-SGD."""
|
||||
|
||||
import math
|
||||
import textwrap
|
||||
from typing import Optional
|
||||
|
||||
from absl import app
|
||||
|
@ -224,6 +225,166 @@ def _compute_dp_sgd_example_privacy(
|
|||
return accountant.get_epsilon(example_delta)
|
||||
|
||||
|
||||
def compute_dp_sgd_privacy_statement(
|
||||
number_of_examples: int,
|
||||
batch_size: int,
|
||||
num_epochs: float,
|
||||
noise_multiplier: float,
|
||||
delta: float,
|
||||
used_microbatching: bool = True,
|
||||
max_examples_per_user: Optional[int] = None,
|
||||
) -> str:
|
||||
"""Produces a privacy report summarizing the DP guarantee.
|
||||
|
||||
Args:
|
||||
number_of_examples: Total number of examples in the dataset. For DP-SGD, an
|
||||
"example" corresponds to one row in a minibatch. E.g., for sequence models
|
||||
this would be a sequence of maximum length.
|
||||
batch_size: The number of examples in a batch. This should be the number of
|
||||
examples in a batch, *regardless of whether/how they are grouped into
|
||||
microbatches*.
|
||||
num_epochs: The number of epochs of training. May be fractional.
|
||||
noise_multiplier: The ratio of the Gaussian noise to the clip norm at each
|
||||
round. It is assumed that the noise_multiplier is constant although the
|
||||
clip norm may be variable if, for example, adaptive clipping is used.
|
||||
delta: The target delta.
|
||||
used_microbatching: Whether microbatching was used (with microbatch size
|
||||
greater than one). Microbatching inflates sensitivity by a factor of two
|
||||
in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical
|
||||
Guide to Machine Learning with Differential Privacy",
|
||||
https://arxiv.org/abs/2303.00654, Sec 5.6.)
|
||||
max_examples_per_user: If the data set is constructed to cap the maximum
|
||||
number of examples each user contributes, provide this argument to also
|
||||
print a user-level DP guarantee.
|
||||
|
||||
Returns:
|
||||
A str precisely articulating the privacy guarantee.
|
||||
"""
|
||||
|
||||
paragraph = f"""\
|
||||
DP-SGD performed over {number_of_examples} examples with {batch_size} \
|
||||
examples per iteration, noise multiplier {noise_multiplier} for {num_epochs} \
|
||||
epochs {'with' if used_microbatching else 'without'} microbatching"""
|
||||
|
||||
if max_examples_per_user is None:
|
||||
paragraph += ', and no bound on number of examples per user.'
|
||||
else:
|
||||
paragraph += f', and at most {max_examples_per_user} examples per user.'
|
||||
|
||||
paragraphs = [textwrap.fill(paragraph, width=80)]
|
||||
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
This privacy guarantee protects the release of all model checkpoints in \
|
||||
addition to the final model.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
|
||||
paragraph = textwrap.fill(
|
||||
f"""\
|
||||
Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
|
||||
with RDP accounting:""",
|
||||
width=80,
|
||||
)
|
||||
|
||||
example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
|
||||
num_epochs, noise_multiplier, delta, used_microbatching
|
||||
)
|
||||
example_eps_subsampling = _compute_dp_sgd_example_privacy(
|
||||
num_epochs,
|
||||
noise_multiplier,
|
||||
delta,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability=batch_size / number_of_examples,
|
||||
)
|
||||
|
||||
paragraph += f"""
|
||||
Epsilon with each example occurring once per epoch: \
|
||||
{example_eps_no_subsampling:12.3f}
|
||||
Epsilon assuming Poisson sampling (*): \
|
||||
{example_eps_subsampling:12.3f}"""
|
||||
|
||||
paragraphs.append(paragraph)
|
||||
|
||||
inf_user_eps = False
|
||||
if max_examples_per_user is not None:
|
||||
user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
|
||||
num_epochs,
|
||||
noise_multiplier,
|
||||
delta,
|
||||
max_examples_per_user,
|
||||
used_microbatching,
|
||||
)
|
||||
user_eps_subsampling = _compute_dp_sgd_user_privacy(
|
||||
num_epochs,
|
||||
noise_multiplier,
|
||||
delta,
|
||||
max_examples_per_user,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability=batch_size / number_of_examples,
|
||||
)
|
||||
if math.isinf(user_eps_no_subsampling):
|
||||
user_eps_no_subsampling_str = ' inf (**)'
|
||||
inf_user_eps = True
|
||||
else:
|
||||
user_eps_no_subsampling_str = f'{user_eps_no_subsampling:12.3f}'
|
||||
if math.isinf(user_eps_subsampling):
|
||||
user_eps_subsampling_str = ' inf (**)'
|
||||
inf_user_eps = True
|
||||
else:
|
||||
user_eps_subsampling_str = f'{user_eps_subsampling:12.3f}'
|
||||
|
||||
paragraph = textwrap.fill(
|
||||
f"""\
|
||||
User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
|
||||
using RDP accounting and group privacy:""",
|
||||
width=80,
|
||||
)
|
||||
paragraph += f"""
|
||||
Epsilon with each example occurring once per epoch: \
|
||||
{user_eps_no_subsampling_str}
|
||||
Epsilon assuming Poisson sampling (*): \
|
||||
{user_eps_subsampling_str}"""
|
||||
|
||||
paragraphs.append(paragraph)
|
||||
else:
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
No user-level privacy guarantee is possible witout a bound on the number of \
|
||||
examples per user.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming \
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should \
|
||||
be closer to this value than the conservative assumption of an arbitrary data \
|
||||
order.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
|
||||
if inf_user_eps:
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
(**) A finite example-level epsilon implies a finite user-level epsilon at any \
|
||||
`max_examples_per_user`, but because conversion from example-level to user-\
|
||||
level DP is not exact, it is possible for the upper bound on the user-level \
|
||||
epsilon to still be infinite.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
|
||||
return '\n\n'.join(paragraphs) + '\n'
|
||||
|
||||
|
||||
def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
|
||||
"""Compute epsilon based on the given hyperparameters.
|
||||
|
||||
|
@ -231,12 +392,11 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
|
|||
with microbatching, and assumes Poisson subsampling, which is rarely used in
|
||||
practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with
|
||||
Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users
|
||||
should call `compute_dp_sgd_privacy_statement` (which will be added shortly),
|
||||
which provides appropriate context for the guarantee (see the reporting
|
||||
recommendations in "How to DP-fy ML", Sec 5.3). If you need a numeric epsilon
|
||||
value under specific assumptions, it is recommended to use the `dp_accounting`
|
||||
libraries directly to compute epsilon, with the precise and correct
|
||||
assumptions of your application.
|
||||
should call `compute_dp_sgd_privacy_statement`, which provides appropriate
|
||||
context for the guarantee (see the reporting recommendations in "How to DP-fy
|
||||
ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions,
|
||||
it is recommended to use the `dp_accounting` libraries directly to compute
|
||||
epsilon, with the precise and correct assumptions of your application.
|
||||
|
||||
Args:
|
||||
n: Number of examples in the training data.
|
||||
|
@ -248,20 +408,22 @@ def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
|
|||
Returns:
|
||||
A 2-tuple containing the value of epsilon and the optimal RDP order.
|
||||
"""
|
||||
# TODO(b/265168958): Update this text for `compute_dp_sgd_privacy_statement`.
|
||||
logging.warn(
|
||||
'`compute_dp_sgd_privacy` is deprecated. It does not account '
|
||||
'for doubling of sensitivity with microbatching, and assumes Poisson '
|
||||
'subsampling, which is rarely used in practice. Please use the '
|
||||
'`dp_accounting` libraries directly to compute epsilon, using the '
|
||||
'precise and correct assumptions of your application.'
|
||||
)
|
||||
logging.warn("""\
|
||||
`compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \
|
||||
sensitivity with microbatching, and assumes Poisson subsampling, which is \
|
||||
rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \
|
||||
provides appropriate context for the guarantee. To compute epsilon under \
|
||||
different assumptions than those in `compute_dp_sgd_privacy_statement`, call \
|
||||
the `dp_accounting` libraries directly.""")
|
||||
|
||||
q = batch_size / n # q - the sampling ratio.
|
||||
if q > 1:
|
||||
raise app.UsageError('n must be larger than the batch size.')
|
||||
orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] +
|
||||
list(range(5, 64)) + [128, 256, 512])
|
||||
orders = (
|
||||
[1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 3.5, 4.0, 4.5]
|
||||
+ list(range(5, 64))
|
||||
+ [128, 256, 512]
|
||||
)
|
||||
steps = int(math.ceil(epochs * n / batch_size))
|
||||
accountant = dp_accounting.rdp.RdpAccountant(orders)
|
||||
|
||||
|
|
|
@ -25,6 +25,15 @@ _example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
|
|||
_user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
|
||||
|
||||
|
||||
DP_SGD_STATEMENT_KWARGS = dict(
|
||||
number_of_examples=10000,
|
||||
batch_size=64,
|
||||
num_epochs=5.0,
|
||||
noise_multiplier=2.0,
|
||||
delta=1e-6,
|
||||
)
|
||||
|
||||
|
||||
class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
||||
|
||||
@parameterized.named_parameters(
|
||||
|
@ -145,6 +154,98 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
|||
)
|
||||
self.assertAlmostEqual(user_eps, example_eps * k)
|
||||
|
||||
def test_dp_sgd_privacy_statement_no_user_dp(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
|
||||
examples per user.
|
||||
|
||||
This privacy guarantee protects the release of all model checkpoints in addition
|
||||
to the final model.
|
||||
|
||||
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
|
||||
RDP accounting:
|
||||
Epsilon with each example occurring once per epoch: 13.376
|
||||
Epsilon assuming Poisson sampling (*): 1.616
|
||||
|
||||
No user-level privacy guarantee is possible witout a bound on the number of
|
||||
examples per user.
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should be
|
||||
closer to this value than the conservative assumption of an arbitrary data
|
||||
order.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
def test_dp_sgd_privacy_statement_user_dp(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
max_examples_per_user=3,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
|
||||
user.
|
||||
|
||||
This privacy guarantee protects the release of all model checkpoints in addition
|
||||
to the final model.
|
||||
|
||||
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
|
||||
RDP accounting:
|
||||
Epsilon with each example occurring once per epoch: 13.376
|
||||
Epsilon assuming Poisson sampling (*): 1.616
|
||||
|
||||
User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
|
||||
RDP accounting and group privacy:
|
||||
Epsilon with each example occurring once per epoch: 113.899
|
||||
Epsilon assuming Poisson sampling (*): 8.129
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should be
|
||||
closer to this value than the conservative assumption of an arbitrary data
|
||||
order.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
def test_dp_sgd_privacy_statement_user_dp_infinite(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
max_examples_per_user=9,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
multiplier 2.0 for 5.0 epochs with microbatching, and at most 9 examples per
|
||||
user.
|
||||
|
||||
This privacy guarantee protects the release of all model checkpoints in addition
|
||||
to the final model.
|
||||
|
||||
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
|
||||
RDP accounting:
|
||||
Epsilon with each example occurring once per epoch: 13.376
|
||||
Epsilon assuming Poisson sampling (*): 1.616
|
||||
|
||||
User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
|
||||
RDP accounting and group privacy:
|
||||
Epsilon with each example occurring once per epoch: inf (**)
|
||||
Epsilon assuming Poisson sampling (*): inf (**)
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should be
|
||||
closer to this value than the conservative assumption of an arbitrary data
|
||||
order.
|
||||
|
||||
(**) A finite example-level epsilon implies a finite user-level epsilon at any
|
||||
`max_examples_per_user`, but because conversion from example-level to user-level
|
||||
DP is not exact, it is possible for the upper bound on the user-level epsilon to
|
||||
still be infinite.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
absltest.main()
|
||||
|
|
Loading…
Reference in a new issue