Use better group privacy bound in computing user level privacy [TF Privacy]

PiperOrigin-RevId: 526852999
This commit is contained in:
A. Unique TensorFlower 2023-04-24 22:16:51 -07:00
parent 60cb0dd2fb
commit 33bbc87ff2
2 changed files with 65 additions and 32 deletions

View file

@ -28,6 +28,11 @@ class UserLevelDPComputationError(Exception):
"""Error raised if user-level epsilon computation fails."""
def _logexpm1(x: float) -> float:
"""Returns log(exp(x) - 1)."""
return x + math.log(-math.expm1(-x))
def _compute_dp_sgd_user_privacy(
num_epochs: float,
noise_multiplier: float,
@ -100,21 +105,32 @@ def _compute_dp_sgd_user_privacy(
# log(G(F(example_delta), example_delta)) - log(user_delta) = 0
# Then we can return user_eps = H(F(example_delta)).
log_k = math.log(max_examples_per_user)
target_user_log_delta = math.log(user_delta)
def user_log_delta_gap(example_log_delta):
example_eps = _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
math.exp(example_log_delta),
used_microbatching,
poisson_subsampling_probability,
)
# We store all example_eps computed for any example_delta in the following
# method. This is done so that we don't have to recompute values for the same
# delta.
epsilon_cache = dict()
# Estimate user_eps, user_log_delta using Vadhan Lemma 2.2.
def user_log_delta_gap(example_log_delta):
if example_log_delta not in epsilon_cache:
epsilon_cache[example_log_delta] = _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
math.exp(example_log_delta),
used_microbatching,
poisson_subsampling_probability,
)
example_eps = epsilon_cache[example_log_delta]
# Estimate user_eps, user_log_delta using Vadhan Lemma 2.2, using a tighter
# bound seen in the penultimate line of the proof, given as
# user_delta = (example_delta * (exp(k * example_eps) - 1)
# / (exp(example_eps) - 1))
user_eps = max_examples_per_user * example_eps
user_log_delta = log_k + user_eps + example_log_delta
user_log_delta = (
example_log_delta + _logexpm1(user_eps) - _logexpm1(example_eps)
)
return user_log_delta - target_user_log_delta
# We need bounds on the example-level delta. The supplied user-level delta
@ -164,9 +180,16 @@ def _compute_dp_sgd_user_privacy(
)
# Vadhan (2017) "The complexity of differential privacy" Lemma 2.2.
# user_delta = k * exp(k * example_eps) * example_delta
# Given example_delta, we can solve for (k * example_eps) = user_eps.
return max(0, target_user_log_delta - log_k - example_log_delta)
if example_log_delta not in epsilon_cache:
epsilon_cache[example_log_delta] = _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
math.exp(example_log_delta),
used_microbatching,
poisson_subsampling_probability,
)
example_eps = epsilon_cache[example_log_delta]
return max_examples_per_user * example_eps
def _compute_dp_sgd_example_privacy(
@ -354,7 +377,7 @@ using RDP accounting and group privacy:""",
paragraphs.append(
textwrap.fill(
"""\
No user-level privacy guarantee is possible witout a bound on the number of \
No user-level privacy guarantee is possible without a bound on the number of \
examples per user.""",
width=80,
)

View file

@ -134,25 +134,35 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
self.assertEqual(user_eps, example_eps)
@parameterized.parameters((0.9, 2), (1.1, 3), (2.3, 13))
def test_user_privacy_epsilon_delta_consistency(self, z, k):
def test_user_privacy_epsilon_delta_consistency(
self, noise_multiplier, max_examples_per_user
):
"""Tests example/user epsilons consistent with Vadhan (2017) Lemma 2.2."""
num_epochs = 5
user_delta = 1e-6
example_delta = 1e-8
q = 2e-4
user_eps = _user_privacy(
num_epochs,
noise_multiplier=z,
user_delta=user_delta,
max_examples_per_user=k,
poisson_subsampling_probability=q,
)
example_eps = _example_privacy(
num_epochs,
noise_multiplier=z,
example_delta=user_delta / (k * math.exp(user_eps)),
noise_multiplier=noise_multiplier,
example_delta=example_delta,
poisson_subsampling_probability=q,
)
self.assertAlmostEqual(user_eps, example_eps * k)
user_delta = math.exp(
math.log(example_delta)
+ compute_dp_sgd_privacy_lib._logexpm1(
max_examples_per_user * example_eps
)
- compute_dp_sgd_privacy_lib._logexpm1(example_eps)
)
user_eps = _user_privacy(
num_epochs,
noise_multiplier=noise_multiplier,
user_delta=user_delta,
max_examples_per_user=max_examples_per_user,
poisson_subsampling_probability=q,
)
self.assertAlmostEqual(user_eps, example_eps * max_examples_per_user)
def test_dp_sgd_privacy_statement_no_user_dp(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
@ -171,7 +181,7 @@ RDP accounting:
Epsilon with each example occurring once per epoch: 13.376
Epsilon assuming Poisson sampling (*): 1.616
No user-level privacy guarantee is possible witout a bound on the number of
No user-level privacy guarantee is possible without a bound on the number of
examples per user.
(*) Poisson sampling is not usually done in training pipelines, but assuming
@ -201,8 +211,8 @@ RDP accounting:
User-level DP with add-or-remove-one adjacency at delta = 1e-06 computed using
RDP accounting and group privacy:
Epsilon with each example occurring once per epoch: 113.899
Epsilon assuming Poisson sampling (*): 8.129
Epsilon with each example occurring once per epoch: 85.940
Epsilon assuming Poisson sampling (*): 6.425
(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed the actual epsilon should be
@ -214,11 +224,11 @@ order.
def test_dp_sgd_privacy_statement_user_dp_infinite(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
**DP_SGD_STATEMENT_KWARGS,
max_examples_per_user=9,
max_examples_per_user=10,
)
expected_statement = """\
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
multiplier 2.0 for 5.0 epochs with microbatching, and at most 9 examples per
multiplier 2.0 for 5.0 epochs with microbatching, and at most 10 examples per
user.
This privacy guarantee protects the release of all model checkpoints in addition