Add support for PLD Accountant in computing DP-SGD privacy statement [TF Privacy]

PiperOrigin-RevId: 587854134
This commit is contained in:
Pritish Kamath 2023-12-04 15:08:16 -08:00 committed by A. Unique TensorFlower
parent f51b637dda
commit 93376c9d6a
3 changed files with 170 additions and 43 deletions

View file

@ -18,20 +18,20 @@ The script applies the RDP accountant to estimate privacy budget of an iterated
Sampled Gaussian Mechanism. The mechanism's parameters are controlled by flags.
Example:
compute_dp_sgd_privacy
compute_dp_sgd_privacy \
--N=60000 \
--batch_size=256 \
--noise_multiplier=1.12 \
--epochs=60 \
--delta=1e-5
--delta=1e-5 \
--accountant_type=RDP
The output states that DP-SGD with these parameters satisfies (2.92, 1e-5)-DP.
Prints out the privacy statement corresponding to the above parameters.
"""
from absl import app
from absl import flags
from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy_statement
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
_NUM_EXAMPLES = flags.DEFINE_integer(
@ -70,6 +70,9 @@ _MAX_EXAMPLES_PER_USER = flags.DEFINE_integer(
'user-level DP guarantee.'
),
)
_ACCOUNTANT_TYPE = flags.DEFINE_enum(
'accountant_type', 'RDP', ['RDP', 'PLD'], 'DP accountant to use.'
)
flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
@ -77,7 +80,7 @@ flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
def main(argv):
del argv # argv is not used.
statement = compute_dp_sgd_privacy_statement(
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
_NUM_EXAMPLES.value,
_BATCH_SIZE.value,
_NUM_EPOCHS.value,
@ -85,6 +88,7 @@ def main(argv):
_DELTA.value,
_USED_MICROBATCHING.value,
_MAX_EXAMPLES_PER_USER.value,
compute_dp_sgd_privacy_lib.AccountantType(_ACCOUNTANT_TYPE.value),
)
print(statement)

View file

@ -14,6 +14,7 @@
# ==============================================================================
"""Library for computing privacy values for DP-SGD."""
import enum
import functools
import math
import textwrap
@ -34,6 +35,20 @@ def _logexpm1(x: float) -> float:
return x + math.log(-math.expm1(-x))
class AccountantType(enum.Enum):
"""Accountant to use for privacy accounting."""
RDP = 'RDP'
PLD = 'PLD'
def get_accountant(self) -> dp_accounting.PrivacyAccountant:
if self == AccountantType.RDP:
return dp_accounting.rdp.RdpAccountant()
if self == AccountantType.PLD:
return dp_accounting.pld.PLDAccountant()
raise ValueError(f'Unsupported Accountant type {self.value}')
def _compute_dp_sgd_user_privacy(
num_epochs: float,
noise_multiplier: float,
@ -41,6 +56,7 @@ def _compute_dp_sgd_user_privacy(
max_examples_per_user: int,
used_microbatching: bool = True,
poisson_subsampling_probability: Optional[float] = None,
accountant_type: AccountantType = AccountantType.RDP,
) -> float:
"""Computes add-or-remove-one-user DP epsilon using group privacy.
@ -63,6 +79,10 @@ def _compute_dp_sgd_user_privacy(
used_microbatching: If true, increases sensitivity by a factor of two.
poisson_subsampling_probability: If not None, gives the probability that
each record is chosen in a batch. If None, assumes no subsampling.
accountant_type: The privacy accountant for computing epsilon. While this
method supports both PLD and RDP accountants, the behavior for PLD
accountant can sometimes be overly pessimistic. This remains to be
investigated and fixed (b/271341062).
Returns:
The add-or-remove-one-user DP epsilon value using group privacy.
@ -92,6 +112,7 @@ def _compute_dp_sgd_user_privacy(
user_delta,
used_microbatching,
poisson_subsampling_probability,
accountant_type,
)
# The computation below to estimate user_eps works as follows.
@ -188,6 +209,7 @@ def _compute_dp_sgd_example_privacy(
example_delta: float,
used_microbatching: bool = True,
poisson_subsampling_probability: Optional[float] = None,
accountant_type: AccountantType = AccountantType.RDP,
) -> float:
"""Computes add-or-remove-one-example DP epsilon.
@ -201,6 +223,7 @@ def _compute_dp_sgd_example_privacy(
used_microbatching: If true, increases sensitivity by a factor of two.
poisson_subsampling_probability: If not None, gives the probability that
each record is chosen in a batch. If None, assumes no subsampling.
accountant_type: The privacy accountant for computing epsilon.
Returns:
The epsilon value.
@ -229,10 +252,10 @@ def _compute_dp_sgd_example_privacy(
event_ = dp_accounting.SelfComposedDpEvent(count=count, event=event_)
return (
dp_accounting.rdp.RdpAccountant()
accountant_type.get_accountant()
.compose(event_)
.get_epsilon(example_delta)
) # TODO(b/271341062)
)
def compute_dp_sgd_privacy_statement(
@ -243,6 +266,7 @@ def compute_dp_sgd_privacy_statement(
delta: float,
used_microbatching: bool = True,
max_examples_per_user: Optional[int] = None,
accountant_type: AccountantType = AccountantType.RDP,
) -> str:
"""Produces a privacy report summarizing the DP guarantee.
@ -267,6 +291,11 @@ def compute_dp_sgd_privacy_statement(
max_examples_per_user: If the data set is constructed to cap the maximum
number of examples each user contributes, provide this argument to also
print a user-level DP guarantee.
accountant_type: The privacy accountant for computing epsilon. Since the
current approach for computing user-level privacy when using PLD
accountant can sometimes be overly pessimistic, this method does not
provide user-level privacy guarantee for PLD accountant_type. This remains
to be investigated and fixed (b/271341062).
Returns:
A str precisely articulating the privacy guarantee.
@ -296,12 +325,16 @@ addition to the final model.""",
paragraph = textwrap.fill(
f"""\
Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
with RDP accounting:""",
with {accountant_type.value} accounting:""",
width=80,
)
example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
num_epochs, noise_multiplier, delta, used_microbatching
num_epochs,
noise_multiplier,
delta,
used_microbatching,
accountant_type=accountant_type,
)
example_eps_subsampling = _compute_dp_sgd_example_privacy(
num_epochs,
@ -309,6 +342,7 @@ with RDP accounting:""",
delta,
used_microbatching,
poisson_subsampling_probability=batch_size / number_of_examples,
accountant_type=accountant_type,
)
paragraph += f"""
@ -320,13 +354,33 @@ with RDP accounting:""",
paragraphs.append(paragraph)
inf_user_eps = False
if max_examples_per_user is not None:
if max_examples_per_user is None:
paragraphs.append(
textwrap.fill(
"""\
No user-level privacy guarantee is possible without a bound on the number of \
examples per user.""",
width=80,
)
)
elif accountant_type == AccountantType.PLD:
# TODO(b/271341062): Add User level DP support for PLD.
paragraphs.append(
textwrap.fill(
"""\
User-level DP epsilon computation is not supported for PLD accounting at this \
time. Use RDP accounting to obtain user-level DP guarantees.""",
width=80,
)
)
else: # Case: max_examples_per_user is not None and accountant_type is RDP
user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
num_epochs,
noise_multiplier,
delta,
max_examples_per_user,
used_microbatching,
accountant_type=accountant_type,
)
user_eps_subsampling = _compute_dp_sgd_user_privacy(
num_epochs,
@ -335,6 +389,7 @@ with RDP accounting:""",
max_examples_per_user,
used_microbatching,
poisson_subsampling_probability=batch_size / number_of_examples,
accountant_type=accountant_type,
)
if math.isinf(user_eps_no_subsampling):
user_eps_no_subsampling_str = ' inf (**)'
@ -350,7 +405,7 @@ with RDP accounting:""",
paragraph = textwrap.fill(
f"""\
User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
using RDP accounting and group privacy:""",
using {accountant_type.value} accounting and group privacy:""",
width=80,
)
paragraph += f"""
@ -360,23 +415,14 @@ using RDP accounting and group privacy:""",
{user_eps_subsampling_str}"""
paragraphs.append(paragraph)
else:
paragraphs.append(
textwrap.fill(
"""\
No user-level privacy guarantee is possible without a bound on the number of \
examples per user.""",
width=80,
)
)
paragraphs.append(
textwrap.fill(
"""\
(*) Poisson sampling is not usually done in training pipelines, but assuming \
that the data was randomly shuffled, it is believed the actual epsilon should \
be closer to this value than the conservative assumption of an arbitrary data \
order.""",
that the data was randomly shuffled, it is believed that the actual epsilon \
should be closer to this value than the conservative assumption of an \
arbitrary data order.""",
width=80,
)
)

View file

@ -23,6 +23,8 @@ from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
_example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
_user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
_RDP = compute_dp_sgd_privacy_lib.AccountantType.RDP
_PLD = compute_dp_sgd_privacy_lib.AccountantType.PLD
DP_SGD_STATEMENT_KWARGS = dict(
@ -81,13 +83,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
_example_privacy(**args)
@parameterized.named_parameters(
('no_microbatching_no_subsampling', False, None, 10.8602036),
('microbatching_no_subsampling', True, None, 26.2880374),
('no_microbatching_with_subsampling', False, 1e-2, 3.2391922),
('microbatching_with_subsampling', True, 1e-2, 22.5970358),
('no_microbatching_no_subsampling_rdp', False, None, _RDP, 10.8602036),
('microbatching_no_subsampling_rdp', True, None, _RDP, 26.2880374),
('no_microbatching_with_subsampling_rdp', False, 1e-2, _RDP, 3.2391922),
('microbatching_with_subsampling_rdp', True, 1e-2, _RDP, 22.5970358),
('no_microbatching_no_subsampling_pld', False, None, _PLD, 10.1224946),
('microbatching_no_subsampling_pld', True, None, _PLD, 24.7160779),
('no_microbatching_with_subsampling_pld', False, 1e-2, _PLD, 2.4612381),
('microbatching_with_subsampling_pld', True, 1e-2, _PLD, 18.6977407),
)
def test_compute_dp_sgd_example_privacy(
self, used_microbatching, poisson_subsampling_probability, expected_eps
self,
used_microbatching,
poisson_subsampling_probability,
accountant_type,
expected_eps,
):
num_epochs = 1.2
noise_multiplier = 0.7
@ -98,6 +108,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
example_delta,
used_microbatching,
poisson_subsampling_probability,
accountant_type,
)
self.assertAlmostEqual(eps, expected_eps)
@ -119,17 +130,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
with self.assertRaises(ValueError):
_user_privacy(**args)
def test_user_privacy_one_example_per_user(self):
@parameterized.named_parameters(('RDP', _RDP), ('PLD', _PLD))
def test_user_privacy_one_example_per_user(self, accountant_type):
num_epochs = 1.2
noise_multiplier = 0.7
delta = 1e-5
example_eps = _example_privacy(num_epochs, noise_multiplier, delta)
example_eps = _example_privacy(
num_epochs, noise_multiplier, delta, accountant_type=accountant_type
)
user_eps = _user_privacy(
num_epochs,
noise_multiplier,
delta,
max_examples_per_user=1,
accountant_type=accountant_type,
)
self.assertEqual(user_eps, example_eps)
@ -146,6 +161,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
noise_multiplier=noise_multiplier,
example_delta=example_delta,
poisson_subsampling_probability=q,
accountant_type=_RDP,
)
user_delta = math.exp(
@ -161,12 +177,14 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
user_delta=user_delta,
max_examples_per_user=max_examples_per_user,
poisson_subsampling_probability=q,
accountant_type=_RDP,
)
self.assertAlmostEqual(user_eps, example_eps * max_examples_per_user)
def test_dp_sgd_privacy_statement_no_user_dp(self):
def test_dp_sgd_privacy_statement_no_user_dp_with_rdp(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
**DP_SGD_STATEMENT_KWARGS,
accountant_type=_RDP,
)
expected_statement = """\
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -185,16 +203,17 @@ No user-level privacy guarantee is possible without a bound on the number of
examples per user.
(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed the actual epsilon should be
closer to this value than the conservative assumption of an arbitrary data
order.
that the data was randomly shuffled, it is believed that the actual epsilon
should be closer to this value than the conservative assumption of an arbitrary
data order.
"""
self.assertEqual(statement, expected_statement)
def test_dp_sgd_privacy_statement_user_dp(self):
def test_dp_sgd_privacy_statement_user_dp_with_rdp(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
**DP_SGD_STATEMENT_KWARGS,
max_examples_per_user=3,
accountant_type=_RDP,
)
expected_statement = """\
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -215,16 +234,17 @@ RDP accounting and group privacy:
Epsilon assuming Poisson sampling (*): 6.425
(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed the actual epsilon should be
closer to this value than the conservative assumption of an arbitrary data
order.
that the data was randomly shuffled, it is believed that the actual epsilon
should be closer to this value than the conservative assumption of an arbitrary
data order.
"""
self.assertEqual(statement, expected_statement)
def test_dp_sgd_privacy_statement_user_dp_infinite(self):
def test_dp_sgd_privacy_statement_user_dp_infinite_with_rdp(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
**DP_SGD_STATEMENT_KWARGS,
max_examples_per_user=10,
accountant_type=_RDP,
)
expected_statement = """\
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
@ -245,14 +265,71 @@ RDP accounting and group privacy:
Epsilon assuming Poisson sampling (*): inf (**)
(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed the actual epsilon should be
closer to this value than the conservative assumption of an arbitrary data
order.
that the data was randomly shuffled, it is believed that the actual epsilon
should be closer to this value than the conservative assumption of an arbitrary
data order.
(**) A finite example-level epsilon implies a finite user-level epsilon at any
`max_examples_per_user`, but because conversion from example-level to user-level
DP is not exact, it is possible for the upper bound on the user-level epsilon to
still be infinite.
"""
self.assertEqual(statement, expected_statement)
def test_dp_sgd_privacy_statement_no_user_dp_with_pld(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
**DP_SGD_STATEMENT_KWARGS,
accountant_type=_PLD,
)
expected_statement = """\
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
examples per user.
This privacy guarantee protects the release of all model checkpoints in addition
to the final model.
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
PLD accounting:
Epsilon with each example occurring once per epoch: 12.595
Epsilon assuming Poisson sampling (*): 1.199
No user-level privacy guarantee is possible without a bound on the number of
examples per user.
(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed that the actual epsilon
should be closer to this value than the conservative assumption of an arbitrary
data order.
"""
self.assertEqual(statement, expected_statement)
def test_dp_sgd_privacy_statement_user_dp_with_pld(self):
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
**DP_SGD_STATEMENT_KWARGS,
max_examples_per_user=3,
accountant_type=_PLD,
)
expected_statement = """\
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
user.
This privacy guarantee protects the release of all model checkpoints in addition
to the final model.
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
PLD accounting:
Epsilon with each example occurring once per epoch: 12.595
Epsilon assuming Poisson sampling (*): 1.199
User-level DP epsilon computation is not supported for PLD accounting at this
time. Use RDP accounting to obtain user-level DP guarantees.
(*) Poisson sampling is not usually done in training pipelines, but assuming
that the data was randomly shuffled, it is believed that the actual epsilon
should be closer to this value than the conservative assumption of an arbitrary
data order.
"""
self.assertEqual(statement, expected_statement)