Add support for PLD Accountant in computing DP-SGD privacy statement [TF Privacy]
PiperOrigin-RevId: 587854134
This commit is contained in:
parent
f51b637dda
commit
93376c9d6a
3 changed files with 170 additions and 43 deletions
|
@ -18,20 +18,20 @@ The script applies the RDP accountant to estimate privacy budget of an iterated
|
|||
Sampled Gaussian Mechanism. The mechanism's parameters are controlled by flags.
|
||||
|
||||
Example:
|
||||
compute_dp_sgd_privacy
|
||||
compute_dp_sgd_privacy \
|
||||
--N=60000 \
|
||||
--batch_size=256 \
|
||||
--noise_multiplier=1.12 \
|
||||
--epochs=60 \
|
||||
--delta=1e-5
|
||||
--delta=1e-5 \
|
||||
--accountant_type=RDP
|
||||
|
||||
The output states that DP-SGD with these parameters satisfies (2.92, 1e-5)-DP.
|
||||
Prints out the privacy statement corresponding to the above parameters.
|
||||
"""
|
||||
|
||||
from absl import app
|
||||
from absl import flags
|
||||
|
||||
from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy_statement
|
||||
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
|
||||
|
||||
|
||||
_NUM_EXAMPLES = flags.DEFINE_integer(
|
||||
|
@ -70,6 +70,9 @@ _MAX_EXAMPLES_PER_USER = flags.DEFINE_integer(
|
|||
'user-level DP guarantee.'
|
||||
),
|
||||
)
|
||||
_ACCOUNTANT_TYPE = flags.DEFINE_enum(
|
||||
'accountant_type', 'RDP', ['RDP', 'PLD'], 'DP accountant to use.'
|
||||
)
|
||||
|
||||
flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
|
||||
|
||||
|
@ -77,7 +80,7 @@ flags.mark_flags_as_required(['N', 'batch_size', 'noise_multiplier', 'epochs'])
|
|||
def main(argv):
|
||||
del argv # argv is not used.
|
||||
|
||||
statement = compute_dp_sgd_privacy_statement(
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
_NUM_EXAMPLES.value,
|
||||
_BATCH_SIZE.value,
|
||||
_NUM_EPOCHS.value,
|
||||
|
@ -85,6 +88,7 @@ def main(argv):
|
|||
_DELTA.value,
|
||||
_USED_MICROBATCHING.value,
|
||||
_MAX_EXAMPLES_PER_USER.value,
|
||||
compute_dp_sgd_privacy_lib.AccountantType(_ACCOUNTANT_TYPE.value),
|
||||
)
|
||||
print(statement)
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
# ==============================================================================
|
||||
"""Library for computing privacy values for DP-SGD."""
|
||||
|
||||
import enum
|
||||
import functools
|
||||
import math
|
||||
import textwrap
|
||||
|
@ -34,6 +35,20 @@ def _logexpm1(x: float) -> float:
|
|||
return x + math.log(-math.expm1(-x))
|
||||
|
||||
|
||||
class AccountantType(enum.Enum):
|
||||
"""Accountant to use for privacy accounting."""
|
||||
|
||||
RDP = 'RDP'
|
||||
PLD = 'PLD'
|
||||
|
||||
def get_accountant(self) -> dp_accounting.PrivacyAccountant:
|
||||
if self == AccountantType.RDP:
|
||||
return dp_accounting.rdp.RdpAccountant()
|
||||
if self == AccountantType.PLD:
|
||||
return dp_accounting.pld.PLDAccountant()
|
||||
raise ValueError(f'Unsupported Accountant type {self.value}')
|
||||
|
||||
|
||||
def _compute_dp_sgd_user_privacy(
|
||||
num_epochs: float,
|
||||
noise_multiplier: float,
|
||||
|
@ -41,6 +56,7 @@ def _compute_dp_sgd_user_privacy(
|
|||
max_examples_per_user: int,
|
||||
used_microbatching: bool = True,
|
||||
poisson_subsampling_probability: Optional[float] = None,
|
||||
accountant_type: AccountantType = AccountantType.RDP,
|
||||
) -> float:
|
||||
"""Computes add-or-remove-one-user DP epsilon using group privacy.
|
||||
|
||||
|
@ -63,6 +79,10 @@ def _compute_dp_sgd_user_privacy(
|
|||
used_microbatching: If true, increases sensitivity by a factor of two.
|
||||
poisson_subsampling_probability: If not None, gives the probability that
|
||||
each record is chosen in a batch. If None, assumes no subsampling.
|
||||
accountant_type: The privacy accountant for computing epsilon. While this
|
||||
method supports both PLD and RDP accountants, the behavior for PLD
|
||||
accountant can sometimes be overly pessimistic. This remains to be
|
||||
investigated and fixed (b/271341062).
|
||||
|
||||
Returns:
|
||||
The add-or-remove-one-user DP epsilon value using group privacy.
|
||||
|
@ -92,6 +112,7 @@ def _compute_dp_sgd_user_privacy(
|
|||
user_delta,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability,
|
||||
accountant_type,
|
||||
)
|
||||
|
||||
# The computation below to estimate user_eps works as follows.
|
||||
|
@ -188,6 +209,7 @@ def _compute_dp_sgd_example_privacy(
|
|||
example_delta: float,
|
||||
used_microbatching: bool = True,
|
||||
poisson_subsampling_probability: Optional[float] = None,
|
||||
accountant_type: AccountantType = AccountantType.RDP,
|
||||
) -> float:
|
||||
"""Computes add-or-remove-one-example DP epsilon.
|
||||
|
||||
|
@ -201,6 +223,7 @@ def _compute_dp_sgd_example_privacy(
|
|||
used_microbatching: If true, increases sensitivity by a factor of two.
|
||||
poisson_subsampling_probability: If not None, gives the probability that
|
||||
each record is chosen in a batch. If None, assumes no subsampling.
|
||||
accountant_type: The privacy accountant for computing epsilon.
|
||||
|
||||
Returns:
|
||||
The epsilon value.
|
||||
|
@ -229,10 +252,10 @@ def _compute_dp_sgd_example_privacy(
|
|||
event_ = dp_accounting.SelfComposedDpEvent(count=count, event=event_)
|
||||
|
||||
return (
|
||||
dp_accounting.rdp.RdpAccountant()
|
||||
accountant_type.get_accountant()
|
||||
.compose(event_)
|
||||
.get_epsilon(example_delta)
|
||||
) # TODO(b/271341062)
|
||||
)
|
||||
|
||||
|
||||
def compute_dp_sgd_privacy_statement(
|
||||
|
@ -243,6 +266,7 @@ def compute_dp_sgd_privacy_statement(
|
|||
delta: float,
|
||||
used_microbatching: bool = True,
|
||||
max_examples_per_user: Optional[int] = None,
|
||||
accountant_type: AccountantType = AccountantType.RDP,
|
||||
) -> str:
|
||||
"""Produces a privacy report summarizing the DP guarantee.
|
||||
|
||||
|
@ -267,6 +291,11 @@ def compute_dp_sgd_privacy_statement(
|
|||
max_examples_per_user: If the data set is constructed to cap the maximum
|
||||
number of examples each user contributes, provide this argument to also
|
||||
print a user-level DP guarantee.
|
||||
accountant_type: The privacy accountant for computing epsilon. Since the
|
||||
current approach for computing user-level privacy when using PLD
|
||||
accountant can sometimes be overly pessimistic, this method does not
|
||||
provide user-level privacy guarantee for PLD accountant_type. This remains
|
||||
to be investigated and fixed (b/271341062).
|
||||
|
||||
Returns:
|
||||
A str precisely articulating the privacy guarantee.
|
||||
|
@ -296,12 +325,16 @@ addition to the final model.""",
|
|||
paragraph = textwrap.fill(
|
||||
f"""\
|
||||
Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
|
||||
with RDP accounting:""",
|
||||
with {accountant_type.value} accounting:""",
|
||||
width=80,
|
||||
)
|
||||
|
||||
example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
|
||||
num_epochs, noise_multiplier, delta, used_microbatching
|
||||
num_epochs,
|
||||
noise_multiplier,
|
||||
delta,
|
||||
used_microbatching,
|
||||
accountant_type=accountant_type,
|
||||
)
|
||||
example_eps_subsampling = _compute_dp_sgd_example_privacy(
|
||||
num_epochs,
|
||||
|
@ -309,6 +342,7 @@ with RDP accounting:""",
|
|||
delta,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability=batch_size / number_of_examples,
|
||||
accountant_type=accountant_type,
|
||||
)
|
||||
|
||||
paragraph += f"""
|
||||
|
@ -320,13 +354,33 @@ with RDP accounting:""",
|
|||
paragraphs.append(paragraph)
|
||||
|
||||
inf_user_eps = False
|
||||
if max_examples_per_user is not None:
|
||||
if max_examples_per_user is None:
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
No user-level privacy guarantee is possible without a bound on the number of \
|
||||
examples per user.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
elif accountant_type == AccountantType.PLD:
|
||||
# TODO(b/271341062): Add User level DP support for PLD.
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
User-level DP epsilon computation is not supported for PLD accounting at this \
|
||||
time. Use RDP accounting to obtain user-level DP guarantees.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
else: # Case: max_examples_per_user is not None and accountant_type is RDP
|
||||
user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
|
||||
num_epochs,
|
||||
noise_multiplier,
|
||||
delta,
|
||||
max_examples_per_user,
|
||||
used_microbatching,
|
||||
accountant_type=accountant_type,
|
||||
)
|
||||
user_eps_subsampling = _compute_dp_sgd_user_privacy(
|
||||
num_epochs,
|
||||
|
@ -335,6 +389,7 @@ with RDP accounting:""",
|
|||
max_examples_per_user,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability=batch_size / number_of_examples,
|
||||
accountant_type=accountant_type,
|
||||
)
|
||||
if math.isinf(user_eps_no_subsampling):
|
||||
user_eps_no_subsampling_str = ' inf (**)'
|
||||
|
@ -350,7 +405,7 @@ with RDP accounting:""",
|
|||
paragraph = textwrap.fill(
|
||||
f"""\
|
||||
User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
|
||||
using RDP accounting and group privacy:""",
|
||||
using {accountant_type.value} accounting and group privacy:""",
|
||||
width=80,
|
||||
)
|
||||
paragraph += f"""
|
||||
|
@ -360,23 +415,14 @@ using RDP accounting and group privacy:""",
|
|||
{user_eps_subsampling_str}"""
|
||||
|
||||
paragraphs.append(paragraph)
|
||||
else:
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
No user-level privacy guarantee is possible without a bound on the number of \
|
||||
examples per user.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
|
||||
paragraphs.append(
|
||||
textwrap.fill(
|
||||
"""\
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming \
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should \
|
||||
be closer to this value than the conservative assumption of an arbitrary data \
|
||||
order.""",
|
||||
that the data was randomly shuffled, it is believed that the actual epsilon \
|
||||
should be closer to this value than the conservative assumption of an \
|
||||
arbitrary data order.""",
|
||||
width=80,
|
||||
)
|
||||
)
|
||||
|
|
|
@ -23,6 +23,8 @@ from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy_lib
|
|||
|
||||
_example_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_example_privacy
|
||||
_user_privacy = compute_dp_sgd_privacy_lib._compute_dp_sgd_user_privacy
|
||||
_RDP = compute_dp_sgd_privacy_lib.AccountantType.RDP
|
||||
_PLD = compute_dp_sgd_privacy_lib.AccountantType.PLD
|
||||
|
||||
|
||||
DP_SGD_STATEMENT_KWARGS = dict(
|
||||
|
@ -81,13 +83,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
|||
_example_privacy(**args)
|
||||
|
||||
@parameterized.named_parameters(
|
||||
('no_microbatching_no_subsampling', False, None, 10.8602036),
|
||||
('microbatching_no_subsampling', True, None, 26.2880374),
|
||||
('no_microbatching_with_subsampling', False, 1e-2, 3.2391922),
|
||||
('microbatching_with_subsampling', True, 1e-2, 22.5970358),
|
||||
('no_microbatching_no_subsampling_rdp', False, None, _RDP, 10.8602036),
|
||||
('microbatching_no_subsampling_rdp', True, None, _RDP, 26.2880374),
|
||||
('no_microbatching_with_subsampling_rdp', False, 1e-2, _RDP, 3.2391922),
|
||||
('microbatching_with_subsampling_rdp', True, 1e-2, _RDP, 22.5970358),
|
||||
('no_microbatching_no_subsampling_pld', False, None, _PLD, 10.1224946),
|
||||
('microbatching_no_subsampling_pld', True, None, _PLD, 24.7160779),
|
||||
('no_microbatching_with_subsampling_pld', False, 1e-2, _PLD, 2.4612381),
|
||||
('microbatching_with_subsampling_pld', True, 1e-2, _PLD, 18.6977407),
|
||||
)
|
||||
def test_compute_dp_sgd_example_privacy(
|
||||
self, used_microbatching, poisson_subsampling_probability, expected_eps
|
||||
self,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability,
|
||||
accountant_type,
|
||||
expected_eps,
|
||||
):
|
||||
num_epochs = 1.2
|
||||
noise_multiplier = 0.7
|
||||
|
@ -98,6 +108,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
|||
example_delta,
|
||||
used_microbatching,
|
||||
poisson_subsampling_probability,
|
||||
accountant_type,
|
||||
)
|
||||
self.assertAlmostEqual(eps, expected_eps)
|
||||
|
||||
|
@ -119,17 +130,21 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
|||
with self.assertRaises(ValueError):
|
||||
_user_privacy(**args)
|
||||
|
||||
def test_user_privacy_one_example_per_user(self):
|
||||
@parameterized.named_parameters(('RDP', _RDP), ('PLD', _PLD))
|
||||
def test_user_privacy_one_example_per_user(self, accountant_type):
|
||||
num_epochs = 1.2
|
||||
noise_multiplier = 0.7
|
||||
delta = 1e-5
|
||||
|
||||
example_eps = _example_privacy(num_epochs, noise_multiplier, delta)
|
||||
example_eps = _example_privacy(
|
||||
num_epochs, noise_multiplier, delta, accountant_type=accountant_type
|
||||
)
|
||||
user_eps = _user_privacy(
|
||||
num_epochs,
|
||||
noise_multiplier,
|
||||
delta,
|
||||
max_examples_per_user=1,
|
||||
accountant_type=accountant_type,
|
||||
)
|
||||
self.assertEqual(user_eps, example_eps)
|
||||
|
||||
|
@ -146,6 +161,7 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
|||
noise_multiplier=noise_multiplier,
|
||||
example_delta=example_delta,
|
||||
poisson_subsampling_probability=q,
|
||||
accountant_type=_RDP,
|
||||
)
|
||||
|
||||
user_delta = math.exp(
|
||||
|
@ -161,12 +177,14 @@ class ComputeDpSgdPrivacyTest(parameterized.TestCase):
|
|||
user_delta=user_delta,
|
||||
max_examples_per_user=max_examples_per_user,
|
||||
poisson_subsampling_probability=q,
|
||||
accountant_type=_RDP,
|
||||
)
|
||||
self.assertAlmostEqual(user_eps, example_eps * max_examples_per_user)
|
||||
|
||||
def test_dp_sgd_privacy_statement_no_user_dp(self):
|
||||
def test_dp_sgd_privacy_statement_no_user_dp_with_rdp(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
accountant_type=_RDP,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
|
@ -185,16 +203,17 @@ No user-level privacy guarantee is possible without a bound on the number of
|
|||
examples per user.
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should be
|
||||
closer to this value than the conservative assumption of an arbitrary data
|
||||
order.
|
||||
that the data was randomly shuffled, it is believed that the actual epsilon
|
||||
should be closer to this value than the conservative assumption of an arbitrary
|
||||
data order.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
def test_dp_sgd_privacy_statement_user_dp(self):
|
||||
def test_dp_sgd_privacy_statement_user_dp_with_rdp(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
max_examples_per_user=3,
|
||||
accountant_type=_RDP,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
|
@ -215,16 +234,17 @@ RDP accounting and group privacy:
|
|||
Epsilon assuming Poisson sampling (*): 6.425
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should be
|
||||
closer to this value than the conservative assumption of an arbitrary data
|
||||
order.
|
||||
that the data was randomly shuffled, it is believed that the actual epsilon
|
||||
should be closer to this value than the conservative assumption of an arbitrary
|
||||
data order.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
def test_dp_sgd_privacy_statement_user_dp_infinite(self):
|
||||
def test_dp_sgd_privacy_statement_user_dp_infinite_with_rdp(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
max_examples_per_user=10,
|
||||
accountant_type=_RDP,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
|
@ -245,14 +265,71 @@ RDP accounting and group privacy:
|
|||
Epsilon assuming Poisson sampling (*): inf (**)
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed the actual epsilon should be
|
||||
closer to this value than the conservative assumption of an arbitrary data
|
||||
order.
|
||||
that the data was randomly shuffled, it is believed that the actual epsilon
|
||||
should be closer to this value than the conservative assumption of an arbitrary
|
||||
data order.
|
||||
|
||||
(**) A finite example-level epsilon implies a finite user-level epsilon at any
|
||||
`max_examples_per_user`, but because conversion from example-level to user-level
|
||||
DP is not exact, it is possible for the upper bound on the user-level epsilon to
|
||||
still be infinite.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
def test_dp_sgd_privacy_statement_no_user_dp_with_pld(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
accountant_type=_PLD,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
multiplier 2.0 for 5.0 epochs with microbatching, and no bound on number of
|
||||
examples per user.
|
||||
|
||||
This privacy guarantee protects the release of all model checkpoints in addition
|
||||
to the final model.
|
||||
|
||||
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
|
||||
PLD accounting:
|
||||
Epsilon with each example occurring once per epoch: 12.595
|
||||
Epsilon assuming Poisson sampling (*): 1.199
|
||||
|
||||
No user-level privacy guarantee is possible without a bound on the number of
|
||||
examples per user.
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed that the actual epsilon
|
||||
should be closer to this value than the conservative assumption of an arbitrary
|
||||
data order.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
def test_dp_sgd_privacy_statement_user_dp_with_pld(self):
|
||||
statement = compute_dp_sgd_privacy_lib.compute_dp_sgd_privacy_statement(
|
||||
**DP_SGD_STATEMENT_KWARGS,
|
||||
max_examples_per_user=3,
|
||||
accountant_type=_PLD,
|
||||
)
|
||||
expected_statement = """\
|
||||
DP-SGD performed over 10000 examples with 64 examples per iteration, noise
|
||||
multiplier 2.0 for 5.0 epochs with microbatching, and at most 3 examples per
|
||||
user.
|
||||
|
||||
This privacy guarantee protects the release of all model checkpoints in addition
|
||||
to the final model.
|
||||
|
||||
Example-level DP with add-or-remove-one adjacency at delta = 1e-06 computed with
|
||||
PLD accounting:
|
||||
Epsilon with each example occurring once per epoch: 12.595
|
||||
Epsilon assuming Poisson sampling (*): 1.199
|
||||
|
||||
User-level DP epsilon computation is not supported for PLD accounting at this
|
||||
time. Use RDP accounting to obtain user-level DP guarantees.
|
||||
|
||||
(*) Poisson sampling is not usually done in training pipelines, but assuming
|
||||
that the data was randomly shuffled, it is believed that the actual epsilon
|
||||
should be closer to this value than the conservative assumption of an arbitrary
|
||||
data order.
|
||||
"""
|
||||
self.assertEqual(statement, expected_statement)
|
||||
|
||||
|
|
Loading…
Reference in a new issue