Automated rollback of commit cff47686f6

PiperOrigin-RevId: 471104040
This commit is contained in:
Steve Chien 2022-08-30 15:22:41 -07:00 committed by A. Unique TensorFlower
parent cff47686f6
commit 875b7f46bd
4 changed files with 200 additions and 295 deletions

View file

@ -61,14 +61,9 @@ else:
from tensorflow_privacy.privacy.keras_models.dp_keras_model import make_dp_model_class
# Optimizers
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import GenericDPAdagradOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import GenericDPAdamOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import GenericDPSGDOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdagradOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import make_gaussian_query_optimizer_class
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import make_keras_generic_optimizer_class
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import make_keras_optimizer_class
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras_vectorized import VectorizedDPKerasAdagradOptimizer

View file

@ -18,18 +18,6 @@ py_library(
deps = ["//tensorflow_privacy/privacy/dp_query:gaussian_query"],
)
py_library(
name = "dp_optimizer_factory",
srcs = [
"dp_optimizer_keras.py",
],
srcs_version = "PY3",
deps = [
"//tensorflow_privacy/privacy/dp_query",
"//tensorflow_privacy/privacy/dp_query:gaussian_query",
],
)
py_library(
name = "dp_optimizer_vectorized",
srcs = [
@ -44,10 +32,7 @@ py_library(
"dp_optimizer_keras.py",
],
srcs_version = "PY3",
deps = [
"//tensorflow_privacy/privacy/dp_query",
"//tensorflow_privacy/privacy/dp_query:gaussian_query",
],
deps = ["//tensorflow_privacy/privacy/dp_query:gaussian_query"],
)
py_library(
@ -99,7 +84,7 @@ py_test(
python_version = "PY3",
srcs_version = "PY3",
deps = [
":dp_optimizer_keras",
":dp_optimizer_keras_vectorized",
"//tensorflow_privacy/privacy/optimizers:dp_optimizer_keras",
"//tensorflow_privacy/privacy/optimizers:dp_optimizer_keras_vectorized",
],
)

View file

@ -13,28 +13,21 @@
# limitations under the License.
# ==============================================================================
"""Differentially private version of Keras optimizer v2."""
from typing import Optional, Type
import warnings
import tensorflow as tf
from tensorflow_privacy.privacy.dp_query import dp_query
from tensorflow_privacy.privacy.dp_query import gaussian_query
def _normalize(microbatch_gradient: tf.Tensor,
num_microbatches: float) -> tf.Tensor:
"""Normalizes `microbatch_gradient` by `num_microbatches`."""
return tf.truediv(microbatch_gradient,
tf.cast(num_microbatches, microbatch_gradient.dtype))
def make_keras_generic_optimizer_class(
cls: Type[tf.keras.optimizers.Optimizer]):
"""Returns a differentially private (DP) subclass of `cls`.
def make_keras_optimizer_class(cls):
"""Given a subclass of `tf.keras.optimizers.Optimizer`, returns a DP-SGD subclass of it.
Args:
cls: Class from which to derive a DP subclass. Should be a subclass of
`tf.keras.optimizers.Optimizer`.
Returns:
A DP-SGD subclass of `cls`.
"""
class DPOptimizerClass(cls): # pylint: disable=empty-docstring
@ -145,23 +138,24 @@ def make_keras_generic_optimizer_class(
def __init__(
self,
dp_sum_query: dp_query.DPQuery,
num_microbatches: Optional[int] = None,
gradient_accumulation_steps: int = 1,
l2_norm_clip,
noise_multiplier,
num_microbatches=None,
gradient_accumulation_steps=1,
*args, # pylint: disable=keyword-arg-before-vararg, g-doc-args
**kwargs):
"""Initializes the DPOptimizerClass.
"""Initialize the DPOptimizerClass.
Args:
dp_sum_query: `DPQuery` object, specifying differential privacy
mechanism to use.
l2_norm_clip: Clipping norm (max L2 norm of per microbatch gradients).
noise_multiplier: Ratio of the standard deviation to the clipping norm.
num_microbatches: Number of microbatches into which each minibatch is
split. Default is `None` which means that number of microbatches is
equal to batch size (i.e. each microbatch contains exactly one
split. Default is `None` which means that number of microbatches
is equal to batch size (i.e. each microbatch contains exactly one
example). If `gradient_accumulation_steps` is greater than 1 and
`num_microbatches` is not `None` then the effective number of
microbatches is equal to `num_microbatches *
gradient_accumulation_steps`.
microbatches is equal to
`num_microbatches * gradient_accumulation_steps`.
gradient_accumulation_steps: If greater than 1 then optimizer will be
accumulating gradients for this number of optimizer steps before
applying them to update model weights. If this argument is set to 1
@ -171,13 +165,13 @@ def make_keras_generic_optimizer_class(
"""
super().__init__(*args, **kwargs)
self.gradient_accumulation_steps = gradient_accumulation_steps
self._l2_norm_clip = l2_norm_clip
self._noise_multiplier = noise_multiplier
self._num_microbatches = num_microbatches
self._dp_sum_query = dp_sum_query
self._was_dp_gradients_called = False
# We initialize the self.`_global_state` within the gradient functions
# (and not here) because tensors must be initialized within the graph.
self._dp_sum_query = gaussian_query.GaussianSumQuery(
l2_norm_clip, l2_norm_clip * noise_multiplier)
self._global_state = None
self._was_dp_gradients_called = False
def _create_slots(self, var_list):
super()._create_slots(var_list) # pytype: disable=attribute-error
@ -241,62 +235,66 @@ def make_keras_generic_optimizer_class(
"""DP-SGD version of base class method."""
self._was_dp_gradients_called = True
if self._global_state is None:
self._global_state = self._dp_sum_query.initial_global_state()
# Compute loss.
if not callable(loss) and tape is None:
raise ValueError('`tape` is required when a `Tensor` loss is passed.')
tape = tape if tape is not None else tf.GradientTape()
with tape:
if callable(loss):
if callable(loss):
with tape:
if not callable(var_list):
tape.watch(var_list)
loss = loss()
if self._num_microbatches is None:
num_microbatches = tf.shape(input=loss)[0]
else:
num_microbatches = self._num_microbatches
microbatch_losses = tf.reduce_mean(
tf.reshape(loss, [num_microbatches, -1]), axis=1)
if self._num_microbatches is None:
num_microbatches = tf.shape(input=loss)[0]
else:
num_microbatches = self._num_microbatches
microbatch_losses = tf.reduce_mean(
tf.reshape(loss, [num_microbatches, -1]), axis=1)
if callable(var_list):
var_list = var_list()
if callable(var_list):
var_list = var_list()
else:
with tape:
if self._num_microbatches is None:
num_microbatches = tf.shape(input=loss)[0]
else:
num_microbatches = self._num_microbatches
microbatch_losses = tf.reduce_mean(
tf.reshape(loss, [num_microbatches, -1]), axis=1)
var_list = tf.nest.flatten(var_list)
sample_params = (
self._dp_sum_query.derive_sample_params(self._global_state))
# Compute the per-microbatch losses using helpful jacobian method.
with tf.keras.backend.name_scope(self._name + '/gradients'):
jacobian_per_var = tape.jacobian(
jacobian = tape.jacobian(
microbatch_losses, var_list, unconnected_gradients='zero')
def process_microbatch(sample_state, microbatch_jacobians):
"""Process one microbatch (record) with privacy helper."""
sample_state = self._dp_sum_query.accumulate_record(
sample_params, sample_state, microbatch_jacobians)
return sample_state
# Clip gradients to given l2_norm_clip.
def clip_gradients(g):
return tf.clip_by_global_norm(g, self._l2_norm_clip)[0]
sample_state = self._dp_sum_query.initial_sample_state(var_list)
for idx in range(num_microbatches):
microbatch_jacobians_per_var = [
jacobian[idx] for jacobian in jacobian_per_var
]
sample_state = process_microbatch(sample_state,
microbatch_jacobians_per_var)
clipped_gradients = tf.map_fn(clip_gradients, jacobian)
grad_sums, self._global_state, _ = (
self._dp_sum_query.get_noised_result(sample_state,
self._global_state))
final_grads = tf.nest.map_structure(_normalize, grad_sums,
[num_microbatches] * len(grad_sums))
def reduce_noise_normalize_batch(g):
# Sum gradients over all microbatches.
summed_gradient = tf.reduce_sum(g, axis=0)
return list(zip(final_grads, var_list))
# Add noise to summed gradients.
noise_stddev = self._l2_norm_clip * self._noise_multiplier
noise = tf.random.normal(
tf.shape(input=summed_gradient), stddev=noise_stddev)
noised_gradient = tf.add(summed_gradient, noise)
# Normalize by number of microbatches and return.
return tf.truediv(noised_gradient,
tf.cast(num_microbatches, tf.float32))
final_gradients = tf.nest.map_structure(reduce_noise_normalize_batch,
clipped_gradients)
return list(zip(final_gradients, var_list))
def get_gradients(self, loss, params):
"""DP-SGD version of base class method."""
@ -324,13 +322,17 @@ def make_keras_generic_optimizer_class(
sample_state = self._dp_sum_query.initial_sample_state(params)
for idx in range(self._num_microbatches):
sample_state = process_microbatch(idx, sample_state)
grad_sums, self._global_state, _ = (
self._dp_sum_query.get_noised_result(sample_state,
self._global_state))
final_grads = tf.nest.map_structure(
_normalize, grad_sums, [self._num_microbatches] * len(grad_sums))
def normalize(v):
try:
return tf.truediv(v, tf.cast(self._num_microbatches, tf.float32))
except TypeError:
return None
final_grads = tf.nest.map_structure(normalize, grad_sums)
return final_grads
@ -366,87 +368,7 @@ def make_keras_generic_optimizer_class(
return DPOptimizerClass
def make_gaussian_query_optimizer_class(cls):
"""Returns a differentially private optimizer using the `GaussianSumQuery`.
Args:
cls: `DPOptimizerClass`, the output of `make_keras_optimizer_class`.
"""
def return_gaussian_query_optimizer(
l2_norm_clip: float,
noise_multiplier: float,
num_microbatches: Optional[int] = None,
gradient_accumulation_steps: int = 1,
*args, # pylint: disable=keyword-arg-before-vararg, g-doc-args
**kwargs):
"""Returns a `DPOptimizerClass` `cls` using the `GaussianSumQuery`.
This function is a thin wrapper around
`make_keras_optimizer_class.<locals>.DPOptimizerClass` which can be used to
apply a `GaussianSumQuery` to any `DPOptimizerClass`.
When combined with stochastic gradient descent, this creates the canonical
DP-SGD algorithm of "Deep Learning with Differential Privacy"
(see https://arxiv.org/abs/1607.00133).
Args:
l2_norm_clip: Clipping norm (max L2 norm of per microbatch gradients).
noise_multiplier: Ratio of the standard deviation to the clipping norm.
num_microbatches: Number of microbatches into which each minibatch is
split. Default is `None` which means that number of microbatches is
equal to batch size (i.e. each microbatch contains exactly one example).
If `gradient_accumulation_steps` is greater than 1 and
`num_microbatches` is not `None` then the effective number of
microbatches is equal to `num_microbatches *
gradient_accumulation_steps`.
gradient_accumulation_steps: If greater than 1 then optimizer will be
accumulating gradients for this number of optimizer steps before
applying them to update model weights. If this argument is set to 1 then
updates will be applied on each optimizer step.
*args: These will be passed on to the base class `__init__` method.
**kwargs: These will be passed on to the base class `__init__` method.
"""
dp_sum_query = gaussian_query.GaussianSumQuery(
l2_norm_clip, l2_norm_clip * noise_multiplier)
return cls(
dp_sum_query=dp_sum_query,
num_microbatches=num_microbatches,
gradient_accumulation_steps=gradient_accumulation_steps,
*args,
**kwargs)
return return_gaussian_query_optimizer
def make_keras_optimizer_class(cls: Type[tf.keras.optimizers.Optimizer]):
"""Returns a differentially private optimizer using the `GaussianSumQuery`.
For backwards compatibility, we create this symbol to match the previous
output of `make_keras_optimizer_class` but using the new logic.
Args:
cls: Class from which to derive a DP subclass. Should be a subclass of
`tf.keras.optimizers.Optimizer`.
"""
warnings.warn(
'`make_keras_optimizer_class` will be depracated on 2023-02-23. '
'Please switch to `make_gaussian_query_optimizer_class` and the '
'generic optimizers (`make_keras_generic_optimizer_class`).')
return make_gaussian_query_optimizer_class(
make_keras_generic_optimizer_class(cls))
GenericDPAdagradOptimizer = make_keras_generic_optimizer_class(
DPKerasAdagradOptimizer = make_keras_optimizer_class(
tf.keras.optimizers.Adagrad)
GenericDPAdamOptimizer = make_keras_generic_optimizer_class(
tf.keras.optimizers.Adam)
GenericDPSGDOptimizer = make_keras_generic_optimizer_class(
tf.keras.optimizers.SGD)
# We keep the same names for backwards compatibility.
DPKerasAdagradOptimizer = make_gaussian_query_optimizer_class(
GenericDPAdagradOptimizer)
DPKerasAdamOptimizer = make_gaussian_query_optimizer_class(
GenericDPAdamOptimizer)
DPKerasSGDOptimizer = make_gaussian_query_optimizer_class(GenericDPSGDOptimizer)
DPKerasAdamOptimizer = make_keras_optimizer_class(tf.keras.optimizers.Adam)
DPKerasSGDOptimizer = make_keras_optimizer_class(tf.keras.optimizers.SGD)

View file

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
@ -28,29 +29,36 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
return 0.5 * tf.reduce_sum(
input_tensor=tf.math.squared_difference(val0, val1), axis=1)
# Parameters for testing: optimizer, num_microbatches, expected gradient for
# var0, expected gradient for var1.
@parameterized.named_parameters(
('DPGradientDescent_1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPAdam_2', dp_optimizer_keras.DPKerasAdamOptimizer, 2),
('DPAdagrad _4', dp_optimizer_keras.DPKerasAdagradOptimizer, 4),
('DPGradientDescentVectorized_1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 1),
('DPAdamVectorized_2',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdamOptimizer, 2),
('DPAdagradVectorized_4',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, 4),
('DPAdagradVectorized_None',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, None),
('DPGradientDescent 1', dp_optimizer_keras.DPKerasSGDOptimizer, 1,
[-2.5, -2.5], [-0.5]),
('DPAdam 2', dp_optimizer_keras.DPKerasAdamOptimizer, 2, [-2.5, -2.5
], [-0.5]),
('DPAdagrad 4', dp_optimizer_keras.DPKerasAdagradOptimizer, 4,
[-2.5, -2.5], [-0.5]),
('DPGradientDescentVectorized 1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 1,
[-2.5, -2.5], [-0.5]),
('DPAdamVectorized 2',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdamOptimizer, 2,
[-2.5, -2.5], [-0.5]),
('DPAdagradVectorized 4',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, 4,
[-2.5, -2.5], [-0.5]),
('DPAdagradVectorized None',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, None,
[-2.5, -2.5], [-0.5]),
)
def testBaselineWithCallableLossNoNoise(self, optimizer_class,
num_microbatches):
def testBaselineWithCallableLoss(self, cls, num_microbatches, expected_grad0,
expected_grad1):
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([3.0])
data0 = tf.Variable([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [-1.0, 0.0]])
data1 = tf.Variable([[8.0], [2.0], [3.0], [1.0]])
expected_grad0 = [-2.5, -2.5]
expected_grad1 = [-0.5]
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=100.0,
noise_multiplier=0.0,
num_microbatches=num_microbatches,
@ -58,34 +66,40 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
loss = lambda: self._loss(data0, var0) + self._loss(data1, var1)
grads_and_vars = optimizer._compute_gradients(loss, [var0, var1])
grads_and_vars = opt._compute_gradients(loss, [var0, var1])
self.assertAllCloseAccordingToType(expected_grad0, grads_and_vars[0][0])
self.assertAllCloseAccordingToType(expected_grad1, grads_and_vars[1][0])
# Parameters for testing: optimizer, num_microbatches, expected gradient for
# var0, expected gradient for var1.
@parameterized.named_parameters(
('DPGradientDescent_1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPAdam_2', dp_optimizer_keras.DPKerasAdamOptimizer, 2),
('DPAdagrad_4', dp_optimizer_keras.DPKerasAdagradOptimizer, 4),
('DPGradientDescentVectorized_1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 1),
('DPAdamVectorized_2',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdamOptimizer, 2),
('DPAdagradVectorized_4',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, 4),
('DPAdagradVectorized_None',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, None),
('DPGradientDescent 1', dp_optimizer_keras.DPKerasSGDOptimizer, 1,
[-2.5, -2.5], [-0.5]),
('DPAdam 2', dp_optimizer_keras.DPKerasAdamOptimizer, 2, [-2.5, -2.5
], [-0.5]),
('DPAdagrad 4', dp_optimizer_keras.DPKerasAdagradOptimizer, 4,
[-2.5, -2.5], [-0.5]),
('DPGradientDescentVectorized 1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 1,
[-2.5, -2.5], [-0.5]),
('DPAdamVectorized 2',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdamOptimizer, 2,
[-2.5, -2.5], [-0.5]),
('DPAdagradVectorized 4',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, 4,
[-2.5, -2.5], [-0.5]),
('DPAdagradVectorized None',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdagradOptimizer, None,
[-2.5, -2.5], [-0.5]),
)
def testBaselineWithTensorLossNoNoise(self, optimizer_class,
num_microbatches):
def testBaselineWithTensorLoss(self, cls, num_microbatches, expected_grad0,
expected_grad1):
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([3.0])
data0 = tf.Variable([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [-1.0, 0.0]])
data1 = tf.Variable([[8.0], [2.0], [3.0], [1.0]])
expected_grad0 = [-2.5, -2.5]
expected_grad1 = [-0.5]
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=100.0,
noise_multiplier=0.0,
num_microbatches=num_microbatches,
@ -95,7 +109,7 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
with tape:
loss = self._loss(data0, var0) + self._loss(data1, var1)
grads_and_vars = optimizer._compute_gradients(loss, [var0, var1], tape=tape)
grads_and_vars = opt._compute_gradients(loss, [var0, var1], tape=tape)
self.assertAllCloseAccordingToType(expected_grad0, grads_and_vars[0][0])
self.assertAllCloseAccordingToType(expected_grad1, grads_and_vars[1][0])
@ -104,11 +118,11 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
('DPGradientDescentVectorized',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer),
)
def testClippingNorm(self, optimizer_class):
def testClippingNorm(self, cls):
var0 = tf.Variable([0.0, 0.0])
data0 = tf.Variable([[3.0, 4.0], [6.0, 8.0]])
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=1.0,
noise_multiplier=0.0,
num_microbatches=1,
@ -116,39 +130,37 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
loss = lambda: self._loss(data0, var0)
# Expected gradient is sum of differences.
grads_and_vars = optimizer._compute_gradients(loss, [var0])
grads_and_vars = opt._compute_gradients(loss, [var0])
self.assertAllCloseAccordingToType([-0.6, -0.8], grads_and_vars[0][0])
@parameterized.named_parameters(
('DPGradientDescent_2_4_1', dp_optimizer_keras.DPKerasSGDOptimizer, 2.0,
('DPGradientDescent 2 4 1', dp_optimizer_keras.DPKerasSGDOptimizer, 2.0,
4.0, 1),
('DPGradientDescent_4_1_4', dp_optimizer_keras.DPKerasSGDOptimizer, 4.0,
('DPGradientDescent 4 1 4', dp_optimizer_keras.DPKerasSGDOptimizer, 4.0,
1.0, 4),
('DPGradientDescentVectorized_2_4_1',
('DPGradientDescentVectorized 2 4 1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 2.0, 4.0,
1),
('DPGradientDescentVectorized_4_1_4',
('DPGradientDescentVectorized 4 1 4',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 4.0, 1.0,
4),
)
def testNoiseMultiplier(self, optimizer_class, l2_norm_clip, noise_multiplier,
def testNoiseMultiplier(self, cls, l2_norm_clip, noise_multiplier,
num_microbatches):
tf.random.set_seed(2)
var0 = tf.Variable(tf.zeros([1000], dtype=tf.float32))
data0 = tf.Variable(tf.zeros([16, 1000], dtype=tf.float32))
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=l2_norm_clip,
noise_multiplier=noise_multiplier,
num_microbatches=num_microbatches,
learning_rate=2.0)
loss = lambda: self._loss(data0, var0)
grads_and_vars = optimizer._compute_gradients(loss, [var0])
grads_and_vars = opt._compute_gradients(loss, [var0])
grads = grads_and_vars[0][0].numpy()
# Test standard deviation is close to l2_norm_clip * noise_multiplier.
self.assertNear(
np.std(grads), l2_norm_clip * noise_multiplier / num_microbatches, 0.5)
@ -163,9 +175,9 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
('DPAdamVectorized',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdamOptimizer),
)
def testRaisesOnNoCallOfComputeGradients(self, optimizer_class):
def testAssertOnNoCallOfComputeGradients(self, cls):
"""Tests that assertion fails when DP gradients are not computed."""
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=100.0,
noise_multiplier=0.0,
num_microbatches=1,
@ -173,14 +185,14 @@ class DPOptimizerComputeGradientsTest(tf.test.TestCase, parameterized.TestCase):
with self.assertRaises(AssertionError):
grads_and_vars = tf.Variable([0.0])
optimizer.apply_gradients(grads_and_vars)
opt.apply_gradients(grads_and_vars)
# Expect no exception if _compute_gradients is called.
var0 = tf.Variable([0.0])
data0 = tf.Variable([[0.0]])
loss = lambda: self._loss(data0, var0)
grads_and_vars = optimizer._compute_gradients(loss, [var0])
optimizer.apply_gradients(grads_and_vars)
grads_and_vars = opt._compute_gradients(loss, [var0])
opt.apply_gradients(grads_and_vars)
class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
@ -190,8 +202,8 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
the Estimator framework.
"""
def _make_linear_model_fn(self, optimizer_class, l2_norm_clip,
noise_multiplier, num_microbatches, learning_rate):
def _make_linear_model_fn(self, opt_cls, l2_norm_clip, noise_multiplier,
num_microbatches, learning_rate):
"""Returns a model function for a linear regressor."""
def linear_model_fn(features, labels, mode):
@ -206,7 +218,7 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
vector_loss = 0.5 * tf.math.squared_difference(labels, preds)
scalar_loss = tf.reduce_mean(input_tensor=vector_loss)
optimizer = optimizer_class(
optimizer = opt_cls(
l2_norm_clip=l2_norm_clip,
noise_multiplier=noise_multiplier,
num_microbatches=num_microbatches,
@ -222,25 +234,26 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
return linear_model_fn
# Parameters for testing: optimizer, num_microbatches.
@parameterized.named_parameters(
('DPGradientDescent_1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPGradientDescent_2', dp_optimizer_keras.DPKerasSGDOptimizer, 2),
('DPGradientDescent_4', dp_optimizer_keras.DPKerasSGDOptimizer, 4),
('DPGradientDescentVectorized_1',
('DPGradientDescent 1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPGradientDescent 2', dp_optimizer_keras.DPKerasSGDOptimizer, 2),
('DPGradientDescent 4', dp_optimizer_keras.DPKerasSGDOptimizer, 4),
('DPGradientDescentVectorized 1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 1),
('DPGradientDescentVectorized_2',
('DPGradientDescentVectorized 2',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 2),
('DPGradientDescentVectorized_4',
('DPGradientDescentVectorized 4',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 4),
('DPGradientDescentVectorized_None',
('DPGradientDescentVectorized None',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, None),
)
def testBaselineNoNoise(self, optimizer_class, num_microbatches):
def testBaseline(self, cls, num_microbatches):
"""Tests that DP optimizers work with tf.estimator."""
linear_regressor = tf_estimator.Estimator(
model_fn=self._make_linear_model_fn(optimizer_class, 100.0, 0.0,
num_microbatches, 0.05))
model_fn=self._make_linear_model_fn(cls, 100.0, 0.0, num_microbatches,
0.05))
true_weights = np.array([[-5], [4], [3], [2]]).astype(np.float32)
true_bias = np.array([6.0]).astype(np.float32)
@ -263,12 +276,13 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
self.assertAllClose(
linear_regressor.get_variable_value('dense/bias'), true_bias, atol=0.05)
# Parameters for testing: optimizer, num_microbatches.
@parameterized.named_parameters(
('DPGradientDescent_1', dp_optimizer_keras.DPKerasSGDOptimizer),
('DPGradientDescentVectorized_1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer),
('DPGradientDescent 1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPGradientDescentVectorized 1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 1),
)
def testClippingNorm(self, optimizer_class):
def testClippingNorm(self, cls, num_microbatches):
"""Tests that DP optimizers work with tf.estimator."""
true_weights = np.array([[6.0], [0.0], [0], [0]]).astype(np.float32)
@ -282,12 +296,8 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
(train_data, train_labels)).batch(1)
unclipped_linear_regressor = tf_estimator.Estimator(
model_fn=self._make_linear_model_fn(
optimizer_class=optimizer_class,
l2_norm_clip=1.0e9,
noise_multiplier=0.0,
num_microbatches=1,
learning_rate=1.0))
model_fn=self._make_linear_model_fn(cls, 1.0e9, 0.0, num_microbatches,
1.0))
unclipped_linear_regressor.train(input_fn=train_input_fn, steps=1)
kernel_value = unclipped_linear_regressor.get_variable_value('dense/kernel')
@ -295,12 +305,8 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
global_norm = np.linalg.norm(np.concatenate((kernel_value, [bias_value])))
clipped_linear_regressor = tf_estimator.Estimator(
model_fn=self._make_linear_model_fn(
optimizer_class=optimizer_class,
l2_norm_clip=1.0,
noise_multiplier=0.0,
num_microbatches=1,
learning_rate=1.0))
model_fn=self._make_linear_model_fn(cls, 1.0, 0.0, num_microbatches,
1.0))
clipped_linear_regressor.train(input_fn=train_input_fn, steps=1)
self.assertAllClose(
@ -315,29 +321,29 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
# Parameters for testing: optimizer, l2_norm_clip, noise_multiplier,
# num_microbatches.
@parameterized.named_parameters(
('DPGradientDescent_2_4_1', dp_optimizer_keras.DPKerasSGDOptimizer, 2.0,
('DPGradientDescent 2 4 1', dp_optimizer_keras.DPKerasSGDOptimizer, 2.0,
4.0, 1),
('DPGradientDescent_3_2_4', dp_optimizer_keras.DPKerasSGDOptimizer, 3.0,
('DPGradientDescent 3 2 4', dp_optimizer_keras.DPKerasSGDOptimizer, 3.0,
2.0, 4),
('DPGradientDescent_8_6_8', dp_optimizer_keras.DPKerasSGDOptimizer, 8.0,
('DPGradientDescent 8 6 8', dp_optimizer_keras.DPKerasSGDOptimizer, 8.0,
6.0, 8),
('DPGradientDescentVectorized_2_4_1',
('DPGradientDescentVectorized 2 4 1',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 2.0, 4.0,
1),
('DPGradientDescentVectorized_3_2_4',
('DPGradientDescentVectorized 3 2 4',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 3.0, 2.0,
4),
('DPGradientDescentVectorized_8_6_8',
('DPGradientDescentVectorized 8 6 8',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, 8.0, 6.0,
8),
)
def testNoiseMultiplier(self, optimizer_class, l2_norm_clip, noise_multiplier,
def testNoiseMultiplier(self, cls, l2_norm_clip, noise_multiplier,
num_microbatches):
"""Tests that DP optimizers work with tf.estimator."""
linear_regressor = tf_estimator.Estimator(
model_fn=self._make_linear_model_fn(
optimizer_class,
cls,
l2_norm_clip,
noise_multiplier,
num_microbatches,
@ -371,9 +377,9 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
('DPAdamVectorized',
dp_optimizer_keras_vectorized.VectorizedDPKerasAdamOptimizer),
)
def testRaisesOnNoCallOfGetGradients(self, optimizer_class):
def testAssertOnNoCallOfGetGradients(self, cls):
"""Tests that assertion fails when DP gradients are not computed."""
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=100.0,
noise_multiplier=0.0,
num_microbatches=1,
@ -381,7 +387,7 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
with self.assertRaises(AssertionError):
grads_and_vars = tf.Variable([0.0])
optimizer.apply_gradients(grads_and_vars)
opt.apply_gradients(grads_and_vars)
def testLargeBatchEmulationNoNoise(self):
# Test for emulation of large batch training.
@ -402,7 +408,7 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
x2 = tf.constant([[4.0, 2.0], [2.0, 1.0]], dtype=tf.float32)
loss2 = lambda: tf.matmul(var0, x2, transpose_b=True) + var1
optimizer = dp_optimizer_keras.DPKerasSGDOptimizer(
opt = dp_optimizer_keras.DPKerasSGDOptimizer(
l2_norm_clip=100.0,
noise_multiplier=0.0,
gradient_accumulation_steps=2,
@ -412,36 +418,35 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
self.assertAllCloseAccordingToType([[1.0, 2.0]], var0)
self.assertAllCloseAccordingToType([3.0], var1)
optimizer.minimize(loss1, [var0, var1])
opt.minimize(loss1, [var0, var1])
# After first call to optimizer values didn't change
self.assertAllCloseAccordingToType([[1.0, 2.0]], var0)
self.assertAllCloseAccordingToType([3.0], var1)
optimizer.minimize(loss2, [var0, var1])
opt.minimize(loss2, [var0, var1])
# After second call to optimizer updates were applied
self.assertAllCloseAccordingToType([[-1.0, 1.0]], var0)
self.assertAllCloseAccordingToType([2.0], var1)
optimizer.minimize(loss2, [var0, var1])
opt.minimize(loss2, [var0, var1])
# After third call to optimizer values didn't change
self.assertAllCloseAccordingToType([[-1.0, 1.0]], var0)
self.assertAllCloseAccordingToType([2.0], var1)
optimizer.minimize(loss2, [var0, var1])
opt.minimize(loss2, [var0, var1])
# After fourth call to optimizer updates were applied again
self.assertAllCloseAccordingToType([[-4.0, -0.5]], var0)
self.assertAllCloseAccordingToType([1.0], var1)
@parameterized.named_parameters(
('DPKerasSGDOptimizer_1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPKerasSGDOptimizer_2', dp_optimizer_keras.DPKerasSGDOptimizer, 2),
('DPKerasSGDOptimizer_4', dp_optimizer_keras.DPKerasSGDOptimizer, 4),
('DPKerasAdamOptimizer_2', dp_optimizer_keras.DPKerasAdamOptimizer, 1),
('DPKerasAdagradOptimizer_2', dp_optimizer_keras.DPKerasAdagradOptimizer,
('DPKerasSGDOptimizer 1', dp_optimizer_keras.DPKerasSGDOptimizer, 1),
('DPKerasSGDOptimizer 2', dp_optimizer_keras.DPKerasSGDOptimizer, 2),
('DPKerasSGDOptimizer 4', dp_optimizer_keras.DPKerasSGDOptimizer, 4),
('DPKerasAdamOptimizer 2', dp_optimizer_keras.DPKerasAdamOptimizer, 1),
('DPKerasAdagradOptimizer 2', dp_optimizer_keras.DPKerasAdagradOptimizer,
2),
)
def testLargeBatchEmulation(self, optimizer_class,
gradient_accumulation_steps):
def testLargeBatchEmulation(self, cls, gradient_accumulation_steps):
# Tests various optimizers with large batch emulation.
# Uses clipping and noise, thus does not test specific values
# of the variables and only tests how often variables are updated.
@ -450,7 +455,7 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
x = tf.constant([[2.0, 0.0], [0.0, 1.0]], dtype=tf.float32)
loss = lambda: tf.matmul(var0, x, transpose_b=True) + var1
optimizer = optimizer_class(
opt = cls(
l2_norm_clip=100.0,
noise_multiplier=0.0,
gradient_accumulation_steps=gradient_accumulation_steps,
@ -459,7 +464,7 @@ class DPOptimizerGetGradientsTest(tf.test.TestCase, parameterized.TestCase):
for _ in range(gradient_accumulation_steps):
self.assertAllCloseAccordingToType([[1.0, 2.0]], var0)
self.assertAllCloseAccordingToType([3.0], var1)
optimizer.minimize(loss, [var0, var1])
opt.minimize(loss, [var0, var1])
self.assertNotAllClose([[1.0, 2.0]], var0)
self.assertNotAllClose([3.0], var1)
@ -496,19 +501,19 @@ class SimpleEmbeddingModel(tf.keras.Model):
return sequence_output, pooled_output
def keras_embedding_model_fn(optimizer_class,
def keras_embedding_model_fn(opt_cls,
l2_norm_clip: float,
noise_multiplier: float,
num_microbatches: int,
learning_rate: float,
use_sequence_output: bool = False,
use_seq_output: bool = False,
unconnected_gradients_to_zero: bool = False):
"""Construct a simple embedding model with a classification layer."""
# Every sample has 4 tokens (sequence length=4).
x = tf.keras.layers.Input(shape=(4,), dtype=tf.float32, name='input')
sequence_output, pooled_output = SimpleEmbeddingModel()(x)
if use_sequence_output:
if use_seq_output:
embedding = sequence_output
else:
embedding = pooled_output
@ -517,7 +522,7 @@ def keras_embedding_model_fn(optimizer_class,
embedding)
model = tf.keras.Model(inputs=x, outputs=probs, name='model')
optimizer = optimizer_class(
optimizer = opt_cls(
l2_norm_clip=l2_norm_clip,
noise_multiplier=noise_multiplier,
num_microbatches=num_microbatches,
@ -557,7 +562,7 @@ class DPVectorizedOptimizerUnconnectedNodesTest(tf.test.TestCase,
@parameterized.named_parameters(
('DPSGDVectorized_SeqOutput_UnconnectedGradients',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer),)
def testSeqOutputUnconnectedGradientsAsNoneFails(self, optimizer_class):
def testSeqOutputUnconnectedGradientsAsNoneFails(self, cls):
"""Tests that DP vectorized optimizers with 'None' unconnected gradients fail.
Sequence models that have unconnected gradients (with
@ -569,16 +574,16 @@ class DPVectorizedOptimizerUnconnectedNodesTest(tf.test.TestCase,
These tests test the various combinations of this flag and the model.
Args:
optimizer_class: The DP optimizer class to test.
cls: The DP optimizer class to test.
"""
embedding_model = keras_embedding_model_fn(
optimizer_class,
cls,
l2_norm_clip=1.0,
noise_multiplier=0.5,
num_microbatches=1,
learning_rate=1.0,
use_sequence_output=True,
use_seq_output=True,
unconnected_gradients_to_zero=False)
train_data = np.random.randint(0, 10, size=(1000, 4), dtype=np.int32)
@ -600,16 +605,16 @@ class DPVectorizedOptimizerUnconnectedNodesTest(tf.test.TestCase,
@parameterized.named_parameters(
('DPSGDVectorized_PooledOutput_UnconnectedGradients',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer),)
def testPooledOutputUnconnectedGradientsAsNonePasses(self, optimizer_class):
def testPooledOutputUnconnectedGradientsAsNonePasses(self, cls):
"""Tests that DP vectorized optimizers with 'None' unconnected gradients fail."""
embedding_model = keras_embedding_model_fn(
optimizer_class,
cls,
l2_norm_clip=1.0,
noise_multiplier=0.5,
num_microbatches=1,
learning_rate=1.0,
use_sequence_output=False,
use_seq_output=False,
unconnected_gradients_to_zero=False)
train_data = np.random.randint(0, 10, size=(1000, 4), dtype=np.int32)
@ -633,17 +638,16 @@ class DPVectorizedOptimizerUnconnectedNodesTest(tf.test.TestCase,
('DPSGDVectorized_PooledOutput_UnconnectedGradientsAreZero',
dp_optimizer_keras_vectorized.VectorizedDPKerasSGDOptimizer, False),
)
def testUnconnectedGradientsAsZeroPasses(self, optimizer_class,
use_sequence_output):
def testUnconnectedGradientsAsZeroPasses(self, cls, use_seq_output):
"""Tests that DP vectorized optimizers with 'Zero' unconnected gradients pass."""
embedding_model = keras_embedding_model_fn(
optimizer_class,
cls,
l2_norm_clip=1.0,
noise_multiplier=0.5,
num_microbatches=1,
learning_rate=1.0,
use_sequence_output=use_sequence_output,
use_seq_output=use_seq_output,
unconnected_gradients_to_zero=True)
train_data = np.random.randint(0, 10, size=(1000, 4), dtype=np.int32)
@ -660,6 +664,5 @@ class DPVectorizedOptimizerUnconnectedNodesTest(tf.test.TestCase,
# other exceptions are errors.
self.fail('ValueError raised by model.fit().')
if __name__ == '__main__':
tf.test.main()