Efficient DPSGD with support to microbatched losses.

PiperOrigin-RevId: 513886957
2023-03-03 12:03:15 -08:00 · 2023-03-03 12:03:15 -08:00 · 8bfafdd74d
commit 8bfafdd74d
parent cbf34f2b04
6 changed files with 252 additions and 104 deletions
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads.py
@ -21,7 +21,7 @@ of the approach given in https://arxiv.org/pdf/2009.03106.pdf (see the
 `compute_gradient_norms()` function).
 """

-from typing import Dict, Iterable, Text, Union
+from typing import Dict, Iterable, Optional, Text, Union

 import tensorflow as tf
 from tensorflow_privacy.privacy.fast_gradient_clipping import gradient_clipping_utils
@ -31,7 +31,9 @@ InputTensor = Union[tf.Tensor, Iterable[tf.Tensor], Dict[Text, tf.Tensor]]


 def get_registry_generator_fn(
-    tape: tf.GradientTape, layer_registry: lr.LayerRegistry
+    tape: tf.GradientTape,
+    layer_registry: lr.LayerRegistry,
+    num_microbatches: Optional[lr.BatchSize] = None,
 ):
  """Creates the generator function for `compute_gradient_norms()`."""
  if layer_registry is None:
@ -50,14 +52,14 @@ def get_registry_generator_fn(
          )
        registry_fn = layer_registry.lookup(layer_instance)
        (layer_vars, layer_outputs, layer_sqr_norm_fn) = registry_fn(
-            layer_instance, args, tape
+            layer_instance, args, tape, num_microbatches
        )
        return layer_outputs, (layer_vars, layer_sqr_norm_fn)
      else:
        # Non-trainable layer.
        return layer_instance(*args, **kwargs), None

-    return registry_generator_fn
+  return registry_generator_fn


 def compute_gradient_norms(
@ -65,6 +67,7 @@ def compute_gradient_norms(
    x_batch: InputTensor,
    y_batch: tf.Tensor,
    layer_registry: lr.LayerRegistry,
+    num_microbatches: Optional[lr.BatchSize] = None,
 ):
  """Computes the per-example loss gradient norms for given data.

@ -83,13 +86,21 @@ def compute_gradient_norms(
      compute gradient norms quickly. See
      `tensorflow_privacy.privacy.fast_gradient_clipping.layer_registry` for
      more details.
+    num_microbatches: An optional number or scalar `tf.Tensor` for the number of
+      microbatches. If not None, indicates that the loss is grouped into
+      num_microbatches (in this case, the batch dimension needs to be a multiple
+      of num_microbatches). When there is microbatches, we always assume the
+      loss is the mean over a microbatch. And the gradient norm is computed for
+      each microbatch.

  Returns:
    A 1D `tf.Tensor` whose i-th entry is the norm of the gradient of the i-th
    per-example loss function.
  """
  tape = tf.GradientTape(persistent=True, watch_accessed_variables=False)
-  registry_generator_fn = get_registry_generator_fn(tape, layer_registry)
+  registry_generator_fn = get_registry_generator_fn(
+      tape, layer_registry, num_microbatches
+  )
  # First loop computes the model outputs, summed loss, and generator outputs.
  with tape:
    model_outputs, generator_outputs_list = (
@ -102,6 +113,10 @@ def compute_gradient_norms(
    loss_config['reduction'] = tf.keras.losses.Reduction.NONE
    per_example_loss_fn = input_model.loss.from_config(loss_config)
    losses = per_example_loss_fn(y_batch, model_outputs)
+    if num_microbatches is not None:
+      losses = tf.reduce_mean(
+          lr.add_microbatch_axis(losses, num_microbatches), axis=1
+      )
    summed_loss = tf.reduce_sum(losses)
  # Unwrap the generator outputs so that the next loop avoids duplicating
  # backprop ops.
@ -149,6 +164,7 @@ def compute_pred_and_clipped_gradients(
    y_batch: tf.Tensor,
    l2_norm_clip: float,
    layer_registry: lr.LayerRegistry,
+    num_microbatches: Optional[lr.BatchSize] = None,
 ):
  """Computes the per-example predictions and per-example clipped loss gradient.

@ -177,6 +193,10 @@ def compute_pred_and_clipped_gradients(
      `output` is the pre-activator tensor, `sqr_grad_norms` is related to the
      squared norms of a layer's pre-activation tensor, and `vars` are relevant
      trainable weights (see `layer_registry_factories.py` for examples).
+    num_microbatches: An optional number or scalar `tf.Tensor` for the number of
+      microbatches. If not None, indicates that the loss is grouped into
+      num_microbatches (in this case, the batch dimension needs to be a multiple
+      of num_microbatches).

  Returns:
    A `tuple` `(y_pred, grad)`. The first element is the prediction generated by
@ -184,11 +204,21 @@ def compute_pred_and_clipped_gradients(
    gradient of the loss function.
  """
  gradient_norms = compute_gradient_norms(
-      input_model, x_batch, y_batch, layer_registry
+      input_model, x_batch, y_batch, layer_registry, num_microbatches
  )
  loss_weights = compute_clip_weights(l2_norm_clip, gradient_norms)
  with tf.GradientTape() as tape:
    y_pred = input_model(x_batch, training=True)
+    if num_microbatches is not None:
+      y_batch = lr.add_microbatch_axis(y_batch, num_microbatches)
+      y_pred = lr.add_microbatch_axis(y_pred, num_microbatches)
+    # Warning: When num_microbatches is not None, we need to be sure that
+    # `compute_loss` always computes the mean over the microbatches
+    # as it is the assumption made when computing the gradient norm.
+    # It is indeed the case for multiple keras loss functions
+    # (e.g. mean_squared_error and binary_crossentropy). However it
+    # is not defined in the contract so may not hold, especially for
+    # custom losses.
    loss_value = input_model.compute_loss(
        x_batch, y_batch, y_pred, loss_weights
    )
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads_test.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/clip_grads_test.py
@ -13,7 +13,7 @@
 # limitations under the License.

 import itertools
-from typing import Callable, Any, List, Union
+from typing import Any, Callable, List, Optional, Union

 from absl.testing import parameterized
 import tensorflow as tf
@ -49,14 +49,17 @@ class DoubleDense(tf.keras.layers.Layer):


 def double_dense_layer_computation(
-    layer_instance: tf.keras.layers.Layer, inputs: Any, tape: tf.GradientTape
+    layer_instance: tf.keras.layers.Layer,
+    inputs: Any,
+    tape: tf.GradientTape,
+    num_microbatches: Optional[int],
 ):
  """Layer registry function for the custom `DoubleDense` layer class."""
  vars1, outputs, sqr_norm_fn1 = layer_registry.dense_layer_computation(
-      layer_instance.dense1, inputs, tape
+      layer_instance.dense1, inputs, tape, num_microbatches
  )
  vars2, outputs, sqr_norm_fn2 = layer_registry.dense_layer_computation(
-      layer_instance.dense2, (outputs,), tape
+      layer_instance.dense2, (outputs,), tape, num_microbatches
  )

  def sqr_norm_fn(base_vars):
@ -68,7 +71,10 @@ def double_dense_layer_computation(


 def compute_true_gradient_norms(
-    input_model: tf.keras.Model, x_batch: tf.Tensor, y_batch: tf.Tensor
+    input_model: tf.keras.Model,
+    x_batch: tf.Tensor,
+    y_batch: tf.Tensor,
+    num_microbatches: Optional[int],
 ):
  """Computes the real gradient norms for an input `(model, x, y)`."""
  loss_config = input_model.loss.get_config()
@ -77,13 +83,22 @@ def compute_true_gradient_norms(
  with tf.GradientTape(persistent=True) as tape:
    y_pred = input_model(x_batch)
    loss = per_example_loss_fn(y_batch, y_pred)
+    if num_microbatches is not None:
+      loss = tf.reduce_mean(
+          tf.reshape(
+              loss,
+              tf.concat([[num_microbatches, -1], tf.shape(loss)[1:]], axis=0),
+          ),
+          axis=1,
+      )
    if isinstance(loss, tf.RaggedTensor):
      loss = loss.to_tensor()
  sqr_norms = []
  for var in input_model.trainable_variables:
    jacobian = tape.jacobian(loss, var, experimental_use_pfor=False)
    reduction_axes = tf.range(1, len(jacobian.shape))
-    sqr_norms.append(tf.reduce_sum(tf.square(jacobian), axis=reduction_axes))
+    sqr_norm = tf.reduce_sum(tf.square(jacobian), axis=reduction_axes)
+    sqr_norms.append(sqr_norm)
  sqr_norm_tsr = tf.stack(sqr_norms, axis=1)
  return tf.sqrt(tf.reduce_sum(sqr_norm_tsr, axis=1))

@ -93,6 +108,7 @@ def get_computed_and_true_norms(
    layer_generator: LayerGenerator,
    input_dims: Union[int, List[int]],
    output_dim: int,
+    num_microbatches: Optional[int],
    is_eager: bool,
    x_input: tf.Tensor,
    rng_seed: int = 777,
@ -113,6 +129,7 @@ def get_computed_and_true_norms(
      `idim` and returns output tensors of dimension `odim`.
    input_dims: The input dimension(s) of the test `tf.keras.Model` instance.
    output_dim: The output dimension of the test `tf.keras.Model` instance.
+    num_microbatches: The number of microbatches. None or an integer.
    is_eager: A `bool` that is `True` if the model should be run eagerly.
    x_input: `tf.Tensor` inputs to be tested.
    rng_seed: An `int` used to initialize model weights.
@ -137,10 +154,16 @@ def get_computed_and_true_norms(
  y_batch = tf.ones_like(y_pred)
  tf.keras.utils.set_random_seed(rng_seed)
  computed_norms = clip_grads.compute_gradient_norms(
-      model, x_input, y_batch, layer_registry=registry
+      model,
+      x_input,
+      y_batch,
+      layer_registry=registry,
+      num_microbatches=num_microbatches,
  )
  tf.keras.utils.set_random_seed(rng_seed)
-  true_norms = compute_true_gradient_norms(model, x_input, y_batch)
+  true_norms = compute_true_gradient_norms(
+      model, x_input, y_batch, num_microbatches
+  )
  return (computed_norms, true_norms)


@ -322,18 +345,30 @@ class ClipGradsDenseLayerTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.product(
      model_name=list(get_dense_model_generators().keys()),
      layer_name=list(get_dense_layer_generators().keys()),
-      input_dim=[1, 2],
+      input_dim=[4],
      output_dim=[1, 2],
+      num_microbatches=[None, 1, 2],
      is_eager=[True, False],
  )
  def test_gradient_norms_on_various_models(
-      self, model_name, layer_name, input_dim, output_dim, is_eager
+      self,
+      model_name,
+      layer_name,
+      input_dim,
+      output_dim,
+      num_microbatches,
+      is_eager,
  ):
    model_generator = get_dense_model_generators()[model_name]
    layer_generator = get_dense_layer_generators()[layer_name]
    x_batches = get_nd_test_batches(input_dim)
    default_registry = layer_registry.make_default_layer_registry()
    for x_batch in x_batches:
+      if (
+          num_microbatches is not None
+          and x_batch.shape[0] % num_microbatches != 0
+      ):
+        continue
      if model_name == 'tower1':
        x_input = [x_batch, x_batch]
      else:
@ -343,6 +378,7 @@ class ClipGradsDenseLayerTest(tf.test.TestCase, parameterized.TestCase):
          layer_generator,
          input_dim,
          output_dim,
+          num_microbatches,
          is_eager,
          x_input,
          registry=default_registry,
@ -362,6 +398,10 @@ class ClipGradsEmbeddingLayerTest(tf.test.TestCase, parameterized.TestCase):
          tf.ragged.constant(
              [[0], [1], [], [0, 0], [0, 1], [1, 0], [1, 1]], dtype=tf.int32
          ),
+          tf.ragged.constant(
+              [[0], [1], [], [0, 0], [0, 1], [1, 0], [1, 1], [0, 1]],
+              dtype=tf.int32,
+          ),
          # 3D inputs.
          tf.convert_to_tensor([[[0, 1]]], dtype_hint=tf.int32),
          tf.convert_to_tensor(
@ -371,14 +411,24 @@ class ClipGradsEmbeddingLayerTest(tf.test.TestCase, parameterized.TestCase):
              [[[0]], [[1]], [], [[0, 0]], [[0, 1]], [[1, 0]], [[1, 1]]],
              dtype=tf.int32,
          ),
+          tf.ragged.constant(
+              [[[0]], [[1]], [], [[0, 0]], [[0, 1]], [[1, 0]], [[1, 1]], [[0]]],
+              dtype=tf.int32,
+          ),
      ],
      model_name=list(get_embedding_model_generators().keys()),
-      output_dim=[1, 2],
-      is_eager=[True, False],
+      output_dim=[2],
+      num_microbatches=[None, 1, 2],
+      is_eager=[True],
  )
  def test_gradient_norms_on_various_models(
-      self, x_batch, model_name, output_dim, is_eager
+      self, x_batch, model_name, output_dim, num_microbatches, is_eager
  ):
+    if (
+        num_microbatches is not None
+        and x_batch.shape[0] % num_microbatches != 0
+    ):
+      return
    valid_test_input = (
        not isinstance(x_batch, tf.RaggedTensor)
        and model_name == 'weighted_bow1'
@ -391,6 +441,7 @@ class ClipGradsEmbeddingLayerTest(tf.test.TestCase, parameterized.TestCase):
          layer_generator=None,
          input_dims=x_batch.shape[1:],
          output_dim=output_dim,
+          num_microbatches=num_microbatches,
          is_eager=is_eager,
          x_input=x_batch,
          registry=default_registry,
@ -403,20 +454,27 @@ class ClipGradsCustomLayerTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.product(
      input_dim=[1, 2],
      output_dim=[1, 2],
+      num_microbatches=[None, 1, 2],
      is_eager=[True, False],
  )
  def test_gradient_norms_on_various_models(
-      self, input_dim, output_dim, is_eager
+      self, input_dim, output_dim, num_microbatches, is_eager
  ):
    registry = layer_registry.make_default_layer_registry()
    registry.insert(DoubleDense, double_dense_layer_computation)
    x_batches = get_nd_test_batches(input_dim)
    for x_batch in x_batches:
+      if (
+          num_microbatches is not None
+          and x_batch.shape[0] % num_microbatches != 0
+      ):
+        continue
      (computed_norms, true_norms) = get_computed_and_true_norms(
          model_generator=make_two_layer_sequential_model,
          layer_generator=lambda a, b: DoubleDense(b),
          input_dims=input_dim,
          output_dim=output_dim,
+          num_microbatches=num_microbatches,
          is_eager=is_eager,
          x_input=x_batch,
          registry=registry,
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/gradient_clipping_utils.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/gradient_clipping_utils.py
@ -157,8 +157,8 @@ def all_trainable_layers_are_registered(

 def add_aggregate_noise(
    input_model: tf.keras.Model,
-    x_batch: InputTensor,
-    clipped_grads: List[tf.Tensor],
+    clipped_grads: list[tf.Tensor],
+    batch_size: tf.Tensor,
    l2_norm_clip: float,
    noise_multiplier: float,
 ) -> List[tf.Tensor]:
@ -169,8 +169,9 @@ def add_aggregate_noise(

  Args:
    input_model: The `tf.keras.Model` to obtain the layers from.
-    x_batch: An `InputTensor` to be fed into the input layer of the model.
    clipped_grads: A list of `tf.Tensor`s representing the clipped gradients.
+    batch_size: The batch size, used for normalizing the noise, when the loss
+      reduction is AUTO or SUM_OVER_BATCH_SIZE.
    l2_norm_clip: Clipping norm (max L2 norm of each gradient).
    noise_multiplier: Ratio of the standard deviation to the clipping norm.

@ -186,17 +187,7 @@ def add_aggregate_noise(
  ]:
    if input_model.loss.reduction == tf.keras.losses.Reduction.AUTO:
      logging.info('Assuming that the loss reduction is `SUM_OVER_BATCH_SIZE`.')
-    if isinstance(x_batch, tf.Tensor):
-      scale /= tf.cast(tf.shape(x_batch)[0], tf.float32)
-    elif isinstance(x_batch, dict):
-      batch_sizes = [
-          tf.cast(tf.shape(v)[0], tf.float32) for v in x_batch.values()
-      ]
-      scale /= tf.math.reduce_min(batch_sizes)
-    else:
-      raise NotImplementedError(
-          'Unknown container/class %s for input' % x_batch.__class__.__name__
-      )
+    scale /= tf.cast(batch_size, tf.float32)

  def add_noise(g):
    return g + tf.random.normal(
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/layer_registry.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/layer_registry.py
@ -38,9 +38,18 @@ whose i-th entry is the L2 norm of the i-th input vector, then

 where `l2_row_norm(y)` computes the L2 norm for each row of an input `y`.
 Details of this decomposition can be found in https://arxiv.org/abs/1510.01799
-"""

-from typing import Any, Callable, Dict, Iterable, Text, Tuple, Type, Union
+We also extend fast gradient norm computation to the case when the losses
+are microbatched, i.e. each per example loss is the mean of a set of losses.
+This could be useful for achieving user-level privacy and for improving the
+quality of DP models, through better estimation of the gradients due to
+aggregation at the microbatch level.
+"""
+# copybara.strip_begin
+# The detailed algorithm can be found in go/fast-dpsgd-mb.
+# copybara.strip_end
+
+from typing import Any, Callable, Dict, Iterable, Optional, Text, Tuple, Type, Union
 import tensorflow as tf


@ -56,6 +65,7 @@ RegistryFunction = Callable[
 ]

 InputTensor = Union[tf.Tensor, Iterable[tf.Tensor], Dict[Text, tf.Tensor]]
+BatchSize = Union[int, tf.Tensor]


 # ==============================================================================
@ -88,6 +98,37 @@ class LayerRegistry:
    self._registry[layer_key] = layer_registry_function


+# ==============================================================================
+# Utilities
+# ==============================================================================
+def add_microbatch_axis(
+    x: tf.Tensor,
+    num_microbatches: Optional[BatchSize],
+) -> tf.Tensor:
+  """Adds the microbatch axis.
+
+  Reshape the input tensor to replace the first(batch) dimension with the
+  shape [num_microbatches, batch_size / num_microbatches]. The batch size
+  must be a multiple of num_microbatches (unless it is None, meaning
+  num_microbatches is the same as the batch size).
+
+  Args:
+    x: the input tensor.
+    num_microbatches: None or a numeric value or a scalar `tf.Tensor`.
+
+  Returns:
+    The reshaped input tensor.
+  """
+  if num_microbatches is None:
+    return tf.expand_dims(x, 1)
+  with tf.control_dependencies(
+      [tf.assert_equal(tf.math.floormod(tf.shape(x)[0], num_microbatches), 0)]
+  ):
+    return tf.reshape(
+        x, tf.concat([[num_microbatches, -1], tf.shape(x)[1:]], axis=0)
+    )
+
+
 # ==============================================================================
 # Supported Keras layers
 # ==============================================================================
@ -95,6 +136,7 @@ def dense_layer_computation(
    layer_instance: tf.keras.layers.Dense,
    inputs: Tuple[InputTensor],
    tape: tf.GradientTape,
+    num_microbatches: Optional[tf.Tensor] = None,
 ) -> RegistryFunctionOutput:
  """Registry function for `tf.keras.layers.Dense`.

@ -111,6 +153,9 @@ def dense_layer_computation(
      output.
    tape: A `tf.GradientTape` instance that will be used to watch the output
      `base_vars`.
+    num_microbatches: An optional numeric value or scalar `tf.Tensor` for
+      indicating whether and how the losses are grouped into microbatches. If
+      not None, num_microbatches must divide the batch size.

  Returns:
    A `tuple` `(base_vars, outputs, sqr_norm_fn)`. `base_vars` is the
@ -132,21 +177,29 @@ def dense_layer_computation(
  tape.watch(base_vars)
  layer_instance.activation = orig_activation
  outputs = orig_activation(base_vars) if orig_activation else base_vars
+
  def sqr_norm_fn(base_vars_grads):
-    sqr_inputs = tf.square(*inputs)
-    inputs_reduction_axes = tf.range(1, tf.rank(sqr_inputs))
-    input_sqr_norms = tf.reduce_sum(sqr_inputs, axis=inputs_reduction_axes)
+
+    def _compute_gramian(x):
+      if num_microbatches is not None:
+        x_microbatched = add_microbatch_axis(x, num_microbatches)
+        return tf.matmul(x_microbatched, x_microbatched, transpose_b=True)
+      else:
+        # Special handling for better efficiency
+        return tf.reduce_sum(tf.square(x), axis=tf.range(1, tf.rank(x)))
+
+    inputs_gram = _compute_gramian(*inputs)
+    base_vars_grads_gram = _compute_gramian(base_vars_grads)
    if layer_instance.use_bias:
      # Adding a bias term is equivalent to a layer with no bias term and which
      # adds an additional variable to the layer input that only takes a
      # constant value of 1.0. This is thus equivalent to adding 1.0 to the sum
      # of the squared values of the inputs.
-      input_sqr_norms += tf.cast(1.0, dtype=input_sqr_norms.dtype)
-    reduction_axes = tf.range(1, tf.rank(base_vars_grads))
-    base_vars_sqr_norms = tf.reduce_sum(
-        tf.square(base_vars_grads), axis=reduction_axes
+      inputs_gram += 1.0
+    return tf.reduce_sum(
+        inputs_gram * base_vars_grads_gram,
+        axis=tf.range(1, tf.rank(inputs_gram)),
    )
-    return input_sqr_norms * base_vars_sqr_norms

  return base_vars, outputs, sqr_norm_fn

@ -155,6 +208,7 @@ def embedding_layer_computation(
    layer_instance: tf.keras.layers.Embedding,
    inputs: Tuple[InputTensor],
    tape: tf.GradientTape,
+    num_microbatches: Optional[tf.Tensor] = None,
 ) -> RegistryFunctionOutput:
  """Registry function for `tf.keras.layers.Embedding`.

@ -171,6 +225,9 @@ def embedding_layer_computation(
      output.
    tape: A `tf.GradientTape` instance that will be used to watch the output
      `base_vars`.
+    num_microbatches: An optional numeric value or scalar `tf.Tensor` for
+      indicating whether and how the losses are grouped into microbatches. If
+      not None, num_microbatches must divide the batch size.

  Returns:
    A `tuple` `(base_vars, outputs, sqr_norm_fn)`. `base_vars` is the
@ -219,6 +276,13 @@ def embedding_layer_computation(
      raise NotImplementedError(
          "Cannot parse input_ids of type %s" % input_ids.__class__.__name__
      )
+    row_indices = tf.cast(row_indices, tf.int32)
+    if num_microbatches is not None:
+      microbatch_size = tf.cast(nrows / num_microbatches, tf.int32)
+      nrows = num_microbatches
+      row_indices = tf.cast(
+          tf.math.floordiv(row_indices, microbatch_size), tf.int32
+      )
    # Sum-reduce the `IndexSlices` that is the result of a `tape.gradient()`
    # call. The sum is reduced by the repeated embedding indices and batch
    # index. It is adapted from the logic in:
--- a/tensorflow_privacy/privacy/keras_models/dp_keras_model.py
+++ b/tensorflow_privacy/privacy/keras_models/dp_keras_model.py
@ -26,53 +26,64 @@ def make_dp_model_class(cls):
    __doc__ = (
        """DP subclass of `{base_model}`.

-       This can be used as a differentially private replacement for
-       {base_model}. This class implements DP-SGD using the standard
-       Gaussian mechanism.
+        This can be used as a differentially private replacement for
+        {base_model}. This class implements DP-SGD using the standard
+        Gaussian mechanism.

-       This class also utilizes a faster gradient clipping algorithm if the
-       following two conditions hold:
+        This class also utilizes a faster gradient clipping algorithm if the
+        following two conditions hold:
        (i)  the trainable layers of the model are keys in the `dict` input
             `layer_registry`,
        (ii) the loss `tf.Tensor` for a given batch of examples is either a
             scalar or a 2D `tf.Tensor` that has only one column
             `(i.e., tf.shape(loss)[1] == 1)` and whose i-th row corresponds to
             the loss of the i-th example.
-       This clipping algorithm specifically computes clipped gradients at the
-       per-example level using the layer registry functions in `layer_registry`
-       (see clip_grads.py for more information about the algorithm). In this
-       setting, microbatching is not used (it is equivalent to
-       `num_microbatches == batch_size`), and the input `num_microbatches`
-       is ignored.
+        This clipping algorithm specifically computes clipped gradients at the
+        per-example or per microbatch (when `num_microbatches` is not None)
+        level using the layer registry functions in `layer_registry` (see
+        clip_grads.py for more information about the algorithm).

-       When instantiating this class, you need to supply several
-       DP-related arguments followed by the standard arguments for
-       `{short_base_model}`.
+        WARNING: with faster gradient clipping, and when num_microbatches is not
+        None, the per microbatch loss is assumed to be computed as the mean
+        of the loss over the microbatch, or effectively, by reshaping the loss
+        from the shape [batch_size, ...] to the shape
+        [num_microbatches, batch_size/num_microbatches, ...] and computing the
+        mean of the loss over the microbatches. This would require that the loss
+        function behaves accordingly. This is true for multiple common
+        predefined keras loss functions (e.g. mean_squared_loss,
+        binary_crossentropy) but may not hold for custom losses (and how such
+        aggregation is done is not exposed by the loss function, unfortunately).
+        It is the caller's responsibility to make sure that the loss function
+        does behave this way.

-       Examples:
+        When instantiating this class, you need to supply several
+        DP-related arguments followed by the standard arguments for
+        `{short_base_model}`.

-       ```python
-       # Create Model instance.
-       model = {dp_model_class}(l2_norm_clip=1.0, noise_multiplier=0.5, use_xla=True,
-                <standard arguments>)
-       ```
+          Examples:

-       You should use your {dp_model_class} instance with a standard instance
-       of `tf.keras.Optimizer` as the optimizer, and a standard reduced loss.
-       You do not need to use a differentially private optimizer.
+        ```python
+        # Create Model instance.
+        model = {dp_model_class}(l2_norm_clip=1.0, noise_multiplier=0.5, use_xla=True,
+                 <standard arguments>)
+        ```

-       ```python
-       # Use a standard (non-DP) optimizer.
-       optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+        You should use your {dp_model_class} instance with a standard instance
+        of `tf.keras.Optimizer` as the optimizer, and a standard reduced loss.
+        You do not need to use a differentially private optimizer.

-       # Use a standard reduced loss.
-       loss = tf.keras.losses.MeanSquaredError()
+        ```python
+        # Use a standard (non-DP) optimizer.
+        optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

-       model.compile(optimizer=optimizer, loss=loss)
-       model.fit(train_data, train_labels, epochs=1, batch_size=32)
-       ```
+        # Use a standard reduced loss.
+        loss = tf.keras.losses.MeanSquaredError()

-       """
+        model.compile(optimizer=optimizer, loss=loss)
+        model.fit(train_data, train_labels, epochs=1, batch_size=32)
+        ```
+
+        """
    ).format(
        base_model='tf.keras.' + cls.__name__,
        short_base_model=cls.__name__,
@ -115,6 +126,7 @@ def make_dp_model_class(cls):
      if isinstance(num_microbatches, bool):
        raise ValueError('Boolean value supplied for `num_microbatches`. '
                         'Did you intend it for `use_xla`?')
+      self._num_microbatches = num_microbatches

      # If all the trainable layers are in the input layer registry, we
      # don't need to use microbatching and can instead use the "fast"
@ -126,16 +138,8 @@ def make_dp_model_class(cls):
          )
          and gradient_clipping_utils.has_internal_compute_graph(self)
      ):
-        if num_microbatches is not None:
-          raise ValueError(
-              'Cannot initialize a model where num_microbatches '
-              'is not `None` and all trainable layers are '
-              'registered in layer_registry.'
-          )
-        self._num_microbatches = None
        self._enable_fast_peg_computation = True
      else:
-        self._num_microbatches = num_microbatches
        self._enable_fast_peg_computation = False

      if use_xla:
@ -198,10 +202,20 @@ def make_dp_model_class(cls):
        # trick, and uses these norms to clip the per-example gradients.
        x, y, _ = tf.keras.utils.unpack_x_y_sample_weight(data)
        y_pred, clipped_grads = clip_grads.compute_pred_and_clipped_gradients(
-            self, x, y, self._l2_norm_clip, self._layer_registry
+            self,
+            x,
+            y,
+            self._l2_norm_clip,
+            self._layer_registry,
+            self._num_microbatches,
        )
+        batch_size = self._num_microbatches or tf.shape(y)[0]
        grads = gradient_clipping_utils.add_aggregate_noise(
-            self, x, clipped_grads, self._l2_norm_clip, self._noise_multiplier
+            self,
+            clipped_grads,
+            batch_size,
+            self._l2_norm_clip,
+            self._noise_multiplier,
        )
      else:
        logging.info('Computing gradients using microbatching.')
--- a/tensorflow_privacy/privacy/keras_models/dp_keras_model_test.py
+++ b/tensorflow_privacy/privacy/keras_models/dp_keras_model_test.py
@ -139,9 +139,7 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):
    train_labels = np.array([[1.0], [3.0], [-2.0], [-4.0]])
    learning_rate = 1.0

-    for test_reg, test_nm in zip(
-        get_layer_registries(), [num_microbatches, None]
-    ):
+    for test_reg in get_layer_registries():
      optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
      loss = tf.keras.losses.MeanSquaredError()

@ -149,7 +147,7 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):
      model = dp_keras_model.DPSequential(
          l2_norm_clip=l2_norm_clip,
          noise_multiplier=0.0,
-          num_microbatches=test_nm,
+          num_microbatches=num_microbatches,
          layer_registry=test_reg,
          layers=[
              tf.keras.layers.InputLayer(input_shape=(2,)),
@ -173,10 +171,11 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):
          train_data, train_labels, w, l2_norm_clip, effective_num_microbatches
      )
      expected_weights = np.squeeze(-learning_rate * expected_grads)
-
      self.assertAllClose(model_weights, expected_weights)

  @parameterized.named_parameters(
+      ('noise_multiplier 3 2 None', 3.0, 2.0, None),
+      ('noise_multiplier 5 4 None', 5.0, 4.0, None),
      ('noise_multiplier 3 2 1', 3.0, 2.0, 1),
      ('noise_multiplier 5 4 1', 5.0, 4.0, 1),
      ('noise_multiplier 3 2 2', 3.0, 2.0, 2),
@ -198,9 +197,7 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):

    learning_rate = 1.0

-    for test_reg, test_nm in zip(
-        get_layer_registries(), [num_microbatches, None]
-    ):
+    for test_reg in get_layer_registries():
      optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
      loss = tf.keras.losses.MeanSquaredError()

@ -208,7 +205,7 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):
      model = dp_keras_model.DPSequential(
          l2_norm_clip=l2_norm_clip,
          noise_multiplier=noise_multiplier,
-          num_microbatches=test_nm,
+          num_microbatches=num_microbatches,
          layer_registry=test_reg,
          layers=[
              tf.keras.layers.InputLayer(input_shape=(1000,)),
@ -220,11 +217,7 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):
      model.compile(optimizer=optimizer, loss=loss)
      model.fit(train_data, train_labels, epochs=1, batch_size=4)

-      effective_num_microbatches = (
-          train_data.shape[0]
-          if model._num_microbatches is None
-          else num_microbatches
-      )
+      effective_num_microbatches = num_microbatches or train_data.shape[0]

      model_weights = model.get_weights()
      measured_std = np.std(model_weights[0])
@ -248,16 +241,14 @@ class DPKerasModelTest(tf.test.TestCase, parameterized.TestCase):
    train_labels = np.array([[0], [1], [1], [0]])
    learning_rate = 1.0

-    for test_reg, test_nm in zip(
-        get_layer_registries(), [num_microbatches, None]
-    ):
+    for test_reg in get_layer_registries():
      optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
      loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

      model = dp_keras_model.DPSequential(
          l2_norm_clip=1.0e9,
          noise_multiplier=0.0,
-          num_microbatches=test_nm,
+          num_microbatches=num_microbatches,
          layer_registry=test_reg,
          layers=[
              tf.keras.layers.InputLayer(input_shape=(2,)),