Clip (per-example) and aggregate gradients.

PiperOrigin-RevId: 480761907
2022-10-12 17:42:56 -07:00 · 2022-10-12 17:42:56 -07:00 · c25cb4a41b
commit c25cb4a41b
parent 71837fbeec
3 changed files with 480 additions and 0 deletions
--- a/tensorflow_privacy/privacy/optimizers/BUILD
+++ b/tensorflow_privacy/privacy/optimizers/BUILD
@ -9,6 +9,18 @@ py_library(
    srcs = ["__init__.py"],
 )

+py_library(
+    name = "clip_and_aggregate_gradients",
+    srcs = [
+        "clip_and_aggregate_gradients.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//third_party/py/six",
+        "//third_party/tensorflow/python/ops/parallel_for:control_flow_ops",
+    ],
+)
+
 py_library(
    name = "dp_optimizer",
    srcs = [
@ -63,6 +75,14 @@ py_library(
    deps = ["//tensorflow_privacy/privacy/dp_query:gaussian_query"],
 )

+py_test(
+    name = "clip_and_aggregate_gradients_test",
+    srcs = ["clip_and_aggregate_gradients_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [":clip_and_aggregate_gradients"],
+)
+
 py_test(
    name = "dp_optimizer_test",
    timeout = "long",
--- a/tensorflow_privacy/privacy/optimizers/clip_and_aggregate_gradients.py
+++ b/tensorflow_privacy/privacy/optimizers/clip_and_aggregate_gradients.py
@ -0,0 +1,247 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Per example gradients clipping and aggregation for sparse gradients.
+
+Modified from tape.jacobian to support sparse gradients.
+"""
+import sys
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+
+import six
+import tensorflow as tf
+
+from tensorflow.python.ops.parallel_for import control_flow_ops  # pylint: disable=g-direct-tensorflow-import
+
+GradientTensor = Union[tf.Tensor, tf.IndexedSlices]
+T = TypeVar('T')
+Nested = Union[T, Tuple[Any, ...], List[Any], Dict[str, Any]]
+
+
+def _deduplicate_batch_indexed_slices(
+    batched_values: tf.Tensor,
+    indices: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Removes duplication of indexed slices by summing them."""
+  perm = tf.concat([
+      tf.constant([1, 0], dtype=tf.int32),
+      tf.range(tf.rank(batched_values))[2:]
+  ],
+                   axis=0)
+  batched_values = tf.transpose(batched_values, perm=perm)
+  unique_indices, new_pos = tf.unique(indices)
+  summed_values = tf.math.unsorted_segment_sum(batched_values, new_pos,
+                                               tf.shape(unique_indices)[0])
+  return tf.transpose(summed_values, perm=perm), unique_indices
+
+
+def _batch_global_norm(vals: List[tf.Tensor]) -> tf.Tensor:
+  """Computes the global norm for each row in the batch."""
+
+  def _norm_squared(v):
+    return tf.cast(
+        tf.reduce_sum(
+            tf.reshape(tf.square(v), tf.stack([tf.shape(v)[0], -1])), axis=1),
+        tf.float32)
+
+  return tf.sqrt(tf.add_n([_norm_squared(v) for v in vals if v is not None]))
+
+
+def _batch_clip_by_global_norm(
+    vals: List[tf.Tensor], normalize: bool,
+    l2_norm_clip: Optional[float]) -> List[tf.Tensor]:
+  """Batch clips by global norm with normalize option."""
+  batch_global_norm = _batch_global_norm(vals)
+  if l2_norm_clip is None:
+    l2_norm_clip = 1.0
+  clip_ratio = l2_norm_clip / tf.maximum(batch_global_norm, 1e-8)
+  if not normalize:
+    clip_ratio = tf.minimum(1.0, clip_ratio)
+
+  def _expand_dims(e, v):
+    new_shape = tf.concat(
+        [tf.shape(v)[0:1],
+         tf.ones_like(tf.shape(v), dtype=tf.int32)[:-1]],
+        axis=0)
+    return tf.reshape(e, new_shape)
+
+  return [
+      v *
+      _expand_dims(tf.cast(clip_ratio, v.dtype), v) if v is not None else None
+      for v in vals
+  ]
+
+
+def clip_and_aggregate_gradients(
+    tape: tf.GradientTape,
+    target: tf.Tensor,
+    sources: Nested[tf.Tensor],
+    unconnected_gradients: tf.UnconnectedGradients = tf.UnconnectedGradients
+    .NONE,
+    normalize: bool = False,
+    l2_norm_clip: Optional[float] = None,
+    aggregate_method: str = 'mean',
+    keep_sparse_threshold: int = 10000) -> Nested[GradientTensor]:
+  """Clips (per-example) and aggregates gradients.
+
+  This procedure computes the Jacobian with respect to a vectorized loss,
+  i.e. the `target` argument, clips the gradient with repsect to each
+  individual output, and sums the clipped gradients. This is correct as
+  per-example gradient if there is a one to one mapping from the input example
+  to the output loss.
+
+  Args:
+    tape: a persistent tape.
+    target: Tensor to be differentiated. It is assumed that each value in
+      `target` is associated with an example so the gradient clipping would be
+      applied to the vectorized target.
+    sources: a list or nested structure of Tensors or Variables. `target` will
+      be differentiated against elements in `sources`.
+    unconnected_gradients: a value which can either hold 'none' or 'zero' and
+      alters the value which will be returned if the `target` and `sources` are
+      unconnected. The possible values and effects are detailed in
+      'UnconnectedGradients' and it defaults to 'none'.
+    normalize: whether to normalize each gradient.
+    l2_norm_clip: when `normalize` is `True`, every gradient is scaled to
+      `l2_norm_clip` (which can be set to None, understood as 1). When
+      `normalize` is `False`, it performs the regular clipping, i.e. scaling the
+      gradient to `l2_norm_clip` only if the gradient's L2 norm is larger than
+      `l2_norm_clip`. When `l2_norm_clip` is `None`, do nothing.
+    aggregate_method: the method for aggregating the gradients. Currently only
+      supports `sum` and `mean`, default to `mean`.
+    keep_sparse_threshold: when the gradient is a `tf.IndexedSlices`,
+      `keep_sparse_threshold` is used to determine if we should keep it in its
+      sparse representation (when the number of embedding items, i.e. vocabulary
+      size >= `keep_sparse_threshold`) or convert it into a dense tensor (when <
+      `keep_sparse_threshold`). The reason for this parameter is that the
+      current implementation of embedding lookup merges all the indices in a
+      batch, hence the sparse representation has input size the same as the
+      number of indices. When it is larger than the embedding size, it would be
+      more efficient to convert the sparse representation to a dense tensor. So
+      this threshold should be set around the number of indices in a typical
+      batch. When it is -1, always convert the sparse tensor to a dense tensor.
+
+  Returns:
+    Gradients stored in the same structure as `sources` with a one to one
+    mapping to the variables in `sources`. Each gradients may be a dense
+    tensor or a `tf.IndexedSlices`.
+
+  Raises:
+    RuntimeError: if `tape` is not persistent.
+    ValueError: if aggregate_method is not 'mean' or 'sum'.
+  """
+
+  if tape._tape is None:  # pylint: disable=protected-access
+    raise RuntimeError('A non-persistent GradientTape can only be used to '
+                       'compute one set of gradients (or jacobians)')
+
+  if aggregate_method not in ['mean', 'sum']:
+    raise ValueError('Only mean and sum methods are supported. But got '
+                     f'{aggregate_method}')
+
+  flat_sources = tf.nest.flatten(sources)
+  # Note that we push and pop the tape here and below. This is needed since we
+  # need gradients through the enclosed operations.
+  with tape._ensure_recording():  # pylint: disable=protected-access
+    target = tf.reshape(target, [-1])
+  target_shape = target.shape
+
+  convert_to_dense_indicator = [True for _ in flat_sources]
+  if keep_sparse_threshold >= 0:
+    convert_to_dense_indicator = [
+        s.shape[0] < keep_sparse_threshold for s in flat_sources
+    ]
+
+  def _unpack_indexed_slices(x, convert_to_dense):
+    """Optionally unpacks `tf.IndexedSlices` to dict of three dense tensors."""
+    if convert_to_dense or not isinstance(x, tf.IndexedSlices):
+      # If x is kept as a tf.IndexedSlices, it will be converted to a dense
+      # tensor in pfor.
+      return x
+    return {
+        'indices': x.indices,
+        'values': x.values,
+        'dense_shape': x.dense_shape
+    }
+
+  def loop_fn(i):
+    with tape._ensure_recording():  # pylint: disable=protected-access
+      y = tf.gather(target, i)
+    g = tape.gradient(
+        y, flat_sources, unconnected_gradients=unconnected_gradients)
+    g = tf.nest.map_structure(_unpack_indexed_slices, g,
+                              convert_to_dense_indicator)
+    return g
+
+  try:
+    target_size = int(target.shape[0])
+  except TypeError:
+    # When the shape is unavailable, fall back to the tensor op.
+    target_size = tf.shape(target)[0]
+
+  try:
+    output = control_flow_ops.pfor(loop_fn, target_size)
+  except ValueError as err:
+    six.reraise(
+        ValueError,
+        ValueError(
+            str(err) + '\nEncountered an exception while vectorizing the '
+            'jacobian computation. Consider using a non-vectorized version, '
+            'i.e. by computing the gradient for each output sequentially.'),
+        sys.exc_info()[2])
+
+  grads = []
+  for i, out in enumerate(output):
+    if out is not None:
+      # Determines if the output is a unpacked tf.IndexedSlices. Since `sources`
+      # has been flattened, it is only when the output is a dictionary (of three
+      # dense tensors).
+      if not isinstance(out, dict):
+        if tf.executing_eagerly():
+          out.set_shape(target_shape.concatenate(flat_sources[i].shape))
+        grads.append((out, None, None))
+      else:
+        # Remove duplicates at per-example level. This is for both correctness
+        # (when the same index gets gathered more than once in the same example)
+        # and efficiency (for the subsequent clipping). All the examples in
+        # the batch should have the same indices so it suffices to take the
+        # first row.
+        values, indices = _deduplicate_batch_indexed_slices(
+            out['values'], out['indices'][0])
+        # The `dense_shape` of all the examples are the same so we take the
+        # first row.
+        grads.append((values, indices, out['dense_shape'][0]))
+    else:
+      grads.append((None, None, None))
+
+  if normalize or l2_norm_clip is not None:
+    values, indices, dense_shape = zip(*grads)
+    values = _batch_clip_by_global_norm(values, normalize, l2_norm_clip)
+    grads = zip(values, indices, dense_shape)
+
+  new_output = []
+  for values, indices, dense_shape in grads:
+    if values is None:
+      new_output.append(None)
+      continue
+    if aggregate_method == 'sum':
+      values = tf.reduce_sum(values, axis=0)
+    else:
+      values = tf.reduce_mean(values, axis=0)
+    if indices is None:
+      new_output.append(values)
+    else:
+      new_output.append(
+          tf.IndexedSlices(
+              values=values, indices=indices, dense_shape=dense_shape))
+  return tf.nest.pack_sequence_as(sources, new_output)
--- a/tensorflow_privacy/privacy/optimizers/clip_and_aggregate_gradients_test.py
+++ b/tensorflow_privacy/privacy/optimizers/clip_and_aggregate_gradients_test.py
@ -0,0 +1,213 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test the correctness and sparseness of clip_and_aggregate_gradients."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow_privacy.privacy.optimizers import clip_and_aggregate_gradients as cag
+
+
+class ClipAndAggregateGradientsTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests clip_and_aggreate_gradients."""
+
+  def _get_loss_and_vars_fn(self, n, keepdims=False):
+    """Returns the function for creating the loss and variables."""
+    # The "model" here consists of both sparse and dense parameters to make sure
+    # `clip_and_aggregate_gradients` computes the gradients in the correct way
+    # and in the right format. The sparse layer is the embedding layer `emb0`,
+    # from which multiple embeddings are gathered, with indices stored
+    # in `ind0`. And the dense parameters is the variable var1 which is directly
+    # used. The loss is the quadratic loss between the model output and the
+    # data stored in `data0` and `data1`. We also add a dummy variable
+    # `dummy_var` which does not participate in the loss computation to test
+    # the `unconnected` argument.
+    emb0 = tf.keras.layers.Embedding(
+        4,
+        2,
+        embeddings_initializer=tf.keras.initializers.Constant(
+            np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0], [3.0, 3.0]])))
+    ind0 = tf.constant([1, 1, 2, 3, 2])
+    data0 = tf.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [-1.0, 0.0],
+                         [-2.0, -1.0], [-3.0, -2.0]])
+
+    var1 = tf.Variable([[1.0], [1.0], [2.0], [2.0], [3.0], [3.0]])
+    data1 = tf.constant([[-1.0], [-2.0], [-2.0], [-3.0], [-3.0], [-4.0]])
+
+    dummy_var = tf.Variable(np.array([[1.0]]).astype(np.float64))
+
+    def _loss(val0, val1):
+      return 0.5 * tf.reduce_sum(
+          input_tensor=tf.math.squared_difference(val0, val1), axis=1)
+
+    def _loss_and_vars_fn():
+      # We concatenate the embeddings with some constant values to make sure
+      # backprop does only go through those gathered indices.
+      val0 = tf.concat([emb0(ind0), tf.constant([[0.0, 0.0]])], axis=0)
+      loss = tf.reduce_sum(
+          tf.reshape(_loss(data0, val0) + _loss(data1, var1), [n, -1]),
+          keepdims=keepdims,
+          axis=1)
+      return loss, (emb0.embeddings, var1, dummy_var)
+
+    return _loss_and_vars_fn
+
+  def _get_true_grads(self,
+                      n,
+                      normalize=False,
+                      l2_norm_clip=None,
+                      agg_method='mean',
+                      unconnected='none'):
+    # The per-example gradients (or jacobians) below are computed manually.
+    # With the (half) quadratic loss, it is the difference between the
+    # variable value and the data value.
+    grad0 = np.array([[[0., 0.], [-2., -3.], [0., 0.], [0., 0.]],
+                      [[0., 0.], [-4., -5.], [0., 0.], [0., 0.]],
+                      [[0., 0.], [0., 0.], [-5., -6.], [0., 0.]],
+                      [[0., 0.], [0., 0.], [0., 0.], [4., 3.]],
+                      [[0., 0.], [0., 0.], [4., 3.], [0., 0.]],
+                      [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]],
+                     dtype=np.float32)
+    grad1 = np.array([[[2.], [0.], [0.], [0.], [0.], [0.]],
+                      [[0.], [3.], [0.], [0.], [0.], [0.]],
+                      [[0.], [0.], [4.], [0.], [0.], [0.]],
+                      [[0.], [0.], [0.], [5.], [0.], [0.]],
+                      [[0.], [0.], [0.], [0.], [6.], [0.]],
+                      [[0.], [0.], [0.], [0.], [0.], [7.]]],
+                     dtype=np.float32)
+    grad2 = np.array([[[0.]], [[0.]], [[0.]], [[0.]], [[0.]], [[0.]]],
+                     dtype=np.float64)
+
+    grads = [
+        np.sum(np.reshape(g, (n, -1, g.shape[1], g.shape[2])), axis=1)
+        for g in [grad0, grad1, grad2]
+    ]
+
+    if normalize or l2_norm_clip is not None:
+      if l2_norm_clip is None:
+        l2_norm_clip = 1.0
+      global_norm = np.sqrt(
+          np.sum([
+              np.sum(np.square(np.reshape(g, (n, -1))), axis=1) for g in grads
+          ],
+                 axis=0))
+      clip_ratio = l2_norm_clip / np.maximum(global_norm, 1e-8)
+      if not normalize:
+        clip_ratio = np.minimum(1.0, clip_ratio)
+      r = np.reshape(clip_ratio, [n, 1, 1])
+      grads = [g * r for g in grads]
+
+    if agg_method == 'sum':
+      grads = [np.sum(g, axis=0) for g in grads]
+    else:
+      grads = [np.mean(g, axis=0) for g in grads]
+
+    if unconnected == 'none':
+      grads[2] = None
+    return grads
+
+  def _to_dense_array(self, g):
+    if g is None:
+      return None
+    return np.array(tf.convert_to_tensor(g))
+
+  @parameterized.parameters(
+      (6, False, None, 'mean', -1, 'none'),
+      (6, True, None, 'sum', 1, 'none'),
+      (2, False, None, 'sum', 3, 'none'),
+      (2, True, 100.0, 'mean', 1, 'zero'),
+      (3, False, 1.0, 'sum', 2, 'zero'),
+      (1, True, 0.5, 'mean', 3, 'none'),
+  )
+  def testCorrect(self, n, normalize, l2_norm_clip, agg_method,
+                  keep_sparse_threshold, unconnected):
+    """Tests the correctness of the computation."""
+    loss_and_vars_fn = self._get_loss_and_vars_fn(n)
+    true_grads = self._get_true_grads(n, normalize, l2_norm_clip, agg_method,
+                                      unconnected)
+
+    with tf.GradientTape() as tape:
+      loss, test_vars = loss_and_vars_fn()
+      results = cag.clip_and_aggregate_gradients(
+          tape,
+          loss,
+          test_vars,
+          normalize=normalize,
+          l2_norm_clip=l2_norm_clip,
+          aggregate_method=agg_method,
+          unconnected_gradients=unconnected,
+          keep_sparse_threshold=keep_sparse_threshold)
+    for r, t in zip(results, true_grads):
+      if t is None:
+        self.assertIsNone(r)
+      else:
+        r = self._to_dense_array(r)
+        self.assertAllCloseAccordingToType(r, t)
+
+  @parameterized.parameters(
+      (6, True),
+      (6, False),
+      (1, True),
+      (1, False),
+  )
+  def testTargetShape(self, n, keepdims):
+    """Tests target gets vectorized regardless of their original shape."""
+    loss_and_vars_fn = self._get_loss_and_vars_fn(n, keepdims)
+    true_grads = self._get_true_grads(n)
+
+    with tf.GradientTape() as tape:
+      loss, test_vars = loss_and_vars_fn()
+      results = cag.clip_and_aggregate_gradients(tape, loss, test_vars)
+    for r, t in zip(results, true_grads):
+      if t is None:
+        self.assertIsNone(r)
+      else:
+        r = self._to_dense_array(r)
+        self.assertAllCloseAccordingToType(r, t)
+
+  @parameterized.parameters(
+      (-1),
+      (0),
+      (4),
+      (5),
+  )
+  def testSparse(self, keep_sparse_threshold):
+    """Tests the outcome is in the desired (dense or sparse) tensor form."""
+    loss_and_vars_fn = self._get_loss_and_vars_fn(3)
+    with tf.GradientTape() as tape:
+      loss, test_vars = loss_and_vars_fn()
+      results = cag.clip_and_aggregate_gradients(
+          tape,
+          loss,
+          test_vars,
+          normalize=False,
+          l2_norm_clip=1.0,
+          aggregate_method='mean',
+          unconnected_gradients='zero',
+          keep_sparse_threshold=keep_sparse_threshold)
+    grads0, grads1, grads2 = results
+    # emb0 has 4 items so grads0 should be in the sparse, i.e.
+    # `tf.IndexedSlices`, form iff `keep_sparse_threshold` is in [0, 4].
+    if keep_sparse_threshold >= 0 and keep_sparse_threshold <= 4:
+      self.assertIsInstance(grads0, tf.IndexedSlices)
+      self.assertLen(grads0.indices, 3)
+    else:
+      self.assertIsInstance(grads0, tf.Tensor)
+    # grads1 and grads2 should always be in the dense, i.e. `tf.Tensor`, form.
+    self.assertIsInstance(grads1, tf.Tensor)
+    self.assertIsInstance(grads2, tf.Tensor)
+
+
+if __name__ == '__main__':
+  tf.test.main()