# Copyright 2019, The TensorFlow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Vectorized differentially private optimizers for TensorFlow.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from distutils.version import LooseVersion import tensorflow as tf if LooseVersion(tf.__version__) < LooseVersion('2.0.0'): nest = tf.contrib.framework.nest AdagradOptimizer = tf.train.AdagradOptimizer AdamOptimizer = tf.train.AdamOptimizer GradientDescentOptimizer = tf.train.GradientDescentOptimizer parent_code = tf.train.Optimizer.compute_gradients.__code__ GATE_OP = tf.train.Optimizer.GATE_OP # pylint: disable=invalid-name else: nest = tf.nest AdagradOptimizer = tf.optimizers.Adagrad AdamOptimizer = tf.optimizers.Adam GradientDescentOptimizer = tf.optimizers.SGD # pylint: disable=invalid-name parent_code = tf.optimizers.Optimizer._compute_gradients.__code__ # pylint: disable=protected-access GATE_OP = None # pylint: disable=invalid-name def make_vectorized_optimizer_class(cls): """Constructs a vectorized DP optimizer class from an existing one.""" if LooseVersion(tf.__version__) < LooseVersion('2.0.0'): child_code = cls.compute_gradients.__code__ else: child_code = cls._compute_gradients.__code__ # pylint: disable=protected-access if child_code is not parent_code: tf.logging.warning( 'WARNING: Calling make_optimizer_class() on class %s that overrides ' 'method compute_gradients(). Check to ensure that ' 'make_optimizer_class() does not interfere with overridden version.', cls.__name__) class DPOptimizerClass(cls): """Differentially private subclass of given class cls.""" def __init__( self, l2_norm_clip, noise_multiplier, num_microbatches=None, *args, # pylint: disable=keyword-arg-before-vararg, g-doc-args **kwargs): """Initialize the DPOptimizerClass. Args: l2_norm_clip: Clipping norm (max L2 norm of per microbatch gradients) noise_multiplier: Ratio of the standard deviation to the clipping norm num_microbatches: How many microbatches into which the minibatch is split. If None, will default to the size of the minibatch, and per-example gradients will be computed. """ super(DPOptimizerClass, self).__init__(*args, **kwargs) self._l2_norm_clip = l2_norm_clip self._noise_multiplier = noise_multiplier self._num_microbatches = num_microbatches def compute_gradients(self, loss, var_list, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None, gradient_tape=None): if callable(loss): # TF is running in Eager mode raise NotImplementedError('Vectorized optimizer unavailable for TF2.') else: # TF is running in graph mode, check we did not receive a gradient tape. if gradient_tape: raise ValueError('When in graph mode, a tape should not be passed.') batch_size = tf.shape(loss)[0] if self._num_microbatches is None: self._num_microbatches = batch_size # Note: it would be closer to the correct i.i.d. sampling of records if # we sampled each microbatch from the appropriate binomial distribution, # although that still wouldn't be quite correct because it would be # sampling from the dataset without replacement. microbatch_losses = tf.reshape(loss, [self._num_microbatches, -1]) if var_list is None: var_list = ( tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) def process_microbatch(microbatch_loss): """Compute clipped grads for one microbatch.""" microbatch_loss = tf.reduce_mean(microbatch_loss) grads, _ = zip(*super(DPOptimizerClass, self).compute_gradients( microbatch_loss, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, grad_loss)) grads_list = [ g if g is not None else tf.zeros_like(v) for (g, v) in zip(list(grads), var_list) ] # Clip gradients to have L2 norm of l2_norm_clip. # Here, we use TF primitives rather than the built-in # tf.clip_by_global_norm() so that operations can be vectorized # across microbatches. grads_flat = nest.flatten(grads_list) squared_l2_norms = [tf.reduce_sum(tf.square(g)) for g in grads_flat] global_norm = tf.sqrt(tf.add_n(squared_l2_norms)) div = tf.maximum(global_norm / self._l2_norm_clip, 1.) clipped_flat = [g / div for g in grads_flat] clipped_grads = nest.pack_sequence_as(grads_list, clipped_flat) return clipped_grads clipped_grads = tf.vectorized_map(process_microbatch, microbatch_losses) def reduce_noise_normalize_batch(stacked_grads): summed_grads = tf.reduce_sum(stacked_grads, axis=0) noise_stddev = self._l2_norm_clip * self._noise_multiplier noise = tf.random.normal(tf.shape(summed_grads), stddev=noise_stddev) noised_grads = summed_grads + noise return noised_grads / tf.cast(self._num_microbatches, tf.float32) final_grads = nest.map_structure(reduce_noise_normalize_batch, clipped_grads) return list(zip(final_grads, var_list)) return DPOptimizerClass VectorizedDPAdagrad = make_vectorized_optimizer_class(AdagradOptimizer) VectorizedDPAdam = make_vectorized_optimizer_class(AdamOptimizer) VectorizedDPSGD = make_vectorized_optimizer_class(GradientDescentOptimizer)