Add utility functions for unwrapping BERT encoder layers into individual Keras layers.

PiperOrigin-RevId: 588419989
2023-12-06 07:29:13 -08:00 · 2023-12-06 07:29:13 -08:00 · fbe5879023
commit fbe5879023
parent 93376c9d6a
3 changed files with 254 additions and 0 deletions
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/BUILD
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/BUILD
@ -6,6 +6,20 @@ py_library(
    srcs_version = "PY3",
 )

+py_library(
+    name = "bert_encoder_utils",
+    srcs = ["bert_encoder_utils.py"],
+    srcs_version = "PY3",
+    deps = [":gradient_clipping_utils"],
+)
+
+py_test(
+    name = "bert_encoder_utils_test",
+    srcs = ["bert_encoder_utils_test.py"],
+    srcs_version = "PY3",
+    deps = [":bert_encoder_utils"],
+)
+
 py_library(
    name = "common_manip_utils",
    srcs = ["common_manip_utils.py"],
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/bert_encoder_utils.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/bert_encoder_utils.py
@ -0,0 +1,82 @@
+# Copyright 2023, The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for manipulating official Tensorflow BERT encoders."""
+
+import tensorflow as tf
+import tensorflow_models as tfm
+from tensorflow_privacy.privacy.fast_gradient_clipping import gradient_clipping_utils
+
+
+def dedup_bert_encoder(input_bert_encoder: tfm.nlp.networks.BertEncoder):
+  """Deduplicates the layer names in a BERT encoder."""
+
+  def _dedup(layer, attr_name, new_name):
+    sublayer = getattr(layer, attr_name)
+    if sublayer is None:
+      return
+    else:
+      sublayer_config = sublayer.get_config()
+      sublayer_config["name"] = new_name
+      setattr(layer, attr_name, sublayer.from_config(sublayer_config))
+
+  for layer in input_bert_encoder.layers:
+    # NOTE: the ordering of the renames is important for the ordering of the
+    # variables in the computed gradients. This is why we use three `for-loop`
+    # instead of one.
+    if isinstance(layer, tfm.nlp.layers.TransformerEncoderBlock):
+      # pylint: disable=protected-access
+      for attr_name in ["inner_dropout_layer", "attention_dropout"]:
+        _dedup(layer, "_" + attr_name, layer.name + "/" + attr_name)
+      # Some layers are nested within the main attention layer (if it exists).
+      if layer._attention_layer is not None:
+        prefix = layer.name + "/" + layer._attention_layer.name
+        _dedup(layer, "_attention_layer", prefix + "/attention_layer")
+        _dedup(
+            layer._attention_layer,
+            "_dropout_layer",
+            prefix + "/attention_inner_dropout_layer",
+        )
+      for attr_name in ["attention_layer_norm", "intermediate_dense"]:
+        _dedup(layer, "_" + attr_name, layer.name + "/" + attr_name)
+      # This is one of the few times that we cannot build from a config, due
+      # to the presence of lambda functions.
+      if layer._intermediate_activation_layer is not None:
+        policy = tf.keras.mixed_precision.global_policy()
+        if policy.name == "mixed_bfloat16":
+          policy = tf.float32
+        layer._intermediate_activation_layer = tf.keras.layers.Activation(
+            layer._inner_activation,
+            dtype=policy,
+            name=layer.name + "/intermediate_activation_layer",
+        )
+      for attr_name in ["output_dense", "output_dropout", "output_layer_norm"]:
+        _dedup(layer, "_" + attr_name, layer.name + "/" + attr_name)
+      # pylint: enable=protected-access
+
+
+def get_unwrapped_bert_encoder(
+    input_bert_encoder: tfm.nlp.networks.BertEncoder,
+) -> tfm.nlp.networks.BertEncoder:
+  """Creates a new BERT encoder whose layers are core Keras layers."""
+  dedup_bert_encoder(input_bert_encoder)
+  core_test_outputs = (
+      gradient_clipping_utils.generate_model_outputs_using_core_keras_layers(
+          input_bert_encoder,
+          custom_layer_set={tfm.nlp.layers.TransformerEncoderBlock},
+      )
+  )
+  return tf.keras.Model(
+      inputs=input_bert_encoder.inputs,
+      outputs=core_test_outputs,
+  )
--- a/tensorflow_privacy/privacy/fast_gradient_clipping/bert_encoder_utils_test.py
+++ b/tensorflow_privacy/privacy/fast_gradient_clipping/bert_encoder_utils_test.py
@ -0,0 +1,158 @@
+# Copyright 2023, The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests of `bert_encoder_utils.py`."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+import tensorflow_models as tfm
+from tensorflow_privacy.privacy.fast_gradient_clipping import bert_encoder_utils
+
+
+def compute_bert_sample_inputs(
+    batch_size, sequence_length, vocab_size, num_types
+):
+  """Returns a set of BERT encoder inputs."""
+  word_id_sample = np.random.randint(
+      vocab_size, size=(batch_size, sequence_length)
+  )
+  mask_sample = np.random.randint(2, size=(batch_size, sequence_length))
+  type_id_sample = np.random.randint(
+      num_types,
+      size=(batch_size, sequence_length),
+  )
+  return [word_id_sample, mask_sample, type_id_sample]
+
+
+def get_small_bert_encoder_and_sample_inputs(dict_outputs=False):
+  """Returns a small BERT encoder for testing."""
+  hidden_size = 2
+  vocab_size = 3
+  num_types = 4
+  max_sequence_length = 5
+  inner_dense_units = 6
+  output_range = 1
+  num_heads = 2
+  num_transformer_layers = 3
+  seed = 777
+
+  bert_encoder = tfm.nlp.networks.BertEncoder(
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      num_attention_heads=num_heads,
+      num_layers=num_transformer_layers,
+      max_sequence_length=max_sequence_length,
+      inner_dim=inner_dense_units,
+      type_vocab_size=num_types,
+      output_range=output_range,
+      initializer=tf.keras.initializers.GlorotUniform(seed),
+      dict_outputs=dict_outputs,
+  )
+
+  batch_size = 3
+  bert_sample_inputs = compute_bert_sample_inputs(
+      batch_size,
+      max_sequence_length,
+      vocab_size,
+      num_types,
+  )
+
+  return bert_encoder, bert_sample_inputs
+
+
+def get_shared_trainable_variables(model1, model2):
+  """Returns the shared trainable variables (by name) between models."""
+  common_names = {v.name for v in model1.trainable_variables} & {
+      v.name for v in model2.trainable_variables
+  }
+  tvars1 = [v for v in model1.trainable_variables if v.name in common_names]
+  tvars2 = [v for v in model2.trainable_variables if v.name in common_names]
+  return tvars1, tvars2
+
+
+def custom_reduced_loss(y_batch, y_pred):
+  del y_batch
+  # Create a loss multiplier to avoid small gradients.
+  large_value_multiplier = 1e10
+  sqr_outputs = []
+  for t in y_pred:
+    reduction_axes = tf.range(1, len(t.shape))
+    sqr_outputs.append(tf.reduce_sum(tf.square(t), axis=reduction_axes))
+  sqr_tsr = tf.stack(sqr_outputs, axis=1)
+  return large_value_multiplier * tf.reduce_sum(sqr_tsr, axis=1)
+
+
+class BertEncoderUtilsTest(tf.test.TestCase, parameterized.TestCase):
+
+  def test_outputs_are_equal(self):
+    true_encoder, sample_inputs = get_small_bert_encoder_and_sample_inputs()
+    unwrapped_encoder = bert_encoder_utils.get_unwrapped_bert_encoder(
+        true_encoder
+    )
+    true_outputs = true_encoder(sample_inputs)
+    computed_outputs = unwrapped_encoder(sample_inputs)
+    self.assertAllClose(true_outputs, computed_outputs)
+
+  def test_shared_trainable_variables_are_equal(self):
+    true_encoder, sample_inputs = get_small_bert_encoder_and_sample_inputs()
+    unwrapped_encoder = bert_encoder_utils.get_unwrapped_bert_encoder(
+        true_encoder
+    )
+    # Initializes the trainable variable shapes.
+    true_encoder(sample_inputs)
+    unwrapped_encoder(sample_inputs)
+    # The official BERT encoder may initialize trainable variables that are
+    # not used in a model forward pass. Hence, they are invisible when we
+    # try to unwrapping layers using our utility function.
+    true_vars, computed_vars = get_shared_trainable_variables(
+        true_encoder, unwrapped_encoder
+    )
+    self.assertAllClose(true_vars, computed_vars)
+
+  def test_shared_gradients_are_equal(self):
+    true_encoder, sample_inputs = get_small_bert_encoder_and_sample_inputs()
+    unwrapped_encoder = bert_encoder_utils.get_unwrapped_bert_encoder(
+        true_encoder
+    )
+    # Create a loss multiplier to avoid small gradients.
+    dummy_labels = None
+    with tf.GradientTape(persistent=True) as tape:
+      true_outputs = true_encoder(sample_inputs)
+      true_sqr_sum = tf.reduce_sum(
+          custom_reduced_loss(dummy_labels, true_outputs)
+      )
+      computed_outputs = unwrapped_encoder(sample_inputs)
+      computed_sqr_sum = tf.reduce_sum(
+          custom_reduced_loss(dummy_labels, computed_outputs)
+      )
+    # The official BERT encoder may initialize trainable variables that are
+    # not used in a model forward pass. Hence, they are invisible when we
+    # try to unwrapping layers using our utility function.
+    true_vars, computed_vars = get_shared_trainable_variables(
+        true_encoder, unwrapped_encoder
+    )
+    true_grads = tape.gradient(true_sqr_sum, true_vars)
+    computed_grads = tape.gradient(computed_sqr_sum, computed_vars)
+    self.assertEqual(len(true_grads), len(computed_grads))
+    for g1, g2 in zip(true_grads, computed_grads):
+      self.assertEqual(type(g1), type(g2))
+      if isinstance(g1, tf.IndexedSlices):
+        self.assertAllClose(g1.values, g2.values)
+        self.assertAllEqual(g2.indices, g2.indices)
+      else:
+        self.assertAllClose(g1, g2)
+
+
+if __name__ == '__main__':
+  tf.test.main()