From 9f4feade7dd3b38b7bc6e4a54e1dc97e7cf29e1f Mon Sep 17 00:00:00 2001
From: Shuang Song <shuangsong@google.com>
Date: Mon, 22 Aug 2022 16:16:12 -0700
Subject: [PATCH] Add more documentation for `gradient_accumulation_steps` in
 keras optimizer.

PiperOrigin-RevId: 469310667
---
 .../privacy/optimizers/dp_optimizer_keras.py  | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tensorflow_privacy/privacy/optimizers/dp_optimizer_keras.py b/tensorflow_privacy/privacy/optimizers/dp_optimizer_keras.py
index 6d53a76..51aeac4 100644
--- a/tensorflow_privacy/privacy/optimizers/dp_optimizer_keras.py
+++ b/tensorflow_privacy/privacy/optimizers/dp_optimizer_keras.py
@@ -105,11 +105,23 @@ def make_keras_optimizer_class(cls):
     opt.minimize(loss, var_list=[var])
     ```
 
-    Note that when using this feature effective batch size is
-    `gradient_accumulation_steps * one_step_batch_size` where
-    `one_step_batch_size` size of the batch which is passed to single step
-    of the optimizer. Thus user may have to adjust learning rate, weight decay
-    and possibly other training hyperparameters accordingly.
+    Note that when using this feature,
+    1. effective batch size is `gradient_accumulation_steps * one_step_batch_size`
+      where `one_step_batch_size` is the size of the batch passed to single step
+      of the optimizer. Thus user may have to adjust learning rate, weight decay
+      and possibly other training hyperparameters accordingly.
+    2. effective noise (the noise to be used for privacy computation) is
+       `noise_multiplier * sqrt(gradient_accumulation_steps)`, as the optimizer
+       adds noise of `self._noise_multiplier` to every step. Thus user may have
+       to adjust the `noise_multiplier` or the privacy computation.
+    Additionally, user may need to adjust the batch size in the data generator,
+    or the number of calls to the data generator, depending on the training
+    framework used. For example, when using Keras model.fit(...) with a
+    user-defined data generator, one may need to make the data generator return
+    `one_step_batch_size` examples each time, and scale the `steps_per_epoch`
+    by `gradient_accumulation_steps`. This is because the data generator is
+    called `steps_per_epoch` times per epoch, and one call only returns
+    `one_step_batch_size` (instead of `effective_batch_size`) examples now.
     """.format(
         base_class='tf.keras.optimizers.' + cls.__name__,
         short_base_class=cls.__name__,