Implement the tree aggregation query in TFP.

The core `tree_aggregation` algorithm is from https://github.com/google-research/federated/tree/master/dp_ftrl. The tree_aggregation_query is partially developed by Monica Ribero Diaz when she was a student researcher at Google. PiperOrigin-RevId: 376953302
2021-06-01 17:26:38 -07:00 · 2021-06-01 17:26:38 -07:00 · 944dcd0e17
commit 944dcd0e17
parent 331a2911ac
5 changed files with 1493 additions and 0 deletions
--- a/tensorflow_privacy/init.py
+++ b/tensorflow_privacy/init.py
@ -47,6 +47,9 @@ else:
  from tensorflow_privacy.privacy.dp_query.quantile_estimator_query import QuantileEstimatorQuery
  from tensorflow_privacy.privacy.dp_query.quantile_estimator_query import NoPrivacyQuantileEstimatorQuery
  from tensorflow_privacy.privacy.dp_query.quantile_adaptive_clip_sum_query import QuantileAdaptiveClipSumQuery
+  from tensorflow_privacy.privacy.dp_query import tree_aggregation
+  from tensorflow_privacy.privacy.dp_query.tree_aggregation_query import TreeCumulativeSumQuery
+  from tensorflow_privacy.privacy.dp_query.tree_aggregation_query import TreeResidualSumQuery

  # Estimators
  from tensorflow_privacy.privacy.estimators.dnn import DNNClassifier
--- a/tensorflow_privacy/privacy/dp_query/tree_aggregation.py
+++ b/tensorflow_privacy/privacy/dp_query/tree_aggregation.py
@ -0,0 +1,367 @@
+# Copyright 2021, The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tree aggregation algorithm.
+
+This algorithm computes cumulative sums of noise based on tree aggregation. When
+using an appropriate noise function (e.g., Gaussian noise), it allows for
+efficient differentially private algorithms under continual observation, without
+prior subsampling or shuffling assumptions.
+"""
+
+import abc
+from typing import Any, Callable, Collection, Optional, Tuple, Union
+
+import attr
+import tensorflow as tf
+
+
+class ValueGenerator(metaclass=abc.ABCMeta):
+  """Base class establishing interface for stateful value generation."""
+
+  @abc.abstractmethod
+  def initialize(self):
+    """Returns initialized state."""
+
+  @abc.abstractmethod
+  def next(self, state):
+    """Returns tree node value and updated state."""
+
+
+class GaussianNoiseGenerator(ValueGenerator):
+  """Gaussian noise generator with counter as pseudo state."""
+
+  def __init__(self,
+               noise_std: float,
+               specs: Collection[tf.TensorSpec],
+               seed: Optional[int] = None):
+    self.noise_std = noise_std
+    self.specs = specs
+    self.seed = seed
+
+  def initialize(self):
+    if self.seed is None:
+      return tf.cast(
+          tf.stack([
+              tf.math.floor(tf.timestamp() * 1e6),
+              tf.math.floor(tf.math.log(tf.timestamp() * 1e6))
+          ]),
+          dtype=tf.int64)
+    else:
+      return tf.constant(self.seed, dtype=tf.int64, shape=(2,))
+
+  def next(self, state):
+    flat_structure = tf.nest.flatten(self.specs)
+    flat_seeds = [state + i for i in range(len(flat_structure))]
+    nest_seeds = tf.nest.pack_sequence_as(self.specs, flat_seeds)
+
+    def _get_noise(spec, seed):
+      return tf.random.stateless_normal(
+          shape=spec.shape, seed=seed, stddev=self.noise_std)
+
+    nest_noise = tf.nest.map_structure(_get_noise, self.specs, nest_seeds)
+    return nest_noise, flat_seeds[-1] + 1
+
+
+class StatelessValueGenerator(ValueGenerator):
+  """A wrapper for stateless value generator initialized by a no-arg function."""
+
+  def __init__(self, value_fn):
+    self.value_fn = value_fn
+
+  def initialize(self):
+    return ()
+
+  def next(self, state):
+    return self.value_fn(), state
+
+
+@attr.s(eq=False, frozen=True, slots=True)
+class TreeState(object):
+  """Class defining state of the tree.
+
+  Attributes:
+    level_buffer: A `tf.Tensor` saves the last node value of the left child
+      entered for the tree levels recorded in `level_buffer_idx`.
+    level_buffer_idx: A `tf.Tensor` for the tree level index of the
+      `level_buffer`.  The tree level index starts from 0, i.e.,
+      `level_buffer[0]` when `level_buffer_idx[0]==0` recorded the noise value
+      for the most recent leaf node.
+   value_generator_state: State of a stateful `ValueGenerator` for tree node.
+  """
+  level_buffer = attr.ib(type=tf.Tensor)
+  level_buffer_idx = attr.ib(type=tf.Tensor)
+  value_generator_state = attr.ib(type=Any)
+
+
+@tf.function
+def get_step_idx(state: TreeState) -> tf.Tensor:
+  """Returns the current leaf node index based on `TreeState.level_buffer_idx`."""
+  step_idx = tf.constant(-1, dtype=tf.int32)
+  for i in tf.range(len(state.level_buffer_idx)):
+    step_idx += tf.math.pow(2, state.level_buffer_idx[i])
+  return step_idx
+
+
+class TreeAggregator():
+  """Tree aggregator to compute accumulated noise in private algorithms.
+
+  This class implements the tree aggregation algorithm for noise values to
+  efficiently privatize streaming algorithms based on Dwork et al. (2010)
+  https://dl.acm.org/doi/pdf/10.1145/1806689.1806787. A buffer at the scale of
+  tree depth is maintained and updated when a new conceptual leaf node arrives.
+
+  Attributes:
+    value_generator: A `ValueGenerator` or a no-arg function to generate a noise
+      value for each tree node.
+  """
+
+  def __init__(self, value_generator: Union[ValueGenerator, Callable[[], Any]]):
+    """Initialize the aggregator with a noise generator."""
+    if isinstance(value_generator, ValueGenerator):
+      self.value_generator = value_generator
+    else:
+      self.value_generator = StatelessValueGenerator(value_generator)
+
+  def init_state(self) -> TreeState:
+    """Returns initial `TreeState`.
+
+    Initializes `TreeState` for a tree of a single leaf node: the respective
+    initial node value in `TreeState.level_buffer` is generated by the value
+    generator function, and the node index is 0.
+    """
+    value_generator_state = self.value_generator.initialize()
+    level_buffer_idx = tf.TensorArray(dtype=tf.int32, size=1, dynamic_size=True)
+    level_buffer_idx = level_buffer_idx.write(0, tf.constant(
+        0, dtype=tf.int32)).stack()
+
+    new_val, value_generator_state = self.value_generator.next(
+        value_generator_state)
+    level_buffer_structure = tf.nest.map_structure(
+        lambda x: tf.TensorArray(dtype=tf.float32, size=1, dynamic_size=True),
+        new_val)
+    level_buffer = tf.nest.map_structure(lambda x, y: x.write(0, y).stack(),
+                                         level_buffer_structure, new_val)
+
+    return TreeState(
+        level_buffer=level_buffer,
+        level_buffer_idx=level_buffer_idx,
+        value_generator_state=value_generator_state)
+
+  @tf.function
+  def _get_cumsum(self, level_buffer: Collection[tf.Tensor]) -> tf.Tensor:
+    return tf.nest.map_structure(lambda x: tf.reduce_sum(x, axis=0),
+                                 level_buffer)
+
+  @tf.function
+  def get_cumsum_and_update(self,
+                            state: TreeState) -> Tuple[tf.Tensor, TreeState]:
+    """Returns tree aggregated value and updated `TreeState` for one step.
+
+    `TreeState` is updated to prepare for accepting the *next* leaf node. Note
+    that `get_step_idx` can be called to get the current index of the leaf node
+    before calling this function. This function accept state for the current
+    leaf node and prepare for the next leaf node because TFF prefers to know
+    the types of state at initialization.
+
+    Args:
+      state: `TreeState` for the current leaf node, index can be queried by
+        `tree_aggregation.get_step_idx(state.level_buffer_idx)`.
+    """
+
+    level_buffer_idx, level_buffer, value_generator_state = (
+        state.level_buffer_idx, state.level_buffer, state.value_generator_state)
+    cumsum = self._get_cumsum(level_buffer)
+
+    new_level_buffer = tf.nest.map_structure(
+        lambda x: tf.TensorArray(  # pylint: disable=g-long-lambda
+            dtype=tf.float32,
+            size=0,
+            dynamic_size=True),
+        level_buffer)
+    new_level_buffer_idx = tf.TensorArray(
+        dtype=tf.int32, size=0, dynamic_size=True)
+    # `TreeState` stores the left child node necessary for computing the cumsum
+    # noise. To update the buffer, let us find the lowest level that will switch
+    # from a right child (not in the buffer) to a left child.
+    level_idx = 0  # new leaf node starts from level 0
+    while tf.less(level_idx, len(level_buffer_idx)) and tf.equal(
+        level_idx, level_buffer_idx[level_idx]):
+      level_idx += 1
+    # Left child nodes for the level lower than `level_idx` will be removed
+    # and a new node will be created at `level_idx`.
+    write_buffer_idx = 0
+    new_level_buffer_idx = new_level_buffer_idx.write(write_buffer_idx,
+                                                      level_idx)
+    new_value, value_generator_state = self.value_generator.next(
+        value_generator_state)
+    new_level_buffer = tf.nest.map_structure(
+        lambda x, y: x.write(write_buffer_idx, y), new_level_buffer, new_value)
+    write_buffer_idx += 1
+    # Buffer index will now different from level index for the old `TreeState`
+    # i.e., `level_buffer_idx[level_idx] != level_idx`. Rename parameter to
+    # buffer index for clarity.
+    buffer_idx = level_idx
+    while tf.less(buffer_idx, len(level_buffer_idx)):
+      new_level_buffer_idx = new_level_buffer_idx.write(
+          write_buffer_idx, level_buffer_idx[buffer_idx])
+      new_level_buffer = tf.nest.map_structure(
+          lambda nb, b: nb.write(write_buffer_idx, b[buffer_idx]),
+          new_level_buffer, level_buffer)
+      buffer_idx += 1
+      write_buffer_idx += 1
+    new_level_buffer_idx = new_level_buffer_idx.stack()
+    new_level_buffer = tf.nest.map_structure(lambda x: x.stack(),
+                                             new_level_buffer)
+    new_state = TreeState(
+        level_buffer=new_level_buffer,
+        level_buffer_idx=new_level_buffer_idx,
+        value_generator_state=value_generator_state)
+    return cumsum, new_state
+
+
+class EfficientTreeAggregator():
+  """Efficient tree aggregator to compute accumulated noise.
+
+  This class implements the efficient tree aggregation algorithm based on
+  Honaker 2015 "Efficient Use of Differentially Private Binary Trees".
+  The noise standard deviation for the note at depth d is roughly
+  `sigma * sqrt(2^{d-1}/(2^d-1))`. which becomes `sigma / sqrt(2)` when
+  the tree is very tall.
+
+  Attributes:
+    value_generator: A `ValueGenerator` or a no-arg function to generate a noise
+      value for each tree node.
+  """
+
+  def __init__(self, value_generator: Union[ValueGenerator, Callable[[], Any]]):
+    """Initialize the aggregator with a noise generator."""
+    if isinstance(value_generator, ValueGenerator):
+      self.value_generator = value_generator
+    else:
+      self.value_generator = StatelessValueGenerator(value_generator)
+
+  def init_state(self) -> TreeState:
+    """Returns initial `TreeState`.
+
+    Initializes `TreeState` for a tree of a single leaf node: the respective
+    initial node value in `TreeState.level_buffer` is generated by the value
+    generator function, and the node index is 0.
+    """
+    value_generator_state = self.value_generator.initialize()
+    level_buffer_idx = tf.TensorArray(dtype=tf.int32, size=1, dynamic_size=True)
+    level_buffer_idx = level_buffer_idx.write(0, tf.constant(
+        0, dtype=tf.int32)).stack()
+
+    new_val, value_generator_state = self.value_generator.next(
+        value_generator_state)
+    level_buffer_structure = tf.nest.map_structure(
+        lambda x: tf.TensorArray(dtype=tf.float32, size=1, dynamic_size=True),
+        new_val)
+    level_buffer = tf.nest.map_structure(lambda x, y: x.write(0, y).stack(),
+                                         level_buffer_structure, new_val)
+
+    return TreeState(
+        level_buffer=level_buffer,
+        level_buffer_idx=level_buffer_idx,
+        value_generator_state=value_generator_state)
+
+  @tf.function
+  def _get_cumsum(self, state: TreeState) -> tf.Tensor:
+    """Returns weighted cumulative sum of noise based on `TreeState`."""
+    # Note that the buffer saved recursive results of the weighted average of
+    # the node value (v) and its two children (l, r), i.e., node = v + (l+r)/2.
+    # To get unbiased estimation with reduced variance for each node, we have to
+    # reweight it by 1/(2-2^{-d}) where d is the depth of the node.
+    level_weights = tf.math.divide(
+        1., 2. - tf.math.pow(.5, tf.cast(state.level_buffer_idx, tf.float32)))
+
+    def _weighted_sum(buffer):
+      expand_shape = [len(level_weights)] + [1] * (len(tf.shape(buffer)) - 1)
+      weighted_buffer = tf.math.multiply(
+          buffer, tf.reshape(level_weights, expand_shape))
+      return tf.reduce_sum(weighted_buffer, axis=0)
+
+    return tf.nest.map_structure(_weighted_sum, state.level_buffer)
+
+  @tf.function
+  def get_cumsum_and_update(self,
+                            state: TreeState) -> Tuple[tf.Tensor, TreeState]:
+    """Returns tree aggregated value and updated `TreeState` for one step.
+
+    `TreeState` is updated to prepare for accepting the *next* leaf node. Note
+    that `get_step_idx` can be called to get the current index of the leaf node
+    before calling this function. This function accept state for the current
+    leaf node and prepare for the next leaf node because TFF prefers to know
+    the types of state at initialization. Note that the value of new node in
+    `TreeState.level_buffer` will depend on its two children, and is updated
+    from bottom up for the right child.
+
+    Args:
+      state: `TreeState` for the current leaf node, index can be queried by
+        `tree_aggregation.get_step_idx(state.level_buffer_idx)`.
+    """
+    cumsum = self._get_cumsum(state)
+
+    level_buffer_idx, level_buffer, value_generator_state = (
+        state.level_buffer_idx, state.level_buffer, state.value_generator_state)
+    new_level_buffer = tf.nest.map_structure(
+        lambda x: tf.TensorArray(  # pylint: disable=g-long-lambda
+            dtype=tf.float32,
+            size=0,
+            dynamic_size=True),
+        level_buffer)
+    new_level_buffer_idx = tf.TensorArray(
+        dtype=tf.int32, size=0, dynamic_size=True)
+    # `TreeState` stores the left child node necessary for computing the cumsum
+    # noise. To update the buffer, let us find the lowest level that will switch
+    # from a right child (not in the buffer) to a left child.
+    level_idx = 0  # new leaf node starts from level 0
+    new_value, value_generator_state = self.value_generator.next(
+        value_generator_state)
+    while tf.less(level_idx, len(level_buffer_idx)) and tf.equal(
+        level_idx, level_buffer_idx[level_idx]):
+      # Recursively update if the current node is a right child.
+      node_value, value_generator_state = self.value_generator.next(
+          value_generator_state)
+      new_value = tf.nest.map_structure(
+          lambda l, r, n: 0.5 * (l[level_idx] + r) + n, level_buffer, new_value,
+          node_value)
+      level_idx += 1
+    # A new (left) node will be created at `level_idx`.
+    write_buffer_idx = 0
+    new_level_buffer_idx = new_level_buffer_idx.write(write_buffer_idx,
+                                                      level_idx)
+    new_level_buffer = tf.nest.map_structure(
+        lambda x, y: x.write(write_buffer_idx, y), new_level_buffer, new_value)
+    write_buffer_idx += 1
+    # Buffer index will now different from level index for the old `TreeState`
+    # i.e., `level_buffer_idx[level_idx] != level_idx`. Rename parameter to
+    # buffer index for clarity.
+    buffer_idx = level_idx
+    while tf.less(buffer_idx, len(level_buffer_idx)):
+      new_level_buffer_idx = new_level_buffer_idx.write(
+          write_buffer_idx, level_buffer_idx[buffer_idx])
+      new_level_buffer = tf.nest.map_structure(
+          lambda nb, b: nb.write(write_buffer_idx, b[buffer_idx]),
+          new_level_buffer, level_buffer)
+      buffer_idx += 1
+      write_buffer_idx += 1
+    new_level_buffer_idx = new_level_buffer_idx.stack()
+    new_level_buffer = tf.nest.map_structure(lambda x: x.stack(),
+                                             new_level_buffer)
+    new_state = TreeState(
+        level_buffer=new_level_buffer,
+        level_buffer_idx=new_level_buffer_idx,
+        value_generator_state=value_generator_state)
+    return cumsum, new_state
--- a/tensorflow_privacy/privacy/dp_query/tree_aggregation_query.py
+++ b/tensorflow_privacy/privacy/dp_query/tree_aggregation_query.py
@ -0,0 +1,355 @@
+# Copyright 2021, The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DPQuery for continual observation queries relying on `tree_aggregation`."""
+
+import attr
+import tensorflow as tf
+
+from tensorflow_privacy.privacy.dp_query import dp_query
+from tensorflow_privacy.privacy.dp_query import tree_aggregation
+
+
+class TreeCumulativeSumQuery(dp_query.SumAggregationDPQuery):
+  """Implements dp_query for adding correlated noise through tree structure.
+
+  First clips and sums records in current sample, returns cumulative sum of
+  samples over time (instead of only current sample) with added noise for
+  cumulative sum proportional to log(T), T being the number of times the query
+  is called.
+
+  Attributes:
+    clip_fn: Callable that specifies clipping function. `clip_fn` receives two
+      arguments: a flat list of vars in a record and a `clip_value` to clip the
+      corresponding record, e.g. clip_fn(flat_record, clip_value).
+    clip_value: float indicating the value at which to clip the record.
+    record_specs: `Collection[tf.TensorSpec]` specifying shapes of records.
+    tree_aggregator: `tree_aggregation.TreeAggregator` initialized with
+      user defined `noise_generator`. `noise_generator` is a
+      `tree_aggregation.ValueGenerator` to generate the noise value for a tree
+      node. Noise stdandard deviation is specified outside the `dp_query` by the
+      user when defining `noise_fn` and should have order
+      O(clip_norm*log(T)/eps) to guarantee eps-DP.
+  """
+
+  @attr.s(frozen=True)
+  class GlobalState(object):
+    """Class defining global state for Tree sum queries.
+
+    Attributes:
+      tree_state: Current state of noise tree keeping track of current leaf and
+        each level state.
+      clip_value: The clipping value to be passed to clip_fn.
+      samples_cumulative_sum: Noiseless cumulative sum of samples over time.
+    """
+    tree_state = attr.ib()
+    clip_value = attr.ib()
+    samples_cumulative_sum = attr.ib()
+
+  def __init__(self,
+               record_specs,
+               noise_generator,
+               clip_fn,
+               clip_value,
+               use_efficient=True):
+    """Initializes the `TreeCumulativeSumQuery`.
+
+    Consider using `build_l2_gaussian_query` for the construction of a
+    `TreeCumulativeSumQuery` with L2 norm clipping and Gaussian noise.
+
+    Args:
+      record_specs: `Collection[tf.TensorSpec]` specifying shapes of records.
+      noise_generator: `tree_aggregation.ValueGenerator` to generate the noise
+        value for a tree node. Should be coupled with clipping norm to guarantee
+        privacy.
+      clip_fn: Callable that specifies clipping function. Input to clip is a
+        flat list of vars in a record.
+      clip_value: Float indicating the value at which to clip the record.
+      use_efficient: Boolean indicating the usage of the efficient tree
+        aggregation algorithm based on the paper "Efficient Use of
+        Differentially Private Binary Trees".
+    """
+    self._clip_fn = clip_fn
+    self._clip_value = clip_value
+    self._record_specs = record_specs
+    if use_efficient:
+      self._tree_aggregator = tree_aggregation.EfficientTreeAggregator(
+          noise_generator)
+    else:
+      self._tree_aggregator = tree_aggregation.TreeAggregator(noise_generator)
+
+  def initial_global_state(self):
+    """Returns initial global state."""
+    initial_tree_state = self._tree_aggregator.init_state()
+    initial_samples_cumulative_sum = tf.nest.map_structure(
+        lambda spec: tf.zeros(spec.shape), self._record_specs)
+    initial_state = TreeCumulativeSumQuery.GlobalState(
+        tree_state=initial_tree_state,
+        clip_value=tf.constant(self._clip_value, tf.float32),
+        samples_cumulative_sum=initial_samples_cumulative_sum)
+    return initial_state
+
+  def derive_sample_params(self, global_state):
+    return global_state.clip_value
+
+  def preprocess_record(self, params, record):
+    """Returns the clipped record using `clip_fn` and params.
+
+    Args:
+      params: `clip_value` for the record.
+      record: The record to be processed.
+
+    Returns:
+      Structure of clipped tensors.
+    """
+    clip_value = params
+    record_as_list = tf.nest.flatten(record)
+    clipped_as_list = self._clip_fn(record_as_list, clip_value)
+    return tf.nest.pack_sequence_as(record, clipped_as_list)
+
+  def get_noised_result(self, sample_state, global_state):
+    """Updates tree, state, and returns noised cumulative sum and updated state.
+
+    Computes new cumulative sum, and returns its noised value. Grows tree_state
+    by one new leaf, and returns the new state.
+
+    Args:
+      sample_state: Sum of clipped records for this round.
+      global_state: Global state with current samples cumulative sum and tree
+        state.
+
+    Returns:
+      A tuple of (noised_cumulative_sum, new_global_state).
+    """
+    new_cumulative_sum = tf.nest.map_structure(
+        tf.add, global_state.samples_cumulative_sum, sample_state)
+    cumulative_sum_noise, new_tree_state = self._tree_aggregator.get_cumsum_and_update(
+        global_state.tree_state)
+    new_global_state = attr.evolve(
+        global_state,
+        samples_cumulative_sum=new_cumulative_sum,
+        tree_state=new_tree_state)
+    noised_cum_sum = tf.nest.map_structure(tf.add, new_cumulative_sum,
+                                           cumulative_sum_noise)
+    return noised_cum_sum, new_global_state
+
+  @classmethod
+  def build_l2_gaussian_query(cls,
+                              clip_norm,
+                              noise_multiplier,
+                              record_specs,
+                              noise_seed=None,
+                              use_efficient=True):
+    """Returns a query instance with L2 norm clipping and Gaussian noise.
+
+    Args:
+      clip_norm: Each record will be clipped so that it has L2 norm at most
+        `clip_norm`.
+      noise_multiplier: The effective noise multiplier for the sum of records.
+        Noise standard deviation is `clip_norm*noise_multiplier`.
+      record_specs: `Collection[tf.TensorSpec]` specifying shapes of records.
+      noise_seed: Integer seed for the Gaussian noise generator. If `None`, a
+        nondeterministic seed based on system time will be generated.
+      use_efficient: Boolean indicating the usage of the efficient tree
+        aggregation algorithm based on the paper "Efficient Use of
+        Differentially Private Binary Trees".
+    """
+    if clip_norm <= 0:
+      raise ValueError(f'`clip_norm` must be positive, got {clip_norm}.')
+
+    if noise_multiplier < 0:
+      raise ValueError(
+          f'`noise_multiplier` must be non-negative, got {noise_multiplier}.')
+
+    gaussian_noise_generator = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=clip_norm * noise_multiplier,
+        specs=record_specs,
+        seed=noise_seed)
+
+    def l2_clip_fn(record_as_list, clip_norm):
+      clipped_record, _ = tf.clip_by_global_norm(record_as_list, clip_norm)
+      return clipped_record
+
+    return cls(
+        clip_fn=l2_clip_fn,
+        clip_value=clip_norm,
+        record_specs=record_specs,
+        noise_generator=gaussian_noise_generator,
+        use_efficient=use_efficient)
+
+
+class TreeResidualSumQuery(dp_query.SumAggregationDPQuery):
+  """Implements dp_query for adding correlated noise through tree structure.
+
+  Clips and sums records in current sample; returns the current sample adding
+  the noise residual from tree aggregation. The returned value is conceptually
+  equivalent to the following: calculates cumulative sum of samples over time
+  (instead of only current sample) with added noise for cumulative sum
+  proportional to log(T), T being the number of times the query is called;
+  returns the residual between the current noised cumsum and the previous one
+  when the query is called. Combining this query with a SGD optimizer can be
+  used to implement the DP-FTRL algorithm in
+  "Practical and Private (Deep) Learning without Sampling or Shuffling".
+
+  Attributes:
+    clip_fn: Callable that specifies clipping function. `clip_fn` receives two
+      arguments: a flat list of vars in a record and a `clip_value` to clip the
+        corresponding record, e.g. clip_fn(flat_record, clip_value).
+    clip_value: float indicating the value at which to clip the record.
+    record_specs: `Collection[tf.TensorSpec]` specifying shapes of records.
+    tree_aggregator: `tree_aggregation.TreeAggregator` initialized with user
+      defined `noise_generator`. `noise_generator` is a
+      `tree_aggregation.ValueGenerator` to generate the noise value for a tree
+      node. Noise stdandard deviation is specified outside the `dp_query` by the
+      user when defining `noise_fn` and should have order
+      O(clip_norm*log(T)/eps) to guarantee eps-DP.
+  """
+
+  @attr.s(frozen=True)
+  class GlobalState(object):
+    """Class defining global state for Tree sum queries.
+
+    Attributes:
+      tree_state: Current state of noise tree keeping track of current leaf and
+        each level state.
+      clip_value: The clipping value to be passed to clip_fn.
+      previous_tree_noise: Cumulative noise by tree aggregation from the
+        previous time the query is called on a sample.
+    """
+    tree_state = attr.ib()
+    clip_value = attr.ib()
+    previous_tree_noise = attr.ib()
+
+  def __init__(self,
+               record_specs,
+               noise_generator,
+               clip_fn,
+               clip_value,
+               use_efficient=True):
+    """Initializes the `TreeResidualSumQuery`.
+
+    Consider using `build_l2_gaussian_query` for the construction of a
+    `TreeResidualSumQuery` with L2 norm clipping and Gaussian noise.
+
+    Args:
+      record_specs: `Collection[tf.TensorSpec]` specifying shapes of records.
+      noise_generator: `tree_aggregation.ValueGenerator` to generate the noise
+        value for a tree node. Should be coupled with clipping norm to guarantee
+        privacy.
+      clip_fn: Callable that specifies clipping function. Input to clip is a
+        flat list of vars in a record.
+      clip_value: Float indicating the value at which to clip the record.
+      use_efficient: Boolean indicating the usage of the efficient tree
+        aggregation algorithm based on the paper "Efficient Use of
+        Differentially Private Binary Trees".
+    """
+    self._clip_fn = clip_fn
+    self._clip_value = clip_value
+    self._record_specs = record_specs
+    if use_efficient:
+      self._tree_aggregator = tree_aggregation.EfficientTreeAggregator(
+          noise_generator)
+    else:
+      self._tree_aggregator = tree_aggregation.TreeAggregator(noise_generator)
+
+  def initial_global_state(self):
+    """Returns initial global state."""
+    initial_tree_state = self._tree_aggregator.init_state()
+    initial_noise = tf.nest.map_structure(lambda spec: tf.zeros(spec.shape),
+                                          self._record_specs)
+    return TreeResidualSumQuery.GlobalState(
+        tree_state=initial_tree_state,
+        clip_value=tf.constant(self._clip_value, tf.float32),
+        previous_tree_noise=initial_noise)
+
+  def derive_sample_params(self, global_state):
+    return global_state.clip_value
+
+  def preprocess_record(self, params, record):
+    """Returns the clipped record using `clip_fn` and params.
+
+    Args:
+      params: `clip_value` for the record.
+      record: The record to be processed.
+
+    Returns:
+      Structure of clipped tensors.
+    """
+    clip_value = params
+    record_as_list = tf.nest.flatten(record)
+    clipped_as_list = self._clip_fn(record_as_list, clip_value)
+    return tf.nest.pack_sequence_as(record, clipped_as_list)
+
+  def get_noised_result(self, sample_state, global_state):
+    """Updates tree state, and returns residual of noised cumulative sum.
+
+    Args:
+      sample_state: Sum of clipped records for this round.
+      global_state: Global state with current samples cumulative sum and tree
+        state.
+
+    Returns:
+      A tuple of (noised_cumulative_sum, new_global_state).
+    """
+    tree_noise, new_tree_state = self._tree_aggregator.get_cumsum_and_update(
+        global_state.tree_state)
+    noised_sample = tf.nest.map_structure(lambda a, b, c: a + b - c,
+                                          sample_state, tree_noise,
+                                          global_state.previous_tree_noise)
+    new_global_state = attr.evolve(
+        global_state, previous_tree_noise=tree_noise, tree_state=new_tree_state)
+    return noised_sample, new_global_state
+
+  @classmethod
+  def build_l2_gaussian_query(cls,
+                              clip_norm,
+                              noise_multiplier,
+                              record_specs,
+                              noise_seed=None,
+                              use_efficient=True):
+    """Returns `TreeResidualSumQuery` with L2 norm clipping and Gaussian noise.
+
+    Args:
+      clip_norm: Each record will be clipped so that it has L2 norm at most
+        `clip_norm`.
+      noise_multiplier: The effective noise multiplier for the sum of records.
+        Noise standard deviation is `clip_norm*noise_multiplier`.
+      record_specs: `Collection[tf.TensorSpec]` specifying shapes of records.
+      noise_seed: Integer seed for the Gaussian noise generator. If `None`, a
+        nondeterministic seed based on system time will be generated.
+      use_efficient: Boolean indicating the usage of the efficient tree
+        aggregation algorithm based on the paper "Efficient Use of
+        Differentially Private Binary Trees".
+    """
+    if clip_norm <= 0:
+      raise ValueError(f'`clip_norm` must be positive, got {clip_norm}.')
+
+    if noise_multiplier < 0:
+      raise ValueError(
+          f'`noise_multiplier` must be non-negative, got {noise_multiplier}.')
+
+    gaussian_noise_generator = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=clip_norm * noise_multiplier,
+        specs=record_specs,
+        seed=noise_seed)
+
+    def l2_clip_fn(record_as_list, clip_norm):
+      clipped_record, _ = tf.clip_by_global_norm(record_as_list, clip_norm)
+      return clipped_record
+
+    return cls(
+        clip_fn=l2_clip_fn,
+        clip_value=clip_norm,
+        record_specs=record_specs,
+        noise_generator=gaussian_noise_generator,
+        use_efficient=use_efficient)
--- a/tensorflow_privacy/privacy/dp_query/tree_aggregation_query_test.py
+++ b/tensorflow_privacy/privacy/dp_query/tree_aggregation_query_test.py
@ -0,0 +1,399 @@
+# Copyright 2021, The TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for `tree_aggregation_query`."""
+
+from absl.testing import parameterized
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_privacy.privacy.dp_query import test_utils
+from tensorflow_privacy.privacy.dp_query import tree_aggregation
+from tensorflow_privacy.privacy.dp_query import tree_aggregation_query
+
+
+STRUCT_RECORD = [
+    tf.constant([[2.0, 0.0], [0.0, 1.0]]),
+    tf.constant([-1.0, 0.0])
+]
+
+SINGLE_VALUE_RECORDS = [tf.constant(1.), tf.constant(3.), tf.constant(5.)]
+
+STRUCTURE_SPECS = tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                        STRUCT_RECORD)
+NOISE_STD = 5.0
+
+STREAMING_SCALARS = np.array(range(7), dtype=np.single)
+
+
+def _get_noise_generator(specs, stddev=NOISE_STD, seed=1):
+  return tree_aggregation.GaussianNoiseGenerator(
+      noise_std=stddev, specs=specs, seed=seed)
+
+
+def _get_noise_fn(specs, stddev=NOISE_STD, seed=1):
+  random_generator = tf.random.Generator.from_seed(seed)
+
+  def noise_fn():
+    shape = tf.nest.map_structure(lambda spec: spec.shape, specs)
+    return tf.nest.map_structure(
+        lambda x: random_generator.normal(x, stddev=stddev), shape)
+
+  return noise_fn
+
+
+def _get_no_noise_fn(specs):
+  shape = tf.nest.map_structure(lambda spec: spec.shape, specs)
+  def no_noise_fn():
+    return tf.nest.map_structure(tf.zeros, shape)
+
+  return no_noise_fn
+
+
+def _get_l2_clip_fn():
+
+  def l2_clip_fn(record_as_list, clip_value):
+    clipped_record, _ = tf.clip_by_global_norm(record_as_list, clip_value)
+    return clipped_record
+
+  return l2_clip_fn
+
+
+def _get_l_infty_clip_fn():
+
+  def l_infty_clip_fn(record_as_list, clip_value):
+    def clip(record):
+      return tf.clip_by_value(
+          record, clip_value_min=-clip_value, clip_value_max=clip_value)
+
+    clipped_record = tf.nest.map_structure(clip, record_as_list)
+    return clipped_record
+
+  return l_infty_clip_fn
+
+
+class TreeCumulativeSumQueryTest(tf.test.TestCase, parameterized.TestCase):
+
+  def test_correct_initial_global_state_struct_type(self):
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=10.,
+        noise_generator=_get_no_noise_fn(STRUCTURE_SPECS),
+        record_specs=STRUCTURE_SPECS)
+    global_state = query.initial_global_state()
+
+    self.assertIsInstance(global_state.tree_state, tree_aggregation.TreeState)
+    expected_cum_sum = tf.nest.map_structure(lambda spec: tf.zeros(spec.shape),
+                                             STRUCTURE_SPECS)
+    self.assertAllClose(expected_cum_sum, global_state.samples_cumulative_sum)
+
+  def test_correct_initial_global_state_single_value_type(self):
+    record_specs = tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                         SINGLE_VALUE_RECORDS[0])
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=10.,
+        noise_generator=_get_no_noise_fn(record_specs),
+        record_specs=record_specs)
+    global_state = query.initial_global_state()
+
+    self.assertIsInstance(global_state.tree_state, tree_aggregation.TreeState)
+    expected_cum_sum = tf.nest.map_structure(lambda spec: tf.zeros(spec.shape),
+                                             record_specs)
+    self.assertAllClose(expected_cum_sum, global_state.samples_cumulative_sum)
+
+  @parameterized.named_parameters(
+      ('not_clip_single_record', SINGLE_VALUE_RECORDS[0], 10.0),
+      ('clip_single_record', SINGLE_VALUE_RECORDS[1], 1.0))
+  def test_l2_clips_single_record(self, record, l2_norm_clip):
+    record_specs = tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                         SINGLE_VALUE_RECORDS[0])
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=l2_norm_clip,
+        noise_generator=_get_no_noise_fn(record_specs),
+        record_specs=record_specs)
+    global_state = query.initial_global_state()
+    record_norm = tf.norm(record)
+    if record_norm > l2_norm_clip:
+      expected_clipped_record = tf.nest.map_structure(
+          lambda t: t * l2_norm_clip / record_norm, record)
+    else:
+      expected_clipped_record = record
+    clipped_record = query.preprocess_record(global_state.clip_value, record)
+    self.assertAllClose(expected_clipped_record, clipped_record)
+
+  @parameterized.named_parameters(
+      ('not_clip_structure_record', STRUCT_RECORD, 10.0),
+      ('clip_structure_record', STRUCT_RECORD, 1.0))
+  def test_l2_clips_structure_type_record(self, record, l2_norm_clip):
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=l2_norm_clip,
+        noise_generator=_get_no_noise_fn(STRUCTURE_SPECS),
+        record_specs=tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                           record))
+    global_state = query.initial_global_state()
+    record_norm = tf.linalg.global_norm(record)
+    if record_norm > l2_norm_clip:
+      expected_clipped_record = tf.nest.map_structure(
+          lambda t: t * l2_norm_clip / record_norm, record)
+    else:
+      expected_clipped_record = record
+    clipped_record = query.preprocess_record(global_state.clip_value, record)
+    self.assertAllClose(expected_clipped_record, clipped_record)
+
+  @parameterized.named_parameters(
+      ('not_clip_single_record', SINGLE_VALUE_RECORDS[0], 10.0),
+      ('clip_single_record', SINGLE_VALUE_RECORDS[1], 1.0))
+  def test_l_infty_clips_single_record(self, record, norm_clip):
+    record_specs = tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                         SINGLE_VALUE_RECORDS[0])
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l_infty_clip_fn(),
+        clip_value=norm_clip,
+        noise_generator=_get_no_noise_fn(record_specs),
+        record_specs=record_specs)
+    global_state = query.initial_global_state()
+    expected_clipped_record = tf.nest.map_structure(
+        lambda t: tf.clip_by_value(t, -norm_clip, norm_clip), record)
+    clipped_record = query.preprocess_record(global_state.clip_value, record)
+    self.assertAllClose(expected_clipped_record, clipped_record)
+
+  @parameterized.named_parameters(
+      ('not_clip_structure_record', STRUCT_RECORD, 10.0),
+      ('clip_structure_record', STRUCT_RECORD, 1.0))
+  def test_linfty_clips_structure_type_record(self, record, norm_clip):
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l_infty_clip_fn(),
+        clip_value=norm_clip,
+        noise_generator=_get_no_noise_fn(STRUCTURE_SPECS),
+        record_specs=tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                           record))
+    global_state = query.initial_global_state()
+    expected_clipped_record = tf.nest.map_structure(
+        lambda t: tf.clip_by_value(t, -norm_clip, norm_clip), record)
+    clipped_record = query.preprocess_record(global_state.clip_value, record)
+    self.assertAllClose(expected_clipped_record, clipped_record)
+
+  def test_noiseless_query_single_value_type_record(self):
+    record_specs = tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                         SINGLE_VALUE_RECORDS[0])
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=10.,
+        noise_generator=_get_no_noise_fn(record_specs),
+        record_specs=record_specs)
+    query_result, _ = test_utils.run_query(query, SINGLE_VALUE_RECORDS)
+    expected = tf.constant(9.)
+    self.assertAllClose(query_result, expected)
+
+  def test_noiseless_query_structure_type_record(self):
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=10.,
+        noise_generator=_get_no_noise_fn(STRUCTURE_SPECS),
+        record_specs=STRUCTURE_SPECS)
+    query_result, _ = test_utils.run_query(query,
+                                           [STRUCT_RECORD, STRUCT_RECORD])
+    expected = tf.nest.map_structure(lambda a, b: a + b, STRUCT_RECORD,
+                                     STRUCT_RECORD)
+    self.assertAllClose(query_result, expected)
+
+  @parameterized.named_parameters(
+      ('two_records_noise_fn', [2.71828, 3.14159], _get_noise_fn),
+      ('five_records_noise_fn', np.random.uniform(size=5).tolist(),
+       _get_noise_fn),
+      ('two_records_generator', [2.71828, 3.14159], _get_noise_generator),
+      ('five_records_generator', np.random.uniform(size=5).tolist(),
+       _get_noise_generator),
+  )
+  def test_noisy_cumsum_and_state_update(self, records, value_generator):
+    num_trials = 200
+    record_specs = tf.nest.map_structure(lambda t: tf.TensorSpec(tf.shape(t)),
+                                         records[0])
+    noised_sums = []
+    for i in range(num_trials):
+      query = tree_aggregation_query.TreeCumulativeSumQuery(
+          clip_fn=_get_l2_clip_fn(),
+          clip_value=10.,
+          noise_generator=value_generator(record_specs, seed=i),
+          record_specs=record_specs)
+      query_result, _ = test_utils.run_query(query, records)
+      noised_sums.append(query_result)
+    result_stddev = np.std(noised_sums)
+    self.assertNear(result_stddev, NOISE_STD, 0.7)  # value for chi-squared test
+
+  @parameterized.named_parameters(
+      ('no_clip', STREAMING_SCALARS, 10., np.cumsum(STREAMING_SCALARS)),
+      ('all_clip', STREAMING_SCALARS, 0.5, STREAMING_SCALARS * 0.5),
+      # STREAMING_SCALARS is list(range(7)), only the last element is clipped
+      # for the following test, which makes the expected value for the last sum
+      # to be `cumsum(5)+5`.
+      ('partial_clip', STREAMING_SCALARS, 5.,
+       np.append(np.cumsum(STREAMING_SCALARS[:-1]), 20.)),
+  )
+  def test_partial_sum_scalar_no_noise(self, streaming_scalars, clip_norm,
+                                       partial_sum):
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=clip_norm,
+        noise_generator=lambda: 0.,
+        record_specs=tf.TensorSpec([]),
+    )
+    global_state = query.initial_global_state()
+    params = query.derive_sample_params(global_state)
+    for scalar, expected_sum in zip(streaming_scalars, partial_sum):
+      sample_state = query.initial_sample_state(scalar)
+      sample_state = query.accumulate_record(params, sample_state, scalar)
+      query_result, global_state = query.get_noised_result(
+          sample_state, global_state)
+      self.assertEqual(query_result, expected_sum)
+
+  @parameterized.named_parameters(
+      ('s0t1step8', 0., 1., [1., 1., 2., 1., 2., 2., 3., 1.]),
+      ('s1t1step8', 1., 1., [2., 3., 5., 5., 7., 8., 10., 9.]),
+      ('s1t2step8', 1., 2., [3., 4., 7., 6., 9., 10., 13., 10.]),
+  )
+  def test_partial_sum_scalar_tree_aggregation(self, scalar_value,
+                                               tree_node_value,
+                                               expected_values):
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=scalar_value + 1.,  # no clip
+        noise_generator=lambda: tree_node_value,
+        record_specs=tf.TensorSpec([]),
+        use_efficient=False,
+    )
+    global_state = query.initial_global_state()
+    params = query.derive_sample_params(global_state)
+    for val in expected_values:
+      # For each streaming step i , the expected value is roughly
+      # `scalar_value*i + tree_aggregation(tree_node_value, i)`
+      sample_state = query.initial_sample_state(scalar_value)
+      sample_state = query.accumulate_record(params, sample_state, scalar_value)
+      query_result, global_state = query.get_noised_result(
+          sample_state, global_state)
+      self.assertEqual(query_result, val)
+
+  @parameterized.named_parameters(
+      ('efficient', True, tree_aggregation.EfficientTreeAggregator),
+      ('normal', False, tree_aggregation.TreeAggregator),
+  )
+  def test_sum_tree_aggregator_instance(self, use_efficient, tree_class):
+    specs = tf.TensorSpec([])
+    query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=1.,
+        noise_generator=_get_noise_fn(specs, 1.),
+        record_specs=specs,
+        use_efficient=use_efficient,
+    )
+    self.assertIsInstance(query._tree_aggregator, tree_class)
+
+  @parameterized.named_parameters(
+      ('r5d10n0s1s16eff', 5, 10, 0., 1, 16, 0.1, True),
+      ('r3d5n1s1s32eff', 3, 5, 1., 1, 32, 1., True),
+      ('r10d3n1s2s16eff', 10, 3, 1., 2, 16, 10., True),
+      ('r10d3n1s2s16', 10, 3, 1., 2, 16, 10., False),
+  )
+  def test_build_l2_gaussian_query(self, records_num, record_dim,
+                                   noise_multiplier, seed, total_steps, clip,
+                                   use_efficient):
+    record_specs = tf.TensorSpec(shape=[record_dim])
+    query = tree_aggregation_query.TreeCumulativeSumQuery.build_l2_gaussian_query(
+        clip_norm=clip,
+        noise_multiplier=noise_multiplier,
+        record_specs=record_specs,
+        noise_seed=seed,
+        use_efficient=use_efficient)
+    reference_query = tree_aggregation_query.TreeCumulativeSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=clip,
+        noise_generator=_get_noise_generator(record_specs,
+                                             clip * noise_multiplier, seed),
+        record_specs=record_specs,
+        use_efficient=use_efficient)
+    global_state = query.initial_global_state()
+    reference_global_state = reference_query.initial_global_state()
+
+    for _ in range(total_steps):
+      records = [
+          tf.random.uniform(shape=[record_dim], maxval=records_num)
+          for _ in range(records_num)
+      ]
+      query_result, global_state = test_utils.run_query(query, records,
+                                                        global_state)
+      reference_query_result, reference_global_state = test_utils.run_query(
+          reference_query, records, reference_global_state)
+      self.assertAllClose(query_result, reference_query_result, rtol=1e-6)
+
+
+class TreeResidualQueryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('r5d10n0s1s16eff', 5, 10, 0., 1, 16, 0.1, True),
+      ('r3d5n1s1s32eff', 3, 5, 1., 1, 32, 1., True),
+      ('r10d3n1s2s16eff', 10, 3, 1., 2, 16, 10., True),
+      ('r10d3n1s2s16', 10, 3, 1., 2, 16, 10., False),
+  )
+  def test_sum(self, records_num, record_dim, noise_multiplier, seed,
+               total_steps, clip, use_efficient):
+    record_specs = tf.TensorSpec(shape=[record_dim])
+    query = tree_aggregation_query.TreeResidualSumQuery.build_l2_gaussian_query(
+        clip_norm=clip,
+        noise_multiplier=noise_multiplier,
+        record_specs=record_specs,
+        noise_seed=seed,
+        use_efficient=use_efficient)
+    sum_query = tree_aggregation_query.TreeCumulativeSumQuery.build_l2_gaussian_query(
+        clip_norm=clip,
+        noise_multiplier=noise_multiplier,
+        record_specs=record_specs,
+        noise_seed=seed,
+        use_efficient=use_efficient)
+    global_state = query.initial_global_state()
+    sum_global_state = sum_query.initial_global_state()
+
+    cumsum_result = tf.zeros(shape=[record_dim])
+    for _ in range(total_steps):
+      records = [
+          tf.random.uniform(shape=[record_dim], maxval=records_num)
+          for _ in range(records_num)
+      ]
+      query_result, global_state = test_utils.run_query(query, records,
+                                                        global_state)
+      sum_query_result, sum_global_state = test_utils.run_query(
+          sum_query, records, sum_global_state)
+      cumsum_result += query_result
+      self.assertAllClose(cumsum_result, sum_query_result, rtol=1e-6)
+
+  @parameterized.named_parameters(
+      ('efficient', True, tree_aggregation.EfficientTreeAggregator),
+      ('normal', False, tree_aggregation.TreeAggregator),
+  )
+  def test_sum_tree_aggregator_instance(self, use_efficient, tree_class):
+    specs = tf.TensorSpec([])
+    query = tree_aggregation_query.TreeResidualSumQuery(
+        clip_fn=_get_l2_clip_fn(),
+        clip_value=1.,
+        noise_generator=_get_noise_fn(specs, 1.),
+        record_specs=specs,
+        use_efficient=use_efficient,
+    )
+    self.assertIsInstance(query._tree_aggregator, tree_class)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tensorflow_privacy/privacy/dp_query/tree_aggregation_test.py
+++ b/tensorflow_privacy/privacy/dp_query/tree_aggregation_test.py
@ -0,0 +1,369 @@
+# Copyright 2021, Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for `tree_aggregation`."""
+import math
+from absl.testing import parameterized
+
+import tensorflow as tf
+from tensorflow_privacy.privacy.dp_query import tree_aggregation
+
+
+class ConstantValueGenerator(tree_aggregation.ValueGenerator):
+
+  def __init__(self, constant_value):
+    self.constant_value = constant_value
+
+  def initialize(self):
+    return ()
+
+  def next(self, state):
+    return self.constant_value, state
+
+
+class TreeAggregatorTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('total4_step1', 4, [1, 1, 2, 1], 1),
+      ('total5_step1', 5, [1, 1, 2, 1, 2], 1),
+      ('total6_step1', 6, [1, 1, 2, 1, 2, 2], 1),
+      ('total7_step1', 7, [1, 1, 2, 1, 2, 2, 3], 1),
+      ('total8_step1', 8, [1, 1, 2, 1, 2, 2, 3, 1], 1),
+      ('total8_step2', 8, [2, 2, 4, 2, 4, 4, 6, 2], 2),
+      ('total8_step0d5', 8, [0.5, 0.5, 1, 0.5, 1, 1, 1.5, 0.5], 0.5))
+  def test_tree_sum_steps_expected(self, total_steps, expected_values,
+                                   node_value):
+    # Test whether `tree_aggregator` will output `expected_value` in each step
+    # when `total_steps` of leaf nodes are traversed. The value of each tree
+    # node is a constant `node_value` for test purpose. Note that `node_value`
+    # denotes the "noise" without private values in private algorithms.
+    tree_aggregator = tree_aggregation.TreeAggregator(
+        value_generator=ConstantValueGenerator(node_value))
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+      self.assertEqual(expected_values[leaf_node_idx], val)
+
+  @parameterized.named_parameters(
+      ('total16_step1', 16, 1, 1),
+      ('total17_step1', 17, 2, 1),
+      ('total18_step1', 18, 2, 1),
+      ('total19_step1', 19, 3, 1),
+      ('total20_step0d5', 20, 1, 0.5),
+      ('total21_step2', 21, 6, 2),
+      ('total1024_step1', 1024, 1, 1),
+      ('total1025_step1', 1025, 2, 1),
+      ('total1026_step1', 1026, 2, 1),
+      ('total1027_step1', 1027, 3, 1),
+      ('total1028_step0d5', 1028, 1, 0.5),
+      ('total1029_step2', 1029, 6, 2),
+  )
+  def test_tree_sum_last_step_expected(self, total_steps, expected_value,
+                                       node_value):
+    # Test whether `tree_aggregator` will output `expected_value` after
+    # `total_steps` of leaf nodes are traversed. The value of each tree node
+    # is a constant `node_value` for test purpose. Note that `node_value`
+    # denotes the "noise" without private values in private algorithms.
+    tree_aggregator = tree_aggregation.TreeAggregator(
+        value_generator=ConstantValueGenerator(node_value))
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+    self.assertEqual(expected_value, val)
+
+  @parameterized.named_parameters(
+      ('total16_step1', 16, 1, 1),
+      ('total17_step1', 17, 2, 1),
+      ('total18_step1', 18, 2, 1),
+      ('total19_step1', 19, 3, 1),
+      ('total20_step0d5', 20, 1, 0.5),
+      ('total21_step2', 21, 6, 2),
+      ('total1024_step1', 1024, 1, 1),
+      ('total1025_step1', 1025, 2, 1),
+      ('total1026_step1', 1026, 2, 1),
+      ('total1027_step1', 1027, 3, 1),
+      ('total1028_step0d5', 1028, 1, 0.5),
+      ('total1029_step2', 1029, 6, 2),
+  )
+  def test_tree_sum_last_step_expected_value_fn(self, total_steps,
+                                                expected_value, node_value):
+    # Test no-arg function as stateless value generator.
+    tree_aggregator = tree_aggregation.TreeAggregator(
+        value_generator=lambda: node_value)
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+    self.assertEqual(expected_value, val)
+
+  @parameterized.named_parameters(
+      ('total8_step1', 8, 1),
+      ('total8_step2', 8, 2),
+      ('total8_step0d5', 8, 0.5),
+      ('total32_step0d5', 32, 0.5),
+      ('total1024_step0d5', 1024, 0.5),
+      ('total2020_step0d5', 2020, 0.5),
+  )
+  def test_tree_sum_steps_max(self, total_steps, node_value):
+    tree_aggregator = tree_aggregation.TreeAggregator(
+        value_generator=ConstantValueGenerator(node_value))
+    max_val = node_value * math.ceil(math.log2(total_steps))
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+      self.assertLessEqual(val, max_val)
+
+  @parameterized.named_parameters(
+      ('total4_std1_d1000', 4, [1, 1, 2, 1], 1, [1000], 0.15),
+      ('total4_std1_d10000', 4, [1, 1, 2, 1], 1, [10000], 0.05),
+      ('total7_std1_d1000', 7, [1, 1, 2, 1, 2, 2, 3], 1, [1000], 0.15),
+      ('total8_std1_d1000', 8, [1, 1, 2, 1, 2, 2, 3, 1], 1, [1000], 0.15),
+      ('total8_std2_d1000', 8, [4, 4, 8, 4, 8, 8, 12, 4], 2, [1000], 0.15),
+      ('total8_std0d5_d1000', 8, [0.25, 0.25, 0.5, 0.25, 0.5, 0.5, 0.75, 0.25
+                                 ], 0.5, [1000], 0.15))
+  def test_tree_sum_noise_expected(self, total_steps, expected_variance,
+                                   noise_std, variable_shape, tolerance):
+    # Test whether `tree_aggregator` will output `expected_variance` (within a
+    # relative `tolerance`) in each step when `total_steps` of leaf nodes are
+    # traversed. Each tree node is a `variable_shape` tensor of Gaussian noise
+    # with `noise_std`.
+    random_generator = tree_aggregation.GaussianNoiseGenerator(
+        noise_std, tf.TensorSpec(variable_shape), seed=2020)
+    tree_aggregator = tree_aggregation.TreeAggregator(
+        value_generator=random_generator)
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+      self.assertAllClose(
+          math.sqrt(expected_variance[leaf_node_idx]),
+          tf.math.reduce_std(val),
+          rtol=tolerance)
+
+  def test_cumsum_vector(self, total_steps=15):
+
+    tree_aggregator = tree_aggregation.TreeAggregator(
+        value_generator=ConstantValueGenerator([
+            tf.ones([2, 2], dtype=tf.float32),
+            tf.constant([2], dtype=tf.float32)
+        ]))
+    tree_aggregator_truth = tree_aggregation.TreeAggregator(
+        value_generator=ConstantValueGenerator(1.))
+    state = tree_aggregator.init_state()
+    truth_state = tree_aggregator_truth.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+      expected_val, truth_state = tree_aggregator_truth.get_cumsum_and_update(
+          truth_state)
+      self.assertEqual(
+          tree_aggregation.get_step_idx(state),
+          tree_aggregation.get_step_idx(truth_state))
+      expected_result = [
+          expected_val * tf.ones([2, 2], dtype=tf.float32),
+          expected_val * tf.constant([2], dtype=tf.float32),
+      ]
+      tf.nest.map_structure(self.assertAllEqual, val, expected_result)
+
+
+class EfficientTreeAggregatorTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('total1_step1', 1, 1, 1.),
+      ('total2_step1', 2, 4. / 3., 1.),
+      ('total3_step1', 3, 4. / 3. + 1., 1.),
+      ('total4_step1', 4, 12. / 7., 1.),
+      ('total5_step1', 5, 12. / 7. + 1., 1.),
+      ('total6_step1', 6, 12. / 7. + 4. / 3., 1.),
+      ('total7_step1', 7, 12. / 7. + 4. / 3. + 1., 1.),
+      ('total8_step1', 8, 32. / 15., 1.),
+      ('total1024_step1', 1024, 11. / (2 - .5**10), 1.),
+      ('total1025_step1', 1025, 11. / (2 - .5**10) + 1., 1.),
+      ('total1026_step1', 1026, 11. / (2 - .5**10) + 4. / 3., 1.),
+      ('total1027_step1', 1027, 11. / (2 - .5**10) + 4. / 3. + 1.0, 1.),
+      ('total1028_step0d5', 1028, (11. / (2 - .5**10) + 12. / 7.) * .5, .5),
+      ('total1029_step2', 1029, (11. / (2 - .5**10) + 12. / 7. + 1.) * 2., 2.),
+  )
+  def test_tree_sum_last_step_expected(self, total_steps, expected_value,
+                                       step_value):
+    # Test whether `tree_aggregator` will output `expected_value` after
+    # `total_steps` of leaf nodes are traversed. The value of each tree node
+    # is a constant `node_value` for test purpose. Note that `node_value`
+    # denotes the "noise" without private values in private algorithms. The
+    # `expected_value` is based on a weighting schema strongly depends on the
+    # depth of the binary tree.
+    tree_aggregator = tree_aggregation.EfficientTreeAggregator(
+        value_generator=ConstantValueGenerator(step_value))
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+    self.assertAllClose(expected_value, val)
+
+  @parameterized.named_parameters(
+      ('total4_std1_d1000', 4, 4. / 7., 1., [1000], 0.15),
+      ('total4_std1_d10000', 4, 4. / 7., 1., [10000], 0.05),
+      ('total7_std1_d1000', 7, 4. / 7. + 2. / 3. + 1., 1, [1000], 0.15),
+      ('total8_std1_d1000', 8, 8. / 15., 1., [1000], 0.15),
+      ('total8_std2_d1000', 8, 8. / 15. * 4, 2., [1000], 0.15),
+      ('total8_std0d5_d1000', 8, 8. / 15. * .25, .5, [1000], 0.15))
+  def test_tree_sum_noise_expected(self, total_steps, expected_variance,
+                                   noise_std, variable_shape, tolerance):
+    # Test whether `tree_aggregator` will output `expected_variance` (within a
+    # relative `tolerance`) after  `total_steps` of leaf nodes are traversed.
+    # Each tree node is a `variable_shape` tensor of Gaussian noise with
+    # `noise_std`. Note that the variance of a tree node is smaller than
+    # the given vanilla node `noise_std` because of the update rule of
+    # `EfficientTreeAggregator`.
+    random_generator = tree_aggregation.GaussianNoiseGenerator(
+        noise_std, tf.TensorSpec(variable_shape), seed=2020)
+    tree_aggregator = tree_aggregation.EfficientTreeAggregator(
+        value_generator=random_generator)
+    state = tree_aggregator.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+    self.assertAllClose(
+        math.sqrt(expected_variance), tf.math.reduce_std(val), rtol=tolerance)
+
+  @parameterized.named_parameters(
+      ('total4_std1_d1000', 4, 1., [1000], 1e-6),
+      ('total30_std2_d1000', 30, 2, [1000], 1e-6),
+      ('total32_std0d5_d1000', 32, .5, [1000], 1e-6),
+      ('total60_std1_d1000', 60, 1, [1000], 1e-6),
+  )
+  def test_tree_sum_noise_efficient(self, total_steps, noise_std,
+                                    variable_shape, tolerance):
+    # Test the variance returned by `EfficientTreeAggregator` is smaller than
+    # `TreeAggregator` (within a relative `tolerance`) after `total_steps` of
+    # leaf nodes are traversed. Each tree node is a `variable_shape` tensor of
+    # Gaussian noise with `noise_std`. A small `tolerance` is used for numerical
+    # stability, `tolerance==0` means `EfficientTreeAggregator` is strictly
+    # better than `TreeAggregator` for reducing variance.
+    random_generator = tree_aggregation.GaussianNoiseGenerator(
+        noise_std, tf.TensorSpec(variable_shape))
+    tree_aggregator = tree_aggregation.EfficientTreeAggregator(
+        value_generator=random_generator)
+    tree_aggregator_baseline = tree_aggregation.TreeAggregator(
+        value_generator=random_generator)
+
+    state = tree_aggregator.init_state()
+    state_baseline = tree_aggregator_baseline.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+      val_baseline, state_baseline = tree_aggregator_baseline.get_cumsum_and_update(
+          state_baseline)
+    self.assertLess(
+        tf.math.reduce_variance(val),
+        (1 + tolerance) * tf.math.reduce_variance(val_baseline))
+
+  def test_cumsum_vector(self, total_steps=15):
+
+    tree_aggregator = tree_aggregation.EfficientTreeAggregator(
+        value_generator=ConstantValueGenerator([
+            tf.ones([2, 2], dtype=tf.float32),
+            tf.constant([2], dtype=tf.float32)
+        ]))
+    tree_aggregator_truth = tree_aggregation.EfficientTreeAggregator(
+        value_generator=ConstantValueGenerator(1.))
+    state = tree_aggregator.init_state()
+    truth_state = tree_aggregator_truth.init_state()
+    for leaf_node_idx in range(total_steps):
+      self.assertEqual(leaf_node_idx, tree_aggregation.get_step_idx(state))
+      val, state = tree_aggregator.get_cumsum_and_update(state)
+      expected_val, truth_state = tree_aggregator_truth.get_cumsum_and_update(
+          truth_state)
+      self.assertEqual(
+          tree_aggregation.get_step_idx(state),
+          tree_aggregation.get_step_idx(truth_state))
+      expected_result = [
+          expected_val * tf.ones([2, 2], dtype=tf.float32),
+          expected_val * tf.constant([2], dtype=tf.float32),
+      ]
+      tf.nest.map_structure(self.assertAllClose, val, expected_result)
+
+
+class GaussianNoiseGeneratorTest(tf.test.TestCase):
+
+  def test_random_generator_tf(self,
+                               noise_mean=1.0,
+                               noise_std=1.0,
+                               samples=1000,
+                               tolerance=0.15):
+    g = tree_aggregation.GaussianNoiseGenerator(
+        noise_std, specs=tf.TensorSpec([]), seed=2020)
+    gstate = g.initialize()
+
+    @tf.function
+    def return_noise(state):
+      value, state = g.next(state)
+      return noise_mean + value, state
+
+    noise_values = []
+    for _ in range(samples):
+      value, gstate = return_noise(gstate)
+      noise_values.append(value)
+    noise_values = tf.stack(noise_values)
+    self.assertAllClose(
+        [tf.math.reduce_mean(noise_values),
+         tf.math.reduce_std(noise_values)], [noise_mean, noise_std],
+        rtol=tolerance)
+
+  def test_seed_state(self, seed=1, steps=32, noise_std=0.1):
+    g = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=noise_std, specs=tf.TensorSpec([]), seed=seed)
+    gstate = g.initialize()
+    g2 = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=noise_std, specs=tf.TensorSpec([]), seed=seed)
+    gstate2 = g.initialize()
+    self.assertAllEqual(gstate, gstate2)
+    for _ in range(steps):
+      value, gstate = g.next(gstate)
+      value2, gstate2 = g2.next(gstate2)
+      self.assertAllEqual(value, value2)
+      self.assertAllEqual(gstate, gstate2)
+
+  def test_seed_state_nondeterministic(self, steps=32, noise_std=0.1):
+    g = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=noise_std, specs=tf.TensorSpec([]))
+    gstate = g.initialize()
+    g2 = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=noise_std, specs=tf.TensorSpec([]))
+    gstate2 = g2.initialize()
+    for _ in range(steps):
+      value, gstate = g.next(gstate)
+      value2, gstate2 = g2.next(gstate2)
+      self.assertNotAllEqual(value, value2)
+      self.assertNotAllEqual(gstate, gstate2)
+
+  def test_seed_state_structure(self, seed=1, steps=32, noise_std=0.1):
+    specs = [tf.TensorSpec([]), tf.TensorSpec([1]), tf.TensorSpec([2, 2])]
+    g = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=noise_std, specs=specs, seed=seed)
+    gstate = g.initialize()
+    g2 = tree_aggregation.GaussianNoiseGenerator(
+        noise_std=noise_std, specs=specs, seed=seed)
+    gstate2 = g2.initialize()
+    for _ in range(steps):
+      value, gstate = g.next(gstate)
+      value2, gstate2 = g2.next(gstate2)
+      self.assertAllClose(value, value2)
+      self.assertAllEqual(gstate, gstate2)
+
+
+if __name__ == '__main__':
+  tf.test.main()