tensorflow_privacy/tutorials/movielens_tutorial.py

# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Training a deep NN on MovieLens with differentially private Adam optimizer."""

from absl import app
from absl import flags
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import estimator as tf_estimator
from tensorflow.compat.v1 import estimator as tf_compat_v1_estimator
from tensorflow_privacy.privacy.analysis.gdp_accountant import compute_eps_poisson
from tensorflow_privacy.privacy.analysis.gdp_accountant import compute_mu_poisson
from tensorflow_privacy.privacy.optimizers import dp_optimizer

#### FLAGS
FLAGS = flags.FLAGS
flags.DEFINE_boolean(
    'dpsgd', True, 'If True, train with DP-SGD. If False, '
    'train with vanilla SGD.')
flags.DEFINE_float('learning_rate', .01, 'Learning rate for training')
flags.DEFINE_float('noise_multiplier', 0.55,
                   'Ratio of the standard deviation to the clipping norm')
flags.DEFINE_float('l2_norm_clip', 5, 'Clipping norm')
flags.DEFINE_integer('epochs', 25, 'Number of epochs')
flags.DEFINE_integer('max_mu', 2, 'GDP upper limit')
flags.DEFINE_string('model_dir', None, 'Model directory')

sampling_batch = 10000
microbatches = 10000
num_examples = 800167


def nn_model_fn(features, labels, mode):
  """NN adapted from github.com/hexiangnan/neural_collaborative_filtering."""
  n_latent_factors_user = 10
  n_latent_factors_movie = 10
  n_latent_factors_mf = 5

  user_input = tf.reshape(features['user'], [-1, 1])
  item_input = tf.reshape(features['movie'], [-1, 1])

  # number of users: 6040; number of movies: 3706
  mf_embedding_user = tf.keras.layers.Embedding(
      6040, n_latent_factors_mf, input_length=1)
  mf_embedding_item = tf.keras.layers.Embedding(
      3706, n_latent_factors_mf, input_length=1)
  mlp_embedding_user = tf.keras.layers.Embedding(
      6040, n_latent_factors_user, input_length=1)
  mlp_embedding_item = tf.keras.layers.Embedding(
      3706, n_latent_factors_movie, input_length=1)

  # GMF part
  # Flatten the embedding vector as latent features in GMF
  mf_user_latent = tf.keras.layers.Flatten()(mf_embedding_user(user_input))
  mf_item_latent = tf.keras.layers.Flatten()(mf_embedding_item(item_input))
  # Element-wise multiply
  mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])

  # MLP part
  # Flatten the embedding vector as latent features in MLP
  mlp_user_latent = tf.keras.layers.Flatten()(mlp_embedding_user(user_input))
  mlp_item_latent = tf.keras.layers.Flatten()(mlp_embedding_item(item_input))
  # Concatenation of two latent features
  mlp_vector = tf.keras.layers.concatenate([mlp_user_latent, mlp_item_latent])

  predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])

  logits = tf.keras.layers.Dense(5)(predict_vector)

  # Calculate loss as a vector (to support microbatches in DP-SGD).
  vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=labels, logits=logits)
  # Define mean of loss across minibatch (for reporting through tf.Estimator).
  scalar_loss = tf.reduce_mean(vector_loss)

  # Configure the training op (for TRAIN mode).
  if mode == tf_estimator.ModeKeys.TRAIN:
    if FLAGS.dpsgd:
      # Use DP version of GradientDescentOptimizer. Other optimizers are
      # available in dp_optimizer. Most optimizers inheriting from
      # tf.compat.v1.train.Optimizer should be wrappable in differentially
      # private counterparts by calling dp_optimizer.optimizer_from_args().
      optimizer = dp_optimizer.DPAdamGaussianOptimizer(
          l2_norm_clip=FLAGS.l2_norm_clip,
          noise_multiplier=FLAGS.noise_multiplier,
          num_microbatches=microbatches,
          learning_rate=FLAGS.learning_rate)
      opt_loss = vector_loss
    else:
      optimizer = tf.compat.v1.train.AdamOptimizer(
          learning_rate=FLAGS.learning_rate)
      opt_loss = scalar_loss

    global_step = tf.compat.v1.train.get_global_step()
    train_op = optimizer.minimize(loss=opt_loss, global_step=global_step)
    # In the following, we pass the mean of the loss (scalar_loss) rather than
    # the vector_loss because tf.estimator requires a scalar loss. This is only
    # used for evaluation and debugging by tf.estimator. The actual loss being
    # minimized is opt_loss defined above and passed to optimizer.minimize().
    return tf_estimator.EstimatorSpec(
        mode=mode, loss=scalar_loss, train_op=train_op)

  # Add evaluation metrics (for EVAL mode).
  if mode == tf_estimator.ModeKeys.EVAL:
    eval_metric_ops = {
        'rmse':
            tf.compat.v1.metrics.root_mean_squared_error(
                labels=tf.cast(labels, tf.float32),
                predictions=tf.tensordot(
                    a=tf.nn.softmax(logits, axis=1),
                    b=tf.constant(np.array([0, 1, 2, 3, 4]), dtype=tf.float32),
                    axes=1))
    }
    return tf_estimator.EstimatorSpec(
        mode=mode, loss=scalar_loss, eval_metric_ops=eval_metric_ops)
  return None


def load_movielens():
  """Loads MovieLens 1M as from https://grouplens.org/datasets/movielens/1m."""
  data = pd.read_csv(
      'ratings.dat',
      sep='::',
      header=None,
      names=['userId', 'movieId', 'rating', 'timestamp'])
  n_users = len(set(data['userId']))
  n_movies = len(set(data['movieId']))
  print('number of movie: ', n_movies)
  print('number of user: ', n_users)

  # give unique dense movie index to movieId
  data['movieIndex'] = stats.rankdata(data['movieId'], method='dense')
  # minus one to reduce the minimum value to 0, which is the start of col index

  print('number of ratings:', data.shape[0])
  print('percentage of sparsity:',
        (1 - data.shape[0] / n_users / n_movies) * 100, '%')

  train, test = train_test_split(data, test_size=0.2, random_state=100)

  return train.values - 1, test.values - 1, np.mean(train['rating'])


def main(unused_argv):
  tf.compat.v1.logging.set_verbosity(3)

  # Load training and test data.
  train_data, test_data, _ = load_movielens()

  # Instantiate the tf.Estimator.
  ml_classifier = tf_estimator.Estimator(
      model_fn=nn_model_fn, model_dir=FLAGS.model_dir)

  # Create tf.Estimator input functions for the training and test data.
  eval_input_fn = tf_compat_v1_estimator.inputs.numpy_input_fn(
      x={
          'user': test_data[:, 0],
          'movie': test_data[:, 4]
      },
      y=test_data[:, 2],
      num_epochs=1,
      shuffle=False)

  # Training loop.
  steps_per_epoch = num_examples // sampling_batch
  test_accuracy_list = []
  for epoch in range(1, FLAGS.epochs + 1):
    for _ in range(steps_per_epoch):
      whether = np.random.random_sample(num_examples) > (
          1 - sampling_batch / num_examples)
      subsampling = [i for i in np.arange(num_examples) if whether[i]]
      global microbatches
      microbatches = len(subsampling)

      train_input_fn = tf_compat_v1_estimator.inputs.numpy_input_fn(
          x={
              'user': train_data[subsampling, 0],
              'movie': train_data[subsampling, 4]
          },
          y=train_data[subsampling, 2],
          batch_size=len(subsampling),
          num_epochs=1,
          shuffle=True)
      # Train the model for one step.
      ml_classifier.train(input_fn=train_input_fn, steps=1)

    # Evaluate the model and print results
    eval_results = ml_classifier.evaluate(input_fn=eval_input_fn)
    test_accuracy = eval_results['rmse']
    test_accuracy_list.append(test_accuracy)
    print('Test RMSE after %d epochs is: %.3f' % (epoch, test_accuracy))

    # Compute the privacy budget expended so far.
    if FLAGS.dpsgd:
      eps = compute_eps_poisson(epoch, FLAGS.noise_multiplier, num_examples,
                                sampling_batch, 1e-6)
      mu = compute_mu_poisson(epoch, FLAGS.noise_multiplier, num_examples,
                              sampling_batch)
      print('For delta=1e-6, the current epsilon is: %.2f' % eps)
      print('For delta=1e-6, the current mu is: %.2f' % mu)

      if mu > FLAGS.max_mu:
        break
    else:
      print('Trained with vanilla non-private SGD optimizer')


if __name__ == '__main__':
  app.run(main)
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`# Copyright 2020 The TensorFlow Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# =============================================================================`
			`"""Training a deep NN on MovieLens with differentially private Adam optimizer."""`

			`from absl import app`
			`from absl import flags`
			`import numpy as np`
			`import pandas as pd`
Normalize `scipy` imports in TensorFlow Privacy to be more friendly with strict dependencies and lint. PiperOrigin-RevId: 424649853 2022-01-27 11:32:47 -07:00			`from scipy import stats`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`from sklearn.model_selection import train_test_split`
			`import tensorflow as tf`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`from tensorflow import estimator as tf_estimator`
			`from tensorflow.compat.v1 import estimator as tf_compat_v1_estimator`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`from tensorflow_privacy.privacy.analysis.gdp_accountant import compute_eps_poisson`
			`from tensorflow_privacy.privacy.analysis.gdp_accountant import compute_mu_poisson`
			`from tensorflow_privacy.privacy.optimizers import dp_optimizer`

			`#### FLAGS`
			`FLAGS = flags.FLAGS`
			`flags.DEFINE_boolean(`
			`'dpsgd', True, 'If True, train with DP-SGD. If False, '`
			`'train with vanilla SGD.')`
			`flags.DEFINE_float('learning_rate', .01, 'Learning rate for training')`
			`flags.DEFINE_float('noise_multiplier', 0.55,`
			`'Ratio of the standard deviation to the clipping norm')`
			`flags.DEFINE_float('l2_norm_clip', 5, 'Clipping norm')`
			`flags.DEFINE_integer('epochs', 25, 'Number of epochs')`
			`flags.DEFINE_integer('max_mu', 2, 'GDP upper limit')`
			`flags.DEFINE_string('model_dir', None, 'Model directory')`

			`sampling_batch = 10000`
			`microbatches = 10000`
			`num_examples = 800167`


			`def nn_model_fn(features, labels, mode):`
			`"""NN adapted from github.com/hexiangnan/neural_collaborative_filtering."""`
			`n_latent_factors_user = 10`
			`n_latent_factors_movie = 10`
			`n_latent_factors_mf = 5`

			`user_input = tf.reshape(features['user'], [-1, 1])`
			`item_input = tf.reshape(features['movie'], [-1, 1])`

			`# number of users: 6040; number of movies: 3706`
			`mf_embedding_user = tf.keras.layers.Embedding(`
			`6040, n_latent_factors_mf, input_length=1)`
			`mf_embedding_item = tf.keras.layers.Embedding(`
			`3706, n_latent_factors_mf, input_length=1)`
			`mlp_embedding_user = tf.keras.layers.Embedding(`
			`6040, n_latent_factors_user, input_length=1)`
			`mlp_embedding_item = tf.keras.layers.Embedding(`
			`3706, n_latent_factors_movie, input_length=1)`

			`# GMF part`
			`# Flatten the embedding vector as latent features in GMF`
			`mf_user_latent = tf.keras.layers.Flatten()(mf_embedding_user(user_input))`
			`mf_item_latent = tf.keras.layers.Flatten()(mf_embedding_item(item_input))`
			`# Element-wise multiply`
			`mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])`

			`# MLP part`
			`# Flatten the embedding vector as latent features in MLP`
			`mlp_user_latent = tf.keras.layers.Flatten()(mlp_embedding_user(user_input))`
			`mlp_item_latent = tf.keras.layers.Flatten()(mlp_embedding_item(item_input))`
			`# Concatenation of two latent features`
			`mlp_vector = tf.keras.layers.concatenate([mlp_user_latent, mlp_item_latent])`

			`predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])`

			`logits = tf.keras.layers.Dense(5)(predict_vector)`

			`# Calculate loss as a vector (to support microbatches in DP-SGD).`
			`vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(`
			`labels=labels, logits=logits)`
			`# Define mean of loss across minibatch (for reporting through tf.Estimator).`
			`scalar_loss = tf.reduce_mean(vector_loss)`

			`# Configure the training op (for TRAIN mode).`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`if mode == tf_estimator.ModeKeys.TRAIN:`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`if FLAGS.dpsgd:`
			`# Use DP version of GradientDescentOptimizer. Other optimizers are`
			`# available in dp_optimizer. Most optimizers inheriting from`
Ensure that TF 1.0 API is referenced at the call site in TensorFlow Privacy. This change makes it easy to search for usage of TF 1.0 API and updates the TF imports across TFP to be written consistently. PiperOrigin-RevId: 427043028 2022-02-07 17:05:45 -07:00			`# tf.compat.v1.train.Optimizer should be wrappable in differentially`
			`# private counterparts by calling dp_optimizer.optimizer_from_args().`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`optimizer = dp_optimizer.DPAdamGaussianOptimizer(`
			`l2_norm_clip=FLAGS.l2_norm_clip,`
			`noise_multiplier=FLAGS.noise_multiplier,`
			`num_microbatches=microbatches,`
			`learning_rate=FLAGS.learning_rate)`
			`opt_loss = vector_loss`
			`else:`
			`optimizer = tf.compat.v1.train.AdamOptimizer(`
			`learning_rate=FLAGS.learning_rate)`
			`opt_loss = scalar_loss`

			`global_step = tf.compat.v1.train.get_global_step()`
			`train_op = optimizer.minimize(loss=opt_loss, global_step=global_step)`
			`# In the following, we pass the mean of the loss (scalar_loss) rather than`
			`# the vector_loss because tf.estimator requires a scalar loss. This is only`
			`# used for evaluation and debugging by tf.estimator. The actual loss being`
			`# minimized is opt_loss defined above and passed to optimizer.minimize().`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`return tf_estimator.EstimatorSpec(`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`mode=mode, loss=scalar_loss, train_op=train_op)`

Format TensorFlow Privacy files. PiperOrigin-RevId: 424923635 2022-01-28 12:56:55 -07:00			`# Add evaluation metrics (for EVAL mode).`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`if mode == tf_estimator.ModeKeys.EVAL:`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`eval_metric_ops = {`
			`'rmse':`
			`tf.compat.v1.metrics.root_mean_squared_error(`
			`labels=tf.cast(labels, tf.float32),`
			`predictions=tf.tensordot(`
			`a=tf.nn.softmax(logits, axis=1),`
			`b=tf.constant(np.array([0, 1, 2, 3, 4]), dtype=tf.float32),`
			`axes=1))`
			`}`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`return tf_estimator.EstimatorSpec(`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`mode=mode, loss=scalar_loss, eval_metric_ops=eval_metric_ops)`
			`return None`


			`def load_movielens():`
			`"""Loads MovieLens 1M as from https://grouplens.org/datasets/movielens/1m."""`
			`data = pd.read_csv(`
			`'ratings.dat',`
			`sep='::',`
			`header=None,`
			`names=['userId', 'movieId', 'rating', 'timestamp'])`
			`n_users = len(set(data['userId']))`
			`n_movies = len(set(data['movieId']))`
			`print('number of movie: ', n_movies)`
			`print('number of user: ', n_users)`

			`# give unique dense movie index to movieId`
Normalize `scipy` imports in TensorFlow Privacy to be more friendly with strict dependencies and lint. PiperOrigin-RevId: 424649853 2022-01-27 11:32:47 -07:00			`data['movieIndex'] = stats.rankdata(data['movieId'], method='dense')`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`# minus one to reduce the minimum value to 0, which is the start of col index`

			`print('number of ratings:', data.shape[0])`
			`print('percentage of sparsity:',`
			`(1 - data.shape[0] / n_users / n_movies) * 100, '%')`

			`train, test = train_test_split(data, test_size=0.2, random_state=100)`

			`return train.values - 1, test.values - 1, np.mean(train['rating'])`


			`def main(unused_argv):`
			`tf.compat.v1.logging.set_verbosity(3)`

			`# Load training and test data.`
			`train_data, test_data, _ = load_movielens()`

			`# Instantiate the tf.Estimator.`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`ml_classifier = tf_estimator.Estimator(`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`model_fn=nn_model_fn, model_dir=FLAGS.model_dir)`

			`# Create tf.Estimator input functions for the training and test data.`
Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`eval_input_fn = tf_compat_v1_estimator.inputs.numpy_input_fn(`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`x={`
			`'user': test_data[:, 0],`
			`'movie': test_data[:, 4]`
			`},`
			`y=test_data[:, 2],`
			`num_epochs=1,`
			`shuffle=False)`

			`# Training loop.`
			`steps_per_epoch = num_examples // sampling_batch`
			`test_accuracy_list = []`
			`for epoch in range(1, FLAGS.epochs + 1):`
			`for _ in range(steps_per_epoch):`
			`whether = np.random.random_sample(num_examples) > (`
			`1 - sampling_batch / num_examples)`
			`subsampling = [i for i in np.arange(num_examples) if whether[i]]`
			`global microbatches`
			`microbatches = len(subsampling)`

Explicitly import estimator from tensorflow as a separate import instead of accessing it via tf.estimator and depend on the tensorflow estimator target. PiperOrigin-RevId: 438419860 2022-03-30 17:04:17 -06:00			`train_input_fn = tf_compat_v1_estimator.inputs.numpy_input_fn(`
Merge pull request #89 from woodyx218:GDPrivacy PiperOrigin-RevId: 307695867 2020-04-21 16:48:54 -06:00			`x={`
			`'user': train_data[subsampling, 0],`
			`'movie': train_data[subsampling, 4]`
			`},`
			`y=train_data[subsampling, 2],`
			`batch_size=len(subsampling),`
			`num_epochs=1,`
			`shuffle=True)`
			`# Train the model for one step.`
			`ml_classifier.train(input_fn=train_input_fn, steps=1)`

			`# Evaluate the model and print results`
			`eval_results = ml_classifier.evaluate(input_fn=eval_input_fn)`
			`test_accuracy = eval_results['rmse']`
			`test_accuracy_list.append(test_accuracy)`
			`print('Test RMSE after %d epochs is: %.3f' % (epoch, test_accuracy))`

			`# Compute the privacy budget expended so far.`
			`if FLAGS.dpsgd:`
			`eps = compute_eps_poisson(epoch, FLAGS.noise_multiplier, num_examples,`
			`sampling_batch, 1e-6)`
			`mu = compute_mu_poisson(epoch, FLAGS.noise_multiplier, num_examples,`
			`sampling_batch)`
			`print('For delta=1e-6, the current epsilon is: %.2f' % eps)`
			`print('For delta=1e-6, the current mu is: %.2f' % mu)`

			`if mu > FLAGS.max_mu:`
			`break`
			`else:`
			`print('Trained with vanilla non-private SGD optimizer')`


			`if __name__ == '__main__':`
			`app.run(main)`