Add files via upload

This commit is contained in:
woodyx218 2020-01-19 20:27:35 +08:00 committed by GitHub
parent 47a984dc25
commit 1d5c5ac2fc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 435 additions and 258 deletions

178
tutorials/adult_tutorial.py Normal file
View file

@ -0,0 +1,178 @@
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Training a one-layer NN on Adult data with differentially private SGD optimizer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import KFold
# from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
# from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent
from tensorflow_privacy.privacy.optimizers import dp_optimizer
from tensorflow_privacy.privacy.analysis.gdp_accountant import *
#### FLAGS
FLAGS = flags.FLAGS
flags.DEFINE_boolean('dpsgd', True, 'If True, train with DP-SGD.'
'If False, train with vanilla SGD.')
flags.DEFINE_float('learning_rate', .15, 'Learning rate for training')
flags.DEFINE_float('noise_multiplier', 0.55,
'Ratio of the standard deviation to the clipping norm')
flags.DEFINE_float('l2_norm_clip', 1, 'Clipping norm')
flags.DEFINE_integer('epochs', 20, 'Number of epochs')
flags.DEFINE_integer('max_mu', 2, 'GDP upper limit')
flags.DEFINE_string('model_dir', None, 'Model directory')
microbatches = 256
def nn_model_fn(features, labels, mode):
''' Define CNN architecture using tf.keras.layers.'''
input_layer = tf.reshape(features['x'], [-1, 123])
y = tf.keras.layers.Dense(16, activation='relu').apply(input_layer)
logits = tf.keras.layers.Dense(2).apply(y)
# Calculate loss as a vector (to support microbatches in DP-SGD).
vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits)
# Define mean of loss across minibatch (for reporting through tf.Estimator).
scalar_loss = tf.reduce_mean(vector_loss)
# Configure the training op (for TRAIN mode).
if mode == tf.estimator.ModeKeys.TRAIN:
if FLAGS.dpsgd:
# Use DP version of GradientDescentOptimizer. Other optimizers are
# available in dp_optimizer. Most optimizers inheriting from
# tf.train.Optimizer should be wrappable in differentially private
# counterparts by calling dp_optimizer.optimizer_from_args().
optimizer = dp_optimizer.DPGradientDescentGaussianOptimizer(
l2_norm_clip=FLAGS.l2_norm_clip,
noise_multiplier=FLAGS.noise_multiplier,
num_microbatches=microbatches,
learning_rate=FLAGS.learning_rate)
opt_loss = vector_loss
else:
optimizer = tf.compat.v1.train.GradientDescentOptimizer(
learning_rate=FLAGS.learning_rate)
opt_loss = scalar_loss
global_step = tf.compat.v1.train.get_global_step()
train_op = optimizer.minimize(loss=opt_loss, global_step=global_step)
# In the following, we pass the mean of the loss (scalar_loss) rather than
# the vector_loss because tf.estimator requires a scalar loss. This is only
# used for evaluation and debugging by tf.estimator. The actual loss being
# minimized is opt_loss defined above and passed to optimizer.minimize().
return tf.estimator.EstimatorSpec(mode=mode,
loss=scalar_loss,
train_op=train_op)
# Add evaluation metrics (for EVAL mode).
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = {
'accuracy':
tf.compat.v1.metrics.accuracy(
labels=labels,
predictions=tf.argmax(input=logits, axis=1))
}
return tf.estimator.EstimatorSpec(mode=mode,
loss=scalar_loss,
eval_metric_ops=eval_metric_ops)
return None
def load_adult():
"""Loads ADULT a2a as in LIBSVM and preprocesses to combine training and validation data."""
# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
X = pd.read_csv("adult.csv")
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X):
train, test = X.iloc[train_index, :], X.iloc[test_index, :]
train_data = train.iloc[:, range(X.shape[1]-1)].values.astype('float32')
test_data = test.iloc[:, range(X.shape[1]-1)].values.astype('float32')
train_labels = (train.iloc[:, X.shape[1]-1] == 1).astype('int32').values
test_labels = (test.iloc[:, X.shape[1]-1] == 1).astype('int32').values
return train_data, train_labels, test_data, test_labels
def main(unused_argv):
'''main'''
tf.compat.v1.logging.set_verbosity(0)
# Load training and test data.
train_data, train_labels, test_data, test_labels = load_adult()
# Instantiate the tf.Estimator.
adult_classifier = tf.compat.v1.estimator.Estimator(model_fn=nn_model_fn,
model_dir=FLAGS.model_dir)
# Create tf.Estimator input functions for the training and test data.
eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={'x': test_data},
y=test_labels,
num_epochs=1,
shuffle=False)
# Training loop.
steps_per_epoch = 29305 // 256
test_accuracy_list = []
for epoch in range(1, FLAGS.epochs + 1):
for step in range(steps_per_epoch):
whether = np.random.random_sample(29305) > (1-256/29305)
subsampling = [i for i in np.arange(29305) if whether[i]]
global microbatches
microbatches = len(subsampling)
train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={'x': train_data[subsampling]},
y=train_labels[subsampling],
batch_size=len(subsampling),
num_epochs=1,
shuffle=True)
# Train the model for one step.
adult_classifier.train(input_fn=train_input_fn, steps=1)
# Evaluate the model and print results
eval_results = adult_classifier.evaluate(input_fn=eval_input_fn)
test_accuracy = eval_results['accuracy']
test_accuracy_list.append(test_accuracy)
print('Test accuracy after %d epochs is: %.3f' % (epoch, test_accuracy))
# Compute the privacy budget expended so far.
if FLAGS.dpsgd:
eps = compute_eps_Poisson(epoch, FLAGS.noise_multiplier, 29305, 256, 1e-5)
mu = compute_mu_Poisson(epoch, FLAGS.noise_multiplier, 29305, 256)
print('For delta=1e-5, the current epsilon is: %.2f' % eps)
print('For delta=1e-5, the current mu is: %.2f' % mu)
if mu > FLAGS.max_mu:
break
else:
print('Trained with vanilla non-private SGD optimizer')
if __name__ == '__main__':
app.run(main)

View file

@ -19,46 +19,47 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from absl import app
from absl import flags
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from scipy.stats import norm from keras.preprocessing import sequence
from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp #from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent #from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent
from tensorflow_privacy.privacy.optimizers import dp_optimizer from tensorflow_privacy.privacy.optimizers import dp_optimizer
from GDprivacy_accountants import * from tensorflow_privacy.privacy.analysis.gdp_accountant import *
from keras.preprocessing import sequence
#### FLAGS #### FLAGS
tf.flags.DEFINE_boolean('dpsgd', True, 'If True, train with DP-SGD. If False, ' FLAGS = flags.FLAGS
flags.DEFINE_boolean('dpsgd', True, 'If True, train with DP-SGD. If False, '
'train with vanilla SGD.') 'train with vanilla SGD.')
tf.flags.DEFINE_float('learning_rate', 0.02, 'Learning rate for training') flags.DEFINE_float('learning_rate', 0.02, 'Learning rate for training')
tf.flags.DEFINE_float('noise_multiplier', 0.56, flags.DEFINE_float('noise_multiplier', 0.56,
'Ratio of the standard deviation to the clipping norm') 'Ratio of the standard deviation to the clipping norm')
tf.flags.DEFINE_float('l2_norm_clip', 1, 'Clipping norm') flags.DEFINE_float('l2_norm_clip', 1, 'Clipping norm')
tf.flags.DEFINE_integer('epochs', 25, 'Number of epochs') flags.DEFINE_integer('epochs', 25, 'Number of epochs')
tf.flags.DEFINE_integer('max_mu', 2, 'GDP upper limit') flags.DEFINE_integer('max_mu', 2, 'GDP upper limit')
tf.flags.DEFINE_string('model_dir', None, 'Model directory') flags.DEFINE_string('model_dir', None, 'Model directory')
FLAGS = tf.flags.FLAGS
microbatches=512 microbatches = 512
np.random.seed(0)
tf.set_random_seed(0)
max_features = 10000 max_features = 10000
# cut texts after this number of words (among top max_features most common words) # cut texts after this number of words (among top max_features most common words)
maxlen = 256 maxlen = 256
def rnn_model_fn(features, labels, mode): def nn_model_fn(features, labels, mode):
# Define CNN architecture using tf.keras.layers. '''Define NN architecture using tf.keras.layers.'''
input_layer = tf.reshape(features['x'], [-1,maxlen]) input_layer = tf.reshape(features['x'], [-1, maxlen])
y = tf.keras.layers.Embedding(max_features,16).apply(input_layer) y = tf.keras.layers.Embedding(max_features, 16).apply(input_layer)
y=tf.keras.layers.GlobalAveragePooling1D().apply(y) y = tf.keras.layers.GlobalAveragePooling1D().apply(y)
y= tf.keras.layers.Dense(16, activation='relu').apply(y) y = tf.keras.layers.Dense(16, activation='relu').apply(y)
logits= tf.keras.layers.Dense(2).apply(y) logits = tf.keras.layers.Dense(2).apply(y)
# Calculate loss as a vector (to support microbatches in DP-SGD). # Calculate loss as a vector (to support microbatches in DP-SGD).
vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( vector_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
@ -68,7 +69,6 @@ def rnn_model_fn(features, labels, mode):
# Configure the training op (for TRAIN mode). # Configure the training op (for TRAIN mode).
if mode == tf.estimator.ModeKeys.TRAIN: if mode == tf.estimator.ModeKeys.TRAIN:
if FLAGS.dpsgd: if FLAGS.dpsgd:
# Use DP version of GradientDescentOptimizer. Other optimizers are # Use DP version of GradientDescentOptimizer. Other optimizers are
# available in dp_optimizer. Most optimizers inheriting from # available in dp_optimizer. Most optimizers inheriting from
@ -81,11 +81,11 @@ def rnn_model_fn(features, labels, mode):
learning_rate=FLAGS.learning_rate) learning_rate=FLAGS.learning_rate)
opt_loss = vector_loss opt_loss = vector_loss
else: else:
optimizer = tf.train.AdamOptimizer( optimizer = tf.compat.v1.train.AdamOptimizer(
learning_rate=FLAGS.learning_rate) learning_rate=FLAGS.learning_rate)
opt_loss = scalar_loss opt_loss = scalar_loss
global_step = tf.train.get_global_step() global_step = tf.compat.v1.train.get_global_step()
train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) train_op = optimizer.minimize(loss=opt_loss, global_step=global_step)
# In the following, we pass the mean of the loss (scalar_loss) rather than # In the following, we pass the mean of the loss (scalar_loss) rather than
# the vector_loss because tf.estimator requires a scalar loss. This is only # the vector_loss because tf.estimator requires a scalar loss. This is only
@ -96,39 +96,43 @@ def rnn_model_fn(features, labels, mode):
train_op=train_op) train_op=train_op)
# Add evaluation metrics (for EVAL mode). # Add evaluation metrics (for EVAL mode).
elif mode == tf.estimator.ModeKeys.EVAL: if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = { eval_metric_ops = {
'accuracy': 'accuracy':
tf.metrics.accuracy( tf.compat.v1.metrics.accuracy(
labels=labels, labels=labels,
predictions=tf.argmax(input=logits, axis=1)) predictions=tf.argmax(input=logits, axis=1))
} }
return tf.estimator.EstimatorSpec(mode=mode, return tf.estimator.EstimatorSpec(mode=mode,
loss=scalar_loss, loss=scalar_loss,
eval_metric_ops=eval_metric_ops) eval_metric_ops=eval_metric_ops)
return None
def load_imdb(): def load_imdb():
(train_data,train_labels), (test_data,test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_features) '''Load IMDB movie reviews data'''
(train_data, train_labels), (test_data, test_labels) = \
tf.keras.datasets.imdb.load_data(num_words=max_features)
train_data = sequence.pad_sequences(train_data, maxlen=maxlen).astype('float32') train_data = sequence.pad_sequences(train_data, maxlen=maxlen).astype('float32')
test_data = sequence.pad_sequences(test_data, maxlen=maxlen).astype('float32') test_data = sequence.pad_sequences(test_data, maxlen=maxlen).astype('float32')
return train_data,train_labels,test_data,test_labels return train_data, train_labels, test_data, test_labels
def main(unused_argv): def main(unused_argv):
tf.logging.set_verbosity(3) '''main'''
tf.compat.v1.logging.set_verbosity(3)
# Load training and test data. # Load training and test data.
train_data,train_labels,test_data,test_labels = load_imdb() train_data, train_labels, test_data, test_labels = load_imdb()
# Instantiate the tf.Estimator. # Instantiate the tf.Estimator.
imdb_classifier = tf.estimator.Estimator(model_fn=rnn_model_fn, imdb_classifier = tf.estimator.Estimator(model_fn=nn_model_fn,
model_dir=FLAGS.model_dir) model_dir=FLAGS.model_dir)
# Create tf.Estimator input functions for the training and test data. # Create tf.Estimator input functions for the training and test data.
eval_input_fn = tf.estimator.inputs.numpy_input_fn( eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={'x': test_data}, x={'x': test_data},
y=test_labels, y=test_labels,
num_epochs=1, num_epochs=1,
@ -139,15 +143,13 @@ def main(unused_argv):
test_accuracy_list = [] test_accuracy_list = []
for epoch in range(1, FLAGS.epochs + 1): for epoch in range(1, FLAGS.epochs + 1):
np.random.seed(epoch)
for step in range(steps_per_epoch): for step in range(steps_per_epoch):
tf.set_random_seed(0) whether = np.random.random_sample(25000) > (1-512/25000)
whether=np.random.random_sample(25000)>(1-512/25000) subsampling = [i for i in np.arange(25000) if whether[i]]
subsampling=[i for i in np.arange(25000) if whether[i]]
global microbatches global microbatches
microbatches=len(subsampling) microbatches = len(subsampling)
train_input_fn = tf.estimator.inputs.numpy_input_fn( train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={'x': train_data[subsampling]}, x={'x': train_data[subsampling]},
y=train_labels[subsampling], y=train_labels[subsampling],
batch_size=len(subsampling), batch_size=len(subsampling),
@ -164,16 +166,16 @@ def main(unused_argv):
# Compute the privacy budget expended so far. # Compute the privacy budget expended so far.
if FLAGS.dpsgd: if FLAGS.dpsgd:
eps = compute_epsP(epoch,FLAGS.noise_multiplier,25000,512,1e-5) eps = compute_eps_Poisson(epoch, FLAGS.noise_multiplier, 25000, 512, 1e-5)
mu= compute_muP(epoch,FLAGS.noise_multiplier,25000,512) mu = compute_mu_Poisson(epoch, FLAGS.noise_multiplier, 25000, 512)
print('For delta=1e-5, the current epsilon is: %.2f' % eps) print('For delta=1e-5, the current epsilon is: %.2f' % eps)
print('For delta=1e-5, the current mu is: %.2f' % mu) print('For delta=1e-5, the current mu is: %.2f' % mu)
if mu>FLAGS.max_mu: if mu > FLAGS.max_mu:
break break
else: else:
print('Trained with vanilla non-private SGD optimizer') print('Trained with vanilla non-private SGD optimizer')
if __name__ == '__main__': if __name__ == '__main__':
tf.app.run() app.run(main)

View file

@ -24,47 +24,45 @@ from absl import flags
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from scipy.stats import norm import pandas as pd
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split
from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp #from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent #from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent
from tensorflow_privacy.privacy.optimizers import dp_optimizer from tensorflow_privacy.privacy.optimizers import dp_optimizer
from GDprivacy_accountants import * from tensorflow_privacy.privacy.analysis.gdp_accountant import *
#### FLAGS #### FLAGS
tf.flags.DEFINE_boolean('dpsgd', True, 'If True, train with DP-SGD. If False, ' FLAGS = flags.FLAGS
flags.DEFINE_boolean('dpsgd', True, 'If True, train with DP-SGD. If False, '
'train with vanilla SGD.') 'train with vanilla SGD.')
tf.flags.DEFINE_float('learning_rate', .01, 'Learning rate for training') flags.DEFINE_float('learning_rate', .01, 'Learning rate for training')
tf.flags.DEFINE_float('noise_multiplier', 0.55, flags.DEFINE_float('noise_multiplier', 0.55,
'Ratio of the standard deviation to the clipping norm') 'Ratio of the standard deviation to the clipping norm')
tf.flags.DEFINE_float('l2_norm_clip', 5, 'Clipping norm') flags.DEFINE_float('l2_norm_clip', 5, 'Clipping norm')
tf.flags.DEFINE_integer('epochs', 25, 'Number of epochs') flags.DEFINE_integer('epochs', 25, 'Number of epochs')
tf.flags.DEFINE_integer('max_mu', 2, 'GDP upper limit') flags.DEFINE_integer('max_mu', 2, 'GDP upper limit')
tf.flags.DEFINE_string('model_dir', None, 'Model directory') flags.DEFINE_string('model_dir', None, 'Model directory')
FLAGS = tf.flags.FLAGS
microbatches=10000 microbatches = 10000
np.random.seed(0)
tf.set_random_seed(0)
n_users=6040
n_movies=3706
def nn_model_fn(features, labels, mode): def nn_model_fn(features, labels, mode):
# Adapted from https://github.com/hexiangnan/neural_collaborative_filtering '''NN adapted from github.com/hexiangnan/neural_collaborative_filtering'''
n_latent_factors_user = 10 n_latent_factors_user = 10
n_latent_factors_movie = 10 n_latent_factors_movie = 10
n_latent_factors_mf = 5 n_latent_factors_mf = 5
user_input = tf.reshape(features['user'], [-1,1]) user_input = tf.reshape(features['user'], [-1, 1])
item_input = tf.reshape(features['movie'], [-1,1]) item_input = tf.reshape(features['movie'], [-1, 1])
mf_embedding_user = tf.keras.layers.Embedding(n_users,n_latent_factors_mf,input_length=1) # number of users: 6040; number of movies: 3706
mf_embedding_item = tf.keras.layers.Embedding(n_movies,n_latent_factors_mf,input_length=1) mf_embedding_user = tf.keras.layers.Embedding(6040, n_latent_factors_mf, input_length=1)
mlp_embedding_user = tf.keras.layers.Embedding(n_users,n_latent_factors_user,input_length=1) mf_embedding_item = tf.keras.layers.Embedding(3706, n_latent_factors_mf, input_length=1)
mlp_embedding_item = tf.keras.layers.Embedding(n_movies,n_latent_factors_movie,input_length=1) mlp_embedding_user = tf.keras.layers.Embedding(6040, n_latent_factors_user, input_length=1)
mlp_embedding_item = tf.keras.layers.Embedding(3706, n_latent_factors_movie, input_length=1)
# GMF part # GMF part
# Flatten the embedding vector as latent features in GMF # Flatten the embedding vector as latent features in GMF
@ -91,7 +89,6 @@ def nn_model_fn(features, labels, mode):
# Configure the training op (for TRAIN mode). # Configure the training op (for TRAIN mode).
if mode == tf.estimator.ModeKeys.TRAIN: if mode == tf.estimator.ModeKeys.TRAIN:
if FLAGS.dpsgd: if FLAGS.dpsgd:
# Use DP version of GradientDescentOptimizer. Other optimizers are # Use DP version of GradientDescentOptimizer. Other optimizers are
# available in dp_optimizer. Most optimizers inheriting from # available in dp_optimizer. Most optimizers inheriting from
@ -104,11 +101,11 @@ def nn_model_fn(features, labels, mode):
learning_rate=FLAGS.learning_rate) learning_rate=FLAGS.learning_rate)
opt_loss = vector_loss opt_loss = vector_loss
else: else:
optimizer = tf.train.AdamOptimizer( optimizer = tf.compat.v1.train.AdamOptimizer(
learning_rate=FLAGS.learning_rate) learning_rate=FLAGS.learning_rate)
opt_loss = scalar_loss opt_loss = scalar_loss
global_step = tf.train.get_global_step() global_step = tf.compat.v1.train.get_global_step()
train_op = optimizer.minimize(loss=opt_loss, global_step=global_step) train_op = optimizer.minimize(loss=opt_loss, global_step=global_step)
# In the following, we pass the mean of the loss (scalar_loss) rather than # In the following, we pass the mean of the loss (scalar_loss) rather than
# the vector_loss because tf.estimator requires a scalar loss. This is only # the vector_loss because tf.estimator requires a scalar loss. This is only
@ -119,56 +116,58 @@ def nn_model_fn(features, labels, mode):
train_op=train_op) train_op=train_op)
# Add evaluation metrics (for EVAL mode). # Add evaluation metrics (for EVAL mode).
elif mode == tf.estimator.ModeKeys.EVAL: if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = { eval_metric_ops = {
'rmse': 'rmse':
tf.metrics.root_mean_squared_error( tf.compat.v1.metrics.root_mean_squared_error(
labels=tf.cast(labels, tf.float32), labels=tf.cast(labels, tf.float32),
predictions=tf.tensordot(a=tf.nn.softmax(logits,axis=1),b=tf.constant(np.array([0,1,2,3,4]),dtype=tf.float32),axes=1)) predictions=tf.tensordot(a=tf.nn.softmax(logits, axis=1),
b=tf.constant(np.array([0, 1, 2, 3, 4]),
dtype=tf.float32),
axes=1))
} }
return tf.estimator.EstimatorSpec(mode=mode, return tf.estimator.EstimatorSpec(mode=mode,
loss=scalar_loss, loss=scalar_loss,
eval_metric_ops=eval_metric_ops) eval_metric_ops=eval_metric_ops)
return None
def load_adult(): def load_adult():
import pandas as pd """Loads MovieLens 1M as from https://grouplens.org/datasets/movielens/1m"""
import numpy as np data = pd.read_csv('ratings.dat', sep='::', header=None,
names=["userId", "movieId", "rating", "timestamp"])
data = pd.read_csv('ratings.dat', sep='::', header=None,names=["userId", "movieId", "rating", "timestamp"]) n_users = len(set(data['userId']))
n_users=len(set(data['userId'])) n_movies = len(set(data['movieId']))
n_movies=len(set(data['movieId'])) print('number of movie: ', n_movies)
print('number of movie: ',n_movies) print('number of user: ', n_users)
print('number of user: ',n_users)
# give unique dense movie index to movieId # give unique dense movie index to movieId
from scipy.stats import rankdata data['movieIndex'] = rankdata(data['movieId'], method='dense')
data['movieIndex']=rankdata(data['movieId'], method='dense')
# minus one to reduce the minimum value to 0, which is the start of col index # minus one to reduce the minimum value to 0, which is the start of col index
print('number of ratings:',data.shape[0]) print('number of ratings:', data.shape[0])
print('percentage of sparsity:',(1-data.shape[0]/n_users/n_movies)*100,'%') print('percentage of sparsity:', (1-data.shape[0]/n_users/n_movies)*100, '%')
from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.2, random_state=100)
train,test=train_test_split(data,test_size=0.2,random_state=100)
return train.values-1, test.values-1, np.mean(train['rating']) return train.values-1, test.values-1, np.mean(train['rating'])
def main(unused_argv): def main(unused_argv):
tf.logging.set_verbosity(3) '''main'''
tf.compat.v1.logging.set_verbosity(3)
# Load training and test data. # Load training and test data.
train_data, test_data, mean = load_adult() train_data, test_data, mean = load_adult()
# Instantiate the tf.Estimator. # Instantiate the tf.Estimator.
adult_classifier = tf.estimator.Estimator(model_fn=nn_model_fn, ml_classifier = tf.estimator.Estimator(model_fn=nn_model_fn,
model_dir=FLAGS.model_dir) model_dir=FLAGS.model_dir)
# Create tf.Estimator input functions for the training and test data. # Create tf.Estimator input functions for the training and test data.
eval_input_fn = tf.estimator.inputs.numpy_input_fn( eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={'user': test_data[:,0], 'movie': test_data[:,4]}, x={'user': test_data[:, 0], 'movie': test_data[:, 4]},
y=test_data[:,2], y=test_data[:, 2],
num_epochs=1, num_epochs=1,
shuffle=False) shuffle=False)
@ -176,41 +175,39 @@ def main(unused_argv):
steps_per_epoch = 800167 // 10000 steps_per_epoch = 800167 // 10000
test_accuracy_list = [] test_accuracy_list = []
for epoch in range(1, FLAGS.epochs + 1): for epoch in range(1, FLAGS.epochs + 1):
np.random.seed(epoch)
for step in range(steps_per_epoch): for step in range(steps_per_epoch):
tf.set_random_seed(0) whether = np.random.random_sample(800167) > (1-10000/800167)
whether=np.random.random_sample(800167)>(1-10000/800167) subsampling = [i for i in np.arange(800167) if whether[i]]
subsampling=[i for i in np.arange(800167) if whether[i]]
global microbatches global microbatches
microbatches=len(subsampling) microbatches = len(subsampling)
train_input_fn = tf.estimator.inputs.numpy_input_fn( train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={'user': train_data[subsampling,0], 'movie': train_data[subsampling,4]}, x={'user': train_data[subsampling, 0], 'movie': train_data[subsampling, 4]},
y=train_data[subsampling,2], y=train_data[subsampling, 2],
batch_size=len(subsampling), batch_size=len(subsampling),
num_epochs=1, num_epochs=1,
shuffle=True) shuffle=True)
# Train the model for one step. # Train the model for one step.
adult_classifier.train(input_fn=train_input_fn, steps=1) ml_classifier.train(input_fn=train_input_fn, steps=1)
# Evaluate the model and print results # Evaluate the model and print results
eval_results = adult_classifier.evaluate(input_fn=eval_input_fn) eval_results = ml_classifier.evaluate(input_fn=eval_input_fn)
test_accuracy = eval_results['rmse'] test_accuracy = eval_results['rmse']
test_accuracy_list.append(test_accuracy) test_accuracy_list.append(test_accuracy)
print('Test RMSE after %d epochs is: %.3f' % (epoch, test_accuracy)) print('Test RMSE after %d epochs is: %.3f' % (epoch, test_accuracy))
# Compute the privacy budget expended so far. # Compute the privacy budget expended so far.
if FLAGS.dpsgd: if FLAGS.dpsgd:
eps = compute_epsP(epoch,FLAGS.noise_multiplier,800167,10000,1e-6) eps = compute_eps_Poisson(epoch, FLAGS.noise_multiplier, 800167, 10000, 1e-6)
mu= compute_muP(epoch,FLAGS.noise_multiplier,800167,10000) mu = compute_mu_Poisson(epoch, FLAGS.noise_multiplier, 800167, 10000)
print('For delta=1e-6, the current epsilon is: %.2f' % eps) print('For delta=1e-6, the current epsilon is: %.2f' % eps)
print('For delta=1e-6, the current mu is: %.2f' % mu) print('For delta=1e-6, the current mu is: %.2f' % mu)
if mu>FLAGS.max_mu: if mu > FLAGS.max_mu:
break break
else: else:
print('Trained with vanilla non-private SGD optimizer') print('Trained with vanilla non-private SGD optimizer')
if __name__ == '__main__': if __name__ == '__main__':
tf.app.run() app.run(main)