forked from 626_privacy/tensorflow_privacy
424 lines
14 KiB
Python
424 lines
14 KiB
Python
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import gzip
|
|
import math
|
|
import os
|
|
import sys
|
|
import tarfile
|
|
|
|
import numpy as np
|
|
from scipy.io import loadmat as loadmat
|
|
from six.moves import cPickle as pickle
|
|
from six.moves import urllib
|
|
from six.moves import xrange
|
|
import tensorflow as tf
|
|
|
|
FLAGS = tf.flags.FLAGS
|
|
|
|
|
|
def create_dir_if_needed(dest_directory):
|
|
"""
|
|
Create directory if doesn't exist
|
|
:param dest_directory:
|
|
:return: True if everything went well
|
|
"""
|
|
if not tf.gfile.IsDirectory(dest_directory):
|
|
tf.gfile.MakeDirs(dest_directory)
|
|
|
|
return True
|
|
|
|
|
|
def maybe_download(file_urls, directory):
|
|
"""
|
|
Download a set of files in temporary local folder
|
|
:param directory: the directory where to download
|
|
:return: a tuple of filepaths corresponding to the files given as input
|
|
"""
|
|
# Create directory if doesn't exist
|
|
assert create_dir_if_needed(directory)
|
|
|
|
# This list will include all URLS of the local copy of downloaded files
|
|
result = []
|
|
|
|
# For each file of the dataset
|
|
for file_url in file_urls:
|
|
# Extract filename
|
|
filename = file_url.split('/')[-1]
|
|
|
|
# If downloading from GitHub, remove suffix ?raw=True from local filename
|
|
if filename.endswith("?raw=true"):
|
|
filename = filename[:-9]
|
|
|
|
# Deduce local file url
|
|
#filepath = os.path.join(directory, filename)
|
|
filepath = directory + '/' + filename
|
|
|
|
# Add to result list
|
|
result.append(filepath)
|
|
|
|
# Test if file already exists
|
|
if not tf.gfile.Exists(filepath):
|
|
def _progress(count, block_size, total_size):
|
|
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
|
float(count * block_size) / float(total_size) * 100.0))
|
|
sys.stdout.flush()
|
|
filepath, _ = urllib.request.urlretrieve(file_url, filepath, _progress)
|
|
print()
|
|
statinfo = os.stat(filepath)
|
|
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
|
|
|
return result
|
|
|
|
|
|
def image_whitening(data):
|
|
"""
|
|
Subtracts mean of image and divides by adjusted standard variance (for
|
|
stability). Operations are per image but performed for the entire array.
|
|
:param image: 4D array (ID, Height, Weight, Channel)
|
|
:return: 4D array (ID, Height, Weight, Channel)
|
|
"""
|
|
assert len(np.shape(data)) == 4
|
|
|
|
# Compute number of pixels in image
|
|
nb_pixels = np.shape(data)[1] * np.shape(data)[2] * np.shape(data)[3]
|
|
|
|
# Subtract mean
|
|
mean = np.mean(data, axis=(1,2,3))
|
|
|
|
ones = np.ones(np.shape(data)[1:4], dtype=np.float32)
|
|
for i in xrange(len(data)):
|
|
data[i, :, :, :] -= mean[i] * ones
|
|
|
|
# Compute adjusted standard variance
|
|
adj_std_var = np.maximum(np.ones(len(data), dtype=np.float32) / math.sqrt(nb_pixels), np.std(data, axis=(1,2,3))) #NOLINT(long-line)
|
|
|
|
# Divide image
|
|
for i in xrange(len(data)):
|
|
data[i, :, :, :] = data[i, :, :, :] / adj_std_var[i]
|
|
|
|
print(np.shape(data))
|
|
|
|
return data
|
|
|
|
|
|
def extract_svhn(local_url):
|
|
"""
|
|
Extract a MATLAB matrix into two numpy arrays with data and labels
|
|
:param local_url:
|
|
:return:
|
|
"""
|
|
|
|
with tf.gfile.Open(local_url, mode='r') as file_obj:
|
|
# Load MATLAB matrix using scipy IO
|
|
dict = loadmat(file_obj)
|
|
|
|
# Extract each dictionary (one for data, one for labels)
|
|
data, labels = dict["X"], dict["y"]
|
|
|
|
# Set np type
|
|
data = np.asarray(data, dtype=np.float32)
|
|
labels = np.asarray(labels, dtype=np.int32)
|
|
|
|
# Transpose data to match TF model input format
|
|
data = data.transpose(3, 0, 1, 2)
|
|
|
|
# Fix the SVHN labels which label 0s as 10s
|
|
labels[labels == 10] = 0
|
|
|
|
# Fix label dimensions
|
|
labels = labels.reshape(len(labels))
|
|
|
|
return data, labels
|
|
|
|
|
|
def unpickle_cifar_dic(file):
|
|
"""
|
|
Helper function: unpickles a dictionary (used for loading CIFAR)
|
|
:param file: filename of the pickle
|
|
:return: tuple of (images, labels)
|
|
"""
|
|
fo = open(file, 'rb')
|
|
dict = pickle.load(fo)
|
|
fo.close()
|
|
return dict['data'], dict['labels']
|
|
|
|
|
|
def extract_cifar10(local_url, data_dir):
|
|
"""
|
|
Extracts the CIFAR-10 dataset and return numpy arrays with the different sets
|
|
:param local_url: where the tar.gz archive is located locally
|
|
:param data_dir: where to extract the archive's file
|
|
:return: a tuple (train data, train labels, test data, test labels)
|
|
"""
|
|
# These numpy dumps can be reloaded to avoid performing the pre-processing
|
|
# if they exist in the working directory.
|
|
# Changing the order of this list will ruin the indices below.
|
|
preprocessed_files = ['/cifar10_train.npy',
|
|
'/cifar10_train_labels.npy',
|
|
'/cifar10_test.npy',
|
|
'/cifar10_test_labels.npy']
|
|
|
|
all_preprocessed = True
|
|
for file in preprocessed_files:
|
|
if not tf.gfile.Exists(data_dir + file):
|
|
all_preprocessed = False
|
|
break
|
|
|
|
if all_preprocessed:
|
|
# Reload pre-processed training data from numpy dumps
|
|
with tf.gfile.Open(data_dir + preprocessed_files[0], mode='r') as file_obj:
|
|
train_data = np.load(file_obj)
|
|
with tf.gfile.Open(data_dir + preprocessed_files[1], mode='r') as file_obj:
|
|
train_labels = np.load(file_obj)
|
|
|
|
# Reload pre-processed testing data from numpy dumps
|
|
with tf.gfile.Open(data_dir + preprocessed_files[2], mode='r') as file_obj:
|
|
test_data = np.load(file_obj)
|
|
with tf.gfile.Open(data_dir + preprocessed_files[3], mode='r') as file_obj:
|
|
test_labels = np.load(file_obj)
|
|
|
|
else:
|
|
# Do everything from scratch
|
|
# Define lists of all files we should extract
|
|
train_files = ["data_batch_" + str(i) for i in xrange(1,6)]
|
|
test_file = ["test_batch"]
|
|
cifar10_files = train_files + test_file
|
|
|
|
# Check if all files have already been extracted
|
|
need_to_unpack = False
|
|
for file in cifar10_files:
|
|
if not tf.gfile.Exists(file):
|
|
need_to_unpack = True
|
|
break
|
|
|
|
# We have to unpack the archive
|
|
if need_to_unpack:
|
|
tarfile.open(local_url, 'r:gz').extractall(data_dir)
|
|
|
|
# Load training images and labels
|
|
images = []
|
|
labels = []
|
|
for file in train_files:
|
|
# Construct filename
|
|
filename = data_dir + "/cifar-10-batches-py/" + file
|
|
|
|
# Unpickle dictionary and extract images and labels
|
|
images_tmp, labels_tmp = unpickle_cifar_dic(filename)
|
|
|
|
# Append to lists
|
|
images.append(images_tmp)
|
|
labels.append(labels_tmp)
|
|
|
|
# Convert to numpy arrays and reshape in the expected format
|
|
train_data = np.asarray(images, dtype=np.float32).reshape((50000,3,32,32))
|
|
train_data = np.swapaxes(train_data, 1, 3)
|
|
train_labels = np.asarray(labels, dtype=np.int32).reshape(50000)
|
|
|
|
# Save so we don't have to do this again
|
|
np.save(data_dir + preprocessed_files[0], train_data)
|
|
np.save(data_dir + preprocessed_files[1], train_labels)
|
|
|
|
# Construct filename for test file
|
|
filename = data_dir + "/cifar-10-batches-py/" + test_file[0]
|
|
|
|
# Load test images and labels
|
|
test_data, test_images = unpickle_cifar_dic(filename)
|
|
|
|
# Convert to numpy arrays and reshape in the expected format
|
|
test_data = np.asarray(test_data,dtype=np.float32).reshape((10000,3,32,32))
|
|
test_data = np.swapaxes(test_data, 1, 3)
|
|
test_labels = np.asarray(test_images, dtype=np.int32).reshape(10000)
|
|
|
|
# Save so we don't have to do this again
|
|
np.save(data_dir + preprocessed_files[2], test_data)
|
|
np.save(data_dir + preprocessed_files[3], test_labels)
|
|
|
|
return train_data, train_labels, test_data, test_labels
|
|
|
|
|
|
def extract_mnist_data(filename, num_images, image_size, pixel_depth):
|
|
"""
|
|
Extract the images into a 4D tensor [image index, y, x, channels].
|
|
|
|
Values are rescaled from [0, 255] down to [-0.5, 0.5].
|
|
"""
|
|
# if not os.path.exists(file):
|
|
if not tf.gfile.Exists(filename+".npy"):
|
|
with gzip.open(filename) as bytestream:
|
|
bytestream.read(16)
|
|
buf = bytestream.read(image_size * image_size * num_images)
|
|
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
|
|
data = (data - (pixel_depth / 2.0)) / pixel_depth
|
|
data = data.reshape(num_images, image_size, image_size, 1)
|
|
np.save(filename, data)
|
|
return data
|
|
else:
|
|
with tf.gfile.Open(filename+".npy", mode='r') as file_obj:
|
|
return np.load(file_obj)
|
|
|
|
|
|
def extract_mnist_labels(filename, num_images):
|
|
"""
|
|
Extract the labels into a vector of int64 label IDs.
|
|
"""
|
|
# if not os.path.exists(file):
|
|
if not tf.gfile.Exists(filename+".npy"):
|
|
with gzip.open(filename) as bytestream:
|
|
bytestream.read(8)
|
|
buf = bytestream.read(1 * num_images)
|
|
labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int32)
|
|
np.save(filename, labels)
|
|
return labels
|
|
else:
|
|
with tf.gfile.Open(filename+".npy", mode='r') as file_obj:
|
|
return np.load(file_obj)
|
|
|
|
|
|
def ld_svhn(extended=False, test_only=False):
|
|
"""
|
|
Load the original SVHN data
|
|
:param extended: include extended training data in the returned array
|
|
:param test_only: disables loading of both train and extra -> large speed up
|
|
:return: tuple of arrays which depend on the parameters
|
|
"""
|
|
# Define files to be downloaded
|
|
# WARNING: changing the order of this list will break indices (cf. below)
|
|
file_urls = ['http://ufldl.stanford.edu/housenumbers/train_32x32.mat',
|
|
'http://ufldl.stanford.edu/housenumbers/test_32x32.mat',
|
|
'http://ufldl.stanford.edu/housenumbers/extra_32x32.mat']
|
|
|
|
# Maybe download data and retrieve local storage urls
|
|
local_urls = maybe_download(file_urls, FLAGS.data_dir)
|
|
|
|
# Extra Train, Test, and Extended Train data
|
|
if not test_only:
|
|
# Load and applying whitening to train data
|
|
train_data, train_labels = extract_svhn(local_urls[0])
|
|
train_data = image_whitening(train_data)
|
|
|
|
# Load and applying whitening to extended train data
|
|
ext_data, ext_labels = extract_svhn(local_urls[2])
|
|
ext_data = image_whitening(ext_data)
|
|
|
|
# Load and applying whitening to test data
|
|
test_data, test_labels = extract_svhn(local_urls[1])
|
|
test_data = image_whitening(test_data)
|
|
|
|
if test_only:
|
|
return test_data, test_labels
|
|
else:
|
|
if extended:
|
|
# Stack train data with the extended training data
|
|
train_data = np.vstack((train_data, ext_data))
|
|
train_labels = np.hstack((train_labels, ext_labels))
|
|
|
|
return train_data, train_labels, test_data, test_labels
|
|
else:
|
|
# Return training and extended training data separately
|
|
return train_data,train_labels, test_data,test_labels, ext_data,ext_labels
|
|
|
|
|
|
def ld_cifar10(test_only=False):
|
|
"""
|
|
Load the original CIFAR10 data
|
|
:param extended: include extended training data in the returned array
|
|
:param test_only: disables loading of both train and extra -> large speed up
|
|
:return: tuple of arrays which depend on the parameters
|
|
"""
|
|
# Define files to be downloaded
|
|
file_urls = ['https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz']
|
|
|
|
# Maybe download data and retrieve local storage urls
|
|
local_urls = maybe_download(file_urls, FLAGS.data_dir)
|
|
|
|
# Extract archives and return different sets
|
|
dataset = extract_cifar10(local_urls[0], FLAGS.data_dir)
|
|
|
|
# Unpack tuple
|
|
train_data, train_labels, test_data, test_labels = dataset
|
|
|
|
# Apply whitening to input data
|
|
train_data = image_whitening(train_data)
|
|
test_data = image_whitening(test_data)
|
|
|
|
if test_only:
|
|
return test_data, test_labels
|
|
else:
|
|
return train_data, train_labels, test_data, test_labels
|
|
|
|
|
|
def ld_mnist(test_only=False):
|
|
"""
|
|
Load the MNIST dataset
|
|
:param extended: include extended training data in the returned array
|
|
:param test_only: disables loading of both train and extra -> large speed up
|
|
:return: tuple of arrays which depend on the parameters
|
|
"""
|
|
# Define files to be downloaded
|
|
# WARNING: changing the order of this list will break indices (cf. below)
|
|
file_urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
|
|
'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
|
|
'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
|
|
'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
|
|
]
|
|
|
|
# Maybe download data and retrieve local storage urls
|
|
local_urls = maybe_download(file_urls, FLAGS.data_dir)
|
|
|
|
# Extract it into np arrays.
|
|
train_data = extract_mnist_data(local_urls[0], 60000, 28, 1)
|
|
train_labels = extract_mnist_labels(local_urls[1], 60000)
|
|
test_data = extract_mnist_data(local_urls[2], 10000, 28, 1)
|
|
test_labels = extract_mnist_labels(local_urls[3], 10000)
|
|
|
|
if test_only:
|
|
return test_data, test_labels
|
|
else:
|
|
return train_data, train_labels, test_data, test_labels
|
|
|
|
|
|
def partition_dataset(data, labels, nb_teachers, teacher_id):
|
|
"""
|
|
Simple partitioning algorithm that returns the right portion of the data
|
|
needed by a given teacher out of a certain nb of teachers
|
|
:param data: input data to be partitioned
|
|
:param labels: output data to be partitioned
|
|
:param nb_teachers: number of teachers in the ensemble (affects size of each
|
|
partition)
|
|
:param teacher_id: id of partition to retrieve
|
|
:return:
|
|
"""
|
|
|
|
# Sanity check
|
|
assert len(data) == len(labels)
|
|
assert int(teacher_id) < int(nb_teachers)
|
|
|
|
# This will floor the possible number of batches
|
|
batch_len = int(len(data) / nb_teachers)
|
|
|
|
# Compute start, end indices of partition
|
|
start = teacher_id * batch_len
|
|
end = (teacher_id+1) * batch_len
|
|
|
|
# Slice partition off
|
|
partition_data = data[start:end]
|
|
partition_labels = labels[start:end]
|
|
|
|
return partition_data, partition_labels
|