tensorflow_privacy/research/pate_2017/input.py

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip
import math
import os
import sys
import tarfile

import numpy as np
from scipy.io import loadmat as loadmat
from six.moves import cPickle as pickle
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

FLAGS = tf.flags.FLAGS


def create_dir_if_needed(dest_directory):
  """
  Create directory if doesn't exist
  :param dest_directory:
  :return: True if everything went well
  """
  if not tf.gfile.IsDirectory(dest_directory):
    tf.gfile.MakeDirs(dest_directory)

  return True


def maybe_download(file_urls, directory):
  """
  Download a set of files in temporary local folder
  :param directory: the directory where to download
  :return: a tuple of filepaths corresponding to the files given as input
  """
  # Create directory if doesn't exist
  assert create_dir_if_needed(directory)

  # This list will include all URLS of the local copy of downloaded files
  result = []

  # For each file of the dataset
  for file_url in file_urls:
    # Extract filename
    filename = file_url.split('/')[-1]

    # If downloading from GitHub, remove suffix ?raw=True from local filename
    if filename.endswith("?raw=true"):
      filename = filename[:-9]

    # Deduce local file url
    #filepath = os.path.join(directory, filename)
    filepath = directory + '/' + filename

    # Add to result list
    result.append(filepath)

    # Test if file already exists
    if not tf.gfile.Exists(filepath):
      def _progress(count, block_size, total_size):
        sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
            float(count * block_size) / float(total_size) * 100.0))
        sys.stdout.flush()
      filepath, _ = urllib.request.urlretrieve(file_url, filepath, _progress)
      print()
      statinfo = os.stat(filepath)
      print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')

  return result


def image_whitening(data):
  """
  Subtracts mean of image and divides by adjusted standard variance (for
  stability). Operations are per image but performed for the entire array.
  :param image: 4D array (ID, Height, Weight, Channel)
  :return: 4D array (ID, Height, Weight, Channel)
  """
  assert len(np.shape(data)) == 4

  # Compute number of pixels in image
  nb_pixels = np.shape(data)[1] * np.shape(data)[2] * np.shape(data)[3]

  # Subtract mean
  mean = np.mean(data, axis=(1, 2, 3))

  ones = np.ones(np.shape(data)[1:4], dtype=np.float32)
  for i in xrange(len(data)):
    data[i, :, :, :] -= mean[i] * ones

  # Compute adjusted standard variance
  adj_std_var = np.maximum(np.ones(len(data), dtype=np.float32) / math.sqrt(nb_pixels), np.std(data, axis=(1, 2, 3)))  # pylint: disable=line-too-long

  # Divide image
  for i in xrange(len(data)):
    data[i, :, :, :] = data[i, :, :, :] / adj_std_var[i]

  print(np.shape(data))

  return data


def extract_svhn(local_url):
  """
  Extract a MATLAB matrix into two numpy arrays with data and labels
  :param local_url:
  :return:
  """

  with tf.gfile.Open(local_url, mode='r') as file_obj:
    # Load MATLAB matrix using scipy IO
    data_dict = loadmat(file_obj)

    # Extract each dictionary (one for data, one for labels)
    data, labels = data_dict['X'], data_dict['y']

    # Set np type
    data = np.asarray(data, dtype=np.float32)
    labels = np.asarray(labels, dtype=np.int32)

    # Transpose data to match TF model input format
    data = data.transpose(3, 0, 1, 2)

    # Fix the SVHN labels which label 0s as 10s
    labels[labels == 10] = 0

    # Fix label dimensions
    labels = labels.reshape(len(labels))

    return data, labels


def unpickle_cifar_dic(file_path):
  """
  Helper function: unpickles a dictionary (used for loading CIFAR)
  :param file_path: filename of the pickle
  :return: tuple of (images, labels)
  """
  file_obj = open(file_path, 'rb')
  data_dict = pickle.load(file_obj)
  file_obj.close()
  return data_dict['data'], data_dict['labels']


def extract_cifar10(local_url, data_dir):
  """
  Extracts the CIFAR-10 dataset and return numpy arrays with the different sets
  :param local_url: where the tar.gz archive is located locally
  :param data_dir: where to extract the archive's file
  :return: a tuple (train data, train labels, test data, test labels)
  """
  # These numpy dumps can be reloaded to avoid performing the pre-processing
  # if they exist in the working directory.
  # Changing the order of this list will ruin the indices below.
  preprocessed_files = ['/cifar10_train.npy',
                        '/cifar10_train_labels.npy',
                        '/cifar10_test.npy',
                        '/cifar10_test_labels.npy']

  all_preprocessed = True
  for file_name in preprocessed_files:
    if not tf.gfile.Exists(data_dir + file_name):
      all_preprocessed = False
      break

  if all_preprocessed:
    # Reload pre-processed training data from numpy dumps
    with tf.gfile.Open(data_dir + preprocessed_files[0], mode='r') as file_obj:
      train_data = np.load(file_obj)
    with tf.gfile.Open(data_dir + preprocessed_files[1], mode='r') as file_obj:
      train_labels = np.load(file_obj)

    # Reload pre-processed testing data from numpy dumps
    with tf.gfile.Open(data_dir + preprocessed_files[2], mode='r') as file_obj:
      test_data = np.load(file_obj)
    with tf.gfile.Open(data_dir + preprocessed_files[3], mode='r') as file_obj:
      test_labels = np.load(file_obj)

  else:
    # Do everything from scratch
    # Define lists of all files we should extract
    train_files = ['data_batch_' + str(i) for i in xrange(1, 6)]
    test_file = ['test_batch']
    cifar10_files = train_files + test_file

    # Check if all files have already been extracted
    need_to_unpack = False
    for file in cifar10_files:
      if not tf.gfile.Exists(file):
        need_to_unpack = True
        break

    # We have to unpack the archive
    if need_to_unpack:
      tarfile.open(local_url, 'r:gz').extractall(data_dir)

    # Load training images and labels
    images = []
    labels = []
    for train_file in train_files:
      # Construct filename
      filename = data_dir + '/cifar-10-batches-py/' + train_file

      # Unpickle dictionary and extract images and labels
      images_tmp, labels_tmp = unpickle_cifar_dic(filename)

      # Append to lists
      images.append(images_tmp)
      labels.append(labels_tmp)

    # Convert to numpy arrays and reshape in the expected format
    train_data = np.asarray(images, dtype=np.float32).reshape((50000, 3, 32, 32))
    train_data = np.swapaxes(train_data, 1, 3)
    train_labels = np.asarray(labels, dtype=np.int32).reshape(50000)

    # Save so we don't have to do this again
    np.save(data_dir + preprocessed_files[0], train_data)
    np.save(data_dir + preprocessed_files[1], train_labels)

    # Construct filename for test file
    filename = data_dir + '/cifar-10-batches-py/' + test_file[0]

    # Load test images and labels
    test_data, test_images = unpickle_cifar_dic(filename)

    # Convert to numpy arrays and reshape in the expected format
    test_data = np.asarray(test_data, dtype=np.float32).reshape((10000, 3, 32, 32))
    test_data = np.swapaxes(test_data, 1, 3)
    test_labels = np.asarray(test_images, dtype=np.int32).reshape(10000)

    # Save so we don't have to do this again
    np.save(data_dir + preprocessed_files[2], test_data)
    np.save(data_dir + preprocessed_files[3], test_labels)

  return train_data, train_labels, test_data, test_labels


def extract_mnist_data(filename, num_images, image_size, pixel_depth):
  """
  Extract the images into a 4D tensor [image index, y, x, channels].

  Values are rescaled from [0, 255] down to [-0.5, 0.5].
  """
  if not tf.gfile.Exists(filename+'.npy'):
    with gzip.open(filename) as bytestream:
      bytestream.read(16)
      buf = bytestream.read(image_size * image_size * num_images)
      data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
      data = (data - (pixel_depth / 2.0)) / pixel_depth
      data = data.reshape(num_images, image_size, image_size, 1)
      np.save(filename, data)
      return data
  else:
    with tf.gfile.Open(filename+'.npy', mode='r') as file_obj:
      return np.load(file_obj)


def extract_mnist_labels(filename, num_images):
  """
  Extract the labels into a vector of int64 label IDs.
  """
  if not tf.gfile.Exists(filename+'.npy'):
    with gzip.open(filename) as bytestream:
      bytestream.read(8)
      buf = bytestream.read(1 * num_images)
      labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int32)
      np.save(filename, labels)
    return labels
  else:
    with tf.gfile.Open(filename+'.npy', mode='r') as file_obj:
      return np.load(file_obj)


def ld_svhn(extended=False, test_only=False):
  """
  Load the original SVHN data
  :param extended: include extended training data in the returned array
  :param test_only: disables loading of both train and extra -> large speed up
  :return: tuple of arrays which depend on the parameters
  """
  # Define files to be downloaded
  # WARNING: changing the order of this list will break indices (cf. below)
  file_urls = ['http://ufldl.stanford.edu/housenumbers/train_32x32.mat',
               'http://ufldl.stanford.edu/housenumbers/test_32x32.mat',
               'http://ufldl.stanford.edu/housenumbers/extra_32x32.mat']

  # Maybe download data and retrieve local storage urls
  local_urls = maybe_download(file_urls, FLAGS.data_dir)

  # Extra Train, Test, and Extended Train data
  if not test_only:
    # Load and applying whitening to train data
    train_data, train_labels = extract_svhn(local_urls[0])
    train_data = image_whitening(train_data)

    # Load and applying whitening to extended train data
    ext_data, ext_labels = extract_svhn(local_urls[2])
    ext_data = image_whitening(ext_data)

  # Load and applying whitening to test data
  test_data, test_labels = extract_svhn(local_urls[1])
  test_data = image_whitening(test_data)

  if test_only:
    return test_data, test_labels
  else:
    if extended:
      # Stack train data with the extended training data
      train_data = np.vstack((train_data, ext_data))
      train_labels = np.hstack((train_labels, ext_labels))

      return train_data, train_labels, test_data, test_labels
    else:
      # Return training and extended training data separately
      return train_data, train_labels, test_data, test_labels, ext_data, ext_labels


def ld_cifar10(test_only=False):
  """
  Load the original CIFAR10 data
  :param extended: include extended training data in the returned array
  :param test_only: disables loading of both train and extra -> large speed up
  :return: tuple of arrays which depend on the parameters
  """
  # Define files to be downloaded
  file_urls = ['https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz']

  # Maybe download data and retrieve local storage urls
  local_urls = maybe_download(file_urls, FLAGS.data_dir)

  # Extract archives and return different sets
  dataset = extract_cifar10(local_urls[0], FLAGS.data_dir)

  # Unpack tuple
  train_data, train_labels, test_data, test_labels = dataset

  # Apply whitening to input data
  train_data = image_whitening(train_data)
  test_data = image_whitening(test_data)

  if test_only:
    return test_data, test_labels
  else:
    return train_data, train_labels, test_data, test_labels


def ld_mnist(test_only=False):
  """
  Load the MNIST dataset
  :param extended: include extended training data in the returned array
  :param test_only: disables loading of both train and extra -> large speed up
  :return: tuple of arrays which depend on the parameters
  """
  # Define files to be downloaded
  # WARNING: changing the order of this list will break indices (cf. below)
  file_urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
               'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
               'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
               'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
              ]

  # Maybe download data and retrieve local storage urls
  local_urls = maybe_download(file_urls, FLAGS.data_dir)

  # Extract it into np arrays.
  train_data = extract_mnist_data(local_urls[0], 60000, 28, 1)
  train_labels = extract_mnist_labels(local_urls[1], 60000)
  test_data = extract_mnist_data(local_urls[2], 10000, 28, 1)
  test_labels = extract_mnist_labels(local_urls[3], 10000)

  if test_only:
    return test_data, test_labels
  else:
    return train_data, train_labels, test_data, test_labels


def partition_dataset(data, labels, nb_teachers, teacher_id):
  """
  Simple partitioning algorithm that returns the right portion of the data
  needed by a given teacher out of a certain nb of teachers
  :param data: input data to be partitioned
  :param labels: output data to be partitioned
  :param nb_teachers: number of teachers in the ensemble (affects size of each
                      partition)
  :param teacher_id: id of partition to retrieve
  :return:
  """

  # Sanity check
  assert len(data) == len(labels)
  assert int(teacher_id) < int(nb_teachers)

  # This will floor the possible number of batches
  batch_len = int(len(data) / nb_teachers)

  # Compute start, end indices of partition
  start = teacher_id * batch_len
  end = (teacher_id+1) * batch_len

  # Slice partition off
  partition_data = data[start:end]
  partition_labels = labels[start:end]

  return partition_data, partition_labels
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`# Copyright 2016 The TensorFlow Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# ==============================================================================`

			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`

			`import gzip`
			`import math`
			`import os`
			`import sys`
			`import tarfile`

reorder imports 2019-03-18 10:23:59 -06:00			`import numpy as np`
			`from scipy.io import loadmat as loadmat`
			`from six.moves import cPickle as pickle`
			`from six.moves import urllib`
			`from six.moves import xrange`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`import tensorflow as tf`

			`FLAGS = tf.flags.FLAGS`


			`def create_dir_if_needed(dest_directory):`
			`"""`
			`Create directory if doesn't exist`
			`:param dest_directory:`
			`:return: True if everything went well`
			`"""`
			`if not tf.gfile.IsDirectory(dest_directory):`
			`tf.gfile.MakeDirs(dest_directory)`

			`return True`


			`def maybe_download(file_urls, directory):`
			`"""`
			`Download a set of files in temporary local folder`
			`:param directory: the directory where to download`
			`:return: a tuple of filepaths corresponding to the files given as input`
			`"""`
			`# Create directory if doesn't exist`
			`assert create_dir_if_needed(directory)`

			`# This list will include all URLS of the local copy of downloaded files`
			`result = []`

			`# For each file of the dataset`
			`for file_url in file_urls:`
			`# Extract filename`
			`filename = file_url.split('/')[-1]`

			`# If downloading from GitHub, remove suffix ?raw=True from local filename`
			`if filename.endswith("?raw=true"):`
			`filename = filename[:-9]`

			`# Deduce local file url`
			`#filepath = os.path.join(directory, filename)`
			`filepath = directory + '/' + filename`

			`# Add to result list`
			`result.append(filepath)`

			`# Test if file already exists`
			`if not tf.gfile.Exists(filepath):`
			`def _progress(count, block_size, total_size):`
			`sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,`
			`float(count * block_size) / float(total_size) * 100.0))`
			`sys.stdout.flush()`
			`filepath, _ = urllib.request.urlretrieve(file_url, filepath, _progress)`
			`print()`
			`statinfo = os.stat(filepath)`
			`print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')`

			`return result`


			`def image_whitening(data):`
			`"""`
			`Subtracts mean of image and divides by adjusted standard variance (for`
			`stability). Operations are per image but performed for the entire array.`
			`:param image: 4D array (ID, Height, Weight, Channel)`
			`:return: 4D array (ID, Height, Weight, Channel)`
			`"""`
			`assert len(np.shape(data)) == 4`

			`# Compute number of pixels in image`
			`nb_pixels = np.shape(data)[1] * np.shape(data)[2] * np.shape(data)[3]`

			`# Subtract mean`
pylint edits 2019-03-18 10:42:59 -06:00			`mean = np.mean(data, axis=(1, 2, 3))`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`ones = np.ones(np.shape(data)[1:4], dtype=np.float32)`
			`for i in xrange(len(data)):`
			`data[i, :, :, :] -= mean[i] * ones`

			`# Compute adjusted standard variance`
pylint edits 2019-03-18 10:42:59 -06:00			`adj_std_var = np.maximum(np.ones(len(data), dtype=np.float32) / math.sqrt(nb_pixels), np.std(data, axis=(1, 2, 3))) # pylint: disable=line-too-long`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`# Divide image`
			`for i in xrange(len(data)):`
			`data[i, :, :, :] = data[i, :, :, :] / adj_std_var[i]`

			`print(np.shape(data))`

			`return data`


			`def extract_svhn(local_url):`
			`"""`
			`Extract a MATLAB matrix into two numpy arrays with data and labels`
			`:param local_url:`
			`:return:`
			`"""`

			`with tf.gfile.Open(local_url, mode='r') as file_obj:`
			`# Load MATLAB matrix using scipy IO`
rename var 2019-03-18 10:27:57 -06:00			`data_dict = loadmat(file_obj)`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`# Extract each dictionary (one for data, one for labels)`
quotes 2019-03-18 10:47:36 -06:00			`data, labels = data_dict['X'], data_dict['y']`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`# Set np type`
			`data = np.asarray(data, dtype=np.float32)`
			`labels = np.asarray(labels, dtype=np.int32)`

			`# Transpose data to match TF model input format`
			`data = data.transpose(3, 0, 1, 2)`

			`# Fix the SVHN labels which label 0s as 10s`
			`labels[labels == 10] = 0`

			`# Fix label dimensions`
			`labels = labels.reshape(len(labels))`

			`return data, labels`


fnames 2019-03-18 10:49:34 -06:00			`def unpickle_cifar_dic(file_path):`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`"""`
			`Helper function: unpickles a dictionary (used for loading CIFAR)`
fnames 2019-03-18 10:49:34 -06:00			`:param file_path: filename of the pickle`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`:return: tuple of (images, labels)`
			`"""`
fnames 2019-03-18 10:49:34 -06:00			`file_obj = open(file_path, 'rb')`
pylint edits 2019-03-18 10:42:59 -06:00			`data_dict = pickle.load(file_obj)`
			`file_obj.close()`
rename var 2019-03-18 10:27:57 -06:00			`return data_dict['data'], data_dict['labels']`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00

			`def extract_cifar10(local_url, data_dir):`
			`"""`
			`Extracts the CIFAR-10 dataset and return numpy arrays with the different sets`
			`:param local_url: where the tar.gz archive is located locally`
			`:param data_dir: where to extract the archive's file`
			`:return: a tuple (train data, train labels, test data, test labels)`
			`"""`
			`# These numpy dumps can be reloaded to avoid performing the pre-processing`
			`# if they exist in the working directory.`
			`# Changing the order of this list will ruin the indices below.`
			`preprocessed_files = ['/cifar10_train.npy',`
			`'/cifar10_train_labels.npy',`
			`'/cifar10_test.npy',`
			`'/cifar10_test_labels.npy']`

			`all_preprocessed = True`
pylint edits 2019-03-18 10:42:59 -06:00			`for file_name in preprocessed_files:`
			`if not tf.gfile.Exists(data_dir + file_name):`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`all_preprocessed = False`
			`break`

			`if all_preprocessed:`
			`# Reload pre-processed training data from numpy dumps`
			`with tf.gfile.Open(data_dir + preprocessed_files[0], mode='r') as file_obj:`
			`train_data = np.load(file_obj)`
			`with tf.gfile.Open(data_dir + preprocessed_files[1], mode='r') as file_obj:`
			`train_labels = np.load(file_obj)`

			`# Reload pre-processed testing data from numpy dumps`
			`with tf.gfile.Open(data_dir + preprocessed_files[2], mode='r') as file_obj:`
			`test_data = np.load(file_obj)`
			`with tf.gfile.Open(data_dir + preprocessed_files[3], mode='r') as file_obj:`
			`test_labels = np.load(file_obj)`

			`else:`
			`# Do everything from scratch`
			`# Define lists of all files we should extract`
quotes 2019-03-18 10:47:36 -06:00			`train_files = ['data_batch_' + str(i) for i in xrange(1, 6)]`
			`test_file = ['test_batch']`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`cifar10_files = train_files + test_file`

			`# Check if all files have already been extracted`
			`need_to_unpack = False`
			`for file in cifar10_files:`
			`if not tf.gfile.Exists(file):`
			`need_to_unpack = True`
			`break`

			`# We have to unpack the archive`
			`if need_to_unpack:`
			`tarfile.open(local_url, 'r:gz').extractall(data_dir)`

			`# Load training images and labels`
			`images = []`
			`labels = []`
fnames 2019-03-18 10:49:34 -06:00			`for train_file in train_files:`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`# Construct filename`
fnames 2019-03-18 10:49:34 -06:00			`filename = data_dir + '/cifar-10-batches-py/' + train_file`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`# Unpickle dictionary and extract images and labels`
			`images_tmp, labels_tmp = unpickle_cifar_dic(filename)`

			`# Append to lists`
			`images.append(images_tmp)`
			`labels.append(labels_tmp)`

			`# Convert to numpy arrays and reshape in the expected format`
pylint edits 2019-03-18 10:42:59 -06:00			`train_data = np.asarray(images, dtype=np.float32).reshape((50000, 3, 32, 32))`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`train_data = np.swapaxes(train_data, 1, 3)`
			`train_labels = np.asarray(labels, dtype=np.int32).reshape(50000)`

			`# Save so we don't have to do this again`
			`np.save(data_dir + preprocessed_files[0], train_data)`
			`np.save(data_dir + preprocessed_files[1], train_labels)`

			`# Construct filename for test file`
quotes 2019-03-18 10:47:36 -06:00			`filename = data_dir + '/cifar-10-batches-py/' + test_file[0]`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`# Load test images and labels`
			`test_data, test_images = unpickle_cifar_dic(filename)`

			`# Convert to numpy arrays and reshape in the expected format`
pylint edits 2019-03-18 10:42:59 -06:00			`test_data = np.asarray(test_data, dtype=np.float32).reshape((10000, 3, 32, 32))`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`test_data = np.swapaxes(test_data, 1, 3)`
			`test_labels = np.asarray(test_images, dtype=np.int32).reshape(10000)`

			`# Save so we don't have to do this again`
			`np.save(data_dir + preprocessed_files[2], test_data)`
			`np.save(data_dir + preprocessed_files[3], test_labels)`

			`return train_data, train_labels, test_data, test_labels`


			`def extract_mnist_data(filename, num_images, image_size, pixel_depth):`
			`"""`
			`Extract the images into a 4D tensor [image index, y, x, channels].`

			`Values are rescaled from [0, 255] down to [-0.5, 0.5].`
			`"""`
quotes 2019-03-18 10:47:36 -06:00			`if not tf.gfile.Exists(filename+'.npy'):`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`with gzip.open(filename) as bytestream:`
			`bytestream.read(16)`
			`buf = bytestream.read(image_size * image_size * num_images)`
			`data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)`
			`data = (data - (pixel_depth / 2.0)) / pixel_depth`
			`data = data.reshape(num_images, image_size, image_size, 1)`
			`np.save(filename, data)`
			`return data`
			`else:`
quotes 2019-03-18 10:47:36 -06:00			`with tf.gfile.Open(filename+'.npy', mode='r') as file_obj:`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`return np.load(file_obj)`


			`def extract_mnist_labels(filename, num_images):`
			`"""`
			`Extract the labels into a vector of int64 label IDs.`
			`"""`
quotes 2019-03-18 10:47:36 -06:00			`if not tf.gfile.Exists(filename+'.npy'):`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`with gzip.open(filename) as bytestream:`
			`bytestream.read(8)`
			`buf = bytestream.read(1 * num_images)`
			`labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int32)`
			`np.save(filename, labels)`
			`return labels`
			`else:`
quotes 2019-03-18 10:47:36 -06:00			`with tf.gfile.Open(filename+'.npy', mode='r') as file_obj:`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00			`return np.load(file_obj)`


			`def ld_svhn(extended=False, test_only=False):`
			`"""`
			`Load the original SVHN data`
			`:param extended: include extended training data in the returned array`
			`:param test_only: disables loading of both train and extra -> large speed up`
			`:return: tuple of arrays which depend on the parameters`
			`"""`
			`# Define files to be downloaded`
			`# WARNING: changing the order of this list will break indices (cf. below)`
			`file_urls = ['http://ufldl.stanford.edu/housenumbers/train_32x32.mat',`
			`'http://ufldl.stanford.edu/housenumbers/test_32x32.mat',`
			`'http://ufldl.stanford.edu/housenumbers/extra_32x32.mat']`

			`# Maybe download data and retrieve local storage urls`
			`local_urls = maybe_download(file_urls, FLAGS.data_dir)`

			`# Extra Train, Test, and Extended Train data`
			`if not test_only:`
			`# Load and applying whitening to train data`
			`train_data, train_labels = extract_svhn(local_urls[0])`
			`train_data = image_whitening(train_data)`

			`# Load and applying whitening to extended train data`
			`ext_data, ext_labels = extract_svhn(local_urls[2])`
			`ext_data = image_whitening(ext_data)`

			`# Load and applying whitening to test data`
			`test_data, test_labels = extract_svhn(local_urls[1])`
			`test_data = image_whitening(test_data)`

			`if test_only:`
			`return test_data, test_labels`
			`else:`
			`if extended:`
			`# Stack train data with the extended training data`
			`train_data = np.vstack((train_data, ext_data))`
			`train_labels = np.hstack((train_labels, ext_labels))`

			`return train_data, train_labels, test_data, test_labels`
			`else:`
			`# Return training and extended training data separately`
pylint edits 2019-03-18 10:42:59 -06:00			`return train_data, train_labels, test_data, test_labels, ext_data, ext_labels`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00

			`def ld_cifar10(test_only=False):`
			`"""`
			`Load the original CIFAR10 data`
			`:param extended: include extended training data in the returned array`
			`:param test_only: disables loading of both train and extra -> large speed up`
			`:return: tuple of arrays which depend on the parameters`
			`"""`
			`# Define files to be downloaded`
			`file_urls = ['https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz']`

			`# Maybe download data and retrieve local storage urls`
			`local_urls = maybe_download(file_urls, FLAGS.data_dir)`

			`# Extract archives and return different sets`
			`dataset = extract_cifar10(local_urls[0], FLAGS.data_dir)`

			`# Unpack tuple`
			`train_data, train_labels, test_data, test_labels = dataset`

			`# Apply whitening to input data`
			`train_data = image_whitening(train_data)`
			`test_data = image_whitening(test_data)`

			`if test_only:`
			`return test_data, test_labels`
			`else:`
			`return train_data, train_labels, test_data, test_labels`


			`def ld_mnist(test_only=False):`
			`"""`
			`Load the MNIST dataset`
			`:param extended: include extended training data in the returned array`
			`:param test_only: disables loading of both train and extra -> large speed up`
			`:return: tuple of arrays which depend on the parameters`
			`"""`
			`# Define files to be downloaded`
			`# WARNING: changing the order of this list will break indices (cf. below)`
			`file_urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',`
			`'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',`
			`'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',`
			`'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',`
pylint edits 2019-03-18 10:42:59 -06:00			`]`
Add missing licenses. PiperOrigin-RevId: 229241117 2019-01-14 14:18:31 -07:00
			`# Maybe download data and retrieve local storage urls`
			`local_urls = maybe_download(file_urls, FLAGS.data_dir)`

			`# Extract it into np arrays.`
			`train_data = extract_mnist_data(local_urls[0], 60000, 28, 1)`
			`train_labels = extract_mnist_labels(local_urls[1], 60000)`
			`test_data = extract_mnist_data(local_urls[2], 10000, 28, 1)`
			`test_labels = extract_mnist_labels(local_urls[3], 10000)`

			`if test_only:`
			`return test_data, test_labels`
			`else:`
			`return train_data, train_labels, test_data, test_labels`


			`def partition_dataset(data, labels, nb_teachers, teacher_id):`
			`"""`
			`Simple partitioning algorithm that returns the right portion of the data`
			`needed by a given teacher out of a certain nb of teachers`
			`:param data: input data to be partitioned`
			`:param labels: output data to be partitioned`
			`:param nb_teachers: number of teachers in the ensemble (affects size of each`
			`partition)`
			`:param teacher_id: id of partition to retrieve`
			`:return:`
			`"""`

			`# Sanity check`
			`assert len(data) == len(labels)`
			`assert int(teacher_id) < int(nb_teachers)`

			`# This will floor the possible number of batches`
			`batch_len = int(len(data) / nb_teachers)`

			`# Compute start, end indices of partition`
			`start = teacher_id * batch_len`
			`end = (teacher_id+1) * batch_len`

			`# Slice partition off`
			`partition_data = data[start:end]`
			`partition_labels = labels[start:end]`

			`return partition_data, partition_labels`