tensorflow_privacy/tensorflow_privacy/privacy/logistic_regression/datasets.py
A. Unique TensorFlower c665281c55 Implementation of Differentially Private Logistic Regression.
PiperOrigin-RevId: 458266079
2022-06-30 11:01:02 -07:00

130 lines
5.1 KiB
Python

# Copyright 2021, The TensorFlow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions for generating train and test data for logistic regression models.
Includes two types of datasets:
- Synthetic linearly separable labeled examples.
Here, in the binary classification case, we generate training examples by
first sampling a random weight vector w from a multivariate Gaussian
distribution. Then, for each training example, we randomly sample a point x,
also from a multivariate Gaussian distribution, and then set the label y equal
to 1 if the inner product of w and x is positive, and equal to 0 otherwise. As
such, the training data is linearly separable.
More generally, in the case where there are num_classes many classes, we
sample num_classes different w vectors. After sampling x, we will set its
class label y to the class for which the corresponding w vector has the
largest inner product with x.
- MNIST 10-class classification dataset.
"""
import dataclasses
from typing import Tuple, Optional
import numpy as np
from sklearn import preprocessing
import tensorflow as tf
@dataclasses.dataclass
class RegressionDataset:
"""Class for storing labeled examples for a regression dataset.
Attributes:
points: array of shape (num_examples, dimension) containing the points to be
classified.
labels: array of shape (num_examples,) containing the corresponding labels,
each belonging to the set {0,1,...,num_classes-1}, where num_classes is
the number of classes.
weights: dimension by num_classes matrix containing coefficients of linear
separator, where dimension is the dimension and num_classes is the number
of classes.
"""
points: np.ndarray
labels: np.ndarray
weights: Optional[np.ndarray]
def linearly_separable_labeled_examples(
num_examples: int, weights: np.ndarray) -> RegressionDataset:
"""Generates num_examples labeled examples using separator given by weights.
Args:
num_examples: number of labeled examples to generate.
weights: dimension by num_classes matrix containing coefficients of linear
separator, where dimension is the dimension and num_classes is the number
of classes.
Returns:
RegressionDataset consisting of points and labels. Each point has unit
l2-norm.
"""
dimension = weights.shape[0]
# Generate points and normalize each to have unit l2-norm.
points_non_normalized = np.random.normal(size=(num_examples, dimension))
points = preprocessing.normalize(points_non_normalized)
# Compute labels.
labels = np.argmax(np.matmul(points, weights), axis=1)
return RegressionDataset(points, labels, weights)
def synthetic_linearly_separable_data(
num_train: int, num_test: int, dimension: int,
num_classes: int) -> Tuple[RegressionDataset, RegressionDataset]:
"""Generates synthetic train and test data for logistic regression.
Args:
num_train: number of training data points.
num_test: number of test data points.
dimension: the dimension of the classification problem.
num_classes: number of classes, assumed to be at least 2.
Returns:
train_dataset: num_train labeled examples, with unit l2-norm points.
test_dataset: num_test labeled examples, with unit l2-norm points.
"""
if num_classes < 2:
raise ValueError(f'num_classes must be at least 2. It is {num_classes}.')
# Generate weight vector.
weights = np.random.normal(size=(dimension, num_classes))
# Generate train labeled examples.
train_dataset = linearly_separable_labeled_examples(num_train, weights)
# Generate test labeled examples.
test_dataset = linearly_separable_labeled_examples(num_test, weights)
return (train_dataset, test_dataset)
def mnist_dataset() -> Tuple[RegressionDataset, RegressionDataset]:
"""Generates (normalized) train and test data for MNIST.
Returns:
train_dataset: MNIST labeled examples, with unit l2-norm points.
test_dataset: MNIST labeled examples, with unit l2-norm points.
"""
train_data, test_data = tf.keras.datasets.mnist.load_data()
train_points_non_normalized, train_labels = train_data
test_points_non_normalized, test_labels = test_data
num_train = train_points_non_normalized.shape[0]
num_test = test_points_non_normalized.shape[0]
train_points_non_normalized = train_points_non_normalized.reshape(
(num_train, -1))
test_points_non_normalized = test_points_non_normalized.reshape(
(num_test, -1))
train_points = preprocessing.normalize(train_points_non_normalized)
test_points = preprocessing.normalize(test_points_non_normalized)
return (RegressionDataset(train_points, train_labels, None),
RegressionDataset(test_points, test_labels, None))