tensorflow_privacy/research/dp_newton/src/dataset_loader.py

# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

"""dataset loader"""

# pylint: skip-file
# pyformat: disable

import os
import ssl
import tarfile
import urllib.request
from my_logistic_regression import MyLogisticRegression
import numpy as np
import requests
from sklearn import preprocessing
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import torch
from torchvision import datasets, transforms


PATH_PREFIX = './src/datasets_directory'
ssl._create_default_https_context = ssl._create_unverified_context


def normalize_fvec(x_train):
  """normalize feature vectors"""
  feature_mean = np.mean(x_train, axis=0)
  feature_std = np.std(x_train, axis=0)
  x_train = (x_train - feature_mean) / feature_std
  return x_train


def backtracking_ls(lrp, dir_srch, w_start, alpha=0.4, beta=0.95):
  """Implementation of backtracking line search

  lr = logistic regression
  dir = the "noisy" gradient direction
  w_start = current point
  alpha and beta tradeoff the precision and complexity of the linesearch

  output is an (close to) optimal stepsize
  """
  step_size = 100
  val_0 = lrp.loss(w_start)
  inner_prod = np.dot(dir_srch, lrp.grad(w_start))
  while (
      lrp.loss(w_start - step_size * dir_srch)
      >= val_0 - step_size * alpha * inner_prod
  ):
    step_size = beta * step_size
    if step_size < 1e-6:
      break
  return step_size


def newton(dataset, w_init, bias=True):
  """Implementation of the newton method with linesearch without privacy constraints

  dataset = dataset
  w_init = initialization point

  output is the model parameter
  """
  feature_vecs, labels = dataset
  if bias is True:
    feature_vecs = np.hstack(
        (np.ones(shape=(np.shape(feature_vecs)[0], 1)), feature_vecs)
    )
  lrp = MyLogisticRegression(feature_vecs, labels, reg=1e-9)
  w_cur = w_init
  for _ in range(8):
    hess = lrp.hess(w_cur)
    dir_srch = np.linalg.solve(hess, lrp.grad_wor(w_cur))
    step_size = backtracking_ls(lrp, dir_srch, w_cur)
    w_cur = w_cur - step_size * dir_srch
  if lrp.loss_wor(w_cur) < lrp.loss_wor(w_init):
    w_out = w_cur
  else:
    w_out = w_init
  return w_out


class Mydatasets:
  """Represents datasets we use for expriments"""

  def __init__(self):
    data_dir = PATH_PREFIX + '/data'
    cache_dir = PATH_PREFIX + '/cache_datasets'
    if not os.path.exists(data_dir):
      os.mkdir(data_dir)
    if not os.path.exists(cache_dir):
      os.mkdir(cache_dir)

  def find_optimal_classifier(self, dataset, bias=True):
    """find the optimal weight vector for the logistic regression

        for the problems with real datasets.

    dataset = training dataset
    bias = bias for the logistic model
    """
    inputs_vec, labels = dataset
    reg = 1e-9
    if bias is True:
      model_lr = LogisticRegression(max_iter=200, C=1 / reg).fit(
          inputs_vec, labels
      )
      w_opt1 = np.concatenate([model_lr.intercept_, np.squeeze(model_lr.coef_)])
      w_opt = newton(dataset, w_opt1, bias)
    else:
      model_lr = LogisticRegression(
          max_iter=200, fit_intercept=False, C=1 / reg
      ).fit(inputs_vec, labels)
      w_opt1 = np.squeeze(model_lr.coef_)
      w_opt = newton(dataset, w_opt1, bias)
    return w_opt

  def fmnist_dataset(self):
    """fmnist dataset"""
    transform_data = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5), (0.5))]
    )
    train_data_trans = datasets.FashionMNIST(
        root=PATH_PREFIX + '/data',
        download=True,
        train=True,
        transform=transform_data,
    )
    train_loader = torch.utils.data.DataLoader(
        train_data_trans, batch_size=len(train_data_trans)
    )
    x_train = next(iter(train_loader))[0].numpy()
    x_train = x_train.reshape(len(x_train), -1)
    y_train = next(iter(train_loader))[1].numpy()
    label0 = 0
    label1 = 3
    indx0 = np.nonzero(y_train == label0)[0]
    indx1 = np.nonzero(y_train == label1)[0]
    labels = y_train.copy()
    labels[indx0] = -1
    labels[indx1] = 1
    indx = np.concatenate((indx0, indx1))
    x_train = x_train[indx]
    labels = labels[indx]
    dataset = x_train, labels
    w_opt = self.find_optimal_classifier(dataset, bias=False)
    return x_train, labels, w_opt

  def a1a_dataset(self):
    """a1a dataset"""
    a1a_url = (
        'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t'
    )
    data_path = PATH_PREFIX + '/data/a1a'
    if not os.path.exists(data_path):
      _ = urllib.request.urlretrieve(a1a_url, data_path)
    data = sklearn.datasets.load_svmlight_file(data_path)
    inputs_vec, labels = data[0], data[1]
    inputs_vec = inputs_vec.toarray()
    scaler = preprocessing.StandardScaler().fit(inputs_vec)
    inputs_vec = scaler.transform(inputs_vec)
    labels = labels.astype(float)
    dataset = inputs_vec, labels
    w_opt = self.find_optimal_classifier(dataset)
    inputs_vec = np.hstack(
        (np.ones(shape=(np.shape(inputs_vec)[0], 1)), inputs_vec)
    )
    return inputs_vec, labels, w_opt

  def protein_dataset(self):
    """protein dataset"""
    path_protein = PATH_PREFIX + '/data/protein/'
    if not os.path.exists(path_protein):
      os.mkdir(path_protein)
      protein_url = (
          'https://kdd.org/cupfiles/KDDCupData/2004/data_kddcup04.tar.gz'
      )
      protein_file = PATH_PREFIX + '/data/protein/data_kddcup04.tar.gz'
      response = requests.get(protein_url, stream=True, timeout=100)
      if response.status_code == 200:
        with open(protein_file, 'wb') as file_data:
          file_data.write(response.raw.read())
      with tarfile.open(protein_file, 'r:gz') as tar:
        tar.extractall(path_protein)
    x_train = np.loadtxt(PATH_PREFIX + '/data/protein/bio_train.dat')[:, 3:]
    y_train = np.loadtxt(PATH_PREFIX + '/data/protein/bio_train.dat')[:, 2]
    indx0 = np.nonzero(y_train == 0)[0]
    indx1 = np.nonzero(y_train == 1)[0]
    labels = y_train.copy()
    labels[indx0] = -1
    labels[indx1] = 1
    indx = np.arange(len(x_train))
    np.random.seed(3000)
    indx_sample = np.random.choice(indx, 50000, replace=False)
    np.random.seed(None)
    x_train = x_train[indx_sample]
    labels = labels[indx_sample]
    x_train = normalize_fvec(x_train)
    w_opt = self.find_optimal_classifier((x_train, labels))
    x_train = np.hstack((np.ones(shape=(np.shape(x_train)[0], 1)), x_train))
    return x_train, labels, w_opt

  def synthetic_dataset(self, num_samples=10000, dim=100):
    """Generates a synthetic dataset for logistic regression.

    n = number of samples d = dimension Features are unit vectors (by default
    uniformly random). Labels are sampled from logistic distribution, so w is
    the "true" solution.
    """
    mean = np.zeros(dim)
    cov = np.eye(dim)
    inputs_vec_un = np.random.multivariate_normal(mean, cov, num_samples)
    nrm = np.linalg.norm(inputs_vec_un, axis=1)
    inputs_vec = inputs_vec_un * 1 / nrm[:, None]
    w_star = np.ones(dim)
    w_star[0] = 1
    inner_prod = np.dot(inputs_vec, w_star)
    params = np.exp(inner_prod) / (1 + np.exp(inner_prod))
    labels = 2 * np.random.binomial(1, params) - 1
    dataset = inputs_vec, labels
    w_opt = self.find_optimal_classifier(dataset, bias=False)
    return inputs_vec, labels, w_opt