237 lines
7.7 KiB
Python
237 lines
7.7 KiB
Python
|
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# =============================================================================
|
||
|
|
||
|
"""dataset loader"""
|
||
|
|
||
|
# pylint: skip-file
|
||
|
# pyformat: disable
|
||
|
|
||
|
import os
|
||
|
import ssl
|
||
|
import tarfile
|
||
|
import urllib.request
|
||
|
from my_logistic_regression import MyLogisticRegression
|
||
|
import numpy as np
|
||
|
import requests
|
||
|
from sklearn import preprocessing
|
||
|
import sklearn.datasets
|
||
|
from sklearn.linear_model import LogisticRegression
|
||
|
import torch
|
||
|
from torchvision import datasets, transforms
|
||
|
|
||
|
|
||
|
PATH_PREFIX = './src/datasets_directory'
|
||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||
|
|
||
|
|
||
|
def normalize_fvec(x_train):
|
||
|
"""normalize feature vectors"""
|
||
|
feature_mean = np.mean(x_train, axis=0)
|
||
|
feature_std = np.std(x_train, axis=0)
|
||
|
x_train = (x_train - feature_mean) / feature_std
|
||
|
return x_train
|
||
|
|
||
|
|
||
|
def backtracking_ls(lrp, dir_srch, w_start, alpha=0.4, beta=0.95):
|
||
|
"""Implementation of backtracking line search
|
||
|
|
||
|
lr = logistic regression
|
||
|
dir = the "noisy" gradient direction
|
||
|
w_start = current point
|
||
|
alpha and beta tradeoff the precision and complexity of the linesearch
|
||
|
|
||
|
output is an (close to) optimal stepsize
|
||
|
"""
|
||
|
step_size = 100
|
||
|
val_0 = lrp.loss(w_start)
|
||
|
inner_prod = np.dot(dir_srch, lrp.grad(w_start))
|
||
|
while (
|
||
|
lrp.loss(w_start - step_size * dir_srch)
|
||
|
>= val_0 - step_size * alpha * inner_prod
|
||
|
):
|
||
|
step_size = beta * step_size
|
||
|
if step_size < 1e-6:
|
||
|
break
|
||
|
return step_size
|
||
|
|
||
|
|
||
|
def newton(dataset, w_init, bias=True):
|
||
|
"""Implementation of the newton method with linesearch without privacy constraints
|
||
|
|
||
|
dataset = dataset
|
||
|
w_init = initialization point
|
||
|
|
||
|
output is the model parameter
|
||
|
"""
|
||
|
feature_vecs, labels = dataset
|
||
|
if bias is True:
|
||
|
feature_vecs = np.hstack(
|
||
|
(np.ones(shape=(np.shape(feature_vecs)[0], 1)), feature_vecs)
|
||
|
)
|
||
|
lrp = MyLogisticRegression(feature_vecs, labels, reg=1e-9)
|
||
|
w_cur = w_init
|
||
|
for _ in range(8):
|
||
|
hess = lrp.hess(w_cur)
|
||
|
dir_srch = np.linalg.solve(hess, lrp.grad_wor(w_cur))
|
||
|
step_size = backtracking_ls(lrp, dir_srch, w_cur)
|
||
|
w_cur = w_cur - step_size * dir_srch
|
||
|
if lrp.loss_wor(w_cur) < lrp.loss_wor(w_init):
|
||
|
w_out = w_cur
|
||
|
else:
|
||
|
w_out = w_init
|
||
|
return w_out
|
||
|
|
||
|
|
||
|
class Mydatasets:
|
||
|
"""Represents datasets we use for expriments"""
|
||
|
|
||
|
def __init__(self):
|
||
|
data_dir = PATH_PREFIX + '/data'
|
||
|
cache_dir = PATH_PREFIX + '/cache_datasets'
|
||
|
if not os.path.exists(data_dir):
|
||
|
os.mkdir(data_dir)
|
||
|
if not os.path.exists(cache_dir):
|
||
|
os.mkdir(cache_dir)
|
||
|
|
||
|
def find_optimal_classifier(self, dataset, bias=True):
|
||
|
"""find the optimal weight vector for the logistic regression
|
||
|
|
||
|
for the problems with real datasets.
|
||
|
|
||
|
dataset = training dataset
|
||
|
bias = bias for the logistic model
|
||
|
"""
|
||
|
inputs_vec, labels = dataset
|
||
|
reg = 1e-9
|
||
|
if bias is True:
|
||
|
model_lr = LogisticRegression(max_iter=200, C=1 / reg).fit(
|
||
|
inputs_vec, labels
|
||
|
)
|
||
|
w_opt1 = np.concatenate([model_lr.intercept_, np.squeeze(model_lr.coef_)])
|
||
|
w_opt = newton(dataset, w_opt1, bias)
|
||
|
else:
|
||
|
model_lr = LogisticRegression(
|
||
|
max_iter=200, fit_intercept=False, C=1 / reg
|
||
|
).fit(inputs_vec, labels)
|
||
|
w_opt1 = np.squeeze(model_lr.coef_)
|
||
|
w_opt = newton(dataset, w_opt1, bias)
|
||
|
return w_opt
|
||
|
|
||
|
def fmnist_dataset(self):
|
||
|
"""fmnist dataset"""
|
||
|
transform_data = transforms.Compose(
|
||
|
[transforms.ToTensor(), transforms.Normalize((0.5), (0.5))]
|
||
|
)
|
||
|
train_data_trans = datasets.FashionMNIST(
|
||
|
root=PATH_PREFIX + '/data',
|
||
|
download=True,
|
||
|
train=True,
|
||
|
transform=transform_data,
|
||
|
)
|
||
|
train_loader = torch.utils.data.DataLoader(
|
||
|
train_data_trans, batch_size=len(train_data_trans)
|
||
|
)
|
||
|
x_train = next(iter(train_loader))[0].numpy()
|
||
|
x_train = x_train.reshape(len(x_train), -1)
|
||
|
y_train = next(iter(train_loader))[1].numpy()
|
||
|
label0 = 0
|
||
|
label1 = 3
|
||
|
indx0 = np.nonzero(y_train == label0)[0]
|
||
|
indx1 = np.nonzero(y_train == label1)[0]
|
||
|
labels = y_train.copy()
|
||
|
labels[indx0] = -1
|
||
|
labels[indx1] = 1
|
||
|
indx = np.concatenate((indx0, indx1))
|
||
|
x_train = x_train[indx]
|
||
|
labels = labels[indx]
|
||
|
dataset = x_train, labels
|
||
|
w_opt = self.find_optimal_classifier(dataset, bias=False)
|
||
|
return x_train, labels, w_opt
|
||
|
|
||
|
def a1a_dataset(self):
|
||
|
"""a1a dataset"""
|
||
|
a1a_url = (
|
||
|
'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t'
|
||
|
)
|
||
|
data_path = PATH_PREFIX + '/data/a1a'
|
||
|
if not os.path.exists(data_path):
|
||
|
_ = urllib.request.urlretrieve(a1a_url, data_path)
|
||
|
data = sklearn.datasets.load_svmlight_file(data_path)
|
||
|
inputs_vec, labels = data[0], data[1]
|
||
|
inputs_vec = inputs_vec.toarray()
|
||
|
scaler = preprocessing.StandardScaler().fit(inputs_vec)
|
||
|
inputs_vec = scaler.transform(inputs_vec)
|
||
|
labels = labels.astype(float)
|
||
|
dataset = inputs_vec, labels
|
||
|
w_opt = self.find_optimal_classifier(dataset)
|
||
|
inputs_vec = np.hstack(
|
||
|
(np.ones(shape=(np.shape(inputs_vec)[0], 1)), inputs_vec)
|
||
|
)
|
||
|
return inputs_vec, labels, w_opt
|
||
|
|
||
|
def protein_dataset(self):
|
||
|
"""protein dataset"""
|
||
|
path_protein = PATH_PREFIX + '/data/protein/'
|
||
|
if not os.path.exists(path_protein):
|
||
|
os.mkdir(path_protein)
|
||
|
protein_url = (
|
||
|
'https://kdd.org/cupfiles/KDDCupData/2004/data_kddcup04.tar.gz'
|
||
|
)
|
||
|
protein_file = PATH_PREFIX + '/data/protein/data_kddcup04.tar.gz'
|
||
|
response = requests.get(protein_url, stream=True, timeout=100)
|
||
|
if response.status_code == 200:
|
||
|
with open(protein_file, 'wb') as file_data:
|
||
|
file_data.write(response.raw.read())
|
||
|
with tarfile.open(protein_file, 'r:gz') as tar:
|
||
|
tar.extractall(path_protein)
|
||
|
x_train = np.loadtxt(PATH_PREFIX + '/data/protein/bio_train.dat')[:, 3:]
|
||
|
y_train = np.loadtxt(PATH_PREFIX + '/data/protein/bio_train.dat')[:, 2]
|
||
|
indx0 = np.nonzero(y_train == 0)[0]
|
||
|
indx1 = np.nonzero(y_train == 1)[0]
|
||
|
labels = y_train.copy()
|
||
|
labels[indx0] = -1
|
||
|
labels[indx1] = 1
|
||
|
indx = np.arange(len(x_train))
|
||
|
np.random.seed(3000)
|
||
|
indx_sample = np.random.choice(indx, 50000, replace=False)
|
||
|
np.random.seed(None)
|
||
|
x_train = x_train[indx_sample]
|
||
|
labels = labels[indx_sample]
|
||
|
x_train = normalize_fvec(x_train)
|
||
|
w_opt = self.find_optimal_classifier((x_train, labels))
|
||
|
x_train = np.hstack((np.ones(shape=(np.shape(x_train)[0], 1)), x_train))
|
||
|
return x_train, labels, w_opt
|
||
|
|
||
|
def synthetic_dataset(self, num_samples=10000, dim=100):
|
||
|
"""Generates a synthetic dataset for logistic regression.
|
||
|
|
||
|
n = number of samples d = dimension Features are unit vectors (by default
|
||
|
uniformly random). Labels are sampled from logistic distribution, so w is
|
||
|
the "true" solution.
|
||
|
"""
|
||
|
mean = np.zeros(dim)
|
||
|
cov = np.eye(dim)
|
||
|
inputs_vec_un = np.random.multivariate_normal(mean, cov, num_samples)
|
||
|
nrm = np.linalg.norm(inputs_vec_un, axis=1)
|
||
|
inputs_vec = inputs_vec_un * 1 / nrm[:, None]
|
||
|
w_star = np.ones(dim)
|
||
|
w_star[0] = 1
|
||
|
inner_prod = np.dot(inputs_vec, w_star)
|
||
|
params = np.exp(inner_prod) / (1 + np.exp(inner_prod))
|
||
|
labels = 2 * np.random.binomial(1, params) - 1
|
||
|
dataset = inputs_vec, labels
|
||
|
w_opt = self.find_optimal_classifier(dataset, bias=False)
|
||
|
return inputs_vec, labels, w_opt
|