# Copyright 2020 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= """dataset loader""" # pylint: skip-file # pyformat: disable import os import ssl import tarfile import urllib.request from my_logistic_regression import MyLogisticRegression import numpy as np import requests from sklearn import preprocessing import sklearn.datasets from sklearn.linear_model import LogisticRegression import torch from torchvision import datasets, transforms PATH_PREFIX = './src/datasets_directory' ssl._create_default_https_context = ssl._create_unverified_context def normalize_fvec(x_train): """normalize feature vectors""" feature_mean = np.mean(x_train, axis=0) feature_std = np.std(x_train, axis=0) x_train = (x_train - feature_mean) / feature_std return x_train def backtracking_ls(lrp, dir_srch, w_start, alpha=0.4, beta=0.95): """Implementation of backtracking line search lr = logistic regression dir = the "noisy" gradient direction w_start = current point alpha and beta tradeoff the precision and complexity of the linesearch output is an (close to) optimal stepsize """ step_size = 100 val_0 = lrp.loss(w_start) inner_prod = np.dot(dir_srch, lrp.grad(w_start)) while ( lrp.loss(w_start - step_size * dir_srch) >= val_0 - step_size * alpha * inner_prod ): step_size = beta * step_size if step_size < 1e-6: break return step_size def newton(dataset, w_init, bias=True): """Implementation of the newton method with linesearch without privacy constraints dataset = dataset w_init = initialization point output is the model parameter """ feature_vecs, labels = dataset if bias is True: feature_vecs = np.hstack( (np.ones(shape=(np.shape(feature_vecs)[0], 1)), feature_vecs) ) lrp = MyLogisticRegression(feature_vecs, labels, reg=1e-9) w_cur = w_init for _ in range(8): hess = lrp.hess(w_cur) dir_srch = np.linalg.solve(hess, lrp.grad_wor(w_cur)) step_size = backtracking_ls(lrp, dir_srch, w_cur) w_cur = w_cur - step_size * dir_srch if lrp.loss_wor(w_cur) < lrp.loss_wor(w_init): w_out = w_cur else: w_out = w_init return w_out class Mydatasets: """Represents datasets we use for expriments""" def __init__(self): data_dir = PATH_PREFIX + '/data' cache_dir = PATH_PREFIX + '/cache_datasets' if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(cache_dir): os.mkdir(cache_dir) def find_optimal_classifier(self, dataset, bias=True): """find the optimal weight vector for the logistic regression for the problems with real datasets. dataset = training dataset bias = bias for the logistic model """ inputs_vec, labels = dataset reg = 1e-9 if bias is True: model_lr = LogisticRegression(max_iter=200, C=1 / reg).fit( inputs_vec, labels ) w_opt1 = np.concatenate([model_lr.intercept_, np.squeeze(model_lr.coef_)]) w_opt = newton(dataset, w_opt1, bias) else: model_lr = LogisticRegression( max_iter=200, fit_intercept=False, C=1 / reg ).fit(inputs_vec, labels) w_opt1 = np.squeeze(model_lr.coef_) w_opt = newton(dataset, w_opt1, bias) return w_opt def fmnist_dataset(self): """fmnist dataset""" transform_data = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5), (0.5))] ) train_data_trans = datasets.FashionMNIST( root=PATH_PREFIX + '/data', download=True, train=True, transform=transform_data, ) train_loader = torch.utils.data.DataLoader( train_data_trans, batch_size=len(train_data_trans) ) x_train = next(iter(train_loader))[0].numpy() x_train = x_train.reshape(len(x_train), -1) y_train = next(iter(train_loader))[1].numpy() label0 = 0 label1 = 3 indx0 = np.nonzero(y_train == label0)[0] indx1 = np.nonzero(y_train == label1)[0] labels = y_train.copy() labels[indx0] = -1 labels[indx1] = 1 indx = np.concatenate((indx0, indx1)) x_train = x_train[indx] labels = labels[indx] dataset = x_train, labels w_opt = self.find_optimal_classifier(dataset, bias=False) return x_train, labels, w_opt def a1a_dataset(self): """a1a dataset""" a1a_url = ( 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t' ) data_path = PATH_PREFIX + '/data/a1a' if not os.path.exists(data_path): _ = urllib.request.urlretrieve(a1a_url, data_path) data = sklearn.datasets.load_svmlight_file(data_path) inputs_vec, labels = data[0], data[1] inputs_vec = inputs_vec.toarray() scaler = preprocessing.StandardScaler().fit(inputs_vec) inputs_vec = scaler.transform(inputs_vec) labels = labels.astype(float) dataset = inputs_vec, labels w_opt = self.find_optimal_classifier(dataset) inputs_vec = np.hstack( (np.ones(shape=(np.shape(inputs_vec)[0], 1)), inputs_vec) ) return inputs_vec, labels, w_opt def protein_dataset(self): """protein dataset""" path_protein = PATH_PREFIX + '/data/protein/' if not os.path.exists(path_protein): os.mkdir(path_protein) protein_url = ( 'https://kdd.org/cupfiles/KDDCupData/2004/data_kddcup04.tar.gz' ) protein_file = PATH_PREFIX + '/data/protein/data_kddcup04.tar.gz' response = requests.get(protein_url, stream=True, timeout=100) if response.status_code == 200: with open(protein_file, 'wb') as file_data: file_data.write(response.raw.read()) with tarfile.open(protein_file, 'r:gz') as tar: tar.extractall(path_protein) x_train = np.loadtxt(PATH_PREFIX + '/data/protein/bio_train.dat')[:, 3:] y_train = np.loadtxt(PATH_PREFIX + '/data/protein/bio_train.dat')[:, 2] indx0 = np.nonzero(y_train == 0)[0] indx1 = np.nonzero(y_train == 1)[0] labels = y_train.copy() labels[indx0] = -1 labels[indx1] = 1 indx = np.arange(len(x_train)) np.random.seed(3000) indx_sample = np.random.choice(indx, 50000, replace=False) np.random.seed(None) x_train = x_train[indx_sample] labels = labels[indx_sample] x_train = normalize_fvec(x_train) w_opt = self.find_optimal_classifier((x_train, labels)) x_train = np.hstack((np.ones(shape=(np.shape(x_train)[0], 1)), x_train)) return x_train, labels, w_opt def synthetic_dataset(self, num_samples=10000, dim=100): """Generates a synthetic dataset for logistic regression. n = number of samples d = dimension Features are unit vectors (by default uniformly random). Labels are sampled from logistic distribution, so w is the "true" solution. """ mean = np.zeros(dim) cov = np.eye(dim) inputs_vec_un = np.random.multivariate_normal(mean, cov, num_samples) nrm = np.linalg.norm(inputs_vec_un, axis=1) inputs_vec = inputs_vec_un * 1 / nrm[:, None] w_star = np.ones(dim) w_star[0] = 1 inner_prod = np.dot(inputs_vec, w_star) params = np.exp(inner_prod) / (1 + np.exp(inner_prod)) labels = 2 * np.random.binomial(1, params) - 1 dataset = inputs_vec, labels w_opt = self.find_optimal_classifier(dataset, bias=False) return inputs_vec, labels, w_opt