mia_on_model_distillation/cifar10-fast-simple/train.py

376 lines
13 KiB
Python
Raw Normal View History

2024-11-20 12:11:10 -07:00
import time
import copy
import torch
import torch.nn as nn
import torchvision
import model
2024-11-23 17:26:51 -07:00
import studentmodel
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from datetime import datetime
2024-11-20 12:11:10 -07:00
def train(seed=0):
# Configurable parameters
epochs = 10
batch_size = 512
momentum = 0.9
weight_decay = 0.256
weight_decay_bias = 0.004
ema_update_freq = 5
ema_rho = 0.99 ** ema_update_freq
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if device.type != "cpu" else torch.float32
# First, the learning rate rises from 0 to 0.002 for the first 194 batches.
# Next, the learning rate shrinks down to 0.0002 over the next 582 batches.
lr_schedule = torch.cat([
torch.linspace(0e+0, 2e-3, 194),
torch.linspace(2e-3, 2e-4, 582),
])
lr_schedule_bias = 64.0 * lr_schedule
# Print information about hardware on first run
if seed == 0:
if device.type == "cuda":
print("Device :", torch.cuda.get_device_name(device.index))
print("Dtype :", dtype)
print()
# Start measuring time
start_time = time.perf_counter()
# Set random seed to increase chance of reproducability
torch.manual_seed(seed)
# Setting cudnn.benchmark to True hampers reproducability, but is faster
torch.backends.cudnn.benchmark = True
# Load dataset
2024-11-23 17:26:51 -07:00
train_data, train_targets, valid_data, valid_targets, train_loader, test_loader = load_cifar10(device, dtype)
2024-11-20 12:11:10 -07:00
# Compute special weights for first layer
weights = model.patch_whitening(train_data[:10000, :, 4:-4, 4:-4])
# Construct the neural network
train_model = model.Model(weights, c_in=3, c_out=10, scale_out=0.125)
# Convert model weights to half precision
train_model.to(dtype)
# Convert BatchNorm back to single precision for better accuracy
for module in train_model.modules():
if isinstance(module, nn.BatchNorm2d):
module.float()
# Upload model to GPU
train_model.to(device)
# Collect weights and biases and create nesterov velocity values
weights = [
(w, torch.zeros_like(w))
for w in train_model.parameters()
if w.requires_grad and len(w.shape) > 1
]
biases = [
(w, torch.zeros_like(w))
for w in train_model.parameters()
if w.requires_grad and len(w.shape) <= 1
]
# Copy the model for validation
valid_model = copy.deepcopy(train_model)
print(f"Preprocessing: {time.perf_counter() - start_time:.2f} seconds")
# Train and validate
print("\nepoch batch train time [sec] validation accuracy")
train_time = 0.0
batch_count = 0
for epoch in range(1, epochs + 1):
# Flush CUDA pipeline for more accurate time measurement
if torch.cuda.is_available():
torch.cuda.synchronize()
start_time = time.perf_counter()
# Randomly shuffle training data
indices = torch.randperm(len(train_data), device=device)
data = train_data[indices]
targets = train_targets[indices]
# Crop random 32x32 patches from 40x40 training data
data = [
random_crop(data[i : i + batch_size], crop_size=(32, 32))
for i in range(0, len(data), batch_size)
]
data = torch.cat(data)
# Randomly flip half the training data
data[: len(data) // 2] = torch.flip(data[: len(data) // 2], [-1])
for i in range(0, len(data), batch_size):
# discard partial batches
if i + batch_size > len(data):
break
# Slice batch from data
inputs = data[i : i + batch_size]
target = targets[i : i + batch_size]
batch_count += 1
# Compute new gradients
train_model.zero_grad()
train_model.train(True)
logits = train_model(inputs)
loss = model.label_smoothing_loss(logits, target, alpha=0.2)
loss.sum().backward()
lr_index = min(batch_count, len(lr_schedule) - 1)
lr = lr_schedule[lr_index]
lr_bias = lr_schedule_bias[lr_index]
# Update weights and biases of training model
update_nesterov(weights, lr, weight_decay, momentum)
update_nesterov(biases, lr_bias, weight_decay_bias, momentum)
# Update validation model with exponential moving averages
if (i // batch_size % ema_update_freq) == 0:
update_ema(train_model, valid_model, ema_rho)
if torch.cuda.is_available():
torch.cuda.synchronize()
# Add training time
train_time += time.perf_counter() - start_time
valid_correct = []
for i in range(0, len(valid_data), batch_size):
valid_model.train(False)
# Test time agumentation: Test model on regular and flipped data
regular_inputs = valid_data[i : i + batch_size]
flipped_inputs = torch.flip(regular_inputs, [-1])
logits1 = valid_model(regular_inputs).detach()
logits2 = valid_model(flipped_inputs).detach()
# Final logits are average of augmented logits
logits = torch.mean(torch.stack([logits1, logits2], dim=0), dim=0)
# Compute correct predictions
correct = logits.max(dim=1)[1] == valid_targets[i : i + batch_size]
valid_correct.append(correct.detach().type(torch.float64))
# Accuracy is average number of correct predictions
valid_acc = torch.mean(torch.cat(valid_correct)).item()
print(f"{epoch:5} {batch_count:8d} {train_time:19.2f} {valid_acc:22.4f}")
2024-11-23 17:26:51 -07:00
return valid_acc, valid_model, train_loader, test_loader, device
2024-11-20 12:11:10 -07:00
def preprocess_data(data, device, dtype):
# Convert to torch float16 tensor
data = torch.tensor(data, device=device).to(dtype)
# Normalize
mean = torch.tensor([125.31, 122.95, 113.87], device=device).to(dtype)
std = torch.tensor([62.99, 62.09, 66.70], device=device).to(dtype)
data = (data - mean) / std
# Permute data from NHWC to NCHW format
data = data.permute(0, 3, 1, 2)
return data
def load_cifar10(device, dtype, data_dir="~/data"):
train = torchvision.datasets.CIFAR10(root=data_dir, download=True)
valid = torchvision.datasets.CIFAR10(root=data_dir, train=False)
train_data = preprocess_data(train.data, device, dtype)
valid_data = preprocess_data(valid.data, device, dtype)
train_targets = torch.tensor(train.targets).to(device)
valid_targets = torch.tensor(valid.targets).to(device)
2024-11-23 17:26:51 -07:00
#From other code... this is SOOOO BAD FIX THIS BUT JUST DOING IT TO GET CODE RUNNING FOR NOW
# Loading the CIFAR-10 dataset:
transforms_cifar = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
train_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transforms_cifar)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transforms_cifar)
#Dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)
2024-11-20 12:11:10 -07:00
# Pad 32x32 to 40x40
train_data = nn.ReflectionPad2d(4)(train_data)
2024-11-23 17:26:51 -07:00
return train_data, train_targets, valid_data, valid_targets, train_loader, test_loader
#used for testing models
def test_multiple_outputs(model, test_loader, device):
model.to(device)
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs, _ = model(inputs) # Disregard the second tensor of the tuple
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")
return accuracy
#used for distillation training
def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, feature_map_weight, ce_loss_weight, device):
ce_loss = nn.CrossEntropyLoss()
mse_loss = nn.MSELoss()
optimizer = optim.Adam(student.parameters(), lr=learning_rate)
teacher.to(device)
student.to(device)
teacher.eval() # Teacher set to evaluation mode
student.train() # Student to train mode
for epoch in range(epochs):
running_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
# Again ignore teacher logits
with torch.no_grad():
_, teacher_feature_map = teacher(inputs)
# Forward pass with the student model
student_logits, regressor_feature_map = student(inputs)
# Calculate the loss
hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)
# Calculate the true label loss
label_loss = ce_loss(student_logits, labels)
# Weighted sum of the two losses
loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")
2024-11-20 12:11:10 -07:00
def update_ema(train_model, valid_model, rho):
# The trained model is not used for validation directly. Instead, the
# validation model weights are updated with exponential moving averages.
train_weights = train_model.state_dict().values()
valid_weights = valid_model.state_dict().values()
for train_weight, valid_weight in zip(train_weights, valid_weights):
if valid_weight.dtype in [torch.float16, torch.float32]:
valid_weight *= rho
valid_weight += (1 - rho) * train_weight
def update_nesterov(weights, lr, weight_decay, momentum):
for weight, velocity in weights:
if weight.requires_grad:
gradient = weight.grad.data
weight = weight.data
gradient.add_(weight, alpha=weight_decay).mul_(-lr)
velocity.mul_(momentum).add_(gradient)
weight.add_(gradient.add_(velocity, alpha=momentum))
def random_crop(data, crop_size):
crop_h, crop_w = crop_size
h = data.size(2)
w = data.size(3)
x = torch.randint(w - crop_w, size=(1,))[0]
y = torch.randint(h - crop_h, size=(1,))[0]
return data[:, :, y : y + crop_h, x : x + crop_w]
def sha256(path):
import hashlib
with open(path, "rb") as f:
return hashlib.sha256(f.read()).hexdigest()
def getrelpath(abspath):
import os
return os.path.relpath(abspath, os.getcwd())
def print_info():
# Knowing this information might improve chance of reproducability
print("File :", getrelpath(__file__), sha256(__file__))
print("Model :", getrelpath(model.__file__), sha256(model.__file__))
print("PyTorch:", torch.__version__)
def main():
print_info()
accuracies = []
threshold = 0.94
for run in range(100):
2024-11-23 17:26:51 -07:00
valid_acc, teacher_model, train_loader, test_loader, device = train(seed=run)
2024-11-20 12:11:10 -07:00
accuracies.append(valid_acc)
# Print accumulated results
within_threshold = sum(acc >= threshold for acc in accuracies)
acc = threshold * 100.0
print()
print(f"{within_threshold} of {run + 1} runs >= {acc} % accuracy")
mean = sum(accuracies) / len(accuracies)
variance = sum((acc - mean)**2 for acc in accuracies) / len(accuracies)
std = variance**0.5
print(f"Min accuracy: {min(accuracies)}")
print(f"Max accuracy: {max(accuracies)}")
print(f"Mean accuracy: {mean} +- {std}")
print()
2024-11-23 17:26:51 -07:00
#saving teacher model
current_datetime = datetime.now()
filename = f"teachermodels/{current_datetime.strftime('%Y%m%d_%H%M%S')}.pt"
#torch.save(model, filename)
#distillation training for student models
distilled_model = studentmodel.Model(num_classes=10).to(device)
#train_mse_loss(teacher=teacher_model, student=distilled_model, train_loader=train_loader, epochs=10, learning_rate=0.001, feature_map_weight=0.25, ce_loss_weight=0.75, device=device)
#test_student_accuracy = test_multiple_outputs(distilled_model, test_loader, device)
#print(f"Student accuracy with CE + RegressorMSE: {test_student_accuracy:.2f}%")
#saving student model
#current_datetime = datetime.now()
#filename = f"studentmodels/{current_datetime.strftime('%Y%m%d_%H%M%S')}.pt"
#torch.save(distilled_model, filename)
2024-11-20 12:11:10 -07:00
if __name__ == "__main__":
main()