mia_on_model_distillation/cifar10-fast-simple/train.py

import time
import copy
import torch
import torch.nn as nn
import torchvision
import model
import studentmodel
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms


from datetime import datetime


def train(seed=0):
    # Configurable parameters
    epochs = 10
    batch_size = 512
    momentum = 0.9
    weight_decay = 0.256
    weight_decay_bias = 0.004
    ema_update_freq = 5
    ema_rho = 0.99 ** ema_update_freq
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.float16 if device.type != "cpu" else torch.float32

    # First, the learning rate rises from 0 to 0.002 for the first 194 batches.
    # Next, the learning rate shrinks down to 0.0002 over the next 582 batches.
    lr_schedule = torch.cat([
        torch.linspace(0e+0, 2e-3, 194),
        torch.linspace(2e-3, 2e-4, 582),
    ])

    lr_schedule_bias = 64.0 * lr_schedule

    # Print information about hardware on first run
    if seed == 0:
        if device.type == "cuda":
            print("Device :", torch.cuda.get_device_name(device.index))

        print("Dtype  :", dtype)
        print()

    # Start measuring time
    start_time = time.perf_counter()

    # Set random seed to increase chance of reproducability
    torch.manual_seed(seed)

    # Setting cudnn.benchmark to True hampers reproducability, but is faster
    torch.backends.cudnn.benchmark = True

    # Load dataset
    train_data, train_targets, valid_data, valid_targets, train_loader, test_loader = load_cifar10(device, dtype)

    # Compute special weights for first layer
    weights = model.patch_whitening(train_data[:10000, :, 4:-4, 4:-4])

    # Construct the neural network
    train_model = model.Model(weights, c_in=3, c_out=10, scale_out=0.125)

    # Convert model weights to half precision
    train_model.to(dtype)

    # Convert BatchNorm back to single precision for better accuracy
    for module in train_model.modules():
        if isinstance(module, nn.BatchNorm2d):
            module.float()

    # Upload model to GPU
    train_model.to(device)

    # Collect weights and biases and create nesterov velocity values
    weights = [
        (w, torch.zeros_like(w))
        for w in train_model.parameters()
        if w.requires_grad and len(w.shape) > 1
    ]
    biases = [
        (w, torch.zeros_like(w))
        for w in train_model.parameters()
        if w.requires_grad and len(w.shape) <= 1
    ]

    # Copy the model for validation
    valid_model = copy.deepcopy(train_model)

    print(f"Preprocessing: {time.perf_counter() - start_time:.2f} seconds")

    # Train and validate
    print("\nepoch    batch    train time [sec]    validation accuracy")
    train_time = 0.0
    batch_count = 0
    for epoch in range(1, epochs + 1):
        # Flush CUDA pipeline for more accurate time measurement
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        start_time = time.perf_counter()

        # Randomly shuffle training data
        indices = torch.randperm(len(train_data), device=device)
        data = train_data[indices]
        targets = train_targets[indices]

        # Crop random 32x32 patches from 40x40 training data
        data = [
            random_crop(data[i : i + batch_size], crop_size=(32, 32))
            for i in range(0, len(data), batch_size)
        ]
        data = torch.cat(data)

        # Randomly flip half the training data
        data[: len(data) // 2] = torch.flip(data[: len(data) // 2], [-1])

        for i in range(0, len(data), batch_size):
            # discard partial batches
            if i + batch_size > len(data):
                break

            # Slice batch from data
            inputs = data[i : i + batch_size]
            target = targets[i : i + batch_size]
            batch_count += 1

            # Compute new gradients
            train_model.zero_grad()
            train_model.train(True)

            logits = train_model(inputs)

            loss = model.label_smoothing_loss(logits, target, alpha=0.2)

            loss.sum().backward()

            lr_index = min(batch_count, len(lr_schedule) - 1)
            lr = lr_schedule[lr_index]
            lr_bias = lr_schedule_bias[lr_index]

            # Update weights and biases of training model
            update_nesterov(weights, lr, weight_decay, momentum)
            update_nesterov(biases, lr_bias, weight_decay_bias, momentum)

            # Update validation model with exponential moving averages
            if (i // batch_size % ema_update_freq) == 0:
                update_ema(train_model, valid_model, ema_rho)

        if torch.cuda.is_available():
            torch.cuda.synchronize()

        # Add training time
        train_time += time.perf_counter() - start_time

        valid_correct = []
        for i in range(0, len(valid_data), batch_size):
            valid_model.train(False)

            # Test time agumentation: Test model on regular and flipped data
            regular_inputs = valid_data[i : i + batch_size]
            flipped_inputs = torch.flip(regular_inputs, [-1])

            logits1 = valid_model(regular_inputs).detach()
            logits2 = valid_model(flipped_inputs).detach()

            # Final logits are average of augmented logits
            logits = torch.mean(torch.stack([logits1, logits2], dim=0), dim=0)

            # Compute correct predictions
            correct = logits.max(dim=1)[1] == valid_targets[i : i + batch_size]

            valid_correct.append(correct.detach().type(torch.float64))

        # Accuracy is average number of correct predictions
        valid_acc = torch.mean(torch.cat(valid_correct)).item()

        print(f"{epoch:5} {batch_count:8d} {train_time:19.2f} {valid_acc:22.4f}")

    return valid_acc, valid_model, train_loader, test_loader, device

def preprocess_data(data, device, dtype):
    # Convert to torch float16 tensor
    data = torch.tensor(data, device=device).to(dtype)

    # Normalize
    mean = torch.tensor([125.31, 122.95, 113.87], device=device).to(dtype)
    std = torch.tensor([62.99, 62.09, 66.70], device=device).to(dtype)
    data = (data - mean) / std

    # Permute data from NHWC to NCHW format
    data = data.permute(0, 3, 1, 2)

    return data


def load_cifar10(device, dtype, data_dir="~/data"):
    train = torchvision.datasets.CIFAR10(root=data_dir, download=True)
    valid = torchvision.datasets.CIFAR10(root=data_dir, train=False)

    train_data = preprocess_data(train.data, device, dtype)
    valid_data = preprocess_data(valid.data, device, dtype)

    train_targets = torch.tensor(train.targets).to(device)
    valid_targets = torch.tensor(valid.targets).to(device)

    #From other code... this is SOOOO BAD FIX THIS BUT JUST DOING IT TO GET CODE RUNNING FOR NOW
    # Loading the CIFAR-10 dataset:
    transforms_cifar = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
    train_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transforms_cifar)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transforms_cifar)
    #Dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

    # Pad 32x32 to 40x40
    train_data = nn.ReflectionPad2d(4)(train_data)

    return train_data, train_targets, valid_data, valid_targets, train_loader, test_loader

#used for testing models
def test_multiple_outputs(model, test_loader, device):
    model.to(device)
    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs, _ = model(inputs) # Disregard the second tensor of the tuple
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

#used for distillation training
def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, feature_map_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.to(device)
    student.to(device)
    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Again ignore teacher logits
            with torch.no_grad():
                _, teacher_feature_map = teacher(inputs)

            # Forward pass with the student model
            student_logits, regressor_feature_map = student(inputs)

            # Calculate the loss
            hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")


def update_ema(train_model, valid_model, rho):
    # The trained model is not used for validation directly. Instead, the
    # validation model weights are updated with exponential moving averages.
    train_weights = train_model.state_dict().values()
    valid_weights = valid_model.state_dict().values()
    for train_weight, valid_weight in zip(train_weights, valid_weights):
        if valid_weight.dtype in [torch.float16, torch.float32]:
            valid_weight *= rho
            valid_weight += (1 - rho) * train_weight


def update_nesterov(weights, lr, weight_decay, momentum):
    for weight, velocity in weights:
        if weight.requires_grad:
            gradient = weight.grad.data
            weight = weight.data

            gradient.add_(weight, alpha=weight_decay).mul_(-lr)
            velocity.mul_(momentum).add_(gradient)
            weight.add_(gradient.add_(velocity, alpha=momentum))


def random_crop(data, crop_size):
    crop_h, crop_w = crop_size
    h = data.size(2)
    w = data.size(3)
    x = torch.randint(w - crop_w, size=(1,))[0]
    y = torch.randint(h - crop_h, size=(1,))[0]
    return data[:, :, y : y + crop_h, x : x + crop_w]


def sha256(path):
    import hashlib
    with open(path, "rb") as f:
        return hashlib.sha256(f.read()).hexdigest()


def getrelpath(abspath):
    import os
    return os.path.relpath(abspath, os.getcwd())


def print_info():
    # Knowing this information might improve chance of reproducability
    print("File   :", getrelpath(__file__), sha256(__file__))
    print("Model  :", getrelpath(model.__file__), sha256(model.__file__))
    print("PyTorch:", torch.__version__)


def main():
    print_info()
    accuracies = []
    threshold = 0.94
    for run in range(100):
        valid_acc, teacher_model, train_loader, test_loader, device = train(seed=run)
        accuracies.append(valid_acc)

        # Print accumulated results
        within_threshold = sum(acc >= threshold for acc in accuracies)
        acc = threshold * 100.0
        print()
        print(f"{within_threshold} of {run + 1} runs >= {acc} % accuracy")
        mean = sum(accuracies) / len(accuracies)
        variance = sum((acc - mean)**2 for acc in accuracies) / len(accuracies)
        std = variance**0.5
        print(f"Min  accuracy: {min(accuracies)}")
        print(f"Max  accuracy: {max(accuracies)}")
        print(f"Mean accuracy: {mean} +- {std}")
        print()
        
        #saving teacher model
        current_datetime = datetime.now()
        filename = f"teachermodels/{current_datetime.strftime('%Y%m%d_%H%M%S')}.pt"
        #torch.save(model, filename)

        #distillation training for student models
        distilled_model = studentmodel.Model(num_classes=10).to(device)
        #train_mse_loss(teacher=teacher_model, student=distilled_model, train_loader=train_loader, epochs=10, learning_rate=0.001, feature_map_weight=0.25, ce_loss_weight=0.75, device=device)
        #test_student_accuracy = test_multiple_outputs(distilled_model, test_loader, device)
        #print(f"Student accuracy with CE + RegressorMSE: {test_student_accuracy:.2f}%")

        #saving student model
        #current_datetime = datetime.now()
        #filename = f"studentmodels/{current_datetime.strftime('%Y%m%d_%H%M%S')}.pt"
        #torch.save(distilled_model, filename)

        
if __name__ == "__main__":
    main()
Init 2024-11-20 12:11:10 -07:00			`import time`
			`import copy`
			`import torch`
			`import torch.nn as nn`
			`import torchvision`
			`import model`
Distill: add student model 2024-11-23 17:26:51 -07:00			`import studentmodel`
			`import torch.optim as optim`
			`import torchvision.datasets as datasets`
			`import torchvision.transforms as transforms`


			`from datetime import datetime`
Init 2024-11-20 12:11:10 -07:00

			`def train(seed=0):`
			`# Configurable parameters`
			`epochs = 10`
			`batch_size = 512`
			`momentum = 0.9`
			`weight_decay = 0.256`
			`weight_decay_bias = 0.004`
			`ema_update_freq = 5`
			`ema_rho = 0.99 ** ema_update_freq`
			`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
			`dtype = torch.float16 if device.type != "cpu" else torch.float32`

			`# First, the learning rate rises from 0 to 0.002 for the first 194 batches.`
			`# Next, the learning rate shrinks down to 0.0002 over the next 582 batches.`
			`lr_schedule = torch.cat([`
			`torch.linspace(0e+0, 2e-3, 194),`
			`torch.linspace(2e-3, 2e-4, 582),`
			`])`

			`lr_schedule_bias = 64.0 * lr_schedule`

			`# Print information about hardware on first run`
			`if seed == 0:`
			`if device.type == "cuda":`
			`print("Device :", torch.cuda.get_device_name(device.index))`

			`print("Dtype :", dtype)`
			`print()`

			`# Start measuring time`
			`start_time = time.perf_counter()`

			`# Set random seed to increase chance of reproducability`
			`torch.manual_seed(seed)`

			`# Setting cudnn.benchmark to True hampers reproducability, but is faster`
			`torch.backends.cudnn.benchmark = True`

			`# Load dataset`
Distill: add student model 2024-11-23 17:26:51 -07:00			`train_data, train_targets, valid_data, valid_targets, train_loader, test_loader = load_cifar10(device, dtype)`
Init 2024-11-20 12:11:10 -07:00
			`# Compute special weights for first layer`
			`weights = model.patch_whitening(train_data[:10000, :, 4:-4, 4:-4])`

			`# Construct the neural network`
			`train_model = model.Model(weights, c_in=3, c_out=10, scale_out=0.125)`

			`# Convert model weights to half precision`
			`train_model.to(dtype)`

			`# Convert BatchNorm back to single precision for better accuracy`
			`for module in train_model.modules():`
			`if isinstance(module, nn.BatchNorm2d):`
			`module.float()`

			`# Upload model to GPU`
			`train_model.to(device)`

			`# Collect weights and biases and create nesterov velocity values`
			`weights = [`
			`(w, torch.zeros_like(w))`
			`for w in train_model.parameters()`
			`if w.requires_grad and len(w.shape) > 1`
			`]`
			`biases = [`
			`(w, torch.zeros_like(w))`
			`for w in train_model.parameters()`
			`if w.requires_grad and len(w.shape) <= 1`
			`]`

			`# Copy the model for validation`
			`valid_model = copy.deepcopy(train_model)`

			`print(f"Preprocessing: {time.perf_counter() - start_time:.2f} seconds")`

			`# Train and validate`
			`print("\nepoch batch train time [sec] validation accuracy")`
			`train_time = 0.0`
			`batch_count = 0`
			`for epoch in range(1, epochs + 1):`
			`# Flush CUDA pipeline for more accurate time measurement`
			`if torch.cuda.is_available():`
			`torch.cuda.synchronize()`

			`start_time = time.perf_counter()`

			`# Randomly shuffle training data`
			`indices = torch.randperm(len(train_data), device=device)`
			`data = train_data[indices]`
			`targets = train_targets[indices]`

			`# Crop random 32x32 patches from 40x40 training data`
			`data = [`
			`random_crop(data[i : i + batch_size], crop_size=(32, 32))`
			`for i in range(0, len(data), batch_size)`
			`]`
			`data = torch.cat(data)`

			`# Randomly flip half the training data`
			`data[: len(data) // 2] = torch.flip(data[: len(data) // 2], [-1])`

			`for i in range(0, len(data), batch_size):`
			`# discard partial batches`
			`if i + batch_size > len(data):`
			`break`

			`# Slice batch from data`
			`inputs = data[i : i + batch_size]`
			`target = targets[i : i + batch_size]`
			`batch_count += 1`

			`# Compute new gradients`
			`train_model.zero_grad()`
			`train_model.train(True)`

			`logits = train_model(inputs)`

			`loss = model.label_smoothing_loss(logits, target, alpha=0.2)`

			`loss.sum().backward()`

			`lr_index = min(batch_count, len(lr_schedule) - 1)`
			`lr = lr_schedule[lr_index]`
			`lr_bias = lr_schedule_bias[lr_index]`

			`# Update weights and biases of training model`
			`update_nesterov(weights, lr, weight_decay, momentum)`
			`update_nesterov(biases, lr_bias, weight_decay_bias, momentum)`

			`# Update validation model with exponential moving averages`
			`if (i // batch_size % ema_update_freq) == 0:`
			`update_ema(train_model, valid_model, ema_rho)`

			`if torch.cuda.is_available():`
			`torch.cuda.synchronize()`

			`# Add training time`
			`train_time += time.perf_counter() - start_time`

			`valid_correct = []`
			`for i in range(0, len(valid_data), batch_size):`
			`valid_model.train(False)`

			`# Test time agumentation: Test model on regular and flipped data`
			`regular_inputs = valid_data[i : i + batch_size]`
			`flipped_inputs = torch.flip(regular_inputs, [-1])`

			`logits1 = valid_model(regular_inputs).detach()`
			`logits2 = valid_model(flipped_inputs).detach()`

			`# Final logits are average of augmented logits`
			`logits = torch.mean(torch.stack([logits1, logits2], dim=0), dim=0)`

			`# Compute correct predictions`
			`correct = logits.max(dim=1)[1] == valid_targets[i : i + batch_size]`

			`valid_correct.append(correct.detach().type(torch.float64))`

			`# Accuracy is average number of correct predictions`
			`valid_acc = torch.mean(torch.cat(valid_correct)).item()`

			`print(f"{epoch:5} {batch_count:8d} {train_time:19.2f} {valid_acc:22.4f}")`

Distill: add student model 2024-11-23 17:26:51 -07:00			`return valid_acc, valid_model, train_loader, test_loader, device`
Init 2024-11-20 12:11:10 -07:00
			`def preprocess_data(data, device, dtype):`
			`# Convert to torch float16 tensor`
			`data = torch.tensor(data, device=device).to(dtype)`

			`# Normalize`
			`mean = torch.tensor([125.31, 122.95, 113.87], device=device).to(dtype)`
			`std = torch.tensor([62.99, 62.09, 66.70], device=device).to(dtype)`
			`data = (data - mean) / std`

			`# Permute data from NHWC to NCHW format`
			`data = data.permute(0, 3, 1, 2)`

			`return data`


			`def load_cifar10(device, dtype, data_dir="~/data"):`
			`train = torchvision.datasets.CIFAR10(root=data_dir, download=True)`
			`valid = torchvision.datasets.CIFAR10(root=data_dir, train=False)`

			`train_data = preprocess_data(train.data, device, dtype)`
			`valid_data = preprocess_data(valid.data, device, dtype)`

			`train_targets = torch.tensor(train.targets).to(device)`
			`valid_targets = torch.tensor(valid.targets).to(device)`

Distill: add student model 2024-11-23 17:26:51 -07:00			`#From other code... this is SOOOO BAD FIX THIS BUT JUST DOING IT TO GET CODE RUNNING FOR NOW`
			`# Loading the CIFAR-10 dataset:`
			`transforms_cifar = transforms.Compose([`
			`transforms.ToTensor(),`
			`transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),`
			`])`
			`train_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transforms_cifar)`
			`test_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transforms_cifar)`
			`#Dataloaders`
			`train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)`
			`test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)`

Init 2024-11-20 12:11:10 -07:00			`# Pad 32x32 to 40x40`
			`train_data = nn.ReflectionPad2d(4)(train_data)`

Distill: add student model 2024-11-23 17:26:51 -07:00			`return train_data, train_targets, valid_data, valid_targets, train_loader, test_loader`

			`#used for testing models`
			`def test_multiple_outputs(model, test_loader, device):`
			`model.to(device)`
			`model.eval()`

			`correct = 0`
			`total = 0`

			`with torch.no_grad():`
			`for inputs, labels in test_loader:`
			`inputs, labels = inputs.to(device), labels.to(device)`

			`outputs, _ = model(inputs) # Disregard the second tensor of the tuple`
			`_, predicted = torch.max(outputs.data, 1)`

			`total += labels.size(0)`
			`correct += (predicted == labels).sum().item()`

			`accuracy = 100 * correct / total`
			`print(f"Test Accuracy: {accuracy:.2f}%")`
			`return accuracy`

			`#used for distillation training`
			`def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, feature_map_weight, ce_loss_weight, device):`
			`ce_loss = nn.CrossEntropyLoss()`
			`mse_loss = nn.MSELoss()`
			`optimizer = optim.Adam(student.parameters(), lr=learning_rate)`

			`teacher.to(device)`
			`student.to(device)`
			`teacher.eval() # Teacher set to evaluation mode`
			`student.train() # Student to train mode`

			`for epoch in range(epochs):`
			`running_loss = 0.0`
			`for inputs, labels in train_loader:`
			`inputs, labels = inputs.to(device), labels.to(device)`

			`optimizer.zero_grad()`

			`# Again ignore teacher logits`
			`with torch.no_grad():`
			`_, teacher_feature_map = teacher(inputs)`

			`# Forward pass with the student model`
			`student_logits, regressor_feature_map = student(inputs)`

			`# Calculate the loss`
			`hidden_rep_loss = mse_loss(regressor_feature_map, teacher_feature_map)`

			`# Calculate the true label loss`
			`label_loss = ce_loss(student_logits, labels)`

			`# Weighted sum of the two losses`
			`loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss`

			`loss.backward()`
			`optimizer.step()`

			`running_loss += loss.item()`

			`print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")`
Init 2024-11-20 12:11:10 -07:00

			`def update_ema(train_model, valid_model, rho):`
			`# The trained model is not used for validation directly. Instead, the`
			`# validation model weights are updated with exponential moving averages.`
			`train_weights = train_model.state_dict().values()`
			`valid_weights = valid_model.state_dict().values()`
			`for train_weight, valid_weight in zip(train_weights, valid_weights):`
			`if valid_weight.dtype in [torch.float16, torch.float32]:`
			`valid_weight *= rho`
			`valid_weight += (1 - rho) * train_weight`


			`def update_nesterov(weights, lr, weight_decay, momentum):`
			`for weight, velocity in weights:`
			`if weight.requires_grad:`
			`gradient = weight.grad.data`
			`weight = weight.data`

			`gradient.add_(weight, alpha=weight_decay).mul_(-lr)`
			`velocity.mul_(momentum).add_(gradient)`
			`weight.add_(gradient.add_(velocity, alpha=momentum))`


			`def random_crop(data, crop_size):`
			`crop_h, crop_w = crop_size`
			`h = data.size(2)`
			`w = data.size(3)`
			`x = torch.randint(w - crop_w, size=(1,))[0]`
			`y = torch.randint(h - crop_h, size=(1,))[0]`
			`return data[:, :, y : y + crop_h, x : x + crop_w]`


			`def sha256(path):`
			`import hashlib`
			`with open(path, "rb") as f:`
			`return hashlib.sha256(f.read()).hexdigest()`


			`def getrelpath(abspath):`
			`import os`
			`return os.path.relpath(abspath, os.getcwd())`


			`def print_info():`
			`# Knowing this information might improve chance of reproducability`
			`print("File :", getrelpath(__file__), sha256(__file__))`
			`print("Model :", getrelpath(model.__file__), sha256(model.__file__))`
			`print("PyTorch:", torch.__version__)`


			`def main():`
			`print_info()`
			`accuracies = []`
			`threshold = 0.94`
			`for run in range(100):`
Distill: add student model 2024-11-23 17:26:51 -07:00			`valid_acc, teacher_model, train_loader, test_loader, device = train(seed=run)`
Init 2024-11-20 12:11:10 -07:00			`accuracies.append(valid_acc)`

			`# Print accumulated results`
			`within_threshold = sum(acc >= threshold for acc in accuracies)`
			`acc = threshold * 100.0`
			`print()`
			`print(f"{within_threshold} of {run + 1} runs >= {acc} % accuracy")`
			`mean = sum(accuracies) / len(accuracies)`
			`variance = sum((acc - mean)**2 for acc in accuracies) / len(accuracies)`
			`std = variance**0.5`
			`print(f"Min accuracy: {min(accuracies)}")`
			`print(f"Max accuracy: {max(accuracies)}")`
			`print(f"Mean accuracy: {mean} +- {std}")`
			`print()`
Distill: add student model 2024-11-23 17:26:51 -07:00
			`#saving teacher model`
			`current_datetime = datetime.now()`
			`filename = f"teachermodels/{current_datetime.strftime('%Y%m%d_%H%M%S')}.pt"`
			`#torch.save(model, filename)`

			`#distillation training for student models`
			`distilled_model = studentmodel.Model(num_classes=10).to(device)`
			`#train_mse_loss(teacher=teacher_model, student=distilled_model, train_loader=train_loader, epochs=10, learning_rate=0.001, feature_map_weight=0.25, ce_loss_weight=0.75, device=device)`
			`#test_student_accuracy = test_multiple_outputs(distilled_model, test_loader, device)`
			`#print(f"Student accuracy with CE + RegressorMSE: {test_student_accuracy:.2f}%")`

			`#saving student model`
			`#current_datetime = datetime.now()`
			`#filename = f"studentmodels/{current_datetime.strftime('%Y%m%d_%H%M%S')}.pt"`
			`#torch.save(distilled_model, filename)`


Init 2024-11-20 12:11:10 -07:00			`if __name__ == "__main__":`
			`main()`