Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def set_data(self, train_dataloader, val_dataloader, model):

def evaluate_result(self, model, dist=None):
model.eval()
val_batch_size = 64 # Batch of 64 for validation
val_batch_size = 64 # Batch 64 for validation as in NanoGPT codebase
if dist is not None:
# In distributed mode, we use the distributed data generator
rank, size = dist.get_rank(), dist.get_world_size()
Expand All @@ -43,7 +43,7 @@ def evaluate_result(self, model, dist=None):
# Compute the validation loss
val_loss, n_batches = 0.0, 0
for data in val_loader:
loss, *_ = self.model(*data)
loss, *_ = model(*data)
val_loss += loss.item()
n_batches += 1
val_loss /= n_batches
Expand Down
5 changes: 4 additions & 1 deletion solvers/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Solver(BaseSolver):
parameters = {
'learning_rate': [1e-3],
'weight_decay': [1e-4],
'num_steps': [6200],
'num_steps': [8_000],
'batch_size': [64],
"slurm_nodes": [1, 2],
"sin_init": [True],
Expand Down Expand Up @@ -63,7 +63,10 @@ def get_next(self, stop_val):
return stop_val + 250

def warm_up(self):
n_iter = self.num_steps
self.num_steps = 10
self.run_once(stop_val=10)
self.num_steps = n_iter

def run(self, cb):

Expand Down
10 changes: 5 additions & 5 deletions solvers/muon.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from benchopt import BaseSolver
from contextlib import nullcontext

import torch
from benchmark_utils.distributed_tools import setup_distributed
from benchmark_utils.lr_scheduler import get_lr
from benchmark_utils.optimizers.muon import Muon
from benchopt import BaseSolver
from torch.optim import AdamW
from tqdm.auto import tqdm

Expand All @@ -13,11 +13,11 @@ class Solver(BaseSolver):
name = "Muon"

parameters = {
"muon_lr": [0.02],
"muon_lr": [3.6e-4],
"muon_momentum": [0.95],
"adam_lr": [3e-4],
"adam_weight_decay": [0.0],
"num_steps": [6200],
"adam_lr": [0.0036],
"adam_weight_decay": [1e-4],
"num_steps": [8_000],
"batch_size": [64],
"slurm_nodes": [1, 2],
}
Expand Down
2 changes: 1 addition & 1 deletion solvers/scion.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Solver(BaseSolver):
"momentum": [0.1],
"hidden_radius": [50.0],
"lm_head_radius": [3000.0],
"num_steps": [6200],
"num_steps": [8_000],
"batch_size": [64],
"slurm_nodes": [1, 2],
}
Expand Down
7 changes: 5 additions & 2 deletions solvers/soap.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ class Solver(BaseSolver):
name = "SOAP"

parameters = {
"learning_rate": [3e-3],
"learning_rate": [0.0036],
"weight_decay": [1e-4],
"num_steps": [6200],
"num_steps": [7_500],
"batch_size": [64],
"slurm_nodes": [1, 2],
}
Expand Down Expand Up @@ -53,7 +53,10 @@ def get_next(self, stop_val):
return stop_val + 250

def warm_up(self):
n_iter = self.num_steps
self.num_steps = 10
self.run_once(stop_val=10)
self.num_steps = n_iter

def run(self, cb):
param_dict = {
Expand Down
Loading