automl · tomMoral · Jul 19, 2024 · Jul 19, 2024
diff --git a/benchopt_benchmark/.github/workflows/main.yml b/benchopt_benchmark/.github/workflows/main.yml
@@ -0,0 +1,27 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+  create:
+    tags:
+      - '**'
+  pull_request:
+    branches:
+      - main
+  schedule:
+    # Run every 1st of the month at 7:42am UTC.
+    - cron:  '42 7 1 * *'
+
+jobs:
+  benchopt_dev:
+    uses: benchopt/template_benchmark/.github/workflows/test_benchmarks.yml@main
+    with:
+      benchopt_branch: benchopt@main
+  benchopt_release:
+    uses: benchopt/template_benchmark/.github/workflows/test_benchmarks.yml@main
+    with:
+      benchopt_version: latest
+  lint:
+    uses: benchopt/template_benchmark/.github/workflows/lint_benchmarks.yml@main
diff --git a/benchopt_benchmark/.gitignore b/benchopt_benchmark/.gitignore
@@ -0,0 +1,18 @@
+# Cache directories
+.pytest_cache
+__pycache__
+__cache__
+*.egg-info
+.coverage
+**/outputs
+joblib/
+/data/
+
+# IDE specific folders
+.vscode
+
+# Config files
+benchopt.ini
+
+.DS_Store
+coverage.xml
diff --git a/benchopt_benchmark/README.rst b/benchopt_benchmark/README.rst
@@ -0,0 +1,33 @@
+
+Fast Optimizer Benchmark
+=====================
+|Build Status| |Python 3.9+|
+
+Benchopt is a package to simplify and make more transparent and
+reproducible the comparisons of optimization algorithms.
+This benchmark is dedicated to optimization algorithm to train neural networks.
+
+Install
+--------
+
+This benchmark can be run using the following commands:
+
+.. code-block::
+
+   $ pip install -U benchopt
+   $ git clone https://github.com/automl/FOB
+   $ benchopt run FOB
+
+Apart from the problem, options can be passed to ``benchopt run``, to restrict the benchmarks to some solvers or datasets, e.g.:
+
+.. code-block::
+
+	$ benchopt run FOB -s solver1 -d dataset2 --max-runs 10 --n-repetitions 10
+
+
+Use ``benchopt run -h`` for more details about these options, or visit https://benchopt.github.io/api.html.
+
+.. |Build Status| image:: https://github.com/automl/FOB/workflows/Tests/badge.svg
+   :target: https://github.com/automl/FOB/actions
+.. |Python 3.6+| image:: https://img.shields.io/badge/python-3.6%2B-blue
+   :target: https://www.python.org/downloads/release/python-360/
diff --git a/benchopt_benchmark/benchmark_utils/__init__.py b/benchopt_benchmark/benchmark_utils/__init__.py
@@ -0,0 +1,18 @@
+# `benchmark_utils` is a module in which you can define code to reuse in
+# the benchmark objective, datasets, and solvers. The folder should have the
+# name `benchmark_utils`, and code defined inside will be importable using
+# the usual import syntax. To import external packages in this file, use a
+# `safe_import_context` named "import_ctx", as follows:
+
+from benchopt.utils import safe_import_context
+
+with safe_import_context() as import_ctx:
+    import numpy as np
+
+
+def gradient_ols(X, y, beta):
+    return X.T @ (X @ beta - y)
+
+
+def value_ols(X, y, beta):
+    return 0.5 * np.mean((y - X @ beta) ** 2)
diff --git a/benchopt_benchmark/datasets/mnist.py b/benchopt_benchmark/datasets/mnist.py
@@ -0,0 +1,29 @@
+from benchopt import BaseDataset, safe_import_context
+from benchopt.config import get_data_path
+
+with safe_import_context() as import_ctx:
+    from pytorch_fob.tasks.mnist.data import MNISTDataModule
+    from pytorch_fob.tasks.mnist.model import MNISTModel
+
+
+class Dataset(BaseDataset):
+    name = 'MNIST'
+
+    parameters = {
+        'num_hidden': [10],
+        'activation': ['Sigmoid', 'ReLU'],
+        'seed': [42, 47]
+    }
+
+    def get_data(self):
+        model = MNISTModel(
+            num_hidden=self.num_hidden,
+            activation=self.activation
+        )
+
+        data_dir = get_data_path('mnist')
+        data_module = MNISTDataModule(
+            data_dir=data_dir, seed=self.seed
+        )
+
+        return dict(model=model, data_module=data_module)
diff --git a/benchopt_benchmark/datasets/simulated.py b/benchopt_benchmark/datasets/simulated.py
@@ -0,0 +1,43 @@
+from benchopt import BaseDataset, safe_import_context
+
+
+# Protect the import with `safe_import_context()`. This allows:
+# - skipping import to speed up autocompletion in CLI.
+# - getting requirements info when all dependencies are not installed.
+with safe_import_context() as import_ctx:
+    import numpy as np
+
+
+# All datasets must be named `Dataset` and inherit from `BaseDataset`
+class Dataset(BaseDataset):
+
+    # Name to select the dataset in the CLI and to display the results.
+    name = "Simulated"
+
+    # List of parameters to generate the datasets. The benchmark will consider
+    # the cross product for each key in the dictionary.
+    # Any parameters 'param' defined here is available as `self.param`.
+    parameters = {
+        'n_samples, n_features': [
+            (1000, 500),
+            (5000, 200),
+        ],
+        'random_state': [27],
+    }
+
+    # List of packages needed to run the dataset. See the corresponding
+    # section in objective.py
+    requirements = []
+
+    def get_data(self):
+        # The return arguments of this function are passed as keyword arguments
+        # to `Objective.set_data`. This defines the benchmark's
+        # API to pass data. It is customizable for each benchmark.
+
+        # Generate pseudorandom data using `numpy`.
+        rng = np.random.RandomState(self.random_state)
+        X = rng.randn(self.n_samples, self.n_features)
+        y = rng.randn(self.n_samples)
+
+        # The dictionary defines the keyword arguments for `Objective.set_data`
+        return dict(X=X, y=y)
diff --git a/benchopt_benchmark/objective.py b/benchopt_benchmark/objective.py
@@ -0,0 +1,41 @@
+from benchopt import BaseObjective, safe_import_context
+
+
+with safe_import_context() as import_ctx:
+    from lightning import Trainer
+
+
+class Objective(BaseObjective):
+    name = "FOB"
+
+    requirements = [
+        "pip::git+https://github.com/automl/FOB.git"
+    ]
+
+    def set_data(self, model, data_module):
+        self.model = model
+        self.data_module = data_module
+
+    def evaluate_result(self, trainer: Trainer):
+        score_train = trainer.validate(self.model, datamodule=self.data_module)
+        score_val = trainer.validate(self.model, datamodule=self.data_module)
+        # TODO - Need to load the best checkpoint
+        score_test = trainer.test(self.model, datamodule=self.data_module)
+        return dict(
+            **{f'train_{k}': v for k, v in score_train[0].items()},
+            **{f'val_{k}': v for k, v in score_val[0].items()},
+            **{f'test_last_{k}': v for k, v in score_test[0].items()},
+            value=score_val[0]['val_loss'],
+        )
+
+    def get_objective(self):
+        return dict(
+            model=self.model,
+            data_module=self.data_module
+        )
+
+    def get_one_result(self):
+        return dict(trainer=Trainer(
+            devices=self.devices,
+            enable_progress_bar=True,
+        ))
diff --git a/benchopt_benchmark/solvers/sgd.py b/benchopt_benchmark/solvers/sgd.py
@@ -0,0 +1,77 @@
+from benchopt import BaseSolver, safe_import_context
+
+
+with safe_import_context() as import_ctx:
+    from pytorch_fob.optimizers.sgd_baseline.optimizer import \
+        configure_optimizers
+    from pytorch_fob.optimizers.optimizers import OptimizerConfig
+    from pytorch_fob.engine.parameter_groups import GroupedModel
+    from lightning.pytorch.utilities.types import OptimizerLRScheduler
+    from lightning import Trainer, Callback
+
+
+class Optimizer():
+    def __init__(self, config: OptimizerConfig) -> None:
+        self.config = config
+
+    def configure_optimizers(self, model: GroupedModel) -> OptimizerLRScheduler:
+        return configure_optimizers(model, self.config)
+
+
+class Solver(BaseSolver):
+    name = 'SGD'
+
+    parameters = {
+        'learning_rate': [1e-3],
+        'weight_decay': [1e-4],
+        'momentum': [0.9],
+        'nesterov': [True],
+        'max_epochs': [200],
+        'eta_min_factor': [0.1],
+        'lr_interval': ['step'],
+        'batch_size': [64]
+    }
+    sampling_strategy = 'run_once'
+
+    def set_objective(self, model, data_module):
+        self.model = model
+        self.data_module = data_module
+
+        self.data_module.set_batch_size(self.batch_size)
+
+        config = OptimizerConfig(
+            optimizer_key='sgd',
+            task_key='benchopt',
+            config=dict(
+                sgd=dict(
+                    name=self.name,
+                    lr_interval=self.lr_interval,
+                    learning_rate=self.learning_rate,
+                    weight_decay=self.weight_decay,
+                    momentum=self.momentum,
+                    nesterov=self.nesterov,
+                    eta_min_factor=self.eta_min_factor,
+                ),
+                benchopt=dict(
+                    max_epochs=self.max_epochs,
+                    max_steps=200,
+                ),
+            )
+        )
+
+        optimizer = Optimizer(config)
+        self.model.set_optimizer(optimizer)
+
+    def run(self, _):
+        # class BenchoptCallback(Callback):
+        #     def on_train_epoch_end(self, trainer, pl_module):
+        #         trainer.should_stop = not cb()
+
+        self.trainer = Trainer(
+            max_epochs=self.max_epochs,
+            # callbacks=[BenchoptCallback()]
+        )
+        self.trainer.fit(self.model, self.data_module)
+
+    def get_result(self):
+        return dict(trainer=self.trainer)
diff --git a/benchopt_benchmark/test_config.py b/benchopt_benchmark/test_config.py
@@ -0,0 +1,13 @@
+import sys  # noqa: F401
+
+import pytest  # noqa: F401
+
+
+def check_test_solver_install(solver_class):
+    """Hook called in `test_solver_install`.
+
+    If one solver needs to be skip/xfailed on some
+    particular architecture, call pytest.xfail when
+    detecting the situation.
+    """
+    pass
diff --git a/pytorch_fob/tasks/mnist/data.py b/pytorch_fob/tasks/mnist/data.py
@@ -1,16 +1,18 @@
+import torch
 from torch.utils.data import random_split
 from torchvision.datasets import MNIST
 from torchvision import transforms
-from pytorch_fob.engine.configs import TaskConfig
+
 from pytorch_fob.tasks import TaskDataModule
 
 
 class MNISTDataModule(TaskDataModule):
-    def __init__(self, config: TaskConfig):
-        super().__init__(config)
+    def __init__(self, data_dir, seed=None):
+        super().__init__(data_dir)
         # split can also be a fraction self.train_val_split
         # [55000, 5000] is taken from https://lightning.ai/docs/pytorch/stable/data/datamodule.html
         self.train_val_split = [55000, 5000]
+        self.seed = seed
 
         # TODO: check values
         # https://lightning.ai/docs/pytorch/stable/data/datamodule.html
@@ -29,9 +31,14 @@ def setup(self, stage: str):
         """
         # Assign train/val datasets for use in dataloaders
         if stage == "fit":
+            generator = torch.Generator()
+            if self.seed is not None:
+                generator = generator.manual_seed(self.seed)
             mnist_full = MNIST(str(self.data_dir), train=True, transform=self.transform)
             # TODO (Zachi) confirm seed everything makes this reproducable:
-            self.data_train, self.data_val = random_split(mnist_full, self.train_val_split)
+            self.data_train, self.data_val = random_split(
+                mnist_full, self.train_val_split, generator=generator
+            )
 
         # Assign test dataset for use in dataloader(s)
         if stage == "test":

diff --git a/pytorch_fob/tasks/mnist/model.py b/pytorch_fob/tasks/mnist/model.py
@@ -5,15 +5,15 @@
 
 
 class MNISTModel(TaskModel):
-    def __init__(self, optimizer: Optimizer, config: TaskConfig):
+    def __init__(self, num_hidden: int, activation: str):
 
         input_size = 28 * 28  # 784
         num_classes = 10
-        num_hidden = config.model.num_hidden
-        activation = config.model.activation
-        if activation.lower() == "Sigmoid".lower():
+        num_hidden = num_hidden
+        activation = activation
+        if activation.lower() == "sigmoid":
             self.activation = torch.nn.Sigmoid
-        elif activation.lower() == "ReLU".lower():
+        elif activation.lower() == "relu":
             self.activation = torch.nn.ReLU
         else:
             raise NotImplementedError(f"{activation} is not supported for mnist yet")
@@ -25,7 +25,7 @@ def __init__(self, optimizer: Optimizer, config: TaskConfig):
             self.activation(),
             torch.nn.Linear(num_hidden, num_classes, bias=True),
         )
-        super().__init__(model, optimizer, config)
+        super().__init__(model)
         # negative log likelihood loss
         self.loss_fn = torch.nn.functional.nll_loss