NVIDIA-NeMo
diff --git a/‎.github/config/requirements.txt‎
Lines changed: 4 additions & 1 deletion b/‎.github/config/requirements.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/_update_dependencies.yml‎
Lines changed: 18 additions & 26 deletions b/‎.github/workflows/_update_dependencies.yml‎
Lines changed: 18 additions & 26 deletions
diff --git a/‎.github/workflows/dependabot.yml‎
Lines changed: 3 additions & 5 deletions b/‎.github/workflows/dependabot.yml‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 29 additions & 21 deletions b/‎README.md‎
Lines changed: 29 additions & 21 deletions
@@ -2,6 +2,9 @@ sphinx
 sphinx-autobuild    # For live doc serving while editing docs
 sphinx-autodoc2     # For documenting Python API
 sphinx-copybutton   # Adds a copy button for code blocks
-myst_parser         # For our markdown docs
 nvidia-sphinx-theme # Our NVIDIA theme
 sphinxcontrib-mermaid # For mermaid diagrams
+myst-parser # For our markdown docs
+sphinx-design
+sphinxcontrib-mermaid
+swagger-plugin-for-sphinx
@@ -9,12 +9,6 @@ on:
     secrets:
       PAT:
         required: true
-      AZURE_CLIENT_ID:
-        required: true
-      AZURE_TENANT_ID:
-        required: true
-      AZURE_SUBSCRIPTION_ID:
-        required: true
       SSH_KEY:
         required: true
       SSH_PWD:
@@ -33,25 +27,18 @@ jobs:
 
   update-lockfile:
     environment: nemo-ci
-    runs-on: linux-amd64-cpu16
+    runs-on: ubuntu-latest
     needs: [pre-flight]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - package: nemo-evaluator
+          - package: nemo-evaluator-launcher
     env:
       SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
       TARGET_BRANCH: ${{ inputs.target-branch }}
     steps:
-      - name: Install Azure CLI
-        run: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-      - name: Azure Login
-        uses: azure/login@v2
-        with:
-          client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
-
-      - name: Azure ACR Login
-        run: az acr login --name nemoci
-
       - name: Checkout repo
         uses: actions/checkout@v4
         with:
@@ -61,7 +48,7 @@ jobs:
         env:
           GH_TOKEN: ${{ secrets.PAT }}
         run: |
-          docker build -f docker/Dockerfile.ci --build-arg INFERENCE_FRAMEWORK=inframework --secret id=GH_TOKEN -t eval .
+          docker build -f docker/Dockerfile.ci --build-arg PACKAGE=${{ matrix.package }} -t eval .
 
       - name: Create bump branch if not exists
         run: |
@@ -81,7 +68,7 @@ jobs:
         run: |
           docker run \
           --rm \
-          -v $(pwd):/workspace \
+          -v $(pwd)/packages/${{ matrix.package }}:/workspace \
           -w /workspace \
           -e GH_TOKEN=${{ secrets.PAT }} \
           eval \
@@ -90,8 +77,8 @@ jobs:
       - name: Upload lock file
         uses: actions/upload-artifact@v4
         with:
-          name: lock-file-${{ env.SOURCE_BRANCH }}
-          path: uv.lock
+          name: lock-file-${{ env.SOURCE_BRANCH }}-${{ matrix.package }}
+          path: packages/${{ matrix.package }}/uv.lock
 
   create-pr:
     needs: [update-lockfile, pre-flight]
@@ -128,8 +115,13 @@ jobs:
 
       - name: Download lock file
         uses: actions/download-artifact@v4
-        with:
-          name: lock-file-${{ env.SOURCE_BRANCH }}
+
+      - name: Move files
+        run: |
+          mv lock-file-${{ needs.pre-flight.outputs.bump-branch }}-nemo-evaluator-launcher/uv.lock \
+            packages/nemo-evaluator-launcher/uv.lock
+          mv lock-file-${{ needs.pre-flight.outputs.bump-branch }}-nemo-evaluator/uv.lock \
+            packages/nemo-evaluator/uv.lock
 
       - name: Create Bump PR
         uses: peter-evans/create-pull-request@v6
 
@@ -24,6 +24,7 @@ jobs:
             grep -o 'r[0-9]\+\.[0-9]\+\.[0-9]\+' | 
             sort -V | 
             tail -n1)
+
           echo "nemo_eval_release_branch=$latest_branch" >> $GITHUB_OUTPUT
 
   bump-tags:
@@ -32,16 +33,13 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - target-branch: ${{ needs.get-release-branch-names.outputs.eval }}
+          # - target-branch: ${{ needs.get-release-branch-names.outputs.eval }} # re-enable that after code-freeze of 25.11
           - target-branch: main
     uses: ./.github/workflows/_update_dependencies.yml
     with:
       target-branch: ${{ matrix.target-branch }}
     secrets:
       PAT: ${{ secrets.PAT }}
-      AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
-      AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
-      AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
       SSH_KEY: ${{ secrets.SSH_KEY }}
       SSH_PWD: ${{ secrets.SSH_PWD }}
 
@@ -59,5 +57,5 @@ jobs:
         run: |
           curl -X POST \
             -H 'Content-type: application/json' \
-            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Mcore-bump-bot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
+            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
             $SLACK_WEBHOOK
@@ -15,11 +15,13 @@ NeMo Evaluator is an open-source platform for robust, reproducible, and scalable
 [Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Eval/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md)
 
 ### Key Pillars
-NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience.
-- **Reproducibility by Default** -- All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
-- **Scale Anywhere** --  Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
-- **State-of-the-Art Benchmarking** --  Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
-- **Extensible and Customizable** --  Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
+
+NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
+
+- **Reproducibility by Default**: All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
+- **Scale Anywhere**: Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
+- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
+- **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
 
 ### How It Works: Launcher and Core Engine
 
@@ -28,7 +30,7 @@ The platform consists of two main components:
 - **`nemo-evaluator` ([The Evaluation Core Engine](./docs/nemo-evaluator/index.md))**: A Python library that manages the interaction between an evaluation harness and the model being tested.
 - **`nemo-evaluator-launcher` ([The CLI and Orchestration](./docs/nemo-evaluator-launcher/index.md))**: The primary user interface and orchestration layer. It handles configuration, selects the execution environment, and launches the appropriate container to run the evaluation.
 
-Most users only need to interact with the `nemo-evaluator-launcher` as universal gateway to different benchmarks and harnesses. It is however possible to interact directly with `nemo-evaluator` by following this [guide](./docs/nemo-evaluator/workflows/using-containers.md).
+Most users typically interact with `nemo-evaluator-launcher`, which serves as a universal gateway to different benchmarks and harnesses. However, it is also possible to interact directly with `nemo-evaluator` by following this [guide](./docs/nemo-evaluator/workflows/using-containers.md).
 
 ```mermaid
 graph TD
@@ -54,45 +56,52 @@ graph TD
 Get your first evaluation result in minutes. This guide uses your local machine to run a small benchmark against an OpenAI API-compatible endpoint.
 
 #### 1. Install the Launcher
+
 The launcher is the only package required to get started.
 
 ```bash
 pip install nemo-evaluator-launcher
 ```
 
 #### 2. Set Up Your Model Endpoint
+
 NeMo Evaluator works with any model that exposes an OpenAI-compatible endpoint. For this quickstart, we will use the OpenAI API.
 
 **What is an OpenAI-compatible endpoint?** A server that exposes /v1/chat/completions and /v1/completions endpoints, matching the OpenAI API specification.
 
 **Options for model endpoints:**
-- **Hosted endpoints** (fastest): Use ready-to-use hosted models from providers like build.nvidia.com that expose OpenAI-compatible APIs with no hosting required.
+
+- **Hosted endpoints** (fastest): Use ready-to-use hosted models from providers like [build.nvidia.com](https://build.nvidia.com) that expose OpenAI-compatible APIs with no hosting required.
 - **Self-hosted options**: Host your own models using tools like NVIDIA NIM, vLLM, or TensorRT-LLM for full control over your evaluation environment.
 
 For detailed setup instructions including self-hosted configurations, see the [tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md).
 
 **Getting an NGC API Key for build.nvidia.com:**
+
 To use out-of-the-box build.nvidia.com APIs, you need an API key:
-1. Register an account at [build.nvidia.com](https://build.nvidia.com)
-2. In the Setup menu under Keys/Secrets, generate an API key
-3. Set the environment variable by executing `export NGC_API_KEY=<<YOUR_API_KEY>>`
 
+1. Register an account at [build.nvidia.com](https://build.nvidia.com).
+2. In the Setup menu under Keys/Secrets, generate an API key.
+3. Set the environment variable by executing `export NGC_API_KEY=<YOUR_API_KEY>`.
 
 #### 3. Run Your First Evaluation
-Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the yaml file.
+
+Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the YAML file.
 
 **Configuration Examples**: Explore ready-to-use configuration files in [`packages/nemo-evaluator-launcher/examples/`](./packages/nemo-evaluator-launcher/examples/) for local, Lepton, and Slurm deployments with various model hosting options (vLLM, NIM, hosted endpoints).
 
-Once you have the example configuration file (either by cloning this repository or downloading e.g. the `local_nvidia_nemotron_nano_9b_v2.yaml` file directly), you can run the following command:
+Once you have the example configuration file, either by cloning this repository or downloading one directly such as `local_nvidia_nemotron_nano_9b_v2.yaml`, you can run the following command:
+
 
 ```bash
 nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/examples --config-name local_nvidia_nemotron_nano_9b_v2 --override execution.output_dir=<YOUR_OUTPUT_LOCAL_DIR>
 ```
 
-Upon running this command, you will be able to see a job_id, which can then be used for tracking the job and the reults with all the logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
+After running this command, you will see a `job_id`, which can be used to track the job and its results. All logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
 
 #### 4. Check Your Results
-Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding job id:
+
+Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding `job_id`:
 
 ```bash
 nemo-evaluator-launcher status <job_id_or_invocation_id>
@@ -101,15 +110,16 @@ nemo-evaluator-launcher status <job_id_or_invocation_id>
 #### Next Steps
 
 - List all supported benchmarks:
-```bash
-nemo-evaluator-launcher ls tasks
-```
+
+  ```bash
+  nemo-evaluator-launcher ls tasks
+  ```
+
 - Explore the [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
 - Scale up your evaluations using the [Slurm Executor](./docs/nemo-evaluator-launcher/executors/slurm.md) or [Lepton Executor](./docs/nemo-evaluator-launcher/executors/lepton.md).
 - Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
 - Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).
 
-
 ### Supported Benchmarks and Evaluation Harnesses
 
 NeMo Evaluator Launcher provides pre-built evaluation containers for different evaluation harnesses through the NVIDIA NGC catalog. Each harness supports a variety of benchmarks, which can then be called via `nemo-evaluator`. This table provides a list of benchmark names per harness. A more detailed list of task names can be found in the [list of NGC containers](./docs/nemo-evaluator/index.md#ngc-containers).
@@ -134,8 +144,6 @@ NeMo Evaluator Launcher provides pre-built evaluation containers for different e
 | **tooltalk** | Tool usage evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) | `25.08.1` | ToolTalk |
 | **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.08.1` | AI2D, ChartQA, OCRBench, SlideVQA |
 
-
-
 ### Contribution Guide
-We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
 
+We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.