Skip to content

Commit 84c264e

Browse files
authored
Merge branch 'main' into awarno/haproxy
Signed-off-by: AWarno <[email protected]>
2 parents 2d0e008 + 7ac93a3 commit 84c264e

File tree

15 files changed

+605
-82
lines changed

15 files changed

+605
-82
lines changed

.github/config/requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ sphinx
22
sphinx-autobuild # For live doc serving while editing docs
33
sphinx-autodoc2 # For documenting Python API
44
sphinx-copybutton # Adds a copy button for code blocks
5-
myst_parser # For our markdown docs
65
nvidia-sphinx-theme # Our NVIDIA theme
76
sphinxcontrib-mermaid # For mermaid diagrams
7+
myst-parser # For our markdown docs
8+
sphinx-design
9+
sphinxcontrib-mermaid
10+
swagger-plugin-for-sphinx

.github/workflows/_update_dependencies.yml

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,6 @@ on:
99
secrets:
1010
PAT:
1111
required: true
12-
AZURE_CLIENT_ID:
13-
required: true
14-
AZURE_TENANT_ID:
15-
required: true
16-
AZURE_SUBSCRIPTION_ID:
17-
required: true
1812
SSH_KEY:
1913
required: true
2014
SSH_PWD:
@@ -33,25 +27,18 @@ jobs:
3327

3428
update-lockfile:
3529
environment: nemo-ci
36-
runs-on: linux-amd64-cpu16
30+
runs-on: ubuntu-latest
3731
needs: [pre-flight]
32+
strategy:
33+
fail-fast: false
34+
matrix:
35+
include:
36+
- package: nemo-evaluator
37+
- package: nemo-evaluator-launcher
3838
env:
3939
SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
4040
TARGET_BRANCH: ${{ inputs.target-branch }}
4141
steps:
42-
- name: Install Azure CLI
43-
run: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
44-
45-
- name: Azure Login
46-
uses: azure/login@v2
47-
with:
48-
client-id: ${{ secrets.AZURE_CLIENT_ID }}
49-
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
50-
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
51-
52-
- name: Azure ACR Login
53-
run: az acr login --name nemoci
54-
5542
- name: Checkout repo
5643
uses: actions/checkout@v4
5744
with:
@@ -61,7 +48,7 @@ jobs:
6148
env:
6249
GH_TOKEN: ${{ secrets.PAT }}
6350
run: |
64-
docker build -f docker/Dockerfile.ci --build-arg INFERENCE_FRAMEWORK=inframework --secret id=GH_TOKEN -t eval .
51+
docker build -f docker/Dockerfile.ci --build-arg PACKAGE=${{ matrix.package }} -t eval .
6552
6653
- name: Create bump branch if not exists
6754
run: |
@@ -81,7 +68,7 @@ jobs:
8168
run: |
8269
docker run \
8370
--rm \
84-
-v $(pwd):/workspace \
71+
-v $(pwd)/packages/${{ matrix.package }}:/workspace \
8572
-w /workspace \
8673
-e GH_TOKEN=${{ secrets.PAT }} \
8774
eval \
@@ -90,8 +77,8 @@ jobs:
9077
- name: Upload lock file
9178
uses: actions/upload-artifact@v4
9279
with:
93-
name: lock-file-${{ env.SOURCE_BRANCH }}
94-
path: uv.lock
80+
name: lock-file-${{ env.SOURCE_BRANCH }}-${{ matrix.package }}
81+
path: packages/${{ matrix.package }}/uv.lock
9582

9683
create-pr:
9784
needs: [update-lockfile, pre-flight]
@@ -128,8 +115,13 @@ jobs:
128115
129116
- name: Download lock file
130117
uses: actions/download-artifact@v4
131-
with:
132-
name: lock-file-${{ env.SOURCE_BRANCH }}
118+
119+
- name: Move files
120+
run: |
121+
mv lock-file-${{ needs.pre-flight.outputs.bump-branch }}-nemo-evaluator-launcher/uv.lock \
122+
packages/nemo-evaluator-launcher/uv.lock
123+
mv lock-file-${{ needs.pre-flight.outputs.bump-branch }}-nemo-evaluator/uv.lock \
124+
packages/nemo-evaluator/uv.lock
133125
134126
- name: Create Bump PR
135127
uses: peter-evans/create-pull-request@v6

.github/workflows/dependabot.yml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ jobs:
2424
grep -o 'r[0-9]\+\.[0-9]\+\.[0-9]\+' |
2525
sort -V |
2626
tail -n1)
27+
2728
echo "nemo_eval_release_branch=$latest_branch" >> $GITHUB_OUTPUT
2829
2930
bump-tags:
@@ -32,16 +33,13 @@ jobs:
3233
fail-fast: false
3334
matrix:
3435
include:
35-
- target-branch: ${{ needs.get-release-branch-names.outputs.eval }}
36+
# - target-branch: ${{ needs.get-release-branch-names.outputs.eval }} # re-enable that after code-freeze of 25.11
3637
- target-branch: main
3738
uses: ./.github/workflows/_update_dependencies.yml
3839
with:
3940
target-branch: ${{ matrix.target-branch }}
4041
secrets:
4142
PAT: ${{ secrets.PAT }}
42-
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
43-
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
44-
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
4543
SSH_KEY: ${{ secrets.SSH_KEY }}
4644
SSH_PWD: ${{ secrets.SSH_PWD }}
4745

@@ -59,5 +57,5 @@ jobs:
5957
run: |
6058
curl -X POST \
6159
-H 'Content-type: application/json' \
62-
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Mcore-bump-bot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
60+
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
6361
$SLACK_WEBHOOK

README.md

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@ NeMo Evaluator is an open-source platform for robust, reproducible, and scalable
1515
[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Eval/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md)
1616

1717
### Key Pillars
18-
NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience.
19-
- **Reproducibility by Default** -- All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
20-
- **Scale Anywhere** -- Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
21-
- **State-of-the-Art Benchmarking** -- Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
22-
- **Extensible and Customizable** -- Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
18+
19+
NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
20+
21+
- **Reproducibility by Default**: All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
22+
- **Scale Anywhere**: Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
23+
- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
24+
- **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
2325

2426
### How It Works: Launcher and Core Engine
2527

@@ -28,7 +30,7 @@ The platform consists of two main components:
2830
- **`nemo-evaluator` ([The Evaluation Core Engine](./docs/nemo-evaluator/index.md))**: A Python library that manages the interaction between an evaluation harness and the model being tested.
2931
- **`nemo-evaluator-launcher` ([The CLI and Orchestration](./docs/nemo-evaluator-launcher/index.md))**: The primary user interface and orchestration layer. It handles configuration, selects the execution environment, and launches the appropriate container to run the evaluation.
3032

31-
Most users only need to interact with the `nemo-evaluator-launcher` as universal gateway to different benchmarks and harnesses. It is however possible to interact directly with `nemo-evaluator` by following this [guide](./docs/nemo-evaluator/workflows/using-containers.md).
33+
Most users typically interact with `nemo-evaluator-launcher`, which serves as a universal gateway to different benchmarks and harnesses. However, it is also possible to interact directly with `nemo-evaluator` by following this [guide](./docs/nemo-evaluator/workflows/using-containers.md).
3234

3335
```mermaid
3436
graph TD
@@ -54,45 +56,52 @@ graph TD
5456
Get your first evaluation result in minutes. This guide uses your local machine to run a small benchmark against an OpenAI API-compatible endpoint.
5557

5658
#### 1. Install the Launcher
59+
5760
The launcher is the only package required to get started.
5861

5962
```bash
6063
pip install nemo-evaluator-launcher
6164
```
6265

6366
#### 2. Set Up Your Model Endpoint
67+
6468
NeMo Evaluator works with any model that exposes an OpenAI-compatible endpoint. For this quickstart, we will use the OpenAI API.
6569

6670
**What is an OpenAI-compatible endpoint?** A server that exposes /v1/chat/completions and /v1/completions endpoints, matching the OpenAI API specification.
6771

6872
**Options for model endpoints:**
69-
- **Hosted endpoints** (fastest): Use ready-to-use hosted models from providers like build.nvidia.com that expose OpenAI-compatible APIs with no hosting required.
73+
74+
- **Hosted endpoints** (fastest): Use ready-to-use hosted models from providers like [build.nvidia.com](https://build.nvidia.com) that expose OpenAI-compatible APIs with no hosting required.
7075
- **Self-hosted options**: Host your own models using tools like NVIDIA NIM, vLLM, or TensorRT-LLM for full control over your evaluation environment.
7176

7277
For detailed setup instructions including self-hosted configurations, see the [tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md).
7378

7479
**Getting an NGC API Key for build.nvidia.com:**
80+
7581
To use out-of-the-box build.nvidia.com APIs, you need an API key:
76-
1. Register an account at [build.nvidia.com](https://build.nvidia.com)
77-
2. In the Setup menu under Keys/Secrets, generate an API key
78-
3. Set the environment variable by executing `export NGC_API_KEY=<<YOUR_API_KEY>>`
7982

83+
1. Register an account at [build.nvidia.com](https://build.nvidia.com).
84+
2. In the Setup menu under Keys/Secrets, generate an API key.
85+
3. Set the environment variable by executing `export NGC_API_KEY=<YOUR_API_KEY>`.
8086

8187
#### 3. Run Your First Evaluation
82-
Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the yaml file.
88+
89+
Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the YAML file.
8390

8491
**Configuration Examples**: Explore ready-to-use configuration files in [`packages/nemo-evaluator-launcher/examples/`](./packages/nemo-evaluator-launcher/examples/) for local, Lepton, and Slurm deployments with various model hosting options (vLLM, NIM, hosted endpoints).
8592

86-
Once you have the example configuration file (either by cloning this repository or downloading e.g. the `local_nvidia_nemotron_nano_9b_v2.yaml` file directly), you can run the following command:
93+
Once you have the example configuration file, either by cloning this repository or downloading one directly such as `local_nvidia_nemotron_nano_9b_v2.yaml`, you can run the following command:
94+
8795

8896
```bash
8997
nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/examples --config-name local_nvidia_nemotron_nano_9b_v2 --override execution.output_dir=<YOUR_OUTPUT_LOCAL_DIR>
9098
```
9199

92-
Upon running this command, you will be able to see a job_id, which can then be used for tracking the job and the reults with all the logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
100+
After running this command, you will see a `job_id`, which can be used to track the job and its results. All logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
93101

94102
#### 4. Check Your Results
95-
Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding job id:
103+
104+
Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding `job_id`:
96105

97106
```bash
98107
nemo-evaluator-launcher status <job_id_or_invocation_id>
@@ -101,15 +110,16 @@ nemo-evaluator-launcher status <job_id_or_invocation_id>
101110
#### Next Steps
102111

103112
- List all supported benchmarks:
104-
```bash
105-
nemo-evaluator-launcher ls tasks
106-
```
113+
114+
```bash
115+
nemo-evaluator-launcher ls tasks
116+
```
117+
107118
- Explore the [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
108119
- Scale up your evaluations using the [Slurm Executor](./docs/nemo-evaluator-launcher/executors/slurm.md) or [Lepton Executor](./docs/nemo-evaluator-launcher/executors/lepton.md).
109120
- Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
110121
- Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).
111122

112-
113123
### Supported Benchmarks and Evaluation Harnesses
114124

115125
NeMo Evaluator Launcher provides pre-built evaluation containers for different evaluation harnesses through the NVIDIA NGC catalog. Each harness supports a variety of benchmarks, which can then be called via `nemo-evaluator`. This table provides a list of benchmark names per harness. A more detailed list of task names can be found in the [list of NGC containers](./docs/nemo-evaluator/index.md#ngc-containers).
@@ -134,8 +144,6 @@ NeMo Evaluator Launcher provides pre-built evaluation containers for different e
134144
| **tooltalk** | Tool usage evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) | `25.08.1` | ToolTalk |
135145
| **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.08.1` | AI2D, ChartQA, OCRBench, SlideVQA |
136146

137-
138-
139147
### Contribution Guide
140-
We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
141148

149+
We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.

0 commit comments

Comments
 (0)