Skip to content

Commit aa87ded

Browse files
authored
Merge branch 'main' into awarno/haproxy
2 parents 06e6a85 + 7996cbe commit aa87ded

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+1885
-510
lines changed

.github/workflows/build-docs.yml

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323

2424
jobs:
2525
pre-flight:
26-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
26+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
2727

2828
build-docs:
2929
needs: [pre-flight]
@@ -38,10 +38,27 @@ jobs:
3838
if: |
3939
(
4040
needs.pre-flight.outputs.is_deployment_workflow == 'true'
41-
|| success()
41+
|| always()
4242
)
4343
&& !cancelled()
4444
runs-on: ubuntu-latest
4545
steps:
46-
- name: Result
47-
run: echo Build docs successful
46+
- name: Get workflow result
47+
id: result
48+
shell: bash -x -e -u -o pipefail {0}
49+
env:
50+
GH_TOKEN: ${{ github.token }}
51+
RUN_ID: ${{ github.run_id }}
52+
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
53+
run: |
54+
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
55+
56+
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
57+
echo "✅ All previous jobs completed successfully"
58+
exit 0
59+
else
60+
echo "❌ Found $FAILED_JOBS failed job(s)"
61+
# Show which jobs failed
62+
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
63+
exit 1
64+
fi

.github/workflows/build-test-publish-wheel.yml

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ defaults:
2828

2929
jobs:
3030
pre-flight:
31-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
31+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
3232

3333
build-test-publish-wheel:
3434
needs: [pre-flight]
@@ -92,10 +92,21 @@ jobs:
9292
(
9393
needs.pre-flight.outputs.docs_only == 'true'
9494
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
95-
|| success()
95+
|| always()
9696
)
9797
&& !cancelled()
9898
runs-on: ubuntu-latest
9999
steps:
100100
- name: Result
101-
run: echo Build test publish wheel successful
101+
run: |
102+
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
103+
104+
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
105+
echo "✅ All previous jobs completed successfully"
106+
exit 0
107+
else
108+
echo "❌ Found $FAILED_JOBS failed job(s)"
109+
# Show which jobs failed
110+
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
111+
exit 1
112+
fi

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ permissions:
3131

3232
jobs:
3333
pre-flight:
34-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
34+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
3535

3636
linting:
3737
runs-on: ubuntu-latest

.github/workflows/copyright-check.yml

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323

2424
jobs:
2525
pre-flight:
26-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
26+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
2727

2828
copyright-check:
2929
needs: [pre-flight]
@@ -38,10 +38,21 @@ jobs:
3838
(
3939
needs.pre-flight.outputs.docs_only == 'true'
4040
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
41-
|| success()
41+
|| always()
4242
)
4343
&& !cancelled()
4444
runs-on: ubuntu-latest
4545
steps:
4646
- name: Result
47-
run: echo Copyright check successful
47+
run: |
48+
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
49+
50+
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
51+
echo "✅ All previous jobs completed successfully"
52+
exit 0
53+
else
54+
echo "❌ Found $FAILED_JOBS failed job(s)"
55+
# Show which jobs failed
56+
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
57+
exit 1
58+
fi

.github/workflows/install-test.yml

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ env:
2828

2929
jobs:
3030
pre-flight:
31-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
31+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
3232

3333
pip-test:
3434
runs-on: ubuntu-latest
@@ -74,9 +74,26 @@ jobs:
7474
(
7575
needs.pre-flight.outputs.docs_only == 'true'
7676
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
77-
|| success()
77+
|| always()
7878
)
7979
&& !cancelled()
8080
steps:
81-
- name: Result
82-
run: echo Install check successful
81+
- name: Get workflow result
82+
id: result
83+
shell: bash -x -e -u -o pipefail {0}
84+
env:
85+
GH_TOKEN: ${{ github.token }}
86+
RUN_ID: ${{ github.run_id }}
87+
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
88+
run: |
89+
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
90+
91+
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
92+
echo "✅ All previous jobs completed successfully"
93+
exit 0
94+
else
95+
echo "❌ Found $FAILED_JOBS failed job(s)"
96+
# Show which jobs failed
97+
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
98+
exit 1
99+
fi

README.md

Lines changed: 85 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# NeMo Evaluator
22

3-
[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://github.com/NVIDIA-NeMo/Eval/blob/main/LICENSE)
3+
[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/LICENSE)
44
[![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-green)](https://www.python.org/downloads/)
5-
[![Tests](https://github.com/NVIDIA-NeMo/Eval/actions/workflows/cicd-main.yml/badge.svg)](https://github.com/NVIDIA-NeMo/Eval/actions/workflows/cicd-main.yml)
5+
[![Tests](https://github.com/NVIDIA-NeMo/Evaluator/actions/workflows/cicd-main.yml/badge.svg)](https://github.com/NVIDIA-NeMo/Evaluator/actions/workflows/cicd-main.yml)
66
[![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
77
[![nemo-evaluator PyPI version](https://img.shields.io/pypi/v/nemo-evaluator.svg)](https://pypi.org/project/nemo-evaluator/)
88
[![nemo-evaluator PyPI downloads](https://img.shields.io/pypi/dm/nemo-evaluator.svg)](https://pypi.org/project/nemo-evaluator/)
@@ -12,9 +12,9 @@
1212

1313
NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models. It enables you to run hundreds of benchmarks across popular evaluation harnesses against any OpenAI-compatible model API. Evaluations execute in open-source Docker containers for auditable and trustworthy results. The platform's containerized architecture allows for the rapid integration of public benchmarks and private datasets.
1414

15-
[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Eval/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md)
15+
[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [NeMo FW model evaluations](#-evaluate-checkpoints-trained-by-nemo-framework) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md)
1616

17-
### Key Pillars
17+
## Key Pillars
1818

1919
NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
2020

@@ -23,7 +23,7 @@ NeMo Evaluator is built on four core principles to provide a reliable and versat
2323
- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
2424
- **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
2525

26-
### How It Works: Launcher and Core Engine
26+
## How It Works: Launcher and Core Engine
2727

2828
The platform consists of two main components:
2929

@@ -51,7 +51,7 @@ graph TD
5151
```
5252

5353

54-
### 🚀 Quickstart
54+
## 🚀 Quickstart
5555

5656
Get your first evaluation result in minutes. This guide uses your local machine to run a small benchmark against an OpenAI API-compatible endpoint.
5757

@@ -63,7 +63,7 @@ The launcher is the only package required to get started.
6363
pip install nemo-evaluator-launcher
6464
```
6565

66-
#### 2. Set Up Your Model Endpoint
66+
### 2. Set Up Your Model Endpoint
6767

6868
NeMo Evaluator works with any model that exposes an OpenAI-compatible endpoint. For this quickstart, we will use the OpenAI API.
6969

@@ -84,7 +84,7 @@ To use out-of-the-box build.nvidia.com APIs, you need an API key:
8484
2. In the Setup menu under Keys/Secrets, generate an API key.
8585
3. Set the environment variable by executing `export NGC_API_KEY=<YOUR_API_KEY>`.
8686

87-
#### 3. Run Your First Evaluation
87+
### 3. Run Your First Evaluation
8888

8989
Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the YAML file.
9090

@@ -99,15 +99,15 @@ nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/exampl
9999

100100
After running this command, you will see a `job_id`, which can be used to track the job and its results. All logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
101101

102-
#### 4. Check Your Results
102+
### 4. Check Your Results
103103

104104
Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding `job_id`:
105105

106106
```bash
107107
nemo-evaluator-launcher status <job_id_or_invocation_id>
108108
```
109109

110-
#### Next Steps
110+
### Next Steps
111111

112112
- List all supported benchmarks:
113113

@@ -120,7 +120,65 @@ nemo-evaluator-launcher status <job_id_or_invocation_id>
120120
- Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
121121
- Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).
122122

123-
### Supported Benchmarks and Evaluation Harnesses
123+
124+
## 🧩 Evaluate checkpoints trained by NeMo Framework
125+
126+
The NeMo Framework is NVIDIA’s GPU-accelerated, end-to-end training platform for large language models (LLMs), multimodal models, and speech models. It enables seamless scaling of both pretraining and post-training workloads, from a single GPU to clusters with thousands of nodes, supporting Hugging Face/PyTorch and Megatron models. NeMo includes a suite of libraries and curated training recipes to help users build models from start to finish.
127+
128+
The NeMo Evaluator is integrated within NeMo Framework, offering streamlined deployment and advanced evaluation capabilities for models trained using NeMo, leveraging state-of-the-art evaluation harnesses.
129+
130+
### Features
131+
132+
- **Multi-Backend Deployment**: Supports PyTriton and multi-instance evaluations using the Ray Serve deployment backend
133+
- **Production-Ready**: Supports high-performance inference with CUDA graphs and flash decoding
134+
- **Multi-GPU and Multi-Node Support**: Enables distributed inference across multiple GPUs and compute nodes
135+
- **OpenAI-Compatible API**: Provides RESTful endpoints aligned with OpenAI API specifications
136+
137+
### 1. Start NeMo Framework Container
138+
139+
For optimal performance and user experience, use the latest version of the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags). Please fetch the most recent `$TAG` and run the following command to start a container:
140+
141+
```bash
142+
docker run --rm -it -w /workdir -v $(pwd):/workdir \
143+
--entrypoint bash \
144+
--gpus all \
145+
nvcr.io/nvidia/nemo:${TAG}
146+
```
147+
148+
### 2. Deploy a Model
149+
150+
```bash
151+
# Deploy a NeMo checkpoint
152+
python \
153+
/opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_inframework.py \
154+
--nemo_checkpoint "/path/to/your/checkpoint" \
155+
--model_id megatron_model \
156+
--port 8080 \
157+
--host 0.0.0.0
158+
```
159+
160+
### 3. Evaluate the Model
161+
162+
```python
163+
from nemo_evaluator.api import evaluate
164+
from nemo_evaluator.api.api_dataclasses import ApiEndpoint, EvaluationConfig, EvaluationTarget
165+
166+
# Configure evaluation
167+
api_endpoint = ApiEndpoint(
168+
url="http://0.0.0.0:8080/v1/completions/",
169+
type="completions",
170+
model_id="megatron_model"
171+
)
172+
target = EvaluationTarget(api_endpoint=api_endpoint)
173+
config = EvaluationConfig(type="gsm8k", output_dir="results")
174+
175+
# Run evaluation
176+
results = evaluate(target_cfg=target, eval_cfg=config)
177+
print(results)
178+
```
179+
180+
181+
## 📊 Supported Benchmarks and Evaluation Harnesses
124182

125183
NeMo Evaluator Launcher provides pre-built evaluation containers for different evaluation harnesses through the NVIDIA NGC catalog. Each harness supports a variety of benchmarks, which can then be called via `nemo-evaluator`. This table provides a list of benchmark names per harness. A more detailed list of task names can be found in the [list of NGC containers](./docs/nemo-evaluator/index.md#ngc-containers).
126184

@@ -144,6 +202,20 @@ NeMo Evaluator Launcher provides pre-built evaluation containers for different e
144202
| **tooltalk** | Tool usage evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) | `25.08.1` | ToolTalk |
145203
| **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.08.1` | AI2D, ChartQA, OCRBench, SlideVQA |
146204

147-
### Contribution Guide
148205

149-
We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
206+
207+
## 🤝 Contribution Guide
208+
209+
We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
210+
211+
212+
## 📄 License
213+
214+
This project is licensed under the Apache License 2.0. See the [LICENSE](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/LICENSE) file for details.
215+
216+
217+
## 📞 Support
218+
219+
- **Issues**: [GitHub Issues](https://github.com/NVIDIA-NeMo/Evaluator/issues)
220+
- **Discussions**: [GitHub Discussions](https://github.com/NVIDIA-NeMo/Evaluator/discussions)
221+
- **Documentation**: [NeMo Evaluator Documentation](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/docs/index.md)

docs/index.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ graph TD
2929
B -- " " --> D{Slurm};
3030
B -- " " --> E{Lepton};
3131
subgraph Execution Environment
32-
C -- "Launches Container" --> F[Evaluation Container];
32+
C -- "Launches Container" --> F[Evaluation Container];
3333
D -- "Launches Container" --> F;
3434
E -- "Launches Container" --> F;
3535
end
@@ -96,6 +96,10 @@ Results, logs, and run configurations are saved locally. Inspect the status of t
9696
nemo-evaluator-launcher status <job_id_or_invocation_id>
9797
```
9898

99+
/// note | About invocation and job IDs
100+
It is possible to use short version of IDs in `status` command, for example `abcd` instead of a full `abcdef0123456` or `ab.0` instead of `abcdef0123456.0`, so long as there are no collisions. This is a syntactic sugar allowing for a slightly easier usage.
101+
///
102+
99103
## Next Steps
100104

101105
- List all supported benchmarks:
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Generic Deployment
2+
3+
Generic deployment provides flexible configuration for deploying any custom server that isn't covered by built-in deployment configurations.
4+
5+
## Configuration
6+
7+
See [Generic Config](../../../../packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment/generic.yaml) for all available parameters.
8+
9+
Key arguments:
10+
- **`image`**: Docker image to use for deployment (required)
11+
- **`command`**: Command to run the server with template variables (required)
12+
- **`served_model_name`**: Name of the served model (required)
13+
- **`endpoints`**: API endpoint paths (chat, completions, health)
14+
- **`checkpoint_path`**: Path to model checkpoint for mounting (default: null)
15+
- **`extra_args`**: Additional command line arguments
16+
- **`env_vars`**: Environment variables as {name: value} dict
17+
18+
## Best Practices
19+
- Ensure server responds to health check endpoint (ensure that health endpoint is correctly parametrized)
20+
- Test configuration with `--dry_run`
21+
22+
## Contributing Permanent Configurations
23+
24+
If you've successfully applied the generic deployment to serve a specific model or framework, contributions are welcome! We'll turn your working configuration into a permanent config file for the community.

docs/nemo-evaluator-launcher/configuration/deployment/index.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Deployment configurations define how to provision and host model endpoints for e
77
- **[vLLM](vllm.md)**: High-performance LLM serving with optimized attention
88
- **[SGLang](sglang.md)**: Structured generation with efficient memory usage
99
- **[NIM](nim.md)**: NVIDIA-optimized inference microservices
10+
- **[Generic](generic.md)**: Custom server deployment with flexible configuration
1011
- **[None](none.md)**: Use existing endpoints (no deployment)
1112

1213
## Quick Reference
@@ -21,8 +22,17 @@ deployment:
2122
- **vLLM**: General-purpose LLM serving
2223
- **SGLang**: General-purpose LLM serving
2324
- **NIM**: NVIDIA hardware optimized deployments
25+
- **Generic**: Custom servers not covered by built-in configs
2426
- **None**: Existing endpoints
2527

28+
## Custom Server Integration
29+
30+
**Need to deploy a server not covered by built-in configs?**
31+
32+
**Quick integration**: Use [Generic deployment](generic.md) for any Docker-based server with OpenAI-compatible API.
33+
34+
**Advanced integration**: Create custom deployment template in `configs/deployment/` for reusable configurations.
35+
2636
## Configuration Files
2737

2838
See all available deployment configurations: [Deployment Configs](../../../../packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment)

0 commit comments

Comments
 (0)