NVIDIA-NeMo
diff --git a/‎.github/workflows/build-docs.yml‎
Lines changed: 21 additions & 4 deletions b/‎.github/workflows/build-docs.yml‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎.github/workflows/build-test-publish-wheel.yml‎
Lines changed: 14 additions & 3 deletions b/‎.github/workflows/build-test-publish-wheel.yml‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/copyright-check.yml‎
Lines changed: 14 additions & 3 deletions b/‎.github/workflows/copyright-check.yml‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎.github/workflows/install-test.yml‎
Lines changed: 21 additions & 4 deletions b/‎.github/workflows/install-test.yml‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 85 additions & 13 deletions b/‎README.md‎
Lines changed: 85 additions & 13 deletions
diff --git a/‎docs/index.md‎
Lines changed: 5 additions & 1 deletion b/‎docs/index.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/nemo-evaluator-launcher/configuration/deployment/generic.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/nemo-evaluator-launcher/configuration/deployment/generic.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/nemo-evaluator-launcher/configuration/deployment/index.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/nemo-evaluator-launcher/configuration/deployment/index.md‎
Lines changed: 10 additions & 0 deletions
@@ -23,7 +23,7 @@ on:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   build-docs:
     needs: [pre-flight]
@@ -38,10 +38,27 @@ jobs:
     if: |
       (
         needs.pre-flight.outputs.is_deployment_workflow == 'true'
-        || success()
+        || always()
       )
       && !cancelled()
     runs-on: ubuntu-latest
     steps:
-      - name: Result
-        run: echo Build docs successful
+      - name: Get workflow result
+        id: result
+        shell: bash -x -e -u -o pipefail {0}
+        env:
+          GH_TOKEN: ${{ github.token }}
+          RUN_ID: ${{ github.run_id }}
+          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -28,7 +28,7 @@ defaults:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   build-test-publish-wheel:
     needs: [pre-flight]
@@ -92,10 +92,21 @@ jobs:
       (
         needs.pre-flight.outputs.docs_only == 'true'
         || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-        || success()
+        || always()
       )
       && !cancelled()
     runs-on: ubuntu-latest
     steps:
       - name: Result
-        run: echo Build test publish wheel successful
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -31,7 +31,7 @@ permissions:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   linting:
     runs-on: ubuntu-latest
 
@@ -23,7 +23,7 @@ on:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   copyright-check:
     needs: [pre-flight]
@@ -38,10 +38,21 @@ jobs:
       (
         needs.pre-flight.outputs.docs_only == 'true'
         || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-        || success()
+        || always()
       )
       && !cancelled()
     runs-on: ubuntu-latest
     steps:
       - name: Result
-        run: echo Copyright check successful
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -28,7 +28,7 @@ env:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   pip-test:
     runs-on: ubuntu-latest
@@ -74,9 +74,26 @@ jobs:
       (
         needs.pre-flight.outputs.docs_only == 'true'
         || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-        || success()
+        || always()
       )
       && !cancelled()
     steps:
-      - name: Result
-        run: echo Install check successful
+      - name: Get workflow result
+        id: result
+        shell: bash -x -e -u -o pipefail {0}
+        env:
+          GH_TOKEN: ${{ github.token }}
+          RUN_ID: ${{ github.run_id }}
+          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -1,8 +1,8 @@
 # NeMo Evaluator
 
-[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://github.com/NVIDIA-NeMo/Eval/blob/main/LICENSE)
+[![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/LICENSE)
 [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-green)](https://www.python.org/downloads/)
-[![Tests](https://github.com/NVIDIA-NeMo/Eval/actions/workflows/cicd-main.yml/badge.svg)](https://github.com/NVIDIA-NeMo/Eval/actions/workflows/cicd-main.yml)
+[![Tests](https://github.com/NVIDIA-NeMo/Evaluator/actions/workflows/cicd-main.yml/badge.svg)](https://github.com/NVIDIA-NeMo/Evaluator/actions/workflows/cicd-main.yml)
 [![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
 [![nemo-evaluator PyPI version](https://img.shields.io/pypi/v/nemo-evaluator.svg)](https://pypi.org/project/nemo-evaluator/)
 [![nemo-evaluator PyPI downloads](https://img.shields.io/pypi/dm/nemo-evaluator.svg)](https://pypi.org/project/nemo-evaluator/)
@@ -12,9 +12,9 @@
 
 NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models. It enables you to run hundreds of benchmarks across popular evaluation harnesses against any OpenAI-compatible model API. Evaluations execute in open-source Docker containers for auditable and trustworthy results. The platform's containerized architecture allows for the rapid integration of public benchmarks and private datasets.
 
-[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Eval/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md)
+[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [NeMo FW model evaluations](#-evaluate-checkpoints-trained-by-nemo-framework) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md)
 
-### Key Pillars
+## ✨ Key Pillars
 
 NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
 
@@ -23,7 +23,7 @@ NeMo Evaluator is built on four core principles to provide a reliable and versat
 - **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
 - **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
 
-### How It Works: Launcher and Core Engine
+## How It Works: Launcher and Core Engine
 
 The platform consists of two main components:
 
@@ -51,7 +51,7 @@ graph TD
 ```
 
 
-### 🚀 Quickstart
+## 🚀 Quickstart
 
 Get your first evaluation result in minutes. This guide uses your local machine to run a small benchmark against an OpenAI API-compatible endpoint.
 
@@ -63,7 +63,7 @@ The launcher is the only package required to get started.
 pip install nemo-evaluator-launcher
 ```
 
-#### 2. Set Up Your Model Endpoint
+### 2. Set Up Your Model Endpoint
 
 NeMo Evaluator works with any model that exposes an OpenAI-compatible endpoint. For this quickstart, we will use the OpenAI API.
 
@@ -84,7 +84,7 @@ To use out-of-the-box build.nvidia.com APIs, you need an API key:
 2. In the Setup menu under Keys/Secrets, generate an API key.
 3. Set the environment variable by executing `export NGC_API_KEY=<YOUR_API_KEY>`.
 
-#### 3. Run Your First Evaluation
+### 3. Run Your First Evaluation
 
 Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the YAML file.
 
@@ -99,15 +99,15 @@ nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/exampl
 
 After running this command, you will see a `job_id`, which can be used to track the job and its results. All logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
 
-#### 4. Check Your Results
+### 4. Check Your Results
 
 Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding `job_id`:
 
 ```bash
 nemo-evaluator-launcher status <job_id_or_invocation_id>
 ```
 
-#### Next Steps
+### Next Steps
 
 - List all supported benchmarks:
 
@@ -120,7 +120,65 @@ nemo-evaluator-launcher status <job_id_or_invocation_id>
 - Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
 - Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).
 
-### Supported Benchmarks and Evaluation Harnesses
+
+## 🧩 Evaluate checkpoints trained by NeMo Framework
+
+The NeMo Framework is NVIDIA’s GPU-accelerated, end-to-end training platform for large language models (LLMs), multimodal models, and speech models. It enables seamless scaling of both pretraining and post-training workloads, from a single GPU to clusters with thousands of nodes, supporting Hugging Face/PyTorch and Megatron models. NeMo includes a suite of libraries and curated training recipes to help users build models from start to finish.
+
+The NeMo Evaluator is integrated within NeMo Framework, offering streamlined deployment and advanced evaluation capabilities for models trained using NeMo, leveraging state-of-the-art evaluation harnesses.
+
+### Features
+
+- **Multi-Backend Deployment**: Supports PyTriton and multi-instance evaluations using the Ray Serve deployment backend
+- **Production-Ready**: Supports high-performance inference with CUDA graphs and flash decoding
+- **Multi-GPU and Multi-Node Support**: Enables distributed inference across multiple GPUs and compute nodes
+- **OpenAI-Compatible API**: Provides RESTful endpoints aligned with OpenAI API specifications
+
+### 1. Start NeMo Framework Container
+
+For optimal performance and user experience, use the latest version of the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags). Please fetch the most recent `$TAG` and run the following command to start a container:
+
+```bash
+docker run --rm -it -w /workdir -v $(pwd):/workdir \
+  --entrypoint bash \
+  --gpus all \
+  nvcr.io/nvidia/nemo:${TAG}
+```
+
+### 2. Deploy a Model
+
+```bash
+# Deploy a NeMo checkpoint
+python \
+  /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_inframework.py \
+  --nemo_checkpoint "/path/to/your/checkpoint" \
+  --model_id megatron_model \
+  --port 8080 \
+  --host 0.0.0.0
+```
+
+### 3. Evaluate the Model
+
+```python
+from nemo_evaluator.api import evaluate
+from nemo_evaluator.api.api_dataclasses import ApiEndpoint, EvaluationConfig, EvaluationTarget
+
+# Configure evaluation
+api_endpoint = ApiEndpoint(
+    url="http://0.0.0.0:8080/v1/completions/",
+    type="completions",
+    model_id="megatron_model"
+)
+target = EvaluationTarget(api_endpoint=api_endpoint)
+config = EvaluationConfig(type="gsm8k", output_dir="results")
+
+# Run evaluation
+results = evaluate(target_cfg=target, eval_cfg=config)
+print(results)
+```
+
+
+## 📊 Supported Benchmarks and Evaluation Harnesses
 
 NeMo Evaluator Launcher provides pre-built evaluation containers for different evaluation harnesses through the NVIDIA NGC catalog. Each harness supports a variety of benchmarks, which can then be called via `nemo-evaluator`. This table provides a list of benchmark names per harness. A more detailed list of task names can be found in the [list of NGC containers](./docs/nemo-evaluator/index.md#ngc-containers).
 
@@ -144,6 +202,20 @@ NeMo Evaluator Launcher provides pre-built evaluation containers for different e
 | **tooltalk** | Tool usage evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) | `25.08.1` | ToolTalk |
 | **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.08.1` | AI2D, ChartQA, OCRBench, SlideVQA |
 
-### Contribution Guide
 
-We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
+
+## 🤝 Contribution Guide
+
+We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
+
+
+## 📄 License
+
+This project is licensed under the Apache License 2.0. See the [LICENSE](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/LICENSE) file for details.
+
+
+## 📞 Support
+
+- **Issues**: [GitHub Issues](https://github.com/NVIDIA-NeMo/Evaluator/issues)
+- **Discussions**: [GitHub Discussions](https://github.com/NVIDIA-NeMo/Evaluator/discussions)
+- **Documentation**: [NeMo Evaluator Documentation](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/docs/index.md)
@@ -29,7 +29,7 @@ graph TD
     B -- " " --> D{Slurm};
     B -- " " --> E{Lepton};
     subgraph Execution Environment
-        C -- "Launches Container" --> F[Evaluation Container];        
+        C -- "Launches Container" --> F[Evaluation Container];
         D -- "Launches Container" --> F;
         E -- "Launches Container" --> F;
     end
@@ -96,6 +96,10 @@ Results, logs, and run configurations are saved locally. Inspect the status of t
 nemo-evaluator-launcher status <job_id_or_invocation_id>
 ```
 
+/// note | About invocation and job IDs
+It is possible to use short version of IDs in `status` command, for example `abcd` instead of a full `abcdef0123456` or `ab.0` instead of `abcdef0123456.0`, so long as there are no collisions. This is a syntactic sugar allowing for a slightly easier usage.
+///
+
 ## Next Steps
 
 - List all supported benchmarks:
 
@@ -0,0 +1,24 @@
+# Generic Deployment
+
+Generic deployment provides flexible configuration for deploying any custom server that isn't covered by built-in deployment configurations.
+
+## Configuration
+
+See [Generic Config](../../../../packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment/generic.yaml) for all available parameters.
+
+Key arguments:
+- **`image`**: Docker image to use for deployment (required)
+- **`command`**: Command to run the server with template variables (required)
+- **`served_model_name`**: Name of the served model (required)
+- **`endpoints`**: API endpoint paths (chat, completions, health)
+- **`checkpoint_path`**: Path to model checkpoint for mounting (default: null)
+- **`extra_args`**: Additional command line arguments
+- **`env_vars`**: Environment variables as {name: value} dict
+
+## Best Practices
+- Ensure server responds to health check endpoint (ensure that health endpoint is correctly parametrized)
+- Test configuration with `--dry_run`
+
+## Contributing Permanent Configurations
+
+If you've successfully applied the generic deployment to serve a specific model or framework, contributions are welcome! We'll turn your working configuration into a permanent config file for the community.
@@ -7,6 +7,7 @@ Deployment configurations define how to provision and host model endpoints for e
 - **[vLLM](vllm.md)**: High-performance LLM serving with optimized attention
 - **[SGLang](sglang.md)**: Structured generation with efficient memory usage  
 - **[NIM](nim.md)**: NVIDIA-optimized inference microservices
+- **[Generic](generic.md)**: Custom server deployment with flexible configuration
 - **[None](none.md)**: Use existing endpoints (no deployment)
 
 ## Quick Reference
@@ -21,8 +22,17 @@ deployment:
 - **vLLM**: General-purpose LLM serving
 - **SGLang**: General-purpose LLM serving
 - **NIM**: NVIDIA hardware optimized deployments
+- **Generic**: Custom servers not covered by built-in configs
 - **None**: Existing endpoints
 
+## Custom Server Integration
+
+**Need to deploy a server not covered by built-in configs?**
+
+**Quick integration**: Use [Generic deployment](generic.md) for any Docker-based server with OpenAI-compatible API.
+
+**Advanced integration**: Create custom deployment template in `configs/deployment/` for reusable configurations.
+
 ## Configuration Files
 
 See all available deployment configurations: [Deployment Configs](../../../../packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment)