Skip to content

Commit

Permalink
Add NeMoLauncherKubernetesCommandGenStrategy
Browse files Browse the repository at this point in the history
  • Loading branch information
TaekyungHeo committed Sep 30, 2024
1 parent 9fdb8fa commit c074ebb
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
from .schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy
from .schema.test_template.nccl_test.template import NcclTest
from .schema.test_template.nemo_launcher.grading_strategy import NeMoLauncherGradingStrategy
from .schema.test_template.nemo_launcher.kubernetes_command_gen_strategy import NeMoLauncherKubernetesCommandGenStrategy
from .schema.test_template.nemo_launcher.kubernetes_install_strategy import NeMoLauncherKubernetesInstallStrategy
from .schema.test_template.nemo_launcher.report_generation_strategy import NeMoLauncherReportGenerationStrategy
from .schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
Expand Down Expand Up @@ -128,6 +129,9 @@
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
Registry().add_strategy(
CommandGenStrategy, [KubernetesSystem], [NeMoLauncher], NeMoLauncherKubernetesCommandGenStrategy
)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Dict, List

from cloudai import CommandGenStrategy

from .kubernetes_install_strategy import NeMoLauncherKubernetesInstallStrategy


class NeMoLauncherKubernetesCommandGenStrategy(CommandGenStrategy):
"""Command generation strategy for NeMo Megatron Launcher on Kubernetes systems."""

def gen_exec_command(
self,
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
num_nodes: int,
nodes: List[str],
) -> str:
final_env_vars = {**self.system.global_env_vars, **extra_env_vars}

launcher_path = (
self.system.install_path
/ NeMoLauncherKubernetesInstallStrategy.SUBDIR_PATH
/ NeMoLauncherKubernetesInstallStrategy.REPOSITORY_NAME
)

self.final_cmd_args = {**self.default_cmd_args, **cmd_args}
self.final_cmd_args["launcher_scripts_path"] = str(launcher_path / "launcher_scripts")

self.final_cmd_args.update({f"env_vars.{key}": value for key, value in final_env_vars.items()})

self.final_cmd_args["cluster"] = self.final_cmd_args.pop("cluster.value", "")
self.final_cmd_args["training"] = self.final_cmd_args.pop("training.values", "")

for key in ["repository_url", "repository_commit_hash", "docker_image_url"]:
self.final_cmd_args.pop(key, None)

if self.final_cmd_args.get("data_dir") == "DATA_DIR":
raise ValueError(
"The 'data_dir' field of the NeMo launcher test contains the placeholder 'DATA_DIR'. "
"Please update the test schema TOML file with a valid path to the dataset."
)

cmd_args_str = self._generate_cmd_args_str(self.final_cmd_args)

full_cmd = f"python {launcher_path}/launcher_scripts/main.py {cmd_args_str}"

if extra_cmd_args:
full_cmd += f" {extra_cmd_args}"

env_vars_str = " ".join(f"{key}={value}" for key, value in final_env_vars.items())
return f"{env_vars_str} {full_cmd}".strip() if env_vars_str else full_cmd.strip()

def _generate_cmd_args_str(self, args: Dict[str, str]) -> str:
"""
Generate a string of command-line arguments, wrapping values in quotes when necessary.
Args:
args (Dict[str, str]): The command-line arguments.
Returns:
str: A string of command-line arguments.
"""
cmd_arg_str_parts = []
env_var_str_parts = []
special_chars = ["[", "]", "\\"]

for key, value in args.items():
if any(char in value for char in special_chars):
value = f'"{value}"'

if key.startswith("env_vars."):
env_var_str_parts.append(f"+{key}={value}")
else:
cmd_arg_str_parts.append(f"{key}={value}")

return " ".join(cmd_arg_str_parts + env_var_str_parts)
26 changes: 26 additions & 0 deletions src/cloudai/test_definitions/nemo_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,35 @@ class NumaMapping(BaseModel):
enable: bool = True


class PersistentVolumeClaim(BaseModel):
"""Persistent volume claim configuration."""

model_config = ConfigDict(extra="forbid")
claim_name: str = "nemo-workspace"


class ClusterWorkspace(BaseModel):
"""Cluster workspace configuration."""

model_config = ConfigDict(extra="forbid")
persistent_volume_claim: PersistentVolumeClaim = Field(default_factory=PersistentVolumeClaim)
mount_path: str = "/nemo-workspace"


class ClusterVolume(BaseModel):
"""Cluster volume configuration."""

model_config = ConfigDict(extra="forbid")
workspace: ClusterWorkspace = Field(default_factory=ClusterWorkspace)


class Cluster(BaseModel):
"""Cluster configuration."""

model_config = ConfigDict(extra="forbid")
value: str = "k8s_v2"
gpus_per_node: int = 8
volumes: ClusterVolume = Field(default_factory=ClusterVolume)


class ExpManager(BaseModel):
Expand All @@ -50,6 +74,8 @@ class Trainer(BaseModel):
val_check_interval: int = 100
log_every_n_steps: Literal["1", "2"] = "1"
enable_checkpointing: bool = False
num_nodes: int = 3
devices: int = 8


class TrainingModelData(BaseModel):
Expand Down
144 changes: 144 additions & 0 deletions tests/test_kubernetes_command_gen_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

import pytest
from cloudai.schema.test_template.nemo_launcher.kubernetes_command_gen_strategy import (
NeMoLauncherKubernetesCommandGenStrategy,
)
from cloudai.systems.kubernetes import KubernetesSystem


@pytest.fixture
def kube_config_tempfile():
kube_config_content = """
apiVersion: v1
kind: Config
clusters:
- cluster:
server: https://127.0.0.1:6443
name: local-cluster
contexts:
- context:
cluster: local-cluster
user: local-user
name: local-context
current-context: local-context
users:
- name: local-user
user:
token: fake-token
"""

home_dir = Path.home()
kube_config_dir = home_dir / ".kube"
kube_config_path = kube_config_dir / "config"

kube_config_dir.mkdir(parents=True, exist_ok=True)

with kube_config_path.open("w") as config_file:
config_file.write(kube_config_content)

yield kube_config_path


@pytest.fixture
def k8s_system(kube_config_tempfile):
system_data = {
"name": "test-system",
"install_path": "/fake/install/path",
"output_path": "/fake/output/path",
"kube_config_path": kube_config_tempfile,
"default_namespace": "default",
"default_image": "test-image",
"scheduler": "kubernetes",
"global_env_vars": {},
"monitor_interval": 1,
}

k8s_system = KubernetesSystem(**system_data)
k8s_system.model_post_init(None)

validated_system = KubernetesSystem.model_validate(system_data)

yield validated_system


class TestNeMoLauncherKubernetesCommandGenStrategy__GenExecCommand:
@pytest.fixture
def nemo_cmd_gen(self, k8s_system: KubernetesSystem) -> NeMoLauncherKubernetesCommandGenStrategy:
cmd_args = {"test_arg": "test_value"}
strategy = NeMoLauncherKubernetesCommandGenStrategy(k8s_system, cmd_args)
strategy.system = k8s_system
strategy.default_cmd_args = {
"repository_url": "mock_repo_url",
"repository_commit_hash": "mock_commit_hash",
"docker_image_url": "mock_docker_image",
"data_dir": "mock_data_dir",
"training.values": "mock_training_values",
"training.model.data.data_prefix": "\\mock_prefix",
}
return strategy

def test_custom_gen_exec_command(self, nemo_cmd_gen: NeMoLauncherKubernetesCommandGenStrategy):
extra_env_vars = {}
cmd_args = {
"launcher_scripts_path": "/fake/install/path/NeMoLauncherKubernetesInstallStrategy/launcher_scripts",
"data_dir": "/nemo-workspace/pile",
"cluster.value": "k8s_v2",
"cluster.volumes.workspace.persistent_volume_claim.claim_name": "nemo-workspace",
"cluster.volumes.workspace.mount_path": "/nemo-workspace",
"stages": "[training]",
"training.values": "gpt3/1b",
"training.exp_manager.explicit_log_dir": "/nemo-workspace/gpt3/1b/training_gpt3/1b/results",
"training.model.data.data_prefix": "[0.5,/nemo-workspace/pile/my-gpt3_00_text_document]",
"training.trainer.num_nodes": "3",
"training.trainer.devices": "8",
"training.trainer.max_steps": "1000",
"training.trainer.val_check_interval": "100",
"training.model.global_batch_size": "48",
}

cmd = nemo_cmd_gen.gen_exec_command(
cmd_args=cmd_args,
extra_env_vars=extra_env_vars,
extra_cmd_args="",
output_path=Path("/fake/output/path"),
num_nodes=3,
nodes=["node1", "node2", "node3"],
)

expected_parts = [
"python /fake/install/path/NeMo-Launcher/NeMo-Launcher/launcher_scripts/main.py",
"data_dir=/nemo-workspace/pile",
'training.model.data.data_prefix="[0.5,/nemo-workspace/pile/my-gpt3_00_text_document]"',
"launcher_scripts_path=/fake/install/path/NeMo-Launcher/NeMo-Launcher/launcher_scripts",
"cluster=k8s_v2",
"cluster.volumes.workspace.persistent_volume_claim.claim_name=nemo-workspace",
"cluster.volumes.workspace.mount_path=/nemo-workspace",
'stages="[training]"',
"training.exp_manager.explicit_log_dir=/nemo-workspace/gpt3/1b/training_gpt3/1b/results",
"training.trainer.num_nodes=3",
"training.trainer.devices=8",
"training.trainer.max_steps=1000",
"training.trainer.val_check_interval=100",
"training.model.global_batch_size=48",
"training=gpt3/1b",
]

for part in expected_parts:
assert part in cmd, f"Part '{part}' was not found in the generated command."

0 comments on commit c074ebb

Please sign in to comment.