Skip to content

Commit

Permalink
allow to use online container for NeMo framework
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffnvidia committed May 22, 2024
1 parent 8971bc7 commit af0f3be
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,14 @@ def _parse_slurm_args(
) -> Dict[str, Any]:
base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, nodes)

image_path = os.path.join(
self.install_path,
NcclTestSlurmInstallStrategy.SUBDIR_PATH,
NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
)
if os.path.isfile(cmd_args["docker_image_url"]):
image_path = cmd_args["docker_image_url"]
else:
image_path = os.path.join(
self.install_path,
NcclTestSlurmInstallStrategy.SUBDIR_PATH,
NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
)

container_mounts = ""
if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,15 @@ def gen_exec_command(
nodes = self.slurm_system.parse_nodes(nodes)
if nodes:
self.final_cmd_args["training.trainer.num_nodes"] = str(len(nodes))
self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"]
if os.path.isfile(self.final_cmd_args["docker_image_url"]):
self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"]
else:
self.final_cmd_args["container"] = os.path.join(
self.install_path,
NeMoLauncherSlurmInstallStrategy.SUBDIR_PATH,
NeMoLauncherSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
)

del self.final_cmd_args["repository_url"]
del self.final_cmd_args["repository_commit_hash"]
del self.final_cmd_args["docker_image_url"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@ def is_installed(self) -> bool:
docker_image_path = os.path.join(subdir_path, self.DOCKER_IMAGE_FILENAME)
repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME)
repo_installed = os.path.isdir(repo_path)
docker_image_installed = os.path.isfile(docker_image_path)
if not os.path.isfile(self.docker_image_url):
docker_image_installed = os.path.isfile(docker_image_path)
else:
docker_image_installed = True

data_dir_path = self.default_cmd_args["data_dir"]
datasets_ready = self._check_datasets_on_nodes(data_dir_path)
Expand Down Expand Up @@ -142,7 +145,8 @@ def install(self) -> None:
)

self._clone_repository(subdir_path)
self._setup_docker_image(self.slurm_system, subdir_path)
if not os.path.isfile(self.docker_image_url):
self._setup_docker_image(self.slurm_system, subdir_path)

def _check_install_path_access(self):
"""
Expand Down

0 comments on commit af0f3be

Please sign in to comment.