diff --git a/conf/v0.6/general/test_template/nemo_launcher.toml b/conf/v0.6/general/test_template/nemo_launcher.toml index 705afa4f2..cb8a7c1ca 100644 --- a/conf/v0.6/general/test_template/nemo_launcher.toml +++ b/conf/v0.6/general/test_template/nemo_launcher.toml @@ -3,15 +3,15 @@ name = "NeMoLauncher" [cmd_args] [cmd_args.repository_url] type = "str" - default = "NEMO_REPOSITORY_URL" + default = "https://github.com/NVIDIA/NeMo-Framework-Launcher.git" [cmd_args.repository_commit_hash] type = "str" - default = "6528780fba8185bf61e7c2396fdd2331ee5933a1" + default = "cf411a9ede3b466677df8ee672bcc6c396e71e1a" [cmd_args.docker_image_url] type = "str" - default = "DOCKER_IMAGE_URL" + default = "nvcr.io/nvidian/nemofw-training:24.01.01" [cmd_args.stages] type = "str" @@ -33,8 +33,8 @@ name = "NeMoLauncher" default = "8" [cmd_args.training] - values = ["gpt3/43b"] - default = "gpt3/43b" + values = ["gpt3/40b_improved"] + default = "gpt3/40b_improved" [cmd_args.training.exp_manager] [cmd_args.training.exp_manager.create_checkpoint_callback] type = "bool" diff --git a/src/cloudai/__main__.py b/src/cloudai/__main__.py index f063bc0a4..49bf87f90 100644 --- a/src/cloudai/__main__.py +++ b/src/cloudai/__main__.py @@ -17,6 +17,7 @@ import logging import os import sys +import traceback from cloudai import Installer, Parser, ReportGenerator, Runner, SystemObjectUpdater @@ -178,7 +179,11 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> None: test_scenario.pretty_print() runner = Runner(args.mode, system, test_scenario) - asyncio.run(runner.run()) + try: + asyncio.run(runner.run()) + except RuntimeError as e: + logging.error(traceback.format_exc()) + logging.error(f"Error running asyncio loop: {e}") print(f"All test scenario results stored at: {runner.runner.output_path}") diff --git a/src/cloudai/runner/slurm/slurm_runner.py b/src/cloudai/runner/slurm/slurm_runner.py index 91b41006b..ff3dc0d89 100644 --- a/src/cloudai/runner/slurm/slurm_runner.py +++ b/src/cloudai/runner/slurm/slurm_runner.py @@ -72,6 +72,8 @@ def _submit_test(self, test: Test) -> Optional[SlurmJob]: job_id = None if self.mode == "run": stdout, stderr = self.cmd_shell.execute(exec_cmd).communicate() + self.logger.info(f"\tstdout: {stdout}") + self.logger.info(f"\tstderr: {stderr}") job_id = test.get_job_id(stdout, stderr) else: job_id = 0 diff --git a/src/cloudai/runner/standalone/standalone_runner.py b/src/cloudai/runner/standalone/standalone_runner.py index 4aecbf6a8..eb28c7063 100644 --- a/src/cloudai/runner/standalone/standalone_runner.py +++ b/src/cloudai/runner/standalone/standalone_runner.py @@ -104,7 +104,9 @@ def is_job_completed(self, job: BaseJob) -> bool: s_job = cast(StandaloneJob, job) command = f"ps -p {s_job.id}" self.logger.debug(f"Checking job status with command: {command}") - stdout = self.cmd_shell.execute(command).communicate()[0] + stdout, stderr = self.cmd_shell.execute(command).communicate() + self.logger.info(f"\tstdout: {stdout}") + self.logger.info(f"\tstderr: {stderr}") return str(s_job.id) not in stdout def kill_job(self, job: BaseJob): diff --git a/src/cloudai/schema/system/slurm/slurm_system.py b/src/cloudai/schema/system/slurm/slurm_system.py index 6e590f6d6..09f1cb34d 100644 --- a/src/cloudai/schema/system/slurm/slurm_system.py +++ b/src/cloudai/schema/system/slurm/slurm_system.py @@ -589,20 +589,18 @@ def parse_sinfo_output(self, sinfo_output: str, node_user_map: Dict[str, str]) - parts = line.split() partition, _, _, _, state, nodelist = parts[:6] partition = partition.rstrip("*") - - node_groups = nodelist.split(",") - for node_group in node_groups: - node_names = self.parse_node_list([node_group.strip()]) - state_enum = self.convert_state_to_enum(state) - - for node_name in node_names: - for part_name, nodes in self.partitions.items(): - if part_name != partition: - continue - for node in nodes: - if node.name == node_name: - node.state = state_enum - node.user = node_user_map.get(node_name, "N/A") + node_names = self.parse_node_list([nodelist]) + + # Convert state to enum, handling states with suffixes + state_enum = self.convert_state_to_enum(state) + for node_name in node_names: + for part_name, nodes in self.partitions.items(): + if part_name != partition: + continue + for node in nodes: + if node.name == node_name: + node.state = state_enum + node.user = node_user_map.get(node_name, "N/A") def convert_state_to_enum(self, state_str: str) -> SlurmNodeState: """ diff --git a/tests/test_slurm_system.py b/tests/test_slurm_system.py index 5b98e9b96..997676d51 100644 --- a/tests/test_slurm_system.py +++ b/tests/test_slurm_system.py @@ -8,8 +8,14 @@ @pytest.fixture def slurm_system(): nodes = [ - SlurmNode(name="nodeA001", partition="main", state=SlurmNodeState.UNKNOWN_STATE), - SlurmNode(name="nodeB001", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-115", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-116", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-117", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-118", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-119", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-120", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-121", partition="main", state=SlurmNodeState.UNKNOWN_STATE), + SlurmNode(name="node-122", partition="main", state=SlurmNodeState.UNKNOWN_STATE), ] system = SlurmSystem( name="test_system", @@ -48,6 +54,20 @@ def test_parse_sinfo_output(slurm_system): slurm_system.parse_sinfo_output(sinfo_output, node_user_map) assert slurm_system.partitions["main"][0].state == SlurmNodeState.IDLE assert slurm_system.partitions["main"][1].state == SlurmNodeState.IDLE + +def test_parse_sinfo_output2(slurm_system): + sinfo_output = """ + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + main up 3:00:00 1 inval node-081 + main up 3:00:00 5 drain node-[065-066,114,124-125] + main up 3:00:00 2 resv node-[034-035] + main up 3:00:00 88 alloc node-[033,036-064,067-080,082-113,115-123,126-128] + backup up 12:00:00 16 alloc node-[01-16] + """ + node_user_map = {'': 'user1', 'node-033': 'user2', 'node-[036-064': 'user3', '067-080': 'user3', '082-113': 'user3', '115-118]': 'user3', 'node-[119-123': 'user4', '126-128]': 'user4', 'node-01': 'user5', 'node-02': 'user5', 'node-03': 'user5', 'node-04': 'user5', 'node-05': 'user5', 'node-06': 'user5', 'node-07': 'user5', 'node-08': 'user5', 'node-09': 'user5', 'node-10': 'user5', 'node-11': 'user5', 'node-12': 'user5', 'node-13': 'user5', 'node-14': 'user5', 'node-15': 'user5', 'node-16': 'user5'} + slurm_system.parse_sinfo_output(sinfo_output, node_user_map) + assert slurm_system.partitions["main"][0].state == SlurmNodeState.ALLOCATED + assert slurm_system.partitions["main"][1].state == SlurmNodeState.ALLOCATED @patch("cloudai.schema.system.SlurmSystem.get_squeue")