Keep track of launched instance IDs in a file

hanwen-cluster · hanwen-cluster · commit 962817c17017 · 2024-10-16T13:35:06.000-07:00
1. The only point of instance launch is `launch_ec2_instances`. Therefore, at the end of the function, this commit adds instance ids to a file to keep track
2. `get_cluster_instances` used to rely solely on retrieving instances with tags `parallelcluster:cluster-name` and "tag:parallelcluster:node-type". It ceases to function when users manually remove tags from instances. This commit, in addition to getting instances by tags, gets instances from the file written by `launch_ec2_instances`. At the end of the function, this commits remove non-existing instances from the file.
3. remove argument `alive_states_only` for code simplicity.

With this commit, cluster scaling should work with/without tags. Therefore, logic of tags and logic of the file are redundant, therefore increase resilience.

This commit is work-in-progress for the following reasons:
1. Code in `get_cluster_instances` could be simplified
2. Requires changes in CLI and Cookbook
  a. in CLI, the IAM policies for head node and clean up lambda are
  ```
        {
            "Action": "ec2:TerminateInstances",
            "Resource": "*",
            "Effect": "Allow",
            "Sid": "EC2Terminate",
            "Condition": {
                "StringEquals": {
                    "ec2:ResourceTag/parallelcluster:cluster-name": &lt;cluster name&gt;
                }
            },
        },
  ```
  it should be changed to
  ```
  {
            "Action": "ec2:TerminateInstances",
            "Resource": "*",
            "Effect": "Allow",
            "Sid": "EC2Terminate",
            "Condition": {
                "StringEquals": {
                    "ec2:ResourceTag/aws:ec2launchtemplate:id": [
                        &lt;Launch template id 1&gt;,
                        &lt;Launch template id 2&gt;,
                        ... It should contain all launch templates of compute and login nodes.
                    ]
                }
            }
        },
  ```
  b. Cookbook should create the file `/etc/parallelcluster/slurm_plugin/running_nodes` during config stage and set it to the owner to `pcluster-admin:pcluster-admin`. This is necessary because the node package does not have permission to create the file
diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py
@@ -649,7 +649,7 @@ def _get_ec2_instances(self):
         time.sleep(5)
         log.info("Retrieving list of EC2 instances associated with the cluster")
         try:
-            return self._instance_manager.get_cluster_instances(include_head_node=False, alive_states_only=True)
+            return self._instance_manager.get_cluster_instances(include_head_node=False)
         except Exception as e:
             log.error("Failed when getting instance info from EC2 with exception %s", e)
             raise ClusterManager.EC2InstancesInfoUnavailable
diff --git a/src/slurm_plugin/fleet_manager.py b/src/slurm_plugin/fleet_manager.py
@@ -193,11 +193,15 @@ def launch_ec2_instances(self, count, job_id=None):
             launch_params = self._evaluate_launch_params(count)
             assigned_nodes = self._launch_instances(launch_params)
             if len(assigned_nodes.get("Instances")) > 0:
+                instance_ids = [instance.get("InstanceId") for instance in assigned_nodes.get("Instances") if instance.get("InstanceId") ]
                 logger.info(
                     "Launched the following instances %s",
-                    print_with_count([instance.get("InstanceId", "") for instance in assigned_nodes.get("Instances")]),
+                    print_with_count(instance_ids),
                 )
                 logger.debug("Launched instances information: %s", assigned_nodes.get("Instances"))
+                running_nodes_file_path = "/etc/parallelcluster/slurm_plugin/running_nodes"
+                with open(running_nodes_file_path, "a") as f:
+                    f.write('\n'.join(instance_ids)+'\n')
 
         return [EC2Instance.from_describe_instance_data(instance_info) for instance_info in assigned_nodes["Instances"]]
 
diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py
@@ -258,46 +258,69 @@ def get_unhealthy_cluster_instance_status(self, cluster_instance_ids):
         return list(instance_health_states.values())
 
     @log_exception(logger, "getting cluster instances from EC2", raise_on_error=True)
-    def get_cluster_instances(self, include_head_node=False, alive_states_only=True):
+    def get_cluster_instances(self, include_head_node=False):
         """
         Get instances that are associated with the cluster.
 
         Instances without all the info set are ignored and not returned
         """
+        running_instances_from_file = set()
+        running_instances_file_path = "/etc/parallelcluster/slurm_plugin/running_nodes"
+        with open(running_instances_file_path, "r") as f:
+            running_instances_from_file.update(set([line.strip() for line in f.readlines() if line.strip()]))
+        untracked_instances = running_instances_from_file.copy()
+
         ec2_client = boto3.client("ec2", region_name=self._region, config=self._boto3_config)
         paginator = ec2_client.get_paginator("describe_instances")
         args = {
             "Filters": [{"Name": "tag:parallelcluster:cluster-name", "Values": [self._cluster_name]}],
         }
-        if alive_states_only:
-            args["Filters"].append({"Name": "instance-state-name", "Values": list(EC2_INSTANCE_ALIVE_STATES)})
+        args["Filters"].append({"Name": "instance-state-name", "Values": list(EC2_INSTANCE_ALIVE_STATES)})
         if not include_head_node:
             args["Filters"].append({"Name": "tag:parallelcluster:node-type", "Values": ["Compute"]})
         response_iterator = paginator.paginate(PaginationConfig={"PageSize": BOTO3_PAGINATION_PAGE_SIZE}, **args)
         filtered_iterator = response_iterator.search("Reservations[].Instances[]")
 
         instances = []
         for instance_info in filtered_iterator:
-            try:
-                private_ip, private_dns_name, all_private_ips = get_private_ip_address_and_dns_name(instance_info)
-                instances.append(
-                    EC2Instance(
-                        instance_info["InstanceId"],
-                        private_ip,
-                        private_dns_name.split(".")[0],
-                        all_private_ips,
-                        instance_info["LaunchTime"],
-                    )
-                )
-            except Exception as e:
-                logger.warning(
-                    "Ignoring instance %s because not all EC2 info are available, exception: %s, message: %s",
+            untracked_instances.discard(instance_info["InstanceId"])
+            self._create_ec2_instance_object(instance_info, instances)
+        non_existing_instances = untracked_instances.copy()
+        for instance_ids in self.chunks(list(untracked_instances),150):
+            filters=[{"Name": "instance-id", "Values": instance_ids}, {"Name": "instance-state-name", "Values": list(EC2_INSTANCE_ALIVE_STATES)}]
+            response_iterator = paginator.paginate(PaginationConfig={"PageSize": BOTO3_PAGINATION_PAGE_SIZE}, Filters=filters)
+            filtered_iterator = response_iterator.search("Reservations[].Instances[]")
+            for instance_info in filtered_iterator:
+                non_existing_instances.discard(instance_info["InstanceId"])
+                self._create_ec2_instance_object(instance_info, instances)
+        with open(running_instances_file_path, "w") as f:
+            f.write('\n'.join(list(running_instances_from_file - non_existing_instances))+'\n')
+        return instances
+
+
+    def chunks(self, lst, n):
+        """Yield successive n-sized chunks from lst."""
+        for i in range(0, len(lst), n):
+            yield lst[i:i + n]
+    def _create_ec2_instance_object(self, instance_info, instances):
+        try:
+            private_ip, private_dns_name, all_private_ips = get_private_ip_address_and_dns_name(instance_info)
+            instances.append(
+                EC2Instance(
                     instance_info["InstanceId"],
-                    type(e).__name__,
-                    e,
+                    private_ip,
+                    private_dns_name.split(".")[0],
+                    all_private_ips,
+                    instance_info["LaunchTime"],
                 )
-
-        return instances
+            )
+        except Exception as e:
+            logger.warning(
+                "Ignoring instance %s because not all EC2 info are available, exception: %s, message: %s",
+                instance_info["InstanceId"],
+                type(e).__name__,
+                e,
+            )
 
     def terminate_all_compute_nodes(self, terminate_batch_size):
         try: