|  | 
| 56 | 56 | # or some combination thereof. | 
| 57 | 57 | # Refer to qstat man page for additional details. | 
| 58 | 58 | # o(rphaned) is not considered as busy since we assume a node in orphaned state is not present in ASG anymore | 
| 59 |  | -SGE_BUSY_STATES = ["u", "C", "s", "d", "D", "E", "P"] | 
|  | 59 | +SGE_BUSY_STATES = ["u", "C", "s", "D", "E", "P"] | 
|  | 60 | + | 
|  | 61 | +# This state is set by nodewatcher when the node is locked and is being terminated. | 
|  | 62 | +SGE_DISABLED_STATE = "d" | 
| 60 | 63 | 
 | 
| 61 | 64 | # If an o(rphaned) state is displayed for a queue instance, it indicates that the queue instance is no longer demanded | 
| 62 | 65 | # by the current cluster queue configuration or the host group configuration. The queue instance is kept because jobs | 
| @@ -133,10 +136,11 @@ def remove_hosts_from_queue(hosts): | 
| 133 | 136 | def install_sge_on_compute_nodes(hosts, cluster_user): | 
| 134 | 137 |     """Start sge on compute nodes in parallel.""" | 
| 135 | 138 |     command = ( | 
| 136 |  | -        "sudo sh -c 'cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf'" | 
|  | 139 | +        "sudo sh -c 'ps aux | grep [s]ge_execd || " | 
|  | 140 | +        "(cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf)'" | 
| 137 | 141 |     ).format(sge.SGE_ROOT) | 
| 138 | 142 |     hostnames = [host.hostname for host in hosts] | 
| 139 |  | -    result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user) | 
|  | 143 | +    result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user, timeout=20) | 
| 140 | 144 | 
 | 
| 141 | 145 |     succeeded_hosts = [] | 
| 142 | 146 |     for host in hosts: | 
| @@ -206,6 +210,7 @@ def get_jobs_info(hostname_filter=None, job_state_filter=None): | 
| 206 | 210 | def get_pending_jobs_info(max_slots_filter=None, skip_if_state=None): | 
| 207 | 211 |     """ | 
| 208 | 212 |     Retrieve the list of pending jobs. | 
|  | 213 | +
 | 
| 209 | 214 |     :param max_slots_filter: discard jobs that require a number of slots bigger than the given value | 
| 210 | 215 |     :param skip_if_state: discard jobs that are in the given state | 
| 211 | 216 |     :return: the list of filtered pending jos. | 
|  | 
0 commit comments