Skip to content

Commit e7dcdfb

Browse files
committed
nodewatcher - torque: remove max 20 nodes at a time
torque takes about 2 seconds per node when deleting multiple nodes at a time. Limiting the number of nodes to remove concurrently in order to control the command timeout. Signed-off-by: Francesco De Martino <[email protected]>
1 parent e6ac373 commit e7dcdfb

File tree

2 files changed

+31
-12
lines changed

2 files changed

+31
-12
lines changed

src/common/schedulers/torque_commands.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,24 @@ def delete_nodes(hosts):
106106
# rerun the job.
107107
if hosts:
108108
run_command(TORQUE_BIN_DIR + "pbsnodes -o {0}".format(" ".join(hosts)), raise_on_error=False, log_error=False)
109-
return _qmgr_manage_nodes(
110-
operation="delete",
111-
hosts=hosts,
112-
error_messages_to_ignore=[
113-
"Unknown node",
114-
"The server was unable to communicate with the MOM to requeue or delete the job."
115-
" The node has been deleted and all jobs on the node have been purged.",
116-
],
117-
)
109+
# Process at most 20 concurrent deletions at a time since the required time linearly depends
110+
# on the number of nodes that we try to remove
111+
succeeded_hosts = set()
112+
chunk_size = 20
113+
for i in range(0, len(hosts), chunk_size):
114+
succeeded_hosts.update(
115+
_qmgr_manage_nodes(
116+
operation="delete",
117+
hosts=hosts[i : i + chunk_size],
118+
error_messages_to_ignore=[
119+
"Unknown node",
120+
"The server was unable to communicate with the MOM to requeue or delete the job."
121+
" The node has been deleted and all jobs on the node have been purged.",
122+
],
123+
)
124+
)
125+
126+
return succeeded_hosts
118127

119128

120129
def update_cluster_limits(max_nodes, node_slots):

tests/common/schedulers/test_torque_commands.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1010
# limitations under the License.
1111
import subprocess
12+
from unittest.mock import call
1213

1314
import pytest
1415

@@ -81,6 +82,7 @@ def test_add_nodes(qmgr_output, hosts, expected_succeeded_hosts, mocker):
8182
["ip-10-0-1-57", "ip-10-0-0-155"],
8283
["ip-10-0-1-57", "ip-10-0-0-155"],
8384
),
85+
("", ["ip-10-0-0-" + str(i) for i in range(0, 88)], ["ip-10-0-0-" + str(i) for i in range(0, 88)]),
8486
],
8587
ids=[
8688
"all_successful",
@@ -91,6 +93,7 @@ def test_add_nodes(qmgr_output, hosts, expected_succeeded_hosts, mocker):
9193
"no_nodes",
9294
"exception",
9395
"ignored_exception",
96+
"88_nodes",
9497
],
9598
)
9699
def test_delete_nodes(qmgr_output, hosts, expected_succeeded_hosts, mocker):
@@ -106,9 +109,16 @@ def test_delete_nodes(qmgr_output, hosts, expected_succeeded_hosts, mocker):
106109
succeeded_hosts = delete_nodes(hosts)
107110

108111
if hosts:
109-
qmgr_mock.assert_called_with(
110-
'/opt/torque/bin/qmgr -c "delete node {0} "'.format(",".join(hosts)), log_error=False
111-
)
112+
chunk_size = 20
113+
calls = []
114+
for i in range(0, len(hosts), chunk_size):
115+
calls.append(
116+
call(
117+
'/opt/torque/bin/qmgr -c "delete node {0} "'.format(",".join(hosts[i : i + chunk_size])),
118+
log_error=False,
119+
)
120+
)
121+
qmgr_mock.assert_has_calls(calls)
112122
pbsnodes_mock.assert_called_with(
113123
"/opt/torque/bin/pbsnodes -o {0}".format(" ".join(hosts)), log_error=False, raise_on_error=False
114124
)

0 commit comments

Comments
 (0)