diff --git a/CHANGELOG.md b/CHANGELOG.md index 12231c2bdf..0c4b161880 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ aws-parallelcluster-cookbook CHANGELOG This file is used to list changes made in each version of the AWS ParallelCluster cookbook. +3.15.0 +------ + +**CHANGES** +1. Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes + and achieve better performance at scale. + + 3.14.0 ------ diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb index 0f0ad87850..8ef48ad12d 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb @@ -32,6 +32,7 @@ dcv_port: node['cluster']['dcv_port'], dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'], dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'], - dcv_auth_user: node['cluster']['dcv']['authenticator']['user'] + dcv_auth_user: node['cluster']['dcv']['authenticator']['user'], + cfnhup_enabled: cfnhup_enabled? ) end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb index 0434761402..7d49664e10 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb @@ -57,6 +57,28 @@ end end + context "when head node and cfn-hup disabled on fleet" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['dcv_enabled'] = 'head_node' + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true) + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'has the correct content' do + is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") + .with_content("[program:clustermgtd]") + .with_content("[program:clusterstatusmgtd]") + .with_content("[program:pcluster_dcv_authenticator]") + .with_content("--port 8444") + end + end + context "when compute fleet" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -77,6 +99,25 @@ .with_content("[program:pcluster_dcv_authenticator]") end end + + context "when compute fleet with cfn-hup disabled on fleet" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'has the correct content' do + is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:computemgtd]") + + is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") + end + end context "when login node and dcv configured" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -109,12 +150,32 @@ it 'has the correct content' do is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") .with_content("[program:loginmgtd]") is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') .with_content("[program:pcluster_dcv_authenticator]") end end + + context "when login node with cfn-hup disabled on fleet" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'LoginNode' + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'has the correct content' do + is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:loginmgtd]") + + is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') + .with_content("[program:cfn-hup]") + end + end end end end diff --git a/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb b/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb index 61fc1aaf73..e98a755235 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb @@ -1,8 +1,7 @@ # Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%> # Local modifications could be overwritten. <%# HeadNode, ComputeFleet, LoginNode -%> -<% case node['cluster']['node_type'] -%> -<% when 'HeadNode', 'ComputeFleet', 'LoginNode' -%> +<% if @cfnhup_enabled -%> [program:cfn-hup] command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh autorestart = true diff --git a/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb b/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb index def13a134a..4bbfffd820 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb @@ -34,3 +34,6 @@ # Default NFS mount options default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime' + +# Cluster Updates +default['cluster']['in_place_update_on_fleet_enabled'] = 'true' diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index ce3a27532f..37e0114051 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -106,3 +106,14 @@ def wait_sync_file(path) timeout 5 end end + +def cfnhup_enabled? + # cfn-hup is always enabled on the head node, as it is required to perform cluster updates. + # cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that + # live updates on compute and login nodes are not possible. + node['cluster']['node_type'] == 'HeadNode' || node['cluster']['in_place_update_on_fleet_enabled'] == 'true' +end + +def cluster_readiness_check_on_update_enabled? + node['cluster']['in_place_update_on_fleet_enabled'] == 'true' +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb new file mode 100644 index 0000000000..e9d180d5e0 --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb @@ -0,0 +1,42 @@ +require_relative '../../../libraries/helpers' +require 'spec_helper' + +describe 'cfnhup_enabled?' do + let(:node) { Chef::Node.new } + + context 'when node type is HeadNode' do + before { node.override['cluster']['node_type'] = 'HeadNode' } + + it 'returns true regardless of in_place_update_on_fleet_enabled setting' do + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + expect(cfnhup_enabled?).to be true + end + end + + %w(ComputeFleet LoginNode).each do |node_type| + context "when node type is #{node_type}" do + before { node.override['cluster']['node_type'] = node_type } + + it 'returns true when in_place_update_on_fleet_enabled is true' do + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'true' + expect(cfnhup_enabled?).to be true + end + + it 'returns false when in_place_update_on_fleet_enabled is false' do + node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' + expect(cfnhup_enabled?).to be false + end + end + end +end + +describe 'cluster_readiness_check_on_update_enabled?' do + let(:node) { Chef::Node.new } + + [true, false].each do |in_place_update_on_fleet_enabled| + it "returns #{in_place_update_on_fleet_enabled} when in_place_update_on_fleet_enabled is #{in_place_update_on_fleet_enabled}" do + node.override['cluster']['in_place_update_on_fleet_enabled'] = in_place_update_on_fleet_enabled.to_s + expect(cluster_readiness_check_on_update_enabled?).to be in_place_update_on_fleet_enabled + end + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 9b63a4a4b6..76ba95362a 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -272,7 +272,7 @@ def update_nodes_in_queue(strategy, queues) chef_sleep '15' -wait_cluster_ready +wait_cluster_ready if cluster_readiness_check_on_update_enabled? execute 'start clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb index 38b894c013..f2f53d13d4 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb @@ -15,6 +15,7 @@ allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(are_mount_or_unmount_required) allow_any_instance_of(Object).to receive(:dig).and_return(true) allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path) + allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(true) RSpec::Mocks.configuration.allow_message_expectations_on_nil = true node.override['cluster']['stack_name'] = cluster_name @@ -58,6 +59,27 @@ end end end + + context 'when cluster readiness check is disabled' do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false) + allow_any_instance_of(Object).to receive(:dig).and_return(true) + allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path) + allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(false) + RSpec::Mocks.configuration.allow_message_expectations_on_nil = true + + node.override['cluster']['stack_name'] = cluster_name + node.override['cluster']['region'] = region + node.override['cluster']['cluster_config_version'] = cluster_config_version + node.override['cluster']['scripts_dir'] = scripts_dir + end + runner.converge(described_recipe) + end + it 'does not check cluster readiness' do + is_expected.not_to run_execute("Check cluster readiness") + end + end end end end