Skip to content

Commit d36b557

Browse files
committed
AC-1243 Added configurable resource constraints on argo message sending pod containers
Signed-off-by: Alex Woodhead <[email protected]>
1 parent 43a1cfb commit d36b557

File tree

7 files changed

+109
-17
lines changed

7 files changed

+109
-17
lines changed

docs/changelog.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,5 @@ Renamed the (Generated) Argo client to ArgoGeneratedClient, added a new ArgoClie
2222
Enhanced the ArgoClient -> Argo_Get_WorkflowLogsAsync method to decode the json better and make the logs extracted from Argo more readable.
2323

2424
Added Mongo Migraions, to allow changes without breaking current stored entries
25+
26+
Added resource constraints to the generated ARGO templates, so these can operate within a Kubernetes cluster with a Resource Quota in the ARGO namespace.

src/Shared/Configuration/TaskManagerConfiguration.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,29 @@ public class ArgoPluginArguments
5050
{
5151
[ConfigurationKeyName("server_url")]
5252
public string ServerUrl { get; set; } = string.Empty;
53+
54+
[ConfigurationKeyName("initContainerCpuLimit")]
55+
public string InitContainerCpuLimit { get; set; } = "1";
56+
57+
[ConfigurationKeyName("initContainerMemoryLimit")]
58+
public string InitContainerMemoryLimit { get; set; } = "500Mi";
59+
60+
[ConfigurationKeyName("waitContainerCpuLimit")]
61+
public string WaitContainerCpuLimit { get; set; } = "1";
62+
63+
[ConfigurationKeyName("waitContainerMemoryLimit")]
64+
public string WaitContainerMemoryLimit { get; set; } = "500Mi";
65+
66+
[ConfigurationKeyName("messageGeneratorContainerCpuLimit")]
67+
public string MessageGeneratorContainerCpuLimit { get; set; } = "1";
68+
69+
[ConfigurationKeyName("messageGeneratorContainerMemoryLimit")]
70+
public string MessageGeneratorContainerMemoryLimit { get; set; } = "500Mi";
71+
72+
[ConfigurationKeyName("messageSenderContainerCpuLimit")]
73+
public string MessageSenderContainerCpuLimit { get; set; } = "1";
74+
75+
[ConfigurationKeyName("messageSenderContainerMemoryLimit")]
76+
public string MessageSenderContainerMemoryLimit { get; set; } = "500Mi";
5377
}
5478
}

src/Shared/Configuration/WorkflowManagerOptions.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,11 @@ public class WorkflowManagerOptions
6060
/// </summary>
6161
public BackgroundServiceSettings BackgroundServiceSettings { get; set; }
6262

63-
[ConfigurationKeyName("argoTtlStatergySeconds")]
64-
public int ArgoTtlStatergySeconds { get; set; } = 60 * 60 * 24 * 2; // 2 days before the pods get automatically cleaned up from argo
63+
[ConfigurationKeyName("argoTtlStrategySeconds")]
64+
public int ArgoTtlStrategySeconds { get; set; } = 60 * 60 * 24 * 2; // 2 days before the pods get automatically cleaned up from argo
6565

66-
[ConfigurationKeyName("minArgoTtlStatergySeconds")]
67-
public int MinArgoTtlStatergySeconds { get; set; } = 30; // time to get logs before cleanup !
66+
[ConfigurationKeyName("minArgoTtlStrategySeconds")]
67+
public int MinArgoTtlStrategySeconds { get; set; } = 30; // time to get logs before cleanup !
6868

6969
public WorkflowManagerOptions()
7070
{

src/TaskManager/Plug-ins/Argo/ArgoPlugin.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -618,16 +618,16 @@ private void SetArgoTtlStratergy(Workflow workflow)
618618
{
619619
if (workflow.Spec.TtlStrategy is null)
620620
{
621-
workflow.Spec.TtlStrategy = new TTLStrategy { SecondsAfterCompletion = _options.Value.ArgoTtlStatergySeconds };
621+
workflow.Spec.TtlStrategy = new TTLStrategy { SecondsAfterCompletion = _options.Value.ArgoTtlStrategySeconds };
622622
}
623623
else
624624
{
625625
if (workflow.Spec.TtlStrategy.SecondsAfterCompletion.HasValue)
626-
workflow.Spec.TtlStrategy.SecondsAfterCompletion = Math.Max(_options.Value.MinArgoTtlStatergySeconds, workflow.Spec.TtlStrategy.SecondsAfterCompletion.Value);
626+
workflow.Spec.TtlStrategy.SecondsAfterCompletion = Math.Max(_options.Value.MinArgoTtlStrategySeconds, workflow.Spec.TtlStrategy.SecondsAfterCompletion.Value);
627627
if (workflow.Spec.TtlStrategy.SecondsAfterSuccess.HasValue)
628-
workflow.Spec.TtlStrategy.SecondsAfterSuccess = Math.Max(_options.Value.MinArgoTtlStatergySeconds, workflow.Spec.TtlStrategy.SecondsAfterSuccess.Value);
628+
workflow.Spec.TtlStrategy.SecondsAfterSuccess = Math.Max(_options.Value.MinArgoTtlStrategySeconds, workflow.Spec.TtlStrategy.SecondsAfterSuccess.Value);
629629
if (workflow.Spec.TtlStrategy.SecondsAfterFailure.HasValue)
630-
workflow.Spec.TtlStrategy.SecondsAfterFailure = Math.Max(_options.Value.MinArgoTtlStatergySeconds, workflow.Spec.TtlStrategy.SecondsAfterFailure.Value);
630+
workflow.Spec.TtlStrategy.SecondsAfterFailure = Math.Max(_options.Value.MinArgoTtlStrategySeconds, workflow.Spec.TtlStrategy.SecondsAfterFailure.Value);
631631
}
632632
RemovePodGCStratergy(workflow);
633633
}

src/TaskManager/Plug-ins/Argo/ExitHookTemplate.cs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,18 @@ public Template2 GenerateMessageTemplate(S3Artifact2 artifact)
7474
Container = new Container2
7575
{
7676
Image = Strings.ExitHookGenerateMessageContainerImage,
77+
Resources = new ResourceRequirements
78+
{
79+
Limits = new Dictionary<string, string>
80+
{
81+
{"cpu", _options.TaskManager.ArgoPluginArguments.MessageGeneratorContainerCpuLimit},
82+
{"memory", _options.TaskManager.ArgoPluginArguments.MessageGeneratorContainerMemoryLimit}
83+
}
84+
},
7785
Command = new List<string> { "/bin/sh", "-c" },
7886
Args = new List<string> { $"echo \"{{{{inputs.parameters.message}}}}\" > {Strings.ExitHookOutputPath}{_messageFileName}; cat {Strings.ExitHookOutputPath}{_messageFileName};" }
7987
},
88+
PodSpecPatch = "{\"initContainers\":[{\"name\":\"init\",\"resources\":{\"limits\":{\"cpu\":\"" + _options.TaskManager.ArgoPluginArguments.InitContainerCpuLimit + "\",\"memory\": \"" + _options.TaskManager.ArgoPluginArguments.InitContainerMemoryLimit + "\"},\"requests\":{\"cpu\":\"0\",\"memory\":\"0Mi\"}}}],\"containers\":[{\"name\":\"wait\",\"resources\":{\"limits\":{\"cpu\":\"" + _options.TaskManager.ArgoPluginArguments.WaitContainerCpuLimit + "\",\"memory\":\"" + _options.TaskManager.ArgoPluginArguments.WaitContainerMemoryLimit + "\"},\"requests\":{\"cpu\":\"0\",\"memory\":\"0Mi\"}}}]}",
8089
Outputs = new Outputs
8190
{
8291
Artifacts = new List<Artifact>()
@@ -155,6 +164,10 @@ public Template2 GenerateSendTemplate(S3Artifact2 artifact)
155164
Container = new Container2
156165
{
157166
Image = _options.TaskManager.ArgoExitHookSendMessageContainerImage,
167+
Resources = new ResourceRequirements
168+
{
169+
Limits = new Dictionary<string, string>{ {"cpu", _options.TaskManager.ArgoPluginArguments.MessageSenderContainerCpuLimit}, {"memory", _options.TaskManager.ArgoPluginArguments.MessageSenderContainerMemoryLimit} }
170+
},
158171
Command = new List<string> { "/rabtap" },
159172
Args = new List<string> {
160173
"pub",
@@ -164,7 +177,8 @@ public Template2 GenerateSendTemplate(S3Artifact2 artifact)
164177
"--delay=0s",
165178
"--confirms",
166179
"--mandatory" }
167-
}
180+
},
181+
PodSpecPatch = "{\"initContainers\":[{\"name\":\"init\",\"resources\":{\"limits\":{\"cpu\":\"" + _options.TaskManager.ArgoPluginArguments.InitContainerCpuLimit + "\",\"memory\": \"" + _options.TaskManager.ArgoPluginArguments.InitContainerMemoryLimit + "\"},\"requests\":{\"cpu\":\"0\",\"memory\":\"0Mi\"}}}],\"containers\":[{\"name\":\"wait\",\"resources\":{\"limits\":{\"cpu\":\"" + _options.TaskManager.ArgoPluginArguments.WaitContainerCpuLimit + "\",\"memory\":\"" + _options.TaskManager.ArgoPluginArguments.WaitContainerMemoryLimit + "\"},\"requests\":{\"cpu\":\"0\",\"memory\":\"0Mi\"}}}]}",
168182
};
169183
}
170184
}

src/TaskManager/TaskManager/appsettings.json

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
},
66
"WorkflowManager": {
77
"taskTimeoutMinutes": 60,
8-
"argoTtlStatergySeconds": 172800, // (60*60*24*2) 2 days before the pods get automatically cleaned up from argo,
9-
"minArgoTtlStatergySeconds": 30,
8+
"argoTtlStrategySeconds": 172800, // (60*60*24*2) 2 days before the pods get automatically cleaned up from argo,
9+
"minArgoTtlStrategySeconds": 30,
1010
"endpointSettings": {
1111
"defaultPageSize": 10,
1212
"maxPageSize": 1000000
@@ -25,7 +25,15 @@
2525
"test": "Monai.Deploy.WorkflowManager.TaskManager.TestPlugin.Repositories.TestPluginRepository, Monai.Deploy.WorkflowManager.TaskManager.TestPlugin"
2626
},
2727
"argoPluginArguments": {
28-
"server_url": "http://argo-argo-workflows-server.argo:2746"
28+
"server_url": "http://argo-argo-workflows-server.argo:2746",
29+
"initContainerCpuLimit": "1",
30+
"initContainerMemoryLimit": "500Mi",
31+
"waitContainerCpuLimit": "1",
32+
"waitContainerMemoryLimit": "500Mi",
33+
"messageGeneratorContainerCpuLimit": "1",
34+
"messageGeneratorContainerMemoryLimit": "500Mi",
35+
"messageSenderContainerCpuLimit": "1",
36+
"messageSenderContainerMemoryLimit": "500Mi"
2937
},
3038
"argoExitHookSendMessageContainerImage": "ghcr.io/jandelgado/rabtap:latest"
3139
},

tests/UnitTests/TaskManager.Argo.Tests/ArgoPluginTest.cs

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,14 @@ public class ArgoPluginTest
6060
private Workflow? _submittedArgoTemplate;
6161
private readonly int _argoTtlStatergySeconds = 360;
6262
private readonly int _minAgoTtlStatergySeconds = 30;
63+
private readonly string _initContainerCpuLimit = "100m";
64+
private readonly string _initContainerMemoryLimit = "200Mi";
65+
private readonly string _waitContainerCpuLimit = "200m";
66+
private readonly string _waitContainerMemoryLimit = "300Mi";
67+
private readonly string _messageGeneratorContainerCpuLimit = "300m";
68+
private readonly string _messageGeneratorContainerMemoryLimit = "400Mi";
69+
private readonly string _messageSenderContainerCpuLimit = "400m";
70+
private readonly string _messageSenderContainerMemoryLimit = "500Mi";
6371

6472
public ArgoPluginTest()
6573
{
@@ -80,8 +88,16 @@ public ArgoPluginTest()
8088
_options.Value.Messaging.PublisherSettings.Add("exchange", "exchange");
8189
_options.Value.Messaging.PublisherSettings.Add("virtualHost", "vhost");
8290
_options.Value.Messaging.Topics.TaskCallbackRequest = "md.tasks.callback";
83-
_options.Value.ArgoTtlStatergySeconds = _argoTtlStatergySeconds;
84-
_options.Value.MinArgoTtlStatergySeconds = _minAgoTtlStatergySeconds;
91+
_options.Value.ArgoTtlStrategySeconds = _argoTtlStatergySeconds;
92+
_options.Value.MinArgoTtlStrategySeconds = _minAgoTtlStatergySeconds;
93+
_options.Value.TaskManager.ArgoPluginArguments.InitContainerCpuLimit = _initContainerCpuLimit;
94+
_options.Value.TaskManager.ArgoPluginArguments.InitContainerMemoryLimit = _initContainerMemoryLimit;
95+
_options.Value.TaskManager.ArgoPluginArguments.WaitContainerCpuLimit = _waitContainerCpuLimit;
96+
_options.Value.TaskManager.ArgoPluginArguments.WaitContainerMemoryLimit = _waitContainerMemoryLimit;
97+
_options.Value.TaskManager.ArgoPluginArguments.MessageGeneratorContainerCpuLimit = _messageGeneratorContainerCpuLimit;
98+
_options.Value.TaskManager.ArgoPluginArguments.MessageGeneratorContainerMemoryLimit = _messageGeneratorContainerMemoryLimit;
99+
_options.Value.TaskManager.ArgoPluginArguments.MessageSenderContainerCpuLimit = _messageSenderContainerCpuLimit;
100+
_options.Value.TaskManager.ArgoPluginArguments.MessageSenderContainerMemoryLimit = _messageSenderContainerMemoryLimit;
85101

86102
_serviceScopeFactory.Setup(p => p.CreateScope()).Returns(_serviceScope.Object);
87103

@@ -630,8 +646,8 @@ public async Task ArgoPlugin_Copies_ImagePullSecrets()
630646
Assert.Equal(secret, _submittedArgoTemplate?.Spec.ImagePullSecrets.First());
631647
}
632648

633-
[Fact(DisplayName = "TTL gets added if not pressent")]
634-
public async Task ArgoPlugin_Ensures_TTL_Added_If_Not_pressent()
649+
[Fact(DisplayName = "TTL gets added if not present")]
650+
public async Task ArgoPlugin_Ensures_TTL_Added_If_Not_present()
635651
{
636652
var argoTemplate = LoadArgoTemplate("SimpleTemplate.yml");
637653
Assert.NotNull(argoTemplate);
@@ -647,6 +663,34 @@ public async Task ArgoPlugin_Ensures_TTL_Added_If_Not_pressent()
647663
Assert.Equal(_argoTtlStatergySeconds, _submittedArgoTemplate?.Spec.TtlStrategy?.SecondsAfterCompletion);
648664
}
649665

666+
[Fact(DisplayName = "Argo Plugin adds required resource limits")]
667+
public async Task ArgoPlugin_Adds_Container_Resource_Restrictions_Based_On_Configured_Values()
668+
{
669+
var argoTemplate = LoadArgoTemplate("SimpleTemplate.yml");
670+
Assert.NotNull(argoTemplate);
671+
672+
SetUpSimpleArgoWorkFlow(argoTemplate);
673+
674+
var message = GenerateTaskDispatchEventWithValidArguments();
675+
676+
var expectedPodSpecPatch = "{\"initContainers\":[{\"name\":\"init\",\"resources\":{\"limits\":{\"cpu\":\"" + _initContainerCpuLimit + "\",\"memory\": \"" +
677+
_initContainerMemoryLimit +
678+
"\"},\"requests\":{\"cpu\":\"0\",\"memory\":\"0Mi\"}}}],\"containers\":[{\"name\":\"wait\",\"resources\":{\"limits\":{\"cpu\":\"" +
679+
_waitContainerCpuLimit + "\",\"memory\":\"" + _waitContainerMemoryLimit +
680+
"\"},\"requests\":{\"cpu\":\"0\",\"memory\":\"0Mi\"}}}]}";
681+
682+
var runner = new ArgoPlugin(_serviceScopeFactory.Object, _logger.Object, _options, message);
683+
var result = await runner.ExecuteTask(CancellationToken.None).ConfigureAwait(false);
684+
685+
Assert.Equal(TaskExecutionStatus.Accepted, result.Status);
686+
Assert.Equal(_messageGeneratorContainerCpuLimit, _submittedArgoTemplate?.Spec.Templates.FirstOrDefault(p => p.Name == Strings.ExitHookTemplateGenerateTemplateName).Container.Resources.Limits["cpu"]);
687+
Assert.Equal(_messageGeneratorContainerMemoryLimit, _submittedArgoTemplate?.Spec.Templates.FirstOrDefault(p => p.Name == Strings.ExitHookTemplateGenerateTemplateName).Container.Resources.Limits["memory"]);
688+
Assert.Equal(expectedPodSpecPatch, _submittedArgoTemplate?.Spec.Templates.FirstOrDefault(p => p.Name == Strings.ExitHookTemplateGenerateTemplateName).PodSpecPatch);
689+
Assert.Equal(_messageSenderContainerCpuLimit, _submittedArgoTemplate?.Spec.Templates.FirstOrDefault(p => p.Name == Strings.ExitHookTemplateSendTemplateName).Container.Resources.Limits["cpu"]);
690+
Assert.Equal(_messageSenderContainerMemoryLimit, _submittedArgoTemplate?.Spec.Templates.FirstOrDefault(p => p.Name == Strings.ExitHookTemplateSendTemplateName).Container.Resources.Limits["memory"]);
691+
Assert.Equal(expectedPodSpecPatch, _submittedArgoTemplate?.Spec.Templates.FirstOrDefault(p => p.Name == Strings.ExitHookTemplateSendTemplateName).PodSpecPatch);
692+
}
693+
650694
[Theory(DisplayName = "TTL gets extended if too short")]
651695
[InlineData(31, 31, 29)]
652696
[InlineData(1, null, null)]
@@ -725,7 +769,7 @@ public async Task ArgoPlugin_Ensures_TTL_Remains(int? secondsAfterCompletion, in
725769
Assert.Equal(secondsAfterFailure, _submittedArgoTemplate?.Spec.TtlStrategy.SecondsAfterFailure);
726770
}
727771

728-
[Fact(DisplayName = "pocGC gets removed if pressent")]
772+
[Fact(DisplayName = "pocGC gets removed if present")]
729773
public async Task ArgoPlugin_Ensures_podGC_is_removed()
730774
{
731775
var argoTemplate = LoadArgoTemplate("SimpleTemplate.yml");

0 commit comments

Comments
 (0)