2727from remote_command_executor import RemoteCommandExecutor
2828from retrying import retry
2929from time_utils import seconds
30- from utils import find_stack_by_tag , generate_stack_name , is_directory_supported , random_alphanumeric
30+
31+ from utils import find_stack_by_tag , generate_stack_name , is_directory_supported , random_alphanumeric , get_quarantined_stacks , is_quarantined_stack , quarantine_stacks
3132
3233from tests .ad_integration .cluster_user import ClusterUser
3334from tests .common .utils import run_system_analyzer
35+ from constants import DO_NOT_DELETE_TAG_KEY , MAX_QUARANTINED_STACKS
3436
3537NUM_USERS_TO_CREATE = 5
3638NUM_USERS_TO_TEST = 3
3739
40+ AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack'
41+
3842
3943def get_infra_stack_outputs (stack_name ):
4044 cfn = boto3 .client ("cloudformation" )
@@ -117,7 +121,7 @@ def add_tag_to_stack(stack_name, key, value):
117121 stack = cfn .Stack (stack_name )
118122 add_tag = True
119123 for tag in stack .tags :
120- if tag .get ("Key" ) == "DO-NOT-DELETE" :
124+ if tag .get ("Key" ) == DO_NOT_DELETE_TAG_KEY :
121125 add_tag = False
122126 break
123127 if add_tag :
@@ -127,6 +131,7 @@ def add_tag_to_stack(stack_name, key, value):
127131 Tags = [
128132 {"Key" : key , "Value" : value },
129133 ],
134+ DisableRollback = True ,
130135 )
131136
132137
@@ -189,7 +194,7 @@ def _get_stack_parameters(directory_type, vpc_stack, keypair):
189194
190195def _create_directory_stack (cfn_stacks_factory , request , directory_type , region , vpc_stack : CfnVpcStack ):
191196 directory_stack_name = generate_stack_name (
192- f"integ-tests-MultiUserInfraStack { directory_type } " , request .config .getoption ("stackname_suffix" )
197+ f"{ AD_STACK_PREFIX } { directory_type } " , request .config .getoption ("stackname_suffix" )
193198 )
194199
195200 if directory_type not in ("MicrosoftAD" , "SimpleAD" ):
@@ -203,7 +208,7 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
203208 stack_parameters = _get_stack_parameters (directory_type , vpc_stack , request .config .getoption ("key_name" ))
204209 tags = [{"Key" : "parallelcluster:integ-tests-ad-stack" , "Value" : directory_type }]
205210 if request .config .getoption ("retain_ad_stack" ):
206- tags .append ({"Key" : "DO-NOT-DELETE" , "Value" : "Retained for integration testing" })
211+ tags .append ({"Key" : DO_NOT_DELETE_TAG_KEY , "Value" : "Retained for integration testing" })
207212
208213 directory_stack = CfnStack (
209214 name = directory_stack_name ,
@@ -212,12 +217,33 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
212217 parameters = stack_parameters ,
213218 capabilities = ["CAPABILITY_IAM" , "CAPABILITY_NAMED_IAM" , "CAPABILITY_AUTO_EXPAND" ],
214219 tags = tags ,
220+ disable_rollback = True ,
215221 )
216- cfn_stacks_factory .create_stack (directory_stack )
222+ try :
223+ cfn_stacks_factory .create_stack (directory_stack , stack_is_under_test = True )
224+ except BaseException as e :
225+ logging .error ("Failed to create stack %s" , directory_stack_name )
226+ # We want to retain the stack in case of failure in order to debug it.
227+ # We retain a limited number of stack to contain the costs.
228+ n_quarantined_ad_stacks = len (get_quarantined_stacks (region , prefix = AD_STACK_PREFIX ))
229+ if n_quarantined_ad_stacks < MAX_QUARANTINED_STACKS :
230+ logging .warn ("Quarantining failed stack %s to debug failure (quarantined: %d, max: %d)" ,
231+ directory_stack_name , n_quarantined_ad_stacks , MAX_QUARANTINED_STACKS )
232+ quarantine_stacks (region , stack_names = [directory_stack_name ])
233+ else :
234+ logging .warn ("Cannot quarantine failed stack %s for debugging because there are already %d quarantined (max: %d)" ,
235+ directory_stack_name , n_quarantined_ad_stacks , MAX_QUARANTINED_STACKS )
236+ raise e
217237 logging .info ("Creation of stack %s complete" , directory_stack_name )
218238
219239 return directory_stack
220240
241+ def get_retained_ad_stacks_count ():
242+ cfn = boto3 .client ("cloudformation" )
243+ failed_stacks = cfn .list_stacks (StackStatusFilter = ['CREATE_FAILED' ])["StackSummaries" ]
244+ failed_ad_stacks = [stack for stack in failed_stacks if AD_STACK_PREFIX in stack .get ('StackName' )]
245+ return len ([stack for stack in failed_ad_stacks if stack .get ("Tags" ) and
246+ any (tag .get ("Key" ) == DO_NOT_DELETE_TAG_KEY for tag in stack .get ("Tags" ))])
221247
222248@retry (wait_fixed = seconds (20 ), stop_max_delay = seconds (700 ))
223249def _check_ssm_success (ssm_client , command_id , instance_id ):
@@ -243,10 +269,10 @@ def _directory_factory(
243269 directory_stack_name = created_directory_stacks .get (region , {}).get ("directory" )
244270 logging .info ("Using directory stack named %s created by another test" , directory_stack_name )
245271 else :
246- stack_prefix = f"integ-tests-MultiUserInfraStack { directory_type } "
272+ stack_prefix = f"{ AD_STACK_PREFIX } { directory_type } "
247273 directory_stack_name = find_stack_by_tag ("parallelcluster:integ-tests-ad-stack" , region , stack_prefix )
248274
249- if not directory_stack_name :
275+ if not directory_stack_name or is_quarantined_stack ( region , directory_stack_name ) :
250276 directory_stack = _create_directory_stack (
251277 cfn_stacks_factory ,
252278 request ,
@@ -257,7 +283,7 @@ def _directory_factory(
257283 directory_stack_name = directory_stack .name
258284 created_directory_stacks [region ]["directory" ] = directory_stack_name
259285 if request .config .getoption ("retain_ad_stack" ):
260- add_tag_to_stack (vpc_stack .name , "DO-NOT-DELETE" , "Retained for integration testing" )
286+ add_tag_to_stack (vpc_stack .name , DO_NOT_DELETE_TAG_KEY , "Retained for integration testing" )
261287 return directory_stack_name
262288
263289 yield _directory_factory
0 commit comments