From ab803cd50ef8655020a2ce79c3a4a14247ce75cc Mon Sep 17 00:00:00 2001 From: Geoffrey Yu Date: Wed, 15 Nov 2023 22:37:05 -0500 Subject: [PATCH] Transition resilience improvements - Wait until the deletion is reflected in the metadata - Refresh the directory as well if there is a connection failure --- src/brad/front_end/front_end.py | 4 ++++ src/brad/provisioning/rds.py | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/brad/front_end/front_end.py b/src/brad/front_end/front_end.py index c8e9abce..a188746b 100644 --- a/src/brad/front_end/front_end.py +++ b/src/brad/front_end/front_end.py @@ -263,6 +263,10 @@ async def start_session(self) -> SessionId: ) raise await asyncio.sleep(time_to_wait) + # Defensively refresh the blueprint and directory before + # retrying. Maybe we are getting outdated endpoint information + # from AWS. + await self._blueprint_mgr.load() async def end_session(self, session_id: SessionId) -> None: await self._sessions.end_session(session_id) diff --git a/src/brad/provisioning/rds.py b/src/brad/provisioning/rds.py index 1300a375..305d0fa5 100644 --- a/src/brad/provisioning/rds.py +++ b/src/brad/provisioning/rds.py @@ -83,7 +83,9 @@ def do_create_replica(): await asyncio.sleep(20) await self.wait_until_instance_is_available(instance_id) - async def delete_replica(self, instance_id: str) -> None: + async def delete_replica( + self, instance_id: str, wait_until_status_updated: bool = True + ) -> None: def do_delete(): self._rds.delete_db_instance( DBInstanceIdentifier=instance_id, @@ -94,6 +96,11 @@ def do_delete(): loop = asyncio.get_running_loop() await loop.run_in_executor(None, do_delete) + # Will poll until the instance's status is no longer "available". + if wait_until_status_updated: + await asyncio.sleep(10) + await self.wait_until_instance_is_not_available(instance_id) + async def wait_until_instance_is_available( self, instance_id: str, polling_interval: float = 20 ) -> None: @@ -108,6 +115,20 @@ async def wait_until_instance_is_available( ) await asyncio.sleep(polling_interval) + async def wait_until_instance_is_not_available( + self, instance_id: str, polling_interval: float = 20 + ) -> None: + while True: + response = await self._describe_db_instance(instance_id) + instance = response["DBInstances"][0] + status = instance["DBInstanceStatus"] + if status != "available": + break + logger.debug( + "Waiting for Aurora instance %s to be NOT available...", instance_id + ) + await asyncio.sleep(polling_interval) + async def wait_until_cluster_is_available( self, cluster_id: str, polling_interval: float = 20 ) -> None: