exospherehost · agam1092005 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/state-manager/app/config/settings.py b/state-manager/app/config/settings.py
@@ -13,6 +13,7 @@ class Settings(BaseModel):
     state_manager_secret: str = Field(..., description="Secret key for API authentication")
     secrets_encryption_key: str = Field(..., description="Key for encrypting secrets")
     trigger_workers: int = Field(default=1, description="Number of workers to run the trigger cron")
+    node_timeout_minutes: int = Field(default=30, gt=0, description="Timeout in minutes for nodes stuck in QUEUED status")
 
     @classmethod
     def from_env(cls) -> "Settings":
@@ -21,7 +22,8 @@ def from_env(cls) -> "Settings":
             mongo_database_name=os.getenv("MONGO_DATABASE_NAME", "exosphere-state-manager"), # type: ignore
             state_manager_secret=os.getenv("STATE_MANAGER_SECRET"), # type: ignore
             secrets_encryption_key=os.getenv("SECRETS_ENCRYPTION_KEY"), # type: ignore
-            trigger_workers=int(os.getenv("TRIGGER_WORKERS", 1)) # type: ignore
+            trigger_workers=int(os.getenv("TRIGGER_WORKERS", 1)), # type: ignore
+            node_timeout_minutes=os.getenv("NODE_TIMEOUT_MINUTES", "30") # type: ignore
         )
 
 

diff --git a/state-manager/app/controller/enqueue_states.py b/state-manager/app/controller/enqueue_states.py
@@ -13,17 +13,21 @@
 
 
 async def find_state(namespace_name: str, nodes: list[str]) -> State | None:
+    current_time_ms = int(time.time() * 1000)
     data = await State.get_pymongo_collection().find_one_and_update(
         {
             "namespace_name": namespace_name,
             "status": StateStatusEnum.CREATED,
             "node_name": {
                 "$in": nodes
             },
-            "enqueue_after": {"$lte": int(time.time() * 1000)}
+            "enqueue_after": {"$lte": current_time_ms}
         },
         {
-            "$set": {"status": StateStatusEnum.QUEUED}
+            "$set": {
+                "status": StateStatusEnum.QUEUED,
+                "queued_at": current_time_ms
+            }
         },
         return_document=ReturnDocument.AFTER
     )

diff --git a/state-manager/app/main.py b/state-manager/app/main.py
@@ -38,6 +38,7 @@
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from apscheduler.triggers.cron import CronTrigger
 from .tasks.trigger_cron import trigger_cron
+from .tasks.check_node_timeout import check_node_timeout
 
 # Define models list
 DOCUMENT_MODELS = [State, GraphTemplate, RegisteredNode, Store, Run, DatabaseTriggers]
@@ -76,6 +77,15 @@ async def lifespan(app: FastAPI):
         max_instances=1,
         id="every_minute_task"
     )
+    scheduler.add_job(
+        check_node_timeout,
+        CronTrigger.from_crontab("* * * * *"),
+        replace_existing=True,
+        misfire_grace_time=60,
+        coalesce=True,
+        max_instances=1,
+        id="check_node_timeout_task"
+    )
     scheduler.start()
 
     # main logic of the server

diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py
@@ -28,6 +28,7 @@ class State(BaseDatabaseModel):
     retry_count: int = Field(default=0, description="Number of times the state has been retried")
     fanout_id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Fanout ID of the state")
     manual_retry_fanout_id: str = Field(default="", description="Fanout ID from a manual retry request, ensuring unique retries for unite nodes.")
+    queued_at: Optional[int] = Field(None, description="Unix time in milliseconds when the state was queued")
 
     @before_event([Insert, Replace, Save])
     def _generate_fingerprint(self):
@@ -102,5 +103,12 @@ class Settings:
                     ("status", 1),
                 ],
                 name="run_id_status_index"
+            ),
+            IndexModel(
+                [
+                    ("status", 1),
+                    ("queued_at", 1),
+                ],
+                name="timeout_query_index"
             )
         ]
diff --git a/state-manager/app/models/state_status_enum.py b/state-manager/app/models/state_status_enum.py
@@ -11,6 +11,7 @@ class StateStatusEnum(str, Enum):
     # Errored
     ERRORED = 'ERRORED'
     NEXT_CREATED_ERROR = 'NEXT_CREATED_ERROR'
+    TIMEDOUT = 'TIMEDOUT'
 
     # Success
     SUCCESS = 'SUCCESS'

diff --git a/state-manager/app/tasks/check_node_timeout.py b/state-manager/app/tasks/check_node_timeout.py
@@ -0,0 +1,36 @@
+import time
+from app.models.db.state import State
+from app.models.state_status_enum import StateStatusEnum
+from app.singletons.logs_manager import LogsManager
+from app.config.settings import get_settings
+
+logger = LogsManager().get_logger()
+
+
+async def check_node_timeout():
+    try:
+        settings = get_settings()
+        timeout_ms = settings.node_timeout_minutes * 60 * 1000
+        current_time_ms = int(time.time() * 1000)
+        timeout_threshold = current_time_ms - timeout_ms
+
+        logger.info(f"Checking for timed out nodes with threshold: {timeout_threshold}")
+
+        result = await State.get_pymongo_collection().update_many(
+            {
+                "status": StateStatusEnum.QUEUED,
+                "queued_at": {"$ne": None, "$lte": timeout_threshold}
+            },
+            {
+                "$set": {
+                    "status": StateStatusEnum.TIMEDOUT,
+                    "error": f"Node execution timed out after {settings.node_timeout_minutes} minutes"
+                }
+            }
+        )
+
+        if result.modified_count > 0:
+            logger.info(f"Marked {result.modified_count} states as TIMEDOUT")
+
+    except Exception:
+        logger.error("Error checking node timeout", exc_info=True)
diff --git a/state-manager/tests/unit/tasks/test_check_node_timeout.py b/state-manager/tests/unit/tasks/test_check_node_timeout.py
@@ -0,0 +1,114 @@
+import pytest
+import time
+from unittest.mock import AsyncMock, MagicMock, patch
+from app.models.state_status_enum import StateStatusEnum
+
+
+class TestCheckNodeTimeout:
+
+    @pytest.mark.asyncio
+    async def test_check_node_timeout_marks_timed_out_states(self):
+        mock_collection = MagicMock()
+        mock_result = MagicMock()
+        mock_result.modified_count = 3
+        mock_collection.update_many = AsyncMock(return_value=mock_result)
+
+        with patch('app.tasks.check_node_timeout.State') as mock_state, \
+             patch('app.tasks.check_node_timeout.get_settings') as mock_get_settings:
+
+            from app.tasks.check_node_timeout import check_node_timeout
+
+            mock_settings = MagicMock()
+            mock_settings.node_timeout_minutes = 30
+            mock_get_settings.return_value = mock_settings
+
+            mock_state.get_pymongo_collection.return_value = mock_collection
+
+            await check_node_timeout()
+
+            mock_collection.update_many.assert_called_once()
+            call_args = mock_collection.update_many.call_args
+
+            query = call_args[0][0]
+            update = call_args[0][1]
+
+            assert query["status"] == StateStatusEnum.QUEUED
+            assert "$ne" in query["queued_at"]
+            assert "$lte" in query["queued_at"]
+
+            assert update["$set"]["status"] == StateStatusEnum.TIMEDOUT
+            assert "timed out after 30 minutes" in update["$set"]["error"]
+
+    @pytest.mark.asyncio
+    async def test_check_node_timeout_no_timed_out_states(self):
+        mock_collection = MagicMock()
+        mock_result = MagicMock()
+        mock_result.modified_count = 0
+        mock_collection.update_many = AsyncMock(return_value=mock_result)
+
+        with patch('app.tasks.check_node_timeout.State') as mock_state, \
+             patch('app.tasks.check_node_timeout.get_settings') as mock_get_settings:
+
+            from app.tasks.check_node_timeout import check_node_timeout
+
+            mock_settings = MagicMock()
+            mock_settings.node_timeout_minutes = 30
+            mock_get_settings.return_value = mock_settings
+
+            mock_state.get_pymongo_collection.return_value = mock_collection
+
+            await check_node_timeout()
+
+            mock_collection.update_many.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_check_node_timeout_handles_exception(self):
+        mock_collection = MagicMock()
+        mock_collection.update_many = AsyncMock(side_effect=Exception("Database error"))
+
+        with patch('app.tasks.check_node_timeout.State') as mock_state, \
+             patch('app.tasks.check_node_timeout.get_settings') as mock_get_settings, \
+             patch('app.tasks.check_node_timeout.logger') as mock_logger:
+
+            from app.tasks.check_node_timeout import check_node_timeout
+
+            mock_settings = MagicMock()
+            mock_settings.node_timeout_minutes = 30
+            mock_get_settings.return_value = mock_settings
+
+            mock_state.get_pymongo_collection.return_value = mock_collection
+
+            await check_node_timeout()
+
+            mock_logger.error.assert_called_once()
+            error_message = mock_logger.error.call_args[0][0]
+            assert "Error checking node timeout" in error_message
+
+    @pytest.mark.asyncio
+    async def test_check_node_timeout_calculates_correct_threshold(self):
+        mock_collection = MagicMock()
+        mock_result = MagicMock()
+        mock_result.modified_count = 0
+        mock_collection.update_many = AsyncMock(return_value=mock_result)
+
+        with patch('app.tasks.check_node_timeout.State') as mock_state, \
+             patch('app.tasks.check_node_timeout.get_settings') as mock_get_settings, \
+             patch('app.tasks.check_node_timeout.time') as mock_time:
+
+            from app.tasks.check_node_timeout import check_node_timeout
+
+            mock_time.time.return_value = 1700000000
+
+            mock_settings = MagicMock()
+            mock_settings.node_timeout_minutes = 45
+            mock_get_settings.return_value = mock_settings
+
+            mock_state.get_pymongo_collection.return_value = mock_collection
+
+            await check_node_timeout()
+
+            call_args = mock_collection.update_many.call_args
+            query = call_args[0][0]
+
+            expected_threshold = (1700000000 * 1000) - (45 * 60 * 1000)
+            assert query["queued_at"]["$lte"] == expected_threshold