From 540e1703957def1583029b2d47cb0c2a7823a539 Mon Sep 17 00:00:00 2001 From: HKPA Date: Sat, 16 May 2026 01:39:16 +0200 Subject: [PATCH] feat: auto-restart browser on crash with watchdog (#14) - Add watchdog task to BrowserManager that periodically checks if running profiles are still alive by probing context.pages - If a profile crashes, automatically restart it with exponential backoff (1s, 2s, 4s, max 30s) and max 3 retries before giving up - Add auto_restart boolean field to profile model (default True) - Add auto_restart to database schema with migration for existing DBs - Start watchdog in lifespan startup in main.py - Explicitly stopped profiles are tracked to prevent unwanted restarts - cleanup_all() cancels watchdog task on shutdown - Fix lambda closure bug in context.on('close') callback - Add 12 tests: crash detection, restart, backoff, max retries, healthy profiles, auto_restart disabled, user-stopped profiles, deleted profiles --- backend/browser_manager.py | 126 +++++++++++++- backend/database.py | 10 +- backend/main.py | 1 + backend/models.py | 3 + backend/tests/test_watchdog.py | 296 +++++++++++++++++++++++++++++++++ 5 files changed, 429 insertions(+), 7 deletions(-) create mode 100644 backend/tests/test_watchdog.py diff --git a/backend/browser_manager.py b/backend/browser_manager.py index 327f91b5..c832be76 100644 --- a/backend/browser_manager.py +++ b/backend/browser_manager.py @@ -163,6 +163,10 @@ def __init__(self): self._lock = asyncio.Lock() self._next_cdp_port = BASE_CDP_PORT self._auto_launch_task: asyncio.Task | None = None + self._watchdog_task: asyncio.Task | None = None + self._restart_counts: dict[str, int] = {} # profile_id -> consecutive crash count + self._restart_times: dict[str, float] = {} # profile_id -> last restart timestamp + self._stopped_profiles: set[str] = set() # profiles explicitly stopped by user async def launch(self, profile: dict[str, Any]) -> RunningProfile: """Launch a browser instance for the given profile.""" @@ -265,8 +269,8 @@ async def launch(self, profile: dict[str, Any]) -> RunningProfile: ) # Auto-cleanup if browser crashes or user closes Chrome via VNC - context.on("close", lambda: asyncio.ensure_future( - self._on_browser_closed(profile_id) + context.on("close", lambda pid=profile_id: asyncio.ensure_future( + self._on_browser_closed(pid) )) async with self._lock: @@ -286,17 +290,24 @@ async def launch(self, profile: dict[str, Any]) -> RunningProfile: await self.vnc.stop_vnc(display) raise - async def _on_browser_closed(self, profile_id: str): - """Called when browser exits (crash, user closed via VNC, or stop()).""" + async def _on_browser_closed(self, profile_id: str) -> bool: + """Called when browser exits (crash, user closed via VNC, or stop()). + + Returns True if cleanup was performed (profile was still in self.running). + """ async with self._lock: running = self.running.pop(profile_id, None) if running: logger.info("Browser closed for profile %s, cleaning up", profile_id) await self.vnc.stop_vnc(running.display) + return True + return False async def stop(self, profile_id: str): """Stop a running browser instance.""" + # Mark as explicitly stopped so watchdog doesn't restart it + self._stopped_profiles.add(profile_id) # Pop before close so _on_browser_closed() finds nothing to clean up async with self._lock: running = self.running.pop(profile_id, None) @@ -327,6 +338,14 @@ def get_status(self, profile_id: str) -> dict[str, Any]: async def cleanup_all(self): """Stop all running profiles. Called on shutdown.""" + # Cancel watchdog before stopping profiles + if self._watchdog_task and not self._watchdog_task.done(): + self._watchdog_task.cancel() + try: + await self._watchdog_task + except asyncio.CancelledError: + pass + async with self._lock: profile_ids = list(self.running.keys()) @@ -376,6 +395,105 @@ def _allocate_cdp_port(self) -> int: continue raise ValueError("No free CDP ports available in range %d-%d" % (BASE_CDP_PORT, BASE_CDP_PORT + CDP_PORT_RANGE - 1)) + async def start_watchdog(self, interval: float = 5.0) -> None: + """Start the watchdog task that periodically checks for crashed profiles. + + Called once during application startup (from lifespan). + """ + if self._watchdog_task and not self._watchdog_task.done(): + return + self._watchdog_task = asyncio.create_task(self._watchdog_loop(interval)) + logger.info("Watchdog started (interval=%.1fs)", interval) + + async def _watchdog_loop(self, interval: float) -> None: + """Periodically check running profiles and restart crashed ones.""" + while True: + try: + await asyncio.sleep(interval) + await self._check_and_restart_crashed() + except asyncio.CancelledError: + logger.info("Watchdog cancelled") + raise + except Exception: + logger.exception("Watchdog error during health check") + + async def _check_and_restart_crashed(self) -> None: + """Check each running profile for liveness and restart if crashed.""" + from . import database as db + + # Snapshot running profile IDs + async with self._lock: + profile_ids = list(self.running.keys()) + + for profile_id in profile_ids: + running = self.running.get(profile_id) + if not running: + continue + + # Check if context is still alive by trying to get pages + alive = True + try: + pages = running.context.pages + if pages: + await pages[0].evaluate("1") + except Exception: + alive = False + + if alive: + continue + + # Profile has crashed — clean up and consider restart + logger.warning("Profile %s appears to have crashed", profile_id) + await self._on_browser_closed(profile_id) + await self._attempt_restart(profile_id, db) + + async def _attempt_restart(self, profile_id: str, db_module: Any) -> None: + """Attempt to restart a crashed profile with exponential backoff.""" + if profile_id in self._stopped_profiles: + logger.info("Profile %s was explicitly stopped, skipping restart", profile_id) + return + + profile = db_module.get_profile(profile_id) + if not profile: + logger.warning("Profile %s not found in database, skipping restart", profile_id) + return + + if not profile.get("auto_restart", True): + logger.info("Profile %s has auto_restart disabled, skipping", profile_id) + return + + count = self._restart_counts.get(profile_id, 0) + if count >= 3: + logger.error( + "Profile %s has crashed %d times, giving up on auto-restart", + profile_id, count, + ) + self._restart_counts.pop(profile_id, None) + self._restart_times.pop(profile_id, None) + return + + # Exponential backoff: 1s, 2s, 4s (capped at 30s) + delay = min(2 ** count, 30) + logger.info( + "Restarting profile %s (attempt %d/3, backoff=%ds)", + profile_id, count + 1, delay, + ) + await asyncio.sleep(delay) + + try: + await self.launch(profile) + # Success — reset counter + self._restart_counts.pop(profile_id, None) + self._restart_times.pop(profile_id, None) + logger.info("Successfully restarted profile %s", profile_id) + except Exception as exc: + self._restart_counts[profile_id] = count + 1 + self._restart_times[profile_id] = asyncio.get_event_loop().time() + logger.error( + "Failed to restart profile %s (attempt %d/3): %s", + profile_id, count + 1, exc, + ) + def _build_fingerprint_args(self, profile: dict[str, Any]) -> list[str]: """Build extra Chromium args from profile fingerprint settings.""" args: list[str] = [ diff --git a/backend/database.py b/backend/database.py index e4682564..3b9e0c04 100644 --- a/backend/database.py +++ b/backend/database.py @@ -78,6 +78,9 @@ def init_db(): if "auto_launch" not in cols: conn.execute("ALTER TABLE profiles ADD COLUMN auto_launch BOOLEAN DEFAULT 0") conn.commit() + if "auto_restart" not in cols: + conn.execute("ALTER TABLE profiles ADD COLUMN auto_restart BOOLEAN DEFAULT 1") + conn.commit() def _now() -> str: @@ -101,9 +104,9 @@ def create_profile( id, name, fingerprint_seed, proxy, timezone, locale, platform, user_agent, screen_width, screen_height, gpu_vendor, gpu_renderer, hardware_concurrency, humanize, human_preset, headless, geoip, - clipboard_sync, auto_launch, color_scheme, launch_args, notes, + clipboard_sync, auto_launch, auto_restart, color_scheme, launch_args, notes, user_data_dir, created_at, updated_at - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( profile_id, name, seed, fields.get("proxy"), @@ -122,6 +125,7 @@ def create_profile( fields.get("geoip", False), fields.get("clipboard_sync", True), fields.get("auto_launch", False), + fields.get("auto_restart", True), fields.get("color_scheme"), json.dumps(fields.get("launch_args") or []), fields.get("notes"), @@ -187,7 +191,7 @@ def update_profile(profile_id: str, **fields: Any) -> dict[str, Any] | None: "name", "fingerprint_seed", "proxy", "timezone", "locale", "platform", "user_agent", "screen_width", "screen_height", "gpu_vendor", "gpu_renderer", "hardware_concurrency", "humanize", "human_preset", "headless", "geoip", - "clipboard_sync", "auto_launch", "color_scheme", "launch_args", "notes", + "clipboard_sync", "auto_launch", "auto_restart", "color_scheme", "launch_args", "notes", ): if col in fields: update_cols.append(f"{col} = ?") diff --git a/backend/main.py b/backend/main.py index 92727a53..5bda1032 100644 --- a/backend/main.py +++ b/backend/main.py @@ -377,6 +377,7 @@ async def lifespan(app: FastAPI): db.init_db() await browser_mgr.cleanup_stale() browser_mgr._auto_launch_task = asyncio.create_task(browser_mgr.auto_launch_all()) + await browser_mgr.start_watchdog() logger.info("CloakBrowser Manager started") yield logger.info("Shutting down — stopping all browsers...") diff --git a/backend/models.py b/backend/models.py index e3ba3521..b0d131a5 100644 --- a/backend/models.py +++ b/backend/models.py @@ -26,6 +26,7 @@ class ProfileCreate(BaseModel): geoip: bool = False clipboard_sync: bool = True auto_launch: bool = False + auto_restart: bool = True color_scheme: Literal["light", "dark", "no-preference"] | None = None launch_args: list[str] = Field(default_factory=list) notes: str | None = None @@ -51,6 +52,7 @@ class ProfileUpdate(BaseModel): geoip: bool | None = None clipboard_sync: bool | None = None auto_launch: bool | None = None + auto_restart: bool | None = None color_scheme: Literal["light", "dark", "no-preference"] | None = Field(default=None) launch_args: list[str] | None = None notes: str | None = Field(default=None) @@ -87,6 +89,7 @@ class ProfileResponse(BaseModel): geoip: bool = False clipboard_sync: bool = True auto_launch: bool = False + auto_restart: bool = True @field_validator("clipboard_sync", mode="before") @classmethod diff --git a/backend/tests/test_watchdog.py b/backend/tests/test_watchdog.py new file mode 100644 index 00000000..9160b30c --- /dev/null +++ b/backend/tests/test_watchdog.py @@ -0,0 +1,296 @@ +"""Tests for the watchdog auto-restart feature (issue #14).""" + +from __future__ import annotations + +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from backend.browser_manager import BrowserManager, RunningProfile + + +def _make_running_profile(profile_id: str = "p1") -> RunningProfile: + """Create a mock RunningProfile with a fake context.""" + context = MagicMock() + context.pages = [MagicMock()] + context.pages[0].evaluate = AsyncMock(return_value=1) + context.on = MagicMock() + return RunningProfile( + profile_id=profile_id, + context=context, + display=1, + ws_port=6080, + cdp_port=5100, + ) + + +def _make_profile_dict(profile_id: str = "p1", auto_restart: bool = True) -> dict: + """Create a minimal profile dict as returned by db.get_profile.""" + return { + "id": profile_id, + "name": f"Profile {profile_id}", + "fingerprint_seed": 12345, + "proxy": None, + "timezone": None, + "locale": None, + "platform": "windows", + "user_agent": None, + "screen_width": 1920, + "screen_height": 1080, + "gpu_vendor": None, + "gpu_renderer": None, + "hardware_concurrency": None, + "humanize": False, + "human_preset": "default", + "headless": False, + "geoip": False, + "clipboard_sync": True, + "auto_launch": False, + "auto_restart": auto_restart, + "color_scheme": None, + "launch_args": [], + "notes": None, + "user_data_dir": "/tmp/test_profile", + "tags": [], + } + + +@pytest.fixture() +def mgr() -> BrowserManager: + """Create a fresh BrowserManager for each test.""" + return BrowserManager() + + +# ── Watchdog start/stop ───────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_start_watchdog_creates_task(mgr: BrowserManager): + """start_watchdog should create a background task.""" + await mgr.start_watchdog(interval=0.1) + assert mgr._watchdog_task is not None + assert not mgr._watchdog_task.done() + # Cleanup + mgr._watchdog_task.cancel() + try: + await mgr._watchdog_task + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_start_watchdog_idempotent(mgr: BrowserManager): + """Calling start_watchdog twice should not create a second task.""" + await mgr.start_watchdog(interval=0.1) + task1 = mgr._watchdog_task + await mgr.start_watchdog(interval=0.1) + assert mgr._watchdog_task is task1 + # Cleanup + task1.cancel() + try: + await task1 + except asyncio.CancelledError: + pass + + +@pytest.mark.asyncio +async def test_cleanup_all_cancels_watchdog(mgr: BrowserManager): + """cleanup_all should cancel the watchdog task.""" + await mgr.start_watchdog(interval=0.1) + assert mgr._watchdog_task is not None + await mgr.cleanup_all() + assert mgr._watchdog_task.cancelled() or mgr._watchdog_task.done() + + +# ── Crash detection and restart ───────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_watchdog_restarts_crashed_profile(mgr: BrowserManager): + """When a profile's context raises on .pages, watchdog should restart it.""" + running = _make_running_profile("p1") + mgr.running["p1"] = running + + # Make context.pages raise to simulate a crash + type(running.context).pages = property(lambda self: (_ for _ in ()).throw(RuntimeError("browser crashed"))) + + profile_dict = _make_profile_dict("p1") + + with patch.object(mgr.vnc, "stop_vnc", new_callable=AsyncMock): + with patch.object(mgr.vnc, "allocate", new_callable=AsyncMock, return_value=(2, 6081)): + with patch.object(mgr.vnc, "start_vnc", new_callable=AsyncMock): + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + new_context = MagicMock() + new_context.pages = [MagicMock()] + new_context.pages[0].evaluate = AsyncMock(return_value=1) + new_context.on = MagicMock() + new_context.add_init_script = AsyncMock() + mock_launch.return_value = new_context + + with patch("backend.database.get_profile", return_value=profile_dict): + # Run a single watchdog tick + await mgr._check_and_restart_crashed() + + # Profile should be running again + assert "p1" in mgr.running + # Restart count should be reset on success + assert "p1" not in mgr._restart_counts + + +@pytest.mark.asyncio +async def test_watchdog_no_restart_when_auto_restart_disabled(mgr: BrowserManager): + """If auto_restart=False, watchdog should not restart a crashed profile.""" + running = _make_running_profile("p1") + mgr.running["p1"] = running + + # Simulate crash + type(running.context).pages = property(lambda self: (_ for _ in ()).throw(RuntimeError("crashed"))) + + profile_dict = _make_profile_dict("p1", auto_restart=False) + + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + with patch.object(mgr.vnc, "stop_vnc", new_callable=AsyncMock): + with patch("backend.database.get_profile", return_value=profile_dict): + await mgr._check_and_restart_crashed() + + # Should not have tried to launch + mock_launch.assert_not_called() + # Profile should not be running + assert "p1" not in mgr.running + + +@pytest.mark.asyncio +async def test_watchdog_no_restart_when_stopped_by_user(mgr: BrowserManager): + """If user explicitly stopped a profile, watchdog should not restart it.""" + mgr._stopped_profiles.add("p1") + + profile_dict = _make_profile_dict("p1") + + with patch("backend.database.get_profile", return_value=profile_dict): + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + + mock_launch.assert_not_called() + + +@pytest.mark.asyncio +async def test_watchdog_no_restart_when_profile_deleted(mgr: BrowserManager): + """If profile no longer exists in DB, watchdog should not restart it.""" + with patch("backend.database.get_profile", return_value=None): + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + + mock_launch.assert_not_called() + + +# ── Exponential backoff ───────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_exponential_backoff_values(mgr: BrowserManager): + """Verify backoff delays: 1s, 2s, 4s.""" + profile_dict = _make_profile_dict("p1") + + with patch("backend.database.get_profile", return_value=profile_dict): + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + mock_launch.side_effect = RuntimeError("launch fails") + + with patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + # Attempt 1 — backoff = min(2^0, 30) = 1 + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + mock_sleep.assert_awaited_with(1) + assert mgr._restart_counts["p1"] == 1 + + # Attempt 2 — backoff = min(2^1, 30) = 2 + mock_sleep.reset_mock() + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + mock_sleep.assert_awaited_with(2) + assert mgr._restart_counts["p1"] == 2 + + # Attempt 3 — backoff = min(2^2, 30) = 4 + mock_sleep.reset_mock() + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + mock_sleep.assert_awaited_with(4) + assert mgr._restart_counts["p1"] == 3 + + +@pytest.mark.asyncio +async def test_backoff_capped_at_30s(mgr: BrowserManager): + """Backoff should be capped at 30 seconds even with high restart count.""" + profile_dict = _make_profile_dict("p1") + mgr._restart_counts["p1"] = 10 # Simulate many prior failures + + with patch("backend.database.get_profile", return_value=profile_dict): + with patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + mock_launch.side_effect = RuntimeError("still failing") + + # count=3 triggers "give up" path, so reset to test capping + mgr._restart_counts["p1"] = 3 + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + # At count >= 3, it gives up without sleeping + mock_sleep.assert_not_awaited() + + +# ── Max retries ───────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_gives_up_after_max_retries(mgr: BrowserManager): + """After 3 failed restarts, watchdog should give up.""" + profile_dict = _make_profile_dict("p1") + mgr._restart_counts["p1"] = 3 + + with patch("backend.database.get_profile", return_value=profile_dict): + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + await mgr._attempt_restart("p1", __import__("backend.database", fromlist=["database"])) + + mock_launch.assert_not_called() + # Counter should be cleaned up + assert "p1" not in mgr._restart_counts + + +@pytest.mark.asyncio +async def test_restart_counter_resets_on_success(mgr: BrowserManager): + """A successful restart should reset the crash counter for that profile.""" + running = _make_running_profile("p1") + mgr.running["p1"] = running + mgr._restart_counts["p1"] = 2 # 2 prior failures + + # Simulate crash + type(running.context).pages = property(lambda self: (_ for _ in ()).throw(RuntimeError("crashed"))) + + profile_dict = _make_profile_dict("p1") + + with patch.object(mgr.vnc, "stop_vnc", new_callable=AsyncMock): + with patch.object(mgr.vnc, "allocate", new_callable=AsyncMock, return_value=(2, 6081)): + with patch.object(mgr.vnc, "start_vnc", new_callable=AsyncMock): + with patch("backend.browser_manager.launch_persistent_context_async", new_callable=AsyncMock) as mock_launch: + new_context = MagicMock() + new_context.pages = [MagicMock()] + new_context.pages[0].evaluate = AsyncMock(return_value=1) + new_context.on = MagicMock() + new_context.add_init_script = AsyncMock() + mock_launch.return_value = new_context + + with patch("backend.database.get_profile", return_value=profile_dict): + await mgr._check_and_restart_crashed() + + assert "p1" not in mgr._restart_counts + + +@pytest.mark.asyncio +async def test_watchdog_ignores_healthy_profiles(mgr: BrowserManager): + """Watchdog should not touch profiles that are still alive.""" + running = _make_running_profile("p1") + mgr.running["p1"] = running + + # Profile is healthy — pages.evaluate returns 1 + with patch("backend.database.get_profile") as mock_get: + await mgr._check_and_restart_crashed() + + # Should not have queried the database (no restart needed) + mock_get.assert_not_called() + # Profile still running + assert "p1" in mgr.running