Skip to content

Commit f65eeca

Browse files
committed
Improve connection robustness
- Attempt reconnections from webrtc task when disconnections happen - Reconnections are handled with new task in LwsApiCalls.c - Workflow was broken with LwsApiCallsESP.c: We now handle it from webrtc_task itself - Do not reboot the device when using wifi-set CLI command
1 parent e4a847b commit f65eeca

File tree

4 files changed

+143
-41
lines changed

4 files changed

+143
-41
lines changed

esp_port/components/esp_webrtc_utils/src/wifi_cli.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,19 @@ static const char *TAG = "wifi_cli";
1414

1515
static int wifi_set_cli_handler(int argc, char *argv[])
1616
{
17+
1718
/* Just to go to the next line */
1819
printf("\n");
1920
if (argc != 3) {
2021
printf("%s: Incorrect arguments\n", TAG);
2122
return 0;
2223
}
2324

25+
/* Stop the Wi-Fi */
26+
if (esp_wifi_stop() != ESP_OK) {
27+
printf("%s: Failed to stop wifi\n", TAG);
28+
}
29+
2430
wifi_config_t wifi_cfg = {
2531
.sta = {
2632
/* Setting a password implies station will connect to all security modes including WEP/WPA.
@@ -44,9 +50,16 @@ static int wifi_set_cli_handler(int argc, char *argv[])
4450
return 0;
4551
}
4652

47-
ESP_LOGI(TAG, "Rebooting with new wi-fi configuration...");
48-
vTaskDelay(pdMS_TO_TICKS(2 * 1000));
49-
esp_restart(); // for now we simply re-start the device
53+
/* (Re)Start Wi-Fi */
54+
if (esp_wifi_start() != ESP_OK) {
55+
printf("%s: Failed to start wifi\n", TAG);
56+
}
57+
58+
/* Connect to AP */
59+
if (esp_wifi_connect() != ESP_OK) {
60+
printf("%s: Failed to connect wifi\n", TAG);
61+
}
62+
5063
return 0;
5164
}
5265

esp_port/components/kvs_webrtc/src/LwsApiCallsESP.c

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -323,20 +323,58 @@ static void esp_websocket_event_handler(void *handler_args, esp_event_base_t bas
323323

324324
case WEBSOCKET_EVENT_DISCONNECTED:
325325
ESP_LOGI(TAG, "WEBSOCKET_EVENT_DISCONNECTED");
326+
327+
BOOL connected = FALSE;
328+
STATUS retStatus = STATUS_SUCCESS;
329+
326330
MUTEX_LOCK(pEspWrapper->wsClientLock);
331+
connected = pEspWrapper->isConnected;
327332
pEspWrapper->isConnected = FALSE;
328333
MUTEX_UNLOCK(pEspWrapper->wsClientLock);
329334

335+
// Now DISCONNECT is the primary event for handling connection cleanup and reconnection
336+
BOOL wasConnected = connected;
330337
ATOMIC_STORE_BOOL(&pSignalingClient->connected, FALSE);
331-
if ((SERVICE_CALL_RESULT) ATOMIC_LOAD(&pSignalingClient->result) == SERVICE_CALL_RESULT_NOT_SET) {
332-
ATOMIC_STORE(&pSignalingClient->result, (SIZE_T) SERVICE_CALL_UNKNOWN);
333-
}
334338

335-
// Signal the condition variable to wake up the waiting thread
339+
// Signal condition variables to wake up waiting threads
336340
MUTEX_LOCK(pSignalingClient->connectedLock);
337341
CVAR_SIGNAL(pSignalingClient->connectedCvar);
338342
MUTEX_UNLOCK(pSignalingClient->connectedLock);
339343

344+
CVAR_BROADCAST(pSignalingClient->receiveCvar);
345+
CVAR_BROADCAST(pSignalingClient->sendCvar);
346+
ATOMIC_STORE(&pSignalingClient->messageResult, (SIZE_T) SERVICE_CALL_RESULT_OK);
347+
348+
// Handle reconnection logic
349+
if (wasConnected && ATOMIC_LOAD(&pSignalingClient->result) != SERVICE_CALL_RESULT_SIGNALING_RECONNECT_ICE &&
350+
!ATOMIC_LOAD_BOOL(&pSignalingClient->shutdown)) {
351+
352+
ESP_LOGI(TAG, "WebSocket disconnected from active connection, triggering reconnection via error callback");
353+
354+
// Set the result failed to indicate disconnection
355+
ATOMIC_STORE(&pSignalingClient->result, (SIZE_T) SERVICE_CALL_UNKNOWN);
356+
357+
// Use the existing error callback mechanism to trigger reconnection
358+
if (pSignalingClient->signalingClientCallbacks.errorReportFn != NULL) {
359+
CHAR errorMsg[] = "WebSocket connection lost, triggering reconnection";
360+
ESP_LOGI(TAG, "DEBUG: Calling errorReportFn with STATUS_SIGNALING_RECONNECT_FAILED=0x%08x", STATUS_SIGNALING_RECONNECT_FAILED);
361+
pSignalingClient->signalingClientCallbacks.errorReportFn(
362+
pSignalingClient->signalingClientCallbacks.customData,
363+
STATUS_SIGNALING_RECONNECT_FAILED,
364+
errorMsg,
365+
(UINT32) STRLEN(errorMsg));
366+
ESP_LOGI(TAG, "Signaled for reconnection via error callback");
367+
} else {
368+
ESP_LOGW(TAG, "No error callback registered, cannot signal for reconnection");
369+
}
370+
} else if (!wasConnected) {
371+
ESP_LOGI(TAG, "WebSocket disconnected but was not previously connected, not triggering reconnection");
372+
} else if (ATOMIC_LOAD_BOOL(&pSignalingClient->shutdown)) {
373+
ESP_LOGI(TAG, "WebSocket disconnected during shutdown, not triggering reconnection");
374+
} else {
375+
ESP_LOGI(TAG, "WebSocket disconnected during ICE reconnect, not triggering full reconnection");
376+
}
377+
340378
// Free the WebSocket buffer on disconnect
341379
freeWebSocketBuffer();
342380
break;
@@ -400,20 +438,19 @@ static void esp_websocket_event_handler(void *handler_args, esp_event_base_t bas
400438
}
401439
}
402440

441+
// For ERROR events, just log and wake up waiting threads
442+
// Let DISCONNECT event handle all state changes and reconnection logic
403443
if (pSignalingClient != NULL) {
404-
ATOMIC_STORE_BOOL(&pSignalingClient->connected, FALSE);
405-
// Set result if not already set
406-
if ((SERVICE_CALL_RESULT) ATOMIC_LOAD(&pSignalingClient->result) == SERVICE_CALL_RESULT_NOT_SET) {
407-
ATOMIC_STORE(&pSignalingClient->result, (SIZE_T) SERVICE_CALL_UNKNOWN);
408-
409-
// Signal the waiting thread about the error
410-
// The mutex must be locked before signaling
411-
MUTEX_LOCK(pSignalingClient->connectedLock);
412-
CVAR_SIGNAL(pSignalingClient->connectedCvar);
413-
MUTEX_UNLOCK(pSignalingClient->connectedLock);
414-
}
415-
// Reset the buffer on error
416-
resetWebSocketBuffer();
444+
ESP_LOGI(TAG, "WebSocket error occurred, waiting for disconnect event to handle cleanup and reconnection");
445+
446+
// Signal condition variables to wake up waiting threads
447+
MUTEX_LOCK(pSignalingClient->connectedLock);
448+
CVAR_SIGNAL(pSignalingClient->connectedCvar);
449+
MUTEX_UNLOCK(pSignalingClient->connectedLock);
450+
451+
CVAR_BROADCAST(pSignalingClient->receiveCvar);
452+
CVAR_BROADCAST(pSignalingClient->sendCvar);
453+
ATOMIC_STORE(&pSignalingClient->messageResult, (SIZE_T) SERVICE_CALL_RESULT_OK);
417454
} else {
418455
ESP_LOGW(TAG, "pSignalingClient is NULL in WEBSOCKET_EVENT_ERROR!");
419456
}
@@ -920,18 +957,16 @@ STATUS connectEspSignalingClient(PSignalingClient pSignalingClient)
920957
// Set ping interval to match SIGNALING_SERVICE_WSS_PING_PONG_INTERVAL_IN_SECONDS (10 seconds)
921958
ws_cfg.ping_interval_sec = 10;
922959

923-
// Enable auto reconnect to handle network issues gracefully
924-
ws_cfg.disable_auto_reconnect = FALSE;
960+
// Disable auto reconnect to let the state machine handle reconnection with fresh credentials
961+
// Auto-reconnect would fail after 5 minutes when the signed URL expires
962+
ws_cfg.disable_auto_reconnect = TRUE;
925963

926964
// Set reconnect timeout to match ESP-IDF recommendations
927965
ws_cfg.reconnect_timeout_ms = 10000; // 10 seconds
928966

929967
// Configure TCP keepalive to match SIGNALING_SERVICE_TCP_KEEPALIVE settings
930968
ws_cfg.network_timeout_ms = SIGNALING_SERVICE_TCP_KEEPALIVE_IN_SECONDS * 1000;
931969

932-
// Configure transport - since we're using wss:// in the URL, this will automatically be SSL
933-
// ws_cfg.transport = WEBSOCKET_TRANSPORT_OVER_SSL;
934-
935970
// For development, set skip_cert_common_name_check to true to skip cert verification
936971
// In production environments, this should be set to false
937972
#ifdef CONFIG_SKIP_COMMON_NAME_CHECK

esp_port/components/kvs_webrtc/src/app_webrtc.c

Lines changed: 67 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,17 @@ STATUS signalingClientError(UINT64 customData, STATUS status, PCHAR msg, UINT32
478478
SNPRINTF(errorMsg, SIZEOF(errorMsg), "Signaling error: 0x%08" PRIx32 " - %.*s", status, (int) msgLen, msg);
479479
raiseEvent(APP_WEBRTC_EVENT_SIGNALING_ERROR, status, NULL, errorMsg);
480480

481+
// DEBUG: Show status comparison
482+
DLOGI("DEBUG: signalingClientError received status=0x%08x, STATUS_SIGNALING_RECONNECT_FAILED=0x%08x, STATUS_SIGNALING_ICE_CONFIG_REFRESH_FAILED=0x%08x",
483+
status, STATUS_SIGNALING_RECONNECT_FAILED, STATUS_SIGNALING_ICE_CONFIG_REFRESH_FAILED);
484+
481485
// We will force re-create the signaling client on the following errors
482486
if (status == STATUS_SIGNALING_ICE_CONFIG_REFRESH_FAILED || status == STATUS_SIGNALING_RECONNECT_FAILED) {
487+
DLOGI("DEBUG: Status matches, setting recreateSignalingClient=TRUE");
483488
ATOMIC_STORE_BOOL(&pSampleConfiguration->recreateSignalingClient, TRUE);
484489
CVAR_BROADCAST(pSampleConfiguration->cvar);
490+
} else {
491+
DLOGI("DEBUG: Status does not match reconnect conditions, not setting recreateSignalingClient flag");
485492
}
486493

487494
return STATUS_SUCCESS;
@@ -1682,24 +1689,70 @@ STATUS sessionCleanupWait(PSampleConfiguration pSampleConfiguration, BOOL isSign
16821689
sessionFreed = FALSE;
16831690
}
16841691

1685-
// Check if we need to re-create the signaling client on-the-fly
1686-
if (ATOMIC_LOAD_BOOL(&pSampleConfiguration->recreateSignalingClient) &&
1692+
// Check if we need to re-create the signaling client on-the-fly with retry mechanism
1693+
BOOL needsRecreate = ATOMIC_LOAD_BOOL(&pSampleConfiguration->recreateSignalingClient);
1694+
if (needsRecreate) {
1695+
DLOGD("recreateSignalingClient flag is TRUE, checking conditions");
1696+
}
1697+
1698+
if (needsRecreate &&
16871699
gWebRtcAppConfig.pSignalingClientInterface != NULL &&
16881700
gSignalingClientData != NULL) {
16891701

1690-
DLOGI("Reconnecting signaling client");
1691-
1692-
// Disconnect and reconnect
1693-
CHK_STATUS(gWebRtcAppConfig.pSignalingClientInterface->disconnect(gSignalingClientData));
1694-
retStatus = gWebRtcAppConfig.pSignalingClientInterface->connect(gSignalingClientData);
1695-
1696-
if (STATUS_SUCCEEDED(retStatus)) {
1697-
// Re-set the variable again
1698-
ATOMIC_STORE_BOOL(&pSampleConfiguration->recreateSignalingClient, FALSE);
1702+
// Static variables to track retry attempts and timing
1703+
static UINT32 retryCount = 0;
1704+
static UINT64 lastRetryTime = 0;
1705+
UINT64 currentTime = GETTIME();
1706+
1707+
// Exponential backoff: 5s, 10s, 20s, 40s, 60s (max)
1708+
UINT64 retryDelays[] = {
1709+
5 * HUNDREDS_OF_NANOS_IN_A_SECOND, // 5 seconds
1710+
10 * HUNDREDS_OF_NANOS_IN_A_SECOND, // 10 seconds
1711+
20 * HUNDREDS_OF_NANOS_IN_A_SECOND, // 20 seconds
1712+
40 * HUNDREDS_OF_NANOS_IN_A_SECOND, // 40 seconds
1713+
60 * HUNDREDS_OF_NANOS_IN_A_SECOND // 60 seconds (max)
1714+
};
1715+
UINT32 maxRetryIndex = ARRAY_SIZE(retryDelays) - 1;
1716+
UINT32 currentRetryIndex = MIN(retryCount, maxRetryIndex);
1717+
UINT64 retryDelay = retryDelays[currentRetryIndex];
1718+
1719+
// Check if enough time has passed since last retry attempt
1720+
BOOL shouldRetry = (lastRetryTime == 0) || (currentTime - lastRetryTime >= retryDelay);
1721+
1722+
if (shouldRetry) {
1723+
DLOGI("Reconnecting signaling client (attempt %d, delay: %llu seconds)",
1724+
retryCount + 1, retryDelay / HUNDREDS_OF_NANOS_IN_A_SECOND);
1725+
1726+
// Disconnect and reconnect
1727+
CHK_STATUS(gWebRtcAppConfig.pSignalingClientInterface->disconnect(gSignalingClientData));
1728+
retStatus = gWebRtcAppConfig.pSignalingClientInterface->connect(gSignalingClientData);
1729+
1730+
if (STATUS_SUCCEEDED(retStatus)) {
1731+
// Reset retry tracking on successful reconnection
1732+
DLOGI("Signaling client reconnected successfully after %d attempts", retryCount + 1);
1733+
ATOMIC_STORE_BOOL(&pSampleConfiguration->recreateSignalingClient, FALSE);
1734+
retryCount = 0;
1735+
lastRetryTime = 0;
1736+
raiseEvent(APP_WEBRTC_EVENT_SIGNALING_CONNECTED, 0, NULL, "Signaling client reconnected");
1737+
} else {
1738+
// Update retry tracking on failure
1739+
retryCount++;
1740+
lastRetryTime = currentTime;
1741+
DLOGE("Failed to reconnect signaling client: 0x%08x (attempt %d, next retry in %llu seconds)",
1742+
retStatus, retryCount,
1743+
retryDelays[MIN(retryCount, maxRetryIndex)] / HUNDREDS_OF_NANOS_IN_A_SECOND);
1744+
1745+
// Keep the recreateSignalingClient flag set to continue retrying
1746+
// Don't reset it - we want to keep trying
1747+
1748+
// Reset status to avoid breaking the loop
1749+
retStatus = STATUS_SUCCESS;
1750+
}
16991751
} else {
1700-
DLOGE("Failed to reconnect signaling client: 0x%08x", retStatus);
1701-
// Reset status to avoid breaking the loop
1702-
retStatus = STATUS_SUCCESS;
1752+
// Not time to retry yet
1753+
UINT64 timeUntilNextRetry = retryDelay - (currentTime - lastRetryTime);
1754+
DLOGD("Waiting %llu more seconds before next reconnection attempt",
1755+
timeUntilNextRetry / HUNDREDS_OF_NANOS_IN_A_SECOND);
17031756
}
17041757
}
17051758

esp_port/components/kvs_webrtc/src/kvs_signaling.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,9 @@ static STATUS adapterErrorCallback(UINT64 customData, STATUS errorStatus, PCHAR
124124
return STATUS_SUCCESS;
125125
}
126126

127-
WEBRTC_STATUS webrtcStatus = convertStatusToWebrtcStatus(errorStatus);
128-
WEBRTC_STATUS result = pAdapterData->originalOnError(pAdapterData->originalCustomData, webrtcStatus, errorMessage, subErrorCode);
127+
// Preserve the original status code instead of converting to generic WEBRTC_STATUS_INTERNAL_ERROR
128+
// This ensures signaling reconnection status codes are passed through correctly
129+
WEBRTC_STATUS result = pAdapterData->originalOnError(pAdapterData->originalCustomData, (WEBRTC_STATUS)errorStatus, errorMessage, subErrorCode);
129130

130131
return (result == WEBRTC_STATUS_SUCCESS) ? STATUS_SUCCESS : STATUS_INTERNAL_ERROR;
131132
}

0 commit comments

Comments
 (0)