Skip to content
Merged
122 changes: 116 additions & 6 deletions src/TCPClientInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,24 @@ TCPClientInterface::TCPClientInterface(const char* name /*= "TCPClientInterface"
return false;
}

#ifdef ARDUINO
// The blocking connect() runs on its own task so it never stalls the main
// loop. read/write/frame stay on the main loop (see loop()).
// Seed _last_connect_attempt so the task's first reconnect-wait check passes
// immediately; otherwise the initial connect could be delayed up to
// RECONNECT_WAIT_MS. Unsigned wraparound keeps this correct when
// millis() < RECONNECT_WAIT_MS.
_last_connect_attempt = millis() - RECONNECT_WAIT_MS;
_task_running = true;
BaseType_t r = xTaskCreatePinnedToCore(tcp_task, "tcp", 6144, this, 1, &_task_handle, 0);
if (r != pdPASS) {
ERROR("TCPClientInterface: Failed to create connect task");
_task_running = false;
return false;
}
INFO("TCPClientInterface: connect worker running");
return true;
#else
// WiFi connection is handled externally (in main.cpp)
// Attempt initial connection
if (!connect()) {
Expand All @@ -55,16 +73,18 @@ TCPClientInterface::TCPClientInterface(const char* name /*= "TCPClientInterface"
}

return true;
#endif
}

bool TCPClientInterface::connect() {
TRACE("TCPClientInterface: Connecting to " + _target_host + ":" + std::to_string(_target_port));

#ifdef ARDUINO
// Set connection timeout
_client.setTimeout(CONNECT_TIMEOUT_MS);

if (!_client.connect(_target_host.c_str(), _target_port)) {
// 3-arg connect bounds the blocking time (the 2-arg form ignores it and can
// block ~18.5s on an unreachable host). Runs on tcp_task, off the main loop.
if (!_client.connect(_target_host.c_str(), _target_port, CONNECT_TIMEOUT_MS)) {
DEBUG("TCPClientInterface: Connection failed");
return false;
}
Expand All @@ -73,10 +93,8 @@ bool TCPClientInterface::connect() {
configure_socket();

INFO("TCPClientInterface: Connected to " + _target_host + ":" + std::to_string(_target_port));
_online = true;
_reconnected = true; // Signal that we (re)connected - main loop should announce
_last_data_received = millis(); // Reset stale timer
_frame_buffer.clear();
// task_loop() publishes the link state (_conn_state / _online / _reconnected)
// after this returns; nothing else is touched here.
return true;

#else
Expand Down Expand Up @@ -233,19 +251,105 @@ void TCPClientInterface::disconnect() {
}

void TCPClientInterface::handle_disconnect() {
#ifdef ARDUINO
// Called on the main loop while CONNECTED. Close the socket and hand it back
// to tcp_task (DISCONNECTED) for a fresh connect.
INFO("TCPClientInterface: Connection lost, will attempt reconnection");
disconnect(); // _client.stop(), _online=false, clear buffer
_last_connect_attempt = millis();
_conn_state.store(DISCONNECTED);
#else
if (_online) {
INFO("TCPClientInterface: Connection lost, will attempt reconnection");
disconnect();
// Reset connect attempt timer to enforce wait before reconnection
_last_connect_attempt = millis();
}
#endif
}

#ifdef ARDUINO
/*static*/ void TCPClientInterface::tcp_task(void* arg) {
auto* self = static_cast<TCPClientInterface*>(arg);
self->task_loop();
self->_task_done = true; // let stop() join before the object is freed
vTaskDelete(nullptr);
}

// Owns _client ONLY while connecting. When the link is down it runs the blocking
// connect() here (off the main loop); on success it publishes CONNECTED and the
// main loop takes over all socket I/O. It never touches _client while CONNECTED.
void TCPClientInterface::task_loop() {
while (_task_running) {
if (_conn_state.load() == DISCONNECTED) {
uint32_t now = millis();
if (now - _last_connect_attempt >= RECONNECT_WAIT_MS) {
_last_connect_attempt = now;
if (ESP.getMaxAllocHeap() >= 20000) { // skip under heap pressure
_conn_state.store(CONNECTING); // claim _client
if (connect()) {
_frame_buffer.clear();
_last_data_received = millis();
_online = true;
_reconnected.store(true); // main loop announces
_conn_state.store(CONNECTED); // hand _client to main loop
Comment thread
torlando-tech marked this conversation as resolved.
Comment thread
torlando-tech marked this conversation as resolved.
Outdated
} else {
_conn_state.store(DISCONNECTED);
}
}
}
}
vTaskDelay(pdMS_TO_TICKS(100));
}
}
Comment thread
torlando-tech marked this conversation as resolved.
#endif

/*virtual*/ void TCPClientInterface::stop() {
#ifdef ARDUINO
// Join the task: signal it, then wait until it has actually left task_loop()
// before tearing anything down. An in-flight connect() can overrun
// CONNECT_TIMEOUT_MS on a slow DNS server, and ~TCPClientInterface() calls
// stop() — returning early would risk a use-after-free on `this`.
_task_running = false;
if (_task_handle != nullptr) {
uint32_t deadline = millis() + CONNECT_TIMEOUT_MS + 2000;
while (!_task_done && (int32_t)(millis() - deadline) < 0) {
vTaskDelay(pdMS_TO_TICKS(20));
}
_task_handle = nullptr;
}
_conn_state.store(DISCONNECTED);
#endif
disconnect();
}
Comment thread
torlando-tech marked this conversation as resolved.

/*virtual*/ void TCPClientInterface::loop() {
#ifdef ARDUINO
// tcp_task owns _client while (re)connecting; the main loop only touches the
// socket once CONNECTED. read/write/frame all happen here (same low-latency
// path as before the task split). The legacy body below is unreachable on
// ARDUINO.
if (_conn_state.load() != CONNECTED) {
_online = false;
return;
}
_online = true;
// ESP32 WiFiClient.connected() can momentarily read false; only treat it as a
// drop when there is also no buffered data.
if (!_client.connected() && _client.available() == 0) {
handle_disconnect();
return;
}
if (_client.available() > 0) {
_last_data_received = millis();
while (_client.available() > 0) {
uint8_t byte = _client.read();
_frame_buffer.append(byte);
}
}
extract_and_process_frames();
return;
#endif
// Periodic status logging
static uint32_t last_status_log = 0;
static uint32_t loop_count = 0;
Expand Down Expand Up @@ -479,6 +583,12 @@ void TCPClientInterface::extract_and_process_frames() {
}

#ifdef ARDUINO
// Only write when CONNECTED — while (re)connecting, _client belongs to
// tcp_task. send_outgoing() runs on the main loop (same thread as loop()),
// so no lock is needed once CONNECTED.
if (_conn_state.load() != CONNECTED) {
return false; // not connected; Reticulum will retry/route
}
size_t written = _client.write(framed.data(), framed.size());
if (written != framed.size()) {
ERROR("TCPClientInterface: Write incomplete, " + std::to_string(written) +
Expand Down
39 changes: 36 additions & 3 deletions src/TCPClientInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#ifdef ARDUINO
#include <WiFi.h>
#include <WiFiClient.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <atomic>
#else
#include <netinet/in.h>
#endif
Expand Down Expand Up @@ -42,9 +45,10 @@ class TCPClientInterface : public RNS::InterfaceImpl {
static const uint32_t BITRATE_GUESS = 10 * 1000 * 1000; // 10 Mbps
static const uint32_t HW_MTU = 1064; // Match UDPInterface

// Reconnection parameters (match Python RNS)
static const uint32_t RECONNECT_WAIT_MS = 5000; // 5 seconds between reconnect attempts
static const uint32_t CONNECT_TIMEOUT_MS = 5000; // 5 second connection timeout
// Reconnection parameters. connect() runs on tcp_task (off the main loop),
// but still bound the timeout and back off so a dead host doesn't busy-retry.
static const uint32_t RECONNECT_WAIT_MS = 15000; // 15s between reconnect attempts
static const uint32_t CONNECT_TIMEOUT_MS = 3000; // 3s connect timeout (task-side)

// TCP keepalive parameters (match Python RNS)
static const int TCP_KEEPIDLE_SEC = 5;
Expand Down Expand Up @@ -78,6 +82,22 @@ class TCPClientInterface : public RNS::InterfaceImpl {
void configure_socket();
void handle_disconnect();

#ifdef ARDUINO
// Only the blocking connect()/DNS runs on tcp_task so it can't stall the
// main loop (which froze the UI). read/write/frame all stay on the main loop
// (low latency, unchanged data path). _conn_state hands _client ownership
// between the two so they never touch the socket concurrently:
// DISCONNECTED -> CONNECTING (task owns _client) -> CONNECTED (main loop
// owns _client); main loop flips CONNECTED -> DISCONNECTED on a drop.
enum ConnState : uint8_t { DISCONNECTED = 0, CONNECTING = 1, CONNECTED = 2 };
static void tcp_task(void* arg);
void task_loop();
TaskHandle_t _task_handle = nullptr;
volatile bool _task_running = false;
volatile bool _task_done = false; // task sets this right before exit; stop() joins on it
Comment thread
torlando-tech marked this conversation as resolved.
Outdated
std::atomic<uint8_t> _conn_state{DISCONNECTED};
#endif

// HDLC frame processing
void process_incoming();
void extract_and_process_frames();
Expand All @@ -88,19 +108,32 @@ class TCPClientInterface : public RNS::InterfaceImpl {

// Connection state
bool _initiator = true;
#ifdef ARDUINO
// Touched by both task_loop() (core 0) and handle_disconnect() (core 1).
std::atomic<uint32_t> _last_connect_attempt{0};
#else
uint32_t _last_connect_attempt = 0;
#endif
#ifdef ARDUINO
std::atomic<bool> _reconnected{false}; // re-established after offline (task-set)
#else
bool _reconnected = false; // Set when connection re-established after being offline
#endif
uint32_t _last_data_received = 0; // Track last data receipt for stale connection detection
static const uint32_t STALE_CONNECTION_MS = 120000; // Consider connection stale after 2 min no data

public:
// Check and clear reconnection flag (for announcing after reconnect)
bool check_reconnected() {
#ifdef ARDUINO
return _reconnected.exchange(false);
#else
if (_reconnected) {
_reconnected = false;
return true;
}
return false;
#endif
}

private:
Expand Down
8 changes: 5 additions & 3 deletions tests/hardware/tdeck_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,11 @@ def wait_for_pyxis_boot(t, timeout=90):
def wait_for_tcp_link(t, timeout=60):
log("HARNESS", "Waiting for TCP interface to connect to rnsd...")
line = t.wait_for_line(
lambda L: "TCPClientInterface" in L and (
"connected" in L.lower() or "Connected" in L or "started" in L.lower()
),
# Match the actual connection ("Connected to ..."), not interface/worker
# startup. Matching "started" let the harness proceed (and drive the
# announce) before the link was up, so the announce was lost and the
# first direct message failed.
lambda L: "TCPClientInterface" in L and "connected to" in L.lower(),
timeout,
)
if line:
Expand Down
Loading