Skip to content

Commit 91aa7f9

Browse files
authored
fix: resolve cross-compilation GLIBC issues and create deployment script (#1675)
1 parent 775a70b commit 91aa7f9

29 files changed

+1330
-27
lines changed

Cross.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[target.aarch64-unknown-linux-gnu]
2+
# Use a newer cross image with updated GLIBC
3+
image = "ghcr.io/cross-rs/aarch64-unknown-linux-gnu:edge"
4+
5+
[target.x86_64-unknown-linux-gnu]
6+
image = "ghcr.io/cross-rs/x86_64-unknown-linux-gnu:edge"
9.11 MB
Binary file not shown.
25.3 MB
Binary file not shown.
10.2 MB
Binary file not shown.
29.5 MB
Binary file not shown.
9.25 MB
Binary file not shown.
25.6 MB
Binary file not shown.
10.5 MB
Binary file not shown.
30.1 MB
Binary file not shown.

crates/core/build.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
use std::process::Command;
22

33
fn main() {
4+
// Skip flatbuffers generation for cross-compilation
5+
if std::env::var("CARGO_BUILD_TARGET").is_ok() {
6+
return;
7+
}
8+
49
let status = Command::new("flatc")
510
.arg("--rust")
611
.arg("-o")
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Channel Overflow Analysis
2+
3+
## Problem
4+
- Dropping 2251 packets in 10 seconds (225 packets/second)
5+
- Channels in `peer_connection.rs` have buffer size of only 1
6+
- Channels in `connection_handler.rs` use buffer size of 100
7+
8+
## Root Cause
9+
In `peer_connection.rs` lines 261-262:
10+
```rust
11+
let (outbound_packets, outbound_packets_recv) = mpsc::channel(1);
12+
let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(1);
13+
```
14+
15+
Buffer size of 1 means:
16+
- Only 1 packet can be queued
17+
- Any additional packets are dropped if receiver hasn't processed the previous one
18+
- With UDP's bursty nature, this causes massive packet loss
19+
20+
## Evidence
21+
From connection_handler.rs:317-322:
22+
```rust
23+
match remote_conn.inbound_packet_sender.try_send(packet_data) {
24+
Ok(_) => { /* success */ }
25+
Err(mpsc::error::TrySendError::Full(_)) => {
26+
// Channel full - this is happening 225 times/second!
27+
}
28+
}
29+
```
30+
31+
## Solution
32+
Increase channel buffer size from 1 to 100 to match other channels in the codebase.
33+
34+
## Impact
35+
This should significantly reduce packet drops and improve connection stability.

crates/core/src/transport/connection_handler.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,20 @@ pub(crate) async fn create_connection_handler<S: Socket>(
6767
bandwidth_limit: Option<usize>,
6868
) -> Result<(OutboundConnectionHandler, InboundConnectionHandler), TransportError> {
6969
// Bind the UDP socket to the specified port
70-
let socket = S::bind((listen_host, listen_port).into()).await?;
70+
let bind_addr: SocketAddr = (listen_host, listen_port).into();
71+
tracing::info!(
72+
target: "freenet_core::transport::send_debug",
73+
%bind_addr,
74+
is_gateway,
75+
"Binding UDP socket"
76+
);
77+
let socket = S::bind(bind_addr).await?;
78+
tracing::info!(
79+
target: "freenet_core::transport::send_debug",
80+
%bind_addr,
81+
is_gateway,
82+
"UDP socket bound successfully"
83+
);
7184
let (och, new_connection_notifier) = OutboundConnectionHandler::config_listener(
7285
Arc::new(socket),
7386
keypair,

crates/core/src/transport/peer_connection.rs

Lines changed: 118 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -147,16 +147,37 @@ impl PeerConnection {
147147
let last_packet_id = remote_conn.last_packet_id.clone();
148148

149149
let keep_alive_handle = tokio::spawn(async move {
150+
tracing::info!(
151+
target: "freenet_core::transport::keepalive_lifecycle",
152+
remote = ?remote_addr,
153+
"Keep-alive task STARTED for connection"
154+
);
155+
150156
let mut interval = tokio::time::interval(KEEP_ALIVE_INTERVAL);
151157
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
152158

153159
// Skip the first immediate tick
154160
interval.tick().await;
155161

162+
let mut tick_count = 0u64;
163+
let task_start = std::time::Instant::now();
164+
156165
loop {
166+
let tick_start = std::time::Instant::now();
157167
interval.tick().await;
168+
tick_count += 1;
169+
170+
let elapsed_since_start = task_start.elapsed();
171+
let elapsed_since_last_tick = tick_start.elapsed();
158172

159-
tracing::trace!(remote = ?remote_addr, "Keep-alive timer tick - sending NoOp");
173+
tracing::info!(
174+
target: "freenet_core::transport::keepalive_lifecycle",
175+
remote = ?remote_addr,
176+
tick_count,
177+
elapsed_since_start_secs = elapsed_since_start.as_secs_f64(),
178+
tick_interval_ms = elapsed_since_last_tick.as_millis(),
179+
"Keep-alive tick - attempting to send NoOp"
180+
);
160181

161182
// Create a NoOp packet
162183
let packet_id = last_packet_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
@@ -174,17 +195,43 @@ impl PeerConnection {
174195
};
175196

176197
// Send the keep-alive packet
177-
if outbound_packets
178-
.send((remote_addr, noop_packet))
179-
.await
180-
.is_err()
181-
{
182-
tracing::debug!(remote = ?remote_addr, "Keep-alive task stopping - channel closed");
183-
break;
198+
tracing::info!(
199+
target: "freenet_core::transport::keepalive_lifecycle",
200+
remote = ?remote_addr,
201+
packet_id,
202+
"Sending keep-alive NoOp packet"
203+
);
204+
205+
match outbound_packets.send((remote_addr, noop_packet)).await {
206+
Ok(_) => {
207+
tracing::info!(
208+
target: "freenet_core::transport::keepalive_lifecycle",
209+
remote = ?remote_addr,
210+
packet_id,
211+
"Keep-alive NoOp packet sent successfully"
212+
);
213+
}
214+
Err(e) => {
215+
tracing::warn!(
216+
target: "freenet_core::transport::keepalive_lifecycle",
217+
remote = ?remote_addr,
218+
error = ?e,
219+
elapsed_since_start_secs = task_start.elapsed().as_secs_f64(),
220+
total_ticks = tick_count,
221+
"Keep-alive task STOPPING - channel closed"
222+
);
223+
break;
224+
}
184225
}
185226
}
186227

187-
tracing::debug!(remote = ?remote_addr, "Keep-alive task exiting");
228+
tracing::warn!(
229+
target: "freenet_core::transport::keepalive_lifecycle",
230+
remote = ?remote_addr,
231+
total_lifetime_secs = task_start.elapsed().as_secs_f64(),
232+
total_ticks = tick_count,
233+
"Keep-alive task EXITING"
234+
);
188235
});
189236

190237
tracing::info!(remote = ?remote_addr, "PeerConnection created with persistent keep-alive task");
@@ -211,8 +258,8 @@ impl PeerConnection {
211258
) -> PeerConnectionMock {
212259
use crate::transport::crypto::TransportKeypair;
213260
use parking_lot::Mutex;
214-
let (outbound_packets, outbound_packets_recv) = mpsc::channel(1);
215-
let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(1);
261+
let (outbound_packets, outbound_packets_recv) = mpsc::channel(100);
262+
let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(100);
216263
let keypair = TransportKeypair::new();
217264
let remote = RemoteConnection {
218265
outbound_packets,
@@ -243,8 +290,8 @@ impl PeerConnection {
243290
) -> RemoteConnectionMock {
244291
use crate::transport::crypto::TransportKeypair;
245292
use parking_lot::Mutex;
246-
let (outbound_packets, outbound_packets_recv) = mpsc::channel(1);
247-
let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(1);
293+
let (outbound_packets, outbound_packets_recv) = mpsc::channel(100);
294+
let (inbound_packet_sender, inbound_packet_recv) = mpsc::channel(100);
248295
let keypair = TransportKeypair::new();
249296
(
250297
RemoteConnection {
@@ -411,6 +458,28 @@ impl PeerConnection {
411458
confirm_receipt,
412459
payload,
413460
} = msg;
461+
462+
// Log keep-alive packets specifically
463+
if matches!(payload, SymmetricMessagePayload::NoOp) {
464+
if confirm_receipt.is_empty() {
465+
tracing::info!(
466+
target: "freenet_core::transport::keepalive_received",
467+
remote = ?self.remote_conn.remote_addr,
468+
packet_id,
469+
time_since_last_received_ms = last_received.elapsed().as_millis(),
470+
"Received NoOp keep-alive packet (no receipts)"
471+
);
472+
} else {
473+
tracing::debug!(
474+
target: "freenet_core::transport::keepalive_received",
475+
remote = ?self.remote_conn.remote_addr,
476+
packet_id,
477+
receipt_count = confirm_receipt.len(),
478+
"Received NoOp receipt packet"
479+
);
480+
}
481+
}
482+
414483
{
415484
tracing::trace!(
416485
remote = %self.remote_conn.remote_addr,
@@ -484,9 +553,43 @@ impl PeerConnection {
484553
res.map_err(|e| TransportError::Other(e.into()))??
485554
}
486555
_ = timeout_check.tick() => {
487-
if last_received.elapsed() > KILL_CONNECTION_AFTER {
488-
tracing::warn!(remote = ?self.remote_conn.remote_addr, "connection timed out - no packets received for {:?}", last_received.elapsed());
556+
let elapsed = last_received.elapsed();
557+
if elapsed > KILL_CONNECTION_AFTER {
558+
tracing::warn!(
559+
target: "freenet_core::transport::keepalive_timeout",
560+
remote = ?self.remote_conn.remote_addr,
561+
elapsed_seconds = elapsed.as_secs_f64(),
562+
timeout_threshold_secs = KILL_CONNECTION_AFTER.as_secs(),
563+
"CONNECTION TIMEOUT - no packets received for {:.8}s",
564+
elapsed.as_secs_f64()
565+
);
566+
567+
// Check if keep-alive task is still alive
568+
if let Some(ref handle) = self.keep_alive_handle {
569+
if !handle.is_finished() {
570+
tracing::error!(
571+
target: "freenet_core::transport::keepalive_timeout",
572+
remote = ?self.remote_conn.remote_addr,
573+
"Keep-alive task is STILL RUNNING despite timeout!"
574+
);
575+
} else {
576+
tracing::error!(
577+
target: "freenet_core::transport::keepalive_timeout",
578+
remote = ?self.remote_conn.remote_addr,
579+
"Keep-alive task has ALREADY FINISHED before timeout!"
580+
);
581+
}
582+
}
583+
489584
return Err(TransportError::ConnectionClosed(self.remote_addr()));
585+
} else {
586+
tracing::trace!(
587+
target: "freenet_core::transport::keepalive_health",
588+
remote = ?self.remote_conn.remote_addr,
589+
elapsed_seconds = elapsed.as_secs_f64(),
590+
remaining_seconds = (KILL_CONNECTION_AFTER - elapsed).as_secs_f64(),
591+
"Connection health check - still alive"
592+
);
490593
}
491594
}
492595
_ = resend_check.take().unwrap_or(tokio::time::sleep(Duration::from_millis(10))) => {

crates/core/src/transport/rate_limiter.rs

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,14 @@ impl<T: TimeSource> PacketRateLimiter<T> {
3939
bandwidth_limit: Option<usize>,
4040
socket: Arc<S>,
4141
) {
42-
tracing::info!(bandwidth_limit, "Rate limiter task started");
42+
tracing::info!("Rate limiter task started");
4343
while let Some((socket_addr, packet)) = self.outbound_packets.recv().await {
4444
// tracing::trace!(%socket_addr, packet_len = %packet.len(), "Sending outbound packet");
4545
if let Some(bandwidth_limit) = bandwidth_limit {
4646
self.rate_limiting(bandwidth_limit, &*socket, packet, socket_addr)
4747
.await;
48-
} else if let Err(error) = socket.send_to(&packet, socket_addr).await {
49-
tracing::error!(%socket_addr, "Error sending packet: {}", error);
50-
continue;
48+
} else {
49+
let _ = socket.send_to(&packet, socket_addr).await;
5150
}
5251
}
5352
tracing::debug!("Rate limiter task ended unexpectedly");
@@ -64,11 +63,62 @@ impl<T: TimeSource> PacketRateLimiter<T> {
6463
if let Some(wait_time) = self.can_send_packet(bandwidth_limit, packet.len()) {
6564
tokio::time::sleep(wait_time).await;
6665
tracing::debug!(%socket_addr, "Sending outbound packet after waiting {:?}", wait_time);
67-
if let Err(error) = socket.send_to(&packet, socket_addr).await {
68-
tracing::error!("Error sending packet: {}", error);
66+
67+
tracing::info!(
68+
target: "freenet_core::transport::send_debug",
69+
dest_addr = %socket_addr,
70+
packet_len = packet.len(),
71+
wait_time_ms = wait_time.as_millis(),
72+
"Attempting to send packet (after rate limit wait)"
73+
);
74+
75+
match socket.send_to(&packet, socket_addr).await {
76+
Ok(bytes_sent) => {
77+
tracing::info!(
78+
target: "freenet_core::transport::send_debug",
79+
dest_addr = %socket_addr,
80+
bytes_sent,
81+
"Socket send_to completed (after wait)"
82+
);
83+
}
84+
Err(error) => {
85+
tracing::error!(
86+
target: "freenet_core::transport::send_debug",
87+
dest_addr = %socket_addr,
88+
error = %error,
89+
"Socket send_to failed (after wait)"
90+
);
91+
}
92+
}
93+
} else {
94+
tracing::info!(
95+
target: "freenet_core::transport::send_debug",
96+
dest_addr = %socket_addr,
97+
packet_len = packet.len(),
98+
first_bytes = ?&packet[..std::cmp::min(32, packet.len())],
99+
"Attempting to send packet (with rate limit)"
100+
);
101+
102+
match socket.send_to(&packet, socket_addr).await {
103+
Ok(bytes_sent) => {
104+
tracing::info!(
105+
target: "freenet_core::transport::send_debug",
106+
dest_addr = %socket_addr,
107+
bytes_sent,
108+
expected_len = packet.len(),
109+
"Socket send_to completed (rate limited)"
110+
);
111+
}
112+
Err(error) => {
113+
tracing::error!(
114+
target: "freenet_core::transport::send_debug",
115+
dest_addr = %socket_addr,
116+
error = %error,
117+
error_kind = ?error.kind(),
118+
"Socket send_to failed (rate limited)"
119+
);
120+
}
69121
}
70-
} else if let Err(error) = socket.send_to(&packet, socket_addr).await {
71-
tracing::debug!(%socket_addr, "Error sending packet: {:?}", error);
72122
}
73123
self.add_packet(packet.len());
74124
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Keep-alive Analysis Results
2+
3+
## Key Finding: Keep-alive packets are being SENT but NOT RECEIVED
4+
5+
### Evidence from logs:
6+
7+
1. **Keep-alive task lifecycle (Hypothesis #1: REFUTED)**
8+
- Keep-alive tasks START correctly when connections are established
9+
- Keep-alive tasks are STILL RUNNING when timeout occurs
10+
- ERROR log: "Keep-alive task is STILL RUNNING despite timeout!"
11+
- Tasks do NOT exit prematurely
12+
13+
2. **Keep-alive sending pattern (vega 136.62.52.28)**
14+
- First connection (13:58:59 - 13:59:30):
15+
- Sent 3 keep-alives (packet IDs: 8, 9, 10) at 10s intervals
16+
- Timeout after exactly 30 seconds
17+
- Subsequent reconnections show same pattern
18+
19+
3. **Keep-alive receiving pattern**
20+
- ZERO keep-alive packets received from vega
21+
- This explains the 30-second timeout
22+
23+
## Timeline for first vega connection:
24+
- 13:58:59.243: Keep-alive task STARTED
25+
- 13:59:09.244: Keep-alive sent (packet 8) - 10s after start
26+
- 13:59:19.244: Keep-alive sent (packet 9) - 20s after start
27+
- 13:59:29.244: Keep-alive sent (packet 10) - 30s after start
28+
- 13:59:30.131: CONNECTION TIMEOUT at 30.001s
29+
- 13:59:30.131: ERROR: Keep-alive task is STILL RUNNING
30+
31+
## Confirmed Hypothesis:
32+
**Hypothesis #2: Keep-alive packets are sent but not received**
33+
- Status: CONFIRMED
34+
- We are sending keep-alives every 10 seconds
35+
- We are NOT receiving any keep-alives back
36+
- The connection times out after exactly 30 seconds
37+
38+
## Next Investigation:
39+
- Why are keep-alive packets not being received?
40+
- Are they being sent on the wire?
41+
- Are they being dropped/filtered?
42+
- Is vega gateway not sending them?

0 commit comments

Comments
 (0)