Skip to content

Commit 88b6094

Browse files
committed
feat(coprocessor): coordinate dependence-chain processing across multiple workers
It provides a non-blocking, distributed locking mechanism that coordinates dependence-chain processing across multiple tfhe-workers. A worker can acquire ownership of the next available dependence-chain entry for processing ordered by last_updated_at (FIFO queue-like approach). Ownership expires after a timeout, enabling work-stealing by other workers. New CLI param --worker_id
1 parent 928594e commit 88b6094

File tree

9 files changed

+273
-5
lines changed

9 files changed

+273
-5
lines changed

coprocessor/fhevm-engine/Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coprocessor/fhevm-engine/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ sqlx = { version = "0.8.6", default-features = false, features = [
7272
"time",
7373
"postgres",
7474
"uuid",
75+
"chrono"
7576
] }
7677
testcontainers = "0.24.0"
7778
thiserror = "2.0.12"
@@ -93,6 +94,7 @@ tracing-subscriber = { version = "0.3.20", features = ["fmt", "json"] }
9394
humantime = "2.2.0"
9495
bytesize = "2.0.1"
9596
http = "1.3.1"
97+
chrono = { version = "0.4.41", features = ["serde"] }
9698

9799
[profile.dev.package.tfhe]
98100
overflow-checks = false

coprocessor/fhevm-engine/tfhe-worker/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ opentelemetry = { workspace = true }
3434
opentelemetry-otlp = { workspace = true }
3535
opentelemetry_sdk = { workspace = true }
3636
opentelemetry-semantic-conventions = { workspace = true }
37+
chrono = { workspace = true }
3738

3839
# crates.io dependencies
3940
actix-web = "4.9.0"
@@ -43,6 +44,7 @@ regex = "1.10.6"
4344
tonic-health = "0.12.3"
4445
tonic-types = "0.12.3"
4546
tonic-web = "0.12.3"
47+
uuid = { version = "1", features = ["v4"] }
4648

4749
# local dependencies
4850
fhevm-engine-common = { path = "../fhevm-engine-common" }

coprocessor/fhevm-engine/tfhe-worker/benches/utils.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ async fn start_coprocessor(rx: Receiver<bool>, app_port: u16, db_url: &str) {
108108
health_check_port: 8080,
109109
metric_rerand_batch_latency: MetricsConfig::default(),
110110
metric_fhe_batch_latency: MetricsConfig::default(),
111+
worker_id: None,
111112
};
112113

113114
std::thread::spawn(move || {

coprocessor/fhevm-engine/tfhe-worker/src/daemon_cli.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use clap::Parser;
22
use fhevm_engine_common::telemetry::MetricsConfig;
33
use fhevm_engine_common::utils::DatabaseURL;
44
use tracing::Level;
5+
use uuid::Uuid;
56

67
#[derive(Parser, Debug, Clone)]
78
#[command(version, about, long_about = None)]
@@ -83,6 +84,12 @@ pub struct Args {
8384
#[arg(long, default_value = "tfhe-worker")]
8485
pub service_name: String,
8586

87+
/// Worker/replica ID for this worker instance
88+
/// If not provided, a random UUID will be generated
89+
/// Used to identify the worker in the dependence_chain table
90+
#[arg(long, value_parser = clap::value_parser!(Uuid))]
91+
pub worker_id: Option<Uuid>,
92+
8693
/// Log level for the application
8794
#[arg(
8895
long,
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
use chrono::{DateTime, Utc};
2+
use sqlx::Postgres;
3+
use tracing::{info, warn};
4+
use uuid::Uuid;
5+
6+
/// Implements a non-blocking, distributed locking mechanism
7+
/// that coordinates dependence-chain processing across multiple workers
8+
pub struct LockMngr {
9+
pool: sqlx::Pool<Postgres>,
10+
worker_id: Uuid,
11+
lock: Option<DatabaseChainLock>,
12+
}
13+
14+
/// Dependence chain lock data
15+
#[derive(Debug, sqlx::FromRow, Clone)]
16+
pub struct DatabaseChainLock {
17+
pub dependence_chain_id: Vec<u8>,
18+
pub worker_id: Option<Uuid>,
19+
pub lock_acquired_at: Option<DateTime<Utc>>,
20+
pub lock_expires_at: Option<DateTime<Utc>>,
21+
pub last_updated_at: DateTime<Utc>,
22+
}
23+
24+
impl LockMngr {
25+
pub fn new(worker_id: Uuid, pool: sqlx::Pool<Postgres>) -> Self {
26+
Self {
27+
worker_id,
28+
pool,
29+
lock: None,
30+
}
31+
}
32+
33+
/// Acquire the next available dependence-chain entry for processing
34+
/// sorted by last_updated_at (FIFO).
35+
/// Returns the dependence_chain_id if a lock was acquired
36+
pub async fn acquire_next_lock(&mut self) -> Result<Option<Vec<u8>>, sqlx::Error> {
37+
let row = sqlx::query_as::<_, DatabaseChainLock>(
38+
r#"
39+
WITH c AS (
40+
SELECT dependence_chain_id
41+
FROM dependence_chain
42+
WHERE
43+
status = 'updated' -- Marked as updated by host-listener
44+
AND
45+
(worker_id IS NULL -- Ensure no other workers own it
46+
OR lock_expires_at < NOW()) -- Work-stealing of expired locks
47+
ORDER BY last_updated_at ASC -- FIFO
48+
FOR UPDATE SKIP LOCKED -- Ensure no other worker is currently trying to lock it
49+
LIMIT 1
50+
)
51+
UPDATE dependence_chain AS dc
52+
SET
53+
worker_id = $1,
54+
status = 'processing',
55+
lock_acquired_at = NOW(),
56+
lock_expires_at = NOW() + INTERVAL '30 seconds'
57+
FROM c
58+
WHERE dc.dependence_chain_id = c.dependence_chain_id
59+
RETURNING dc.*;
60+
"#,
61+
)
62+
.bind(self.worker_id.to_string())
63+
.fetch_optional(&self.pool)
64+
.await?;
65+
66+
let row = if let Some(row) = row {
67+
row
68+
} else {
69+
return Ok(None);
70+
};
71+
72+
self.lock.replace(row.clone());
73+
74+
info!(target: "deps_chain", ?row, "Acquired lock");
75+
76+
Ok(Some(row.dependence_chain_id))
77+
}
78+
79+
/// Release all locks held by this worker
80+
///
81+
/// If host-listener has marked the dependence chain as 'updated' in the meantime,
82+
/// we don't overwrite its status
83+
pub async fn release_all_owned_locks(&self) -> Result<u64, sqlx::Error> {
84+
// Since UPDATE always aquire a row-level lock internally,
85+
// this acts as atomic_exchange
86+
let rows = sqlx::query!(
87+
r#"
88+
UPDATE dependence_chain
89+
SET
90+
worker_id = NULL,
91+
lock_acquired_at = NULL,
92+
lock_expires_at = NULL,
93+
status = CASE
94+
WHEN status = 'processing' THEN 'processed'
95+
ELSE status
96+
END
97+
WHERE worker_id = $1
98+
"#,
99+
self.worker_id
100+
)
101+
.execute(&self.pool)
102+
.await?;
103+
104+
info!(target: "deps_chain", worker_id = %self.worker_id,
105+
count = rows.rows_affected(), "Released all locks");
106+
107+
Ok(rows.rows_affected())
108+
}
109+
110+
/// Release the lock held by this worker on the current dependence chain
111+
/// If host-listener has marked the dependence chain as 'updated' in the meantime,
112+
/// we don't overwrite its status
113+
pub async fn release_current_lock(&self) -> Result<u64, sqlx::Error> {
114+
let dep_chain_id = match &self.lock {
115+
Some(lock) => lock.dependence_chain_id.clone(),
116+
None => {
117+
warn!(target: "deps_chain", "No lock to release");
118+
return Ok(0);
119+
}
120+
};
121+
122+
let rows = sqlx::query!(
123+
r#"
124+
UPDATE dependence_chain
125+
SET
126+
worker_id = NULL,
127+
lock_acquired_at = NULL,
128+
lock_expires_at = NULL,
129+
status = CASE
130+
WHEN status = 'processing' THEN 'processed'
131+
ELSE status
132+
END
133+
WHERE worker_id = $1 AND dependence_chain_id = $2
134+
"#,
135+
self.worker_id,
136+
dep_chain_id,
137+
)
138+
.execute(&self.pool)
139+
.await?;
140+
141+
info!(target: "deps_chain", ?dep_chain_id, "Released lock");
142+
143+
Ok(rows.rows_affected())
144+
}
145+
146+
/// Set error on the current dependence chain
147+
/// If host-listener has marked the dependence chain as 'updated' in the meantime,
148+
/// we don't overwrite its error
149+
///
150+
/// The error is only informational and does not affect the processing status
151+
pub async fn set_processing_error(&self, err: Option<String>) -> Result<u64, sqlx::Error> {
152+
let dep_chain_id: Vec<u8> = match &self.lock {
153+
Some(lock) => lock.dependence_chain_id.clone(),
154+
None => {
155+
warn!(target: "deps_chain", "No lock to set error on");
156+
return Ok(0);
157+
}
158+
};
159+
160+
let rows = sqlx::query!(
161+
r#"
162+
UPDATE dependence_chain
163+
SET
164+
error_message = CASE
165+
WHEN status = 'processing' THEN $3
166+
ELSE error_message
167+
END
168+
WHERE worker_id = $1 AND dependence_chain_id = $2
169+
"#,
170+
self.worker_id,
171+
dep_chain_id,
172+
err
173+
)
174+
.execute(&self.pool)
175+
.await?;
176+
177+
info!(target: "deps_chain", ?dep_chain_id, error = ?err, "Set error on lock");
178+
Ok(rows.rows_affected())
179+
}
180+
181+
/// Extend the lock expiration time on the current dependence chain
182+
pub async fn extend_current_lock(&self) -> Result<(), sqlx::Error> {
183+
let dependence_chain_id = match &self.lock {
184+
Some(lock) => lock.dependence_chain_id.clone(),
185+
None => {
186+
info!(target: "deps_chain", "No lock to extend");
187+
return Ok(());
188+
}
189+
};
190+
191+
sqlx::query!(
192+
r#"
193+
UPDATE dependence_chain
194+
SET
195+
lock_expires_at = NOW() + INTERVAL '30 seconds'
196+
WHERE dependence_chain_id = $1 AND worker_id = $2
197+
"#,
198+
dependence_chain_id,
199+
self.worker_id
200+
)
201+
.execute(&self.pool)
202+
.await?;
203+
204+
info!(target: "deps_chain", ?dependence_chain_id, "Extended lock");
205+
206+
Ok(())
207+
}
208+
}

coprocessor/fhevm-engine/tfhe-worker/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use tokio::task::JoinSet;
88

99
pub mod daemon_cli;
1010
mod db_queries;
11+
pub mod dependence_chain;
1112
pub mod health_check;
1213
pub mod server;
1314

coprocessor/fhevm-engine/tfhe-worker/src/tests/utils.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ async fn start_coprocessor(rx: Receiver<bool>, app_port: u16, db_url: &str) {
117117
health_check_port: 8081,
118118
metric_rerand_batch_latency: MetricsConfig::default(),
119119
metric_fhe_batch_latency: MetricsConfig::default(),
120+
worker_id: None,
120121
};
121122

122123
std::thread::spawn(move || {

0 commit comments

Comments
 (0)