@@ -119,6 +119,12 @@ pub enum Error {
119
119
#[ error( "Failed to send request to Instance: Channel closed" ) ]
120
120
FailedSendChannelClosed ,
121
121
122
+ #[ error(
123
+ "Failed to send request to Instance: channel at capacity \
124
+ ({QUEUE_SIZE})"
125
+ ) ]
126
+ FailedSendChannelFull ,
127
+
122
128
#[ error(
123
129
"Failed to send request from Instance Runner: Client Channel closed"
124
130
) ]
@@ -217,10 +223,10 @@ enum InstanceRequest {
217
223
tx : oneshot:: Sender < Result < ZoneBundleMetadata , BundleError > > ,
218
224
} ,
219
225
GetFilesystemPool {
220
- tx : oneshot:: Sender < Option < ZpoolName > > ,
226
+ tx : oneshot:: Sender < Result < Option < ZpoolName > , ManagerError > > ,
221
227
} ,
222
228
CurrentState {
223
- tx : oneshot:: Sender < SledVmmState > ,
229
+ tx : oneshot:: Sender < Result < SledVmmState , ManagerError > > ,
224
230
} ,
225
231
PutState {
226
232
state : VmmStateRequested ,
@@ -248,6 +254,58 @@ enum InstanceRequest {
248
254
} ,
249
255
}
250
256
257
+ impl InstanceRequest {
258
+ /// Handle an error returned by [`mpsc::Sender::try_send`] when attempting
259
+ /// to send a request to the instance.
260
+ ///
261
+ /// This is a bit complex: the returned [`mpsc::error::TrySendError`] will
262
+ /// contain the [`InstanceRequest`] we were trying to send, and thus the
263
+ /// [`oneshot::Sender`] for that request's response. This function handles
264
+ /// the `TrySendError` by inspecting the error to determine whether the
265
+ /// channel has closed or is full, constructing the relevant [`Error`], and
266
+ /// extracting the response oneshot channel from the request, and then
267
+ /// sending back the error over that channel.
268
+ ///
269
+ /// If sending the error back to the client fails, this function returns an
270
+ /// error, so that the client having given up can be logged; otherwise, it returns `Ok(())`.
271
+ fn fail_try_send (
272
+ err : mpsc:: error:: TrySendError < Self > ,
273
+ ) -> Result < ( ) , Error > {
274
+ let ( error, this) = match err {
275
+ mpsc:: error:: TrySendError :: Closed ( this) => {
276
+ ( Error :: FailedSendChannelClosed , this)
277
+ }
278
+ mpsc:: error:: TrySendError :: Full ( this) => {
279
+ ( Error :: FailedSendChannelFull , this)
280
+ }
281
+ } ;
282
+
283
+ match this {
284
+ Self :: RequestZoneBundle { tx } => tx
285
+ . send ( Err ( BundleError :: FailedSend ( anyhow ! ( error) ) ) )
286
+ . map_err ( |_| Error :: FailedSendClientClosed ) ,
287
+ Self :: GetFilesystemPool { tx } => tx
288
+ . send ( Err ( error. into ( ) ) )
289
+ . map_err ( |_| Error :: FailedSendClientClosed ) ,
290
+ Self :: CurrentState { tx } => tx
291
+ . send ( Err ( error. into ( ) ) )
292
+ . map_err ( |_| Error :: FailedSendClientClosed ) ,
293
+ Self :: PutState { tx, .. } => tx
294
+ . send ( Err ( error. into ( ) ) )
295
+ . map_err ( |_| Error :: FailedSendClientClosed ) ,
296
+ Self :: Terminate { tx, .. } => tx
297
+ . send ( Err ( error. into ( ) ) )
298
+ . map_err ( |_| Error :: FailedSendClientClosed ) ,
299
+ Self :: IssueSnapshotRequest { tx, .. }
300
+ | Self :: AddExternalIp { tx, .. }
301
+ | Self :: DeleteExternalIp { tx, .. }
302
+ | Self :: RefreshExternalIps { tx } => tx
303
+ . send ( Err ( error. into ( ) ) )
304
+ . map_err ( |_| Error :: FailedSendClientClosed ) ,
305
+ }
306
+ }
307
+ }
308
+
251
309
// A small task which tracks the state of the instance, by constantly querying
252
310
// the state of Propolis for updates.
253
311
//
@@ -488,11 +546,11 @@ impl InstanceRunner {
488
546
. map_err( |_| Error :: FailedSendClientClosed )
489
547
} ,
490
548
Some ( GetFilesystemPool { tx } ) => {
491
- tx. send( self . get_filesystem_zpool( ) )
549
+ tx. send( Ok ( self . get_filesystem_zpool( ) ) )
492
550
. map_err( |_| Error :: FailedSendClientClosed )
493
551
} ,
494
552
Some ( CurrentState { tx } ) => {
495
- tx. send( self . current_state( ) )
553
+ tx. send( Ok ( self . current_state( ) ) )
496
554
. map_err( |_| Error :: FailedSendClientClosed )
497
555
} ,
498
556
Some ( PutState { state, tx } ) => {
@@ -562,9 +620,9 @@ impl InstanceRunner {
562
620
RequestZoneBundle { tx } => tx
563
621
. send ( Err ( BundleError :: InstanceTerminating ) )
564
622
. map_err ( |_| ( ) ) ,
565
- GetFilesystemPool { tx } => tx. send ( None ) . map_err ( |_| ( ) ) ,
623
+ GetFilesystemPool { tx } => tx. send ( Ok ( None ) ) . map_err ( |_| ( ) ) ,
566
624
CurrentState { tx } => {
567
- tx. send ( self . current_state ( ) ) . map_err ( |_| ( ) )
625
+ tx. send ( Ok ( self . current_state ( ) ) ) . map_err ( |_| ( ) )
568
626
}
569
627
PutState { tx, .. } => {
570
628
tx. send ( Err ( Error :: Terminating . into ( ) ) ) . map_err ( |_| ( ) )
@@ -1092,13 +1150,48 @@ fn propolis_error_code(
1092
1150
}
1093
1151
1094
1152
/// Describes a single Propolis server that incarnates a specific instance.
1153
+ #[ derive( Clone ) ]
1095
1154
pub struct Instance {
1096
1155
id : InstanceUuid ,
1097
1156
1157
+ /// Request channel for communicating with the instance task.
1158
+ ///
1159
+ /// # Extremely Serious Warning
1160
+ ///
1161
+ /// This channel is used by the `InstanceManager` task to communicate to the
1162
+ /// instance task corresponding to each instance on this sled. Note that all
1163
+ /// of the methods on this type which send [`InstanceRequest`]s over this
1164
+ /// channel use [`mpsc::Sender::try_send`], which fails if the channel is at
1165
+ /// capacity, and *not* [`mpsc::Sender::send`], which is an async method
1166
+ /// that *waits* until capacity is available. THIS IS VERY IMPORTANT.
1167
+ ///
1168
+ /// This is because the `InstanceManager` task will call these methods in
1169
+ /// its request-processing loop as it receives requests from clients, in
1170
+ /// order to forward the request to the relevant instance. If the instance's
1171
+ /// channel has filled up because the instance is currently processing a
1172
+ /// slow request, `await`ing a call to [`mpsc::Sender::send`] will block the
1173
+ /// `InstanceManager`'s main loop from proceeding until the instance task
1174
+ /// has finished what it's doing and drained the next request from channel.
1175
+ /// Critically, this means that requests to *other, unrelated instances* on
1176
+ /// this sled would have to wait until this instance has finished what it's
1177
+ /// doing. That means a single deadlocked instance task, which is waiting
1178
+ /// for something that never completes, can render *all* instances on this
1179
+ /// sled inaccessible.
1180
+ ///
1181
+ /// Therefore, any time we send requests to the `Instance` over this channel
1182
+ /// from code that's called in the `InstanceManager`'s run loop MUST use
1183
+ /// [`mpsc::Sender::try_send`] rather than [`mpsc::Sender::send`]. Should
1184
+ /// the channel be at capacity, we return an
1185
+ /// [`Error::FailedSendChannelFull`], which eventually becomes a 503 Service
1186
+ /// Unavailable error when returned to the client. It is acceptable to call
1187
+ /// [`mpsc::Sender::send`] on this channel ONLY from code which runs
1188
+ /// exclusively in tasks that are not blocking the `InstanceManager`'s run
1189
+ /// loop.
1098
1190
tx : mpsc:: Sender < InstanceRequest > ,
1099
1191
1192
+ /// This is reference-counted so that the `Instance` struct may be cloned.
1100
1193
#[ allow( dead_code) ]
1101
- runner_handle : tokio:: task:: JoinHandle < ( ) > ,
1194
+ runner_handle : Arc < tokio:: task:: JoinHandle < ( ) > > ,
1102
1195
}
1103
1196
1104
1197
#[ derive( Debug ) ]
@@ -1250,43 +1343,39 @@ impl Instance {
1250
1343
let runner_handle =
1251
1344
tokio:: task:: spawn ( async move { runner. run ( ) . await } ) ;
1252
1345
1253
- Ok ( Instance { id, tx, runner_handle } )
1346
+ Ok ( Instance { id, tx, runner_handle : Arc :: new ( runner_handle ) } )
1254
1347
}
1255
1348
1256
1349
pub fn id ( & self ) -> InstanceUuid {
1257
1350
self . id
1258
1351
}
1259
1352
1260
1353
/// Create bundle from an instance zone.
1261
- pub async fn request_zone_bundle (
1354
+ pub fn request_zone_bundle (
1262
1355
& self ,
1263
1356
tx : oneshot:: Sender < Result < ZoneBundleMetadata , BundleError > > ,
1264
- ) -> Result < ( ) , BundleError > {
1357
+ ) -> Result < ( ) , Error > {
1265
1358
self . tx
1266
- . send ( InstanceRequest :: RequestZoneBundle { tx } )
1267
- . await
1268
- . map_err ( |err| BundleError :: FailedSend ( anyhow ! ( err) ) ) ?;
1269
- Ok ( ( ) )
1359
+ . try_send ( InstanceRequest :: RequestZoneBundle { tx } )
1360
+ . or_else ( InstanceRequest :: fail_try_send)
1270
1361
}
1271
1362
1272
- pub async fn get_filesystem_zpool (
1363
+ pub fn get_filesystem_zpool (
1273
1364
& self ,
1274
- ) -> Result < Option < ZpoolName > , Error > {
1275
- let ( tx , rx ) = oneshot :: channel ( ) ;
1365
+ tx : oneshot :: Sender < Result < Option < ZpoolName > , ManagerError > > ,
1366
+ ) -> Result < ( ) , Error > {
1276
1367
self . tx
1277
- . send ( InstanceRequest :: GetFilesystemPool { tx } )
1278
- . await
1279
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1280
- Ok ( rx. await ?)
1368
+ . try_send ( InstanceRequest :: GetFilesystemPool { tx } )
1369
+ . or_else ( InstanceRequest :: fail_try_send)
1281
1370
}
1282
1371
1283
- pub async fn current_state ( & self ) -> Result < SledVmmState , Error > {
1284
- let ( tx, rx) = oneshot:: channel ( ) ;
1372
+ pub fn current_state (
1373
+ & self ,
1374
+ tx : oneshot:: Sender < Result < SledVmmState , ManagerError > > ,
1375
+ ) -> Result < ( ) , Error > {
1285
1376
self . tx
1286
- . send ( InstanceRequest :: CurrentState { tx } )
1287
- . await
1288
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1289
- Ok ( rx. await ?)
1377
+ . try_send ( InstanceRequest :: CurrentState { tx } )
1378
+ . or_else ( InstanceRequest :: fail_try_send)
1290
1379
}
1291
1380
1292
1381
/// Attempts to update the current state of the instance by launching a
@@ -1300,84 +1389,72 @@ impl Instance {
1300
1389
/// instance begins to stop when Propolis has just begun to handle a prior
1301
1390
/// request to reboot, the instance's state may proceed from Stopping to
1302
1391
/// Rebooting to Running to Stopping to Stopped.
1303
- pub async fn put_state (
1392
+ pub fn put_state (
1304
1393
& self ,
1305
1394
tx : oneshot:: Sender < Result < VmmPutStateResponse , ManagerError > > ,
1306
1395
state : VmmStateRequested ,
1307
1396
) -> Result < ( ) , Error > {
1308
1397
self . tx
1309
- . send ( InstanceRequest :: PutState { state, tx } )
1310
- . await
1311
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1312
- Ok ( ( ) )
1398
+ . try_send ( InstanceRequest :: PutState { state, tx } )
1399
+ . or_else ( InstanceRequest :: fail_try_send)
1313
1400
}
1314
1401
1315
1402
/// Rudely terminates this instance's Propolis (if it has one) and
1316
1403
/// immediately transitions the instance to the Destroyed state.
1317
- pub async fn terminate (
1404
+ pub fn terminate (
1318
1405
& self ,
1319
1406
tx : oneshot:: Sender < Result < VmmUnregisterResponse , ManagerError > > ,
1320
1407
mark_failed : bool ,
1321
1408
) -> Result < ( ) , Error > {
1322
1409
self . tx
1323
- . send ( InstanceRequest :: Terminate { mark_failed, tx } )
1324
- . await
1325
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1326
- Ok ( ( ) )
1410
+ . try_send ( InstanceRequest :: Terminate { mark_failed, tx } )
1411
+ . or_else ( InstanceRequest :: fail_try_send)
1327
1412
}
1328
1413
1329
- pub async fn issue_snapshot_request (
1414
+ pub fn issue_snapshot_request (
1330
1415
& self ,
1331
1416
tx : oneshot:: Sender < Result < ( ) , ManagerError > > ,
1332
1417
disk_id : Uuid ,
1333
1418
snapshot_id : Uuid ,
1334
1419
) -> Result < ( ) , Error > {
1335
1420
self . tx
1336
- . send ( InstanceRequest :: IssueSnapshotRequest {
1421
+ . try_send ( InstanceRequest :: IssueSnapshotRequest {
1337
1422
disk_id,
1338
1423
snapshot_id,
1339
1424
tx,
1340
1425
} )
1341
- . await
1342
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1343
- Ok ( ( ) )
1426
+ . or_else ( InstanceRequest :: fail_try_send)
1344
1427
}
1345
1428
1346
- pub async fn add_external_ip (
1429
+ pub fn add_external_ip (
1347
1430
& self ,
1348
1431
tx : oneshot:: Sender < Result < ( ) , ManagerError > > ,
1349
1432
ip : & InstanceExternalIpBody ,
1350
1433
) -> Result < ( ) , Error > {
1351
1434
self . tx
1352
- . send ( InstanceRequest :: AddExternalIp { ip : * ip, tx } )
1353
- . await
1354
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1355
- Ok ( ( ) )
1435
+ . try_send ( InstanceRequest :: AddExternalIp { ip : * ip, tx } )
1436
+ . or_else ( InstanceRequest :: fail_try_send)
1356
1437
}
1357
1438
1358
- pub async fn delete_external_ip (
1439
+ pub fn delete_external_ip (
1359
1440
& self ,
1360
1441
tx : oneshot:: Sender < Result < ( ) , ManagerError > > ,
1361
1442
ip : & InstanceExternalIpBody ,
1362
1443
) -> Result < ( ) , Error > {
1363
1444
self . tx
1364
- . send ( InstanceRequest :: DeleteExternalIp { ip : * ip, tx } )
1365
- . await
1366
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1367
- Ok ( ( ) )
1445
+ . try_send ( InstanceRequest :: DeleteExternalIp { ip : * ip, tx } )
1446
+ . or_else ( InstanceRequest :: fail_try_send)
1368
1447
}
1369
1448
1370
1449
/// Reinstalls an instance's set of external IPs within OPTE, using
1371
1450
/// up-to-date IP<->IGW mappings. This will not disrupt existing flows.
1372
- pub async fn refresh_external_ips (
1451
+ pub fn refresh_external_ips (
1373
1452
& self ,
1374
1453
tx : oneshot:: Sender < Result < ( ) , ManagerError > > ,
1375
1454
) -> Result < ( ) , Error > {
1376
1455
self . tx
1377
- . send ( InstanceRequest :: RefreshExternalIps { tx } )
1378
- . await
1379
- . map_err ( |_| Error :: FailedSendChannelClosed ) ?;
1380
- Ok ( ( ) )
1456
+ . try_send ( InstanceRequest :: RefreshExternalIps { tx } )
1457
+ . or_else ( InstanceRequest :: fail_try_send)
1381
1458
}
1382
1459
}
1383
1460
@@ -2104,7 +2181,6 @@ mod tests {
2104
2181
// pretending we're InstanceManager::ensure_state, start our "instance"
2105
2182
// (backed by fakes and propolis_mock_server)
2106
2183
inst. put_state ( put_tx, VmmStateRequested :: Running )
2107
- . await
2108
2184
. expect ( "failed to send Instance::put_state" ) ;
2109
2185
2110
2186
// even though we ignore this result at instance creation time in
@@ -2198,7 +2274,6 @@ mod tests {
2198
2274
// pretending we're InstanceManager::ensure_state, try in vain to start
2199
2275
// our "instance", but no propolis server is running
2200
2276
inst. put_state ( put_tx, VmmStateRequested :: Running )
2201
- . await
2202
2277
. expect ( "failed to send Instance::put_state" ) ;
2203
2278
2204
2279
let timeout_fut = timeout ( TIMEOUT_DURATION , put_rx) ;
@@ -2305,7 +2380,6 @@ mod tests {
2305
2380
// pretending we're InstanceManager::ensure_state, try in vain to start
2306
2381
// our "instance", but the zone never finishes installing
2307
2382
inst. put_state ( put_tx, VmmStateRequested :: Running )
2308
- . await
2309
2383
. expect ( "failed to send Instance::put_state" ) ;
2310
2384
2311
2385
// Timeout our future waiting for the instance-state-change at 1s. This
0 commit comments