diff --git a/contrib/test/run_fd_shred_cap.sh b/contrib/test/run_fd_shred_cap.sh index bdf888276b..82e9b0af72 100755 --- a/contrib/test/run_fd_shred_cap.sh +++ b/contrib/test/run_fd_shred_cap.sh @@ -90,16 +90,16 @@ echo " idx_max = 8192 alloc_max = 1073741824 file = \"$DATA_DIR/shredcap_testnet.blockstore\" +[funk] + max_account_records = 150000000 + heap_size_gib = 100 + max_database_transactions = 2000 [tiles] [tiles.shred] max_pending_shred_sets = 16384 [tiles.replay] snapshot = \"$SNAPSHOT\" incremental = \"$INCREMENTAL\" - funk_sz_gb = 100 - funk_rec_max = 150000000 - funk_txn_max = 2000 - funk_file = \"$DATA_DIR/shredcap_testnet.funk\" [tiles.store_int] shred_cap_replay = \"$SHREDCAP\" shred_cap_end_slot = 317018450 diff --git a/contrib/test/test_firedancer_leader.sh b/contrib/test/test_firedancer_leader.sh index 282f354bcb..3e9771ac58 100755 --- a/contrib/test/test_firedancer_leader.sh +++ b/contrib/test/test_firedancer_leader.sh @@ -50,10 +50,6 @@ echo " [tiles.replay] capture = \"firedancer-dev.solcap\" snapshot = \"$FULL_SNAPSHOT\" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 - funk_file = \"/tmp/localnet.funk\" cluster_version = \"2.0.14\" [tiles.gui] enabled = false @@ -72,6 +68,10 @@ echo " txn_max = 1024 alloc_max = 10737418240 file = \"/tmp/localnet.blockstore\" +[funk] + max_account_records = 10000000 + heap_size_gib = 32 + max_database_transactions = 1024 [log] path = \"firedancer-dev.log\" level_stderr = \"INFO\" diff --git a/snapload.toml b/snapload.toml new file mode 100644 index 0000000000..2038d361b0 --- /dev/null +++ b/snapload.toml @@ -0,0 +1,11 @@ +[hugetlbfs] +max_page_size = "huge" + +[funk] +max_account_records = 100_000_000 +heap_size_gib = 64 + +[log] + level_stderr = "INFO" + level_logfile = "INFO" + path = "-" diff --git a/src/app/firedancer-dev/Local.mk b/src/app/firedancer-dev/Local.mk index 59bddacbaa..85d8a4b289 100644 --- a/src/app/firedancer-dev/Local.mk +++ b/src/app/firedancer-dev/Local.mk @@ -13,6 +13,7 @@ $(call add-objs,commands/gossip,fd_firedancer_dev) $(call add-objs,commands/bench,fd_firedancer_dev) $(call add-objs,commands/dev,fd_firedancer_dev) $(call add-objs,commands/sim,fd_firedancer_dev) +$(call add-objs,commands/snapshot_load,fd_firedancer_dev) $(call add-objs,commands/backtest,fd_firedancer_dev) $(call make-bin,firedancer-dev,main,fd_firedancer_dev fd_firedancer fddev_shared fdctl_shared fd_discof fd_disco fd_choreo fd_flamenco fd_funk fd_quic fd_tls fd_reedsol fd_ballet fd_waltz fd_tango fd_util firedancer_version, $(SECP256K1_LIBS) $(ROCKSDB_LIBS)) diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c index aa3207fa5a..f5df7b934e 100644 --- a/src/app/firedancer-dev/commands/backtest.c +++ b/src/app/firedancer-dev/commands/backtest.c @@ -15,7 +15,7 @@ */ -#include "../../shared/commands/configure/configure.h" +#include "../../firedancer/topology.h" #include "../../shared/commands/run/run.h" /* initialize_workspaces */ #include "../../shared/fd_config.h" /* config_t */ #include "../../../disco/tiles.h" @@ -23,124 +23,12 @@ #include "../../../disco/topo/fd_topob.h" #include "../../../disco/topo/fd_pod_format.h" #include "../../../discof/geyser/fd_replay_notif.h" -#include "../../../flamenco/runtime/fd_runtime.h" #include "../../../flamenco/runtime/fd_txncache.h" -#include "../../../flamenco/snapshot/fd_snapshot_base.h" #include /* pause */ extern fd_topo_obj_callbacks_t * CALLBACKS[]; fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); -static fd_topo_obj_t * -setup_topo_runtime_pub( fd_topo_t * topo, - char const * wksp_name, - ulong mem_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "runtime_pub", wksp_name ); - FD_TEST( fd_pod_insertf_ulong( topo->props, mem_max, "obj.%lu.mem_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, 12UL, "obj.%lu.wksp_tag", obj->id ) ); - return obj; -} - -static fd_topo_obj_t * -setup_topo_txncache( fd_topo_t * topo, - char const * wksp_name, - ulong max_rooted_slots, - ulong max_live_slots, - ulong max_txn_per_slot, - ulong max_constipated_slots ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "txncache", wksp_name ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, max_rooted_slots, "obj.%lu.max_rooted_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_live_slots, "obj.%lu.max_live_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_txn_per_slot, "obj.%lu.max_txn_per_slot", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_constipated_slots, "obj.%lu.max_constipated_slots", obj->id ) ); - - return obj; -} - -#include -#include "../../../flamenco/runtime/fd_blockstore.h" -static fd_topo_obj_t * -setup_topo_blockstore( fd_topo_t * topo, - char const * wksp_name, - ulong shred_max, - ulong block_max, - ulong idx_max, - ulong txn_max, - ulong alloc_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "blockstore", wksp_name ); - - ulong seed; - FD_TEST( sizeof(ulong) == getrandom( &seed, sizeof(ulong), 0 ) ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, 1UL, "obj.%lu.wksp_tag", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, seed, "obj.%lu.seed", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, shred_max, "obj.%lu.shred_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, block_max, "obj.%lu.block_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, idx_max, "obj.%lu.idx_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, txn_max, "obj.%lu.txn_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, alloc_max, "obj.%lu.alloc_max", obj->id ) ); - - /* DO NOT MODIFY LOOSE WITHOUT CHANGING HOW BLOCKSTORE ALLOCATES INTERNAL STRUCTURES */ - - ulong blockstore_footprint = fd_blockstore_footprint( shred_max, block_max, idx_max, txn_max ) + alloc_max; - FD_TEST( fd_pod_insertf_ulong( topo->props, blockstore_footprint, "obj.%lu.loose", obj->id ) ); - - return obj; -} - -static void -setup_snapshots( config_t * config, - fd_topo_tile_t * tile ) { - uchar incremental_is_file, incremental_is_url; - if( strnlen( config->tiles.replay.incremental, PATH_MAX )>0UL ) { - incremental_is_file = 1U; - } else { - incremental_is_file = 0U; - } - if( strnlen( config->tiles.replay.incremental_url, PATH_MAX )>0UL ) { - incremental_is_url = 1U; - } else { - incremental_is_url = 0U; - } - if( FD_UNLIKELY( incremental_is_file && incremental_is_url ) ) { - FD_LOG_ERR(( "At most one of the incremental snapshot source strings in the configuration file under [tiles.replay.incremental] and [tiles.replay.incremental_url] may be set." )); - } - tile->replay.incremental_src_type = INT_MAX; - if( FD_LIKELY( incremental_is_url ) ) { - strncpy( tile->replay.incremental, config->tiles.replay.incremental_url, sizeof(tile->replay.incremental) ); - tile->replay.incremental_src_type = FD_SNAPSHOT_SRC_HTTP; - } - if( FD_UNLIKELY( incremental_is_file ) ) { - strncpy( tile->replay.incremental, config->tiles.replay.incremental, sizeof(tile->replay.incremental) ); - tile->replay.incremental_src_type = FD_SNAPSHOT_SRC_FILE; - } - - uchar snapshot_is_file, snapshot_is_url; - if( strnlen( config->tiles.replay.snapshot, PATH_MAX )>0UL ) { - snapshot_is_file = 1U; - } else { - snapshot_is_file = 0U; - } - if( strnlen( config->tiles.replay.snapshot_url, PATH_MAX )>0UL ) { - snapshot_is_url = 1U; - } else { - snapshot_is_url = 0U; - } - if( FD_UNLIKELY( snapshot_is_file && snapshot_is_url ) ) { - FD_LOG_ERR(( "At most one of the full snapshot source strings in the configuration file under [tiles.replay.snapshot] and [tiles.replay.snapshot_url] may be set." )); - } - tile->replay.snapshot_src_type = INT_MAX; - if( FD_LIKELY( snapshot_is_url ) ) { - strncpy( tile->replay.snapshot, config->tiles.replay.snapshot_url, sizeof(tile->replay.snapshot) ); - tile->replay.snapshot_src_type = FD_SNAPSHOT_SRC_HTTP; - } - if( FD_UNLIKELY( snapshot_is_file ) ) { - strncpy( tile->replay.snapshot, config->tiles.replay.snapshot, sizeof(tile->replay.snapshot) ); - tile->replay.snapshot_src_type = FD_SNAPSHOT_SRC_FILE; - } -} - static void backtest_topo( config_t * config ) { fd_topo_cpus_t cpus[1]; @@ -164,10 +52,7 @@ backtest_topo( config_t * config ) { /**********************************************************************/ fd_topob_wksp( topo, "metric" ); fd_topob_wksp( topo, "metric_in" ); - fd_topo_tile_t * metric_tile = fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); - if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) - FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); - metric_tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); /**********************************************************************/ /* Add the backtest tile to topo */ @@ -187,52 +72,16 @@ backtest_topo( config_t * config ) { /**********************************************************************/ fd_topob_wksp( topo, "replay" ); fd_topo_tile_t * replay_tile = fd_topob_tile( topo, "replay", "replay", "metric_in", replay_cpu_idx, 0, 0 ); - replay_tile->replay.fec_max = config->tiles.shred.max_pending_shred_sets; - replay_tile->replay.max_vote_accounts = config->firedancer.runtime.limits.max_vote_accounts; /* specified by [tiles.replay] */ - strncpy( replay_tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(replay_tile->replay.blockstore_file) ); - strncpy( replay_tile->replay.blockstore_checkpt, config->firedancer.blockstore.checkpt, sizeof(replay_tile->replay.blockstore_checkpt) ); - - replay_tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; - strncpy( replay_tile->replay.capture, config->tiles.replay.capture, sizeof(replay_tile->replay.capture) ); - strncpy( replay_tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(replay_tile->replay.funk_checkpt) ); - replay_tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - replay_tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - replay_tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( replay_tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(replay_tile->replay.funk_file) ); - replay_tile->replay.plugins_enabled = config->tiles.gui.enabled; - - if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) - && !strncmp( config->tiles.replay.snapshot, "", 1 ) ) ) { - fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); - } - strncpy( replay_tile->replay.genesis, config->tiles.replay.genesis, sizeof(replay_tile->replay.genesis) ); - - setup_snapshots( config, replay_tile ); + fd_topob_wksp( topo, "funk" ); + fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", + config->firedancer.funk.max_account_records, + config->firedancer.funk.max_database_transactions, + config->firedancer.funk.heap_size_gib ); - strncpy( replay_tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(replay_tile->replay.slots_replayed) ); - strncpy( replay_tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(replay_tile->replay.status_cache) ); - strncpy( replay_tile->replay.cluster_version, config->tiles.replay.cluster_version, sizeof(replay_tile->replay.cluster_version) ); - replay_tile->replay.bank_tile_count = config->layout.bank_tile_count; - replay_tile->replay.exec_tile_count = config->firedancer.layout.exec_tile_count; - replay_tile->replay.writer_tile_cuont = config->firedancer.layout.writer_tile_count; - strncpy( replay_tile->replay.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(replay_tile->replay.tower_checkpt) ); - - replay_tile->replay.enable_features_cnt = config->tiles.replay.enable_features_cnt; - for( ulong i = 0; i < replay_tile->replay.enable_features_cnt; i++ ) { - strncpy( replay_tile->replay.enable_features[i], config->tiles.replay.enable_features[i], sizeof(replay_tile->replay.enable_features[i]) ); - } - - /* not specified by [tiles.replay] */ - - strncpy( replay_tile->replay.identity_key_path, config->paths.identity_key, sizeof(replay_tile->replay.identity_key_path) ); - replay_tile->replay.ip_addr = config->net.ip_addr; - replay_tile->replay.vote = config->firedancer.consensus.vote; - strncpy( replay_tile->replay.vote_account_path, config->paths.vote_account, sizeof(replay_tile->replay.vote_account_path) ); - replay_tile->replay.full_interval = config->tiles.batch.full_interval; - replay_tile->replay.incremental_interval = config->tiles.batch.incremental_interval; + fd_topob_tile_uses( topo, replay_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); /**********************************************************************/ /* Add the executor tiles to topo */ @@ -427,6 +276,29 @@ backtest_topo( config_t * config ) { fd_topob_tile_uses( topo, replay_tile, constipated_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FD_TEST( fd_pod_insertf_ulong( topo->props, constipated_obj->id, "constipate" ) ); + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + if( !strcmp( tile->name, "rocksdb" ) ) { + tile->archiver.end_slot = config->tiles.archiver.end_slot; + strncpy( tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); + if( FD_UNLIKELY( 0==strlen( tile->archiver.archiver_path ) ) ) { + FD_LOG_ERR(( "Rocksdb not found, check `archiver.archiver_path` in toml" )); + } else { + FD_LOG_NOTICE(( "Found rocksdb path from config: %s", tile->archiver.archiver_path )); + } + } else if( !fd_topo_configure_tile( tile, config ) ) { + FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + } + + /* Override */ + if( !strcmp( tile->name, "replay" ) ) { + tile->replay.enable_features_cnt = config->tiles.replay.enable_features_cnt; + for( ulong i = 0; i < tile->replay.enable_features_cnt; i++ ) { + strncpy( tile->replay.enable_features[i], config->tiles.replay.enable_features[i], sizeof(tile->replay.enable_features[i]) ); + } + } + } + /**********************************************************************/ /* Finish and print out the topo information */ /**********************************************************************/ @@ -436,7 +308,7 @@ backtest_topo( config_t * config ) { static void backtest_cmd_fn( args_t * args FD_PARAM_UNUSED, - config_t * config ) { + config_t * config ) { FD_LOG_NOTICE(( "Start to run the backtest cmd" )); backtest_topo( config ); @@ -459,13 +331,13 @@ backtest_cmd_fn( args_t * args FD_PARAM_UNUSED, static void backtest_cmd_perm( args_t * args FD_PARAM_UNUSED, - fd_cap_chk_t * chk FD_PARAM_UNUSED, - config_t const * config FD_PARAM_UNUSED ) {} + fd_cap_chk_t * chk FD_PARAM_UNUSED, + config_t const * config FD_PARAM_UNUSED ) {} static void backtest_cmd_args( int * pargc FD_PARAM_UNUSED, - char *** pargv FD_PARAM_UNUSED, - args_t * args FD_PARAM_UNUSED ) {} + char *** pargv FD_PARAM_UNUSED, + args_t * args FD_PARAM_UNUSED ) {} action_t fd_action_backtest = { .name = "backtest", diff --git a/src/app/firedancer-dev/commands/sim.c b/src/app/firedancer-dev/commands/sim.c index 0417dfc3fe..1b084e3968 100644 --- a/src/app/firedancer-dev/commands/sim.c +++ b/src/app/firedancer-dev/commands/sim.c @@ -17,78 +17,17 @@ a notification for the previous frag from storei_notif. */ +#include "../../firedancer/topology.h" #include "../../shared/commands/run/run.h" /* initialize_workspaces */ -#include "../../shared/fd_config.h" /* config_t */ #include "../../../disco/topo/fd_cpu_topo.h" /* fd_topo_cpus */ #include "../../../disco/topo/fd_topob.h" #include "../../../disco/topo/fd_pod_format.h" -#include "../../../flamenco/runtime/fd_runtime.h" #include "../../../flamenco/runtime/fd_txncache.h" #include /* pause */ extern fd_topo_obj_callbacks_t * CALLBACKS[]; fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); -/* setup_topo_txncache, setup_topo_runtime_pub and setup_topo_blockstore - are simply copied from fd_firedancer.c */ -static fd_topo_obj_t * -setup_topo_txncache( fd_topo_t * topo, - char const * wksp_name, - ulong max_rooted_slots, - ulong max_live_slots, - ulong max_txn_per_slot, - ulong max_constipated_slots ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "txncache", wksp_name ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, max_rooted_slots, "obj.%lu.max_rooted_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_live_slots, "obj.%lu.max_live_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_txn_per_slot, "obj.%lu.max_txn_per_slot", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_constipated_slots, "obj.%lu.max_constipated_slots", obj->id ) ); - - return obj; -} - -static fd_topo_obj_t * -setup_topo_runtime_pub( fd_topo_t * topo, - char const * wksp_name, - ulong mem_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "runtime_pub", wksp_name ); - FD_TEST( fd_pod_insertf_ulong( topo->props, mem_max, "obj.%lu.mem_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, 12UL, "obj.%lu.wksp_tag", obj->id ) ); - return obj; -} - -#include -#include "../../../flamenco/runtime/fd_blockstore.h" -static fd_topo_obj_t * -setup_topo_blockstore( fd_topo_t * topo, - char const * wksp_name, - ulong shred_max, - ulong block_max, - ulong idx_max, - ulong txn_max, - ulong alloc_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "blockstore", wksp_name ); - - ulong seed; - FD_TEST( sizeof(ulong) == getrandom( &seed, sizeof(ulong), 0 ) ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, 1UL, "obj.%lu.wksp_tag", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, seed, "obj.%lu.seed", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, shred_max, "obj.%lu.shred_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, block_max, "obj.%lu.block_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, idx_max, "obj.%lu.idx_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, txn_max, "obj.%lu.txn_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, alloc_max, "obj.%lu.alloc_max", obj->id ) ); - - /* DO NOT MODIFY LOOSE WITHOUT CHANGING HOW BLOCKSTORE ALLOCATES INTERNAL STRUCTURES */ - - ulong blockstore_footprint = fd_blockstore_footprint( shred_max, block_max, idx_max, txn_max ) + alloc_max; - FD_TEST( fd_pod_insertf_ulong( topo->props, blockstore_footprint, "obj.%lu.loose", obj->id ) ); - - return obj; -} - static void sim_topo( config_t * config ) { fd_topo_cpus_t cpus[1]; @@ -106,95 +45,24 @@ sim_topo( config_t * config ) { static_end_idx, }; - /**********************************************************************/ - /* Add the metric tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "metric" ); fd_topob_wksp( topo, "metric_in" ); - fd_topo_tile_t * metric_tile = fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); - if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) - FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); - metric_tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); - /**********************************************************************/ - /* Add the playback tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "playback" ); - fd_topo_tile_t * playback_tile = fd_topob_tile( topo, "arch_p", "playback", "metric_in", playback_cpu_idx, 0, 0 ); - strncpy( playback_tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); - if( FD_UNLIKELY( 0==strlen( playback_tile->archiver.archiver_path ) ) ) { - FD_LOG_ERR(( "Archive file not found for playback" )); - } else { - FD_LOG_NOTICE(( "Found archive file from config: %s", playback_tile->archiver.archiver_path )); - } + fd_topob_tile( topo, "arch_p", "playback", "metric_in", playback_cpu_idx, 0, 0 ); - /**********************************************************************/ - /* Add the storei tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "storei" ); fd_topo_tile_t * storei_tile = fd_topob_tile( topo, "storei", "storei", "metric_in", storei_cpu_idx, 0, 0 ); - strncpy( storei_tile->store_int.blockstore_file, config->firedancer.blockstore.file, sizeof(storei_tile->store_int.blockstore_file) ); - strncpy( storei_tile->store_int.blockstore_restore, config->firedancer.blockstore.restore, sizeof(storei_tile->store_int.blockstore_restore) ); - strncpy( storei_tile->store_int.identity_key_path, config->paths.identity_key, sizeof(storei_tile->store_int.identity_key_path) ); - strncpy( storei_tile->store_int.slots_pending, config->tiles.store_int.slots_pending, sizeof( storei_tile->store_int.slots_pending ) ); - strncpy( storei_tile->store_int.shred_cap_archive, config->tiles.store_int.shred_cap_archive, sizeof(storei_tile->store_int.shred_cap_archive) ); - strncpy( storei_tile->store_int.shred_cap_replay, config->tiles.store_int.shred_cap_replay, sizeof(storei_tile->store_int.shred_cap_replay) ); - storei_tile->store_int.shred_cap_end_slot = config->tiles.store_int.shred_cap_end_slot; - storei_tile->store_int.expected_shred_version = config->consensus.expected_shred_version; - /**********************************************************************/ - /* Add the replay tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "replay" ); fd_topo_tile_t * replay_tile = fd_topob_tile( topo, "replay", "replay", "metric_in", replay_cpu_idx, 0, 0 ); - replay_tile->replay.fec_max = config->tiles.shred.max_pending_shred_sets; - replay_tile->replay.max_vote_accounts = config->firedancer.runtime.limits.max_vote_accounts; - - /* specified by [tiles.replay] */ - strncpy( replay_tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(replay_tile->replay.blockstore_file) ); - strncpy( replay_tile->replay.blockstore_checkpt, config->firedancer.blockstore.checkpt, sizeof(replay_tile->replay.blockstore_checkpt) ); - - replay_tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; - strncpy( replay_tile->replay.capture, config->tiles.replay.capture, sizeof(replay_tile->replay.capture) ); - strncpy( replay_tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(replay_tile->replay.funk_checkpt) ); - replay_tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - replay_tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - replay_tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( replay_tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(replay_tile->replay.funk_file) ); - replay_tile->replay.plugins_enabled = 0; - - if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) - && !strncmp( config->tiles.replay.snapshot, "", 1 ) ) ) { - fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); - } - strncpy( replay_tile->replay.genesis, config->tiles.replay.genesis, sizeof(replay_tile->replay.genesis) ); - - strncpy( replay_tile->replay.incremental, config->tiles.replay.incremental, sizeof(replay_tile->replay.incremental) ); - strncpy( replay_tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(replay_tile->replay.slots_replayed) ); - strncpy( replay_tile->replay.snapshot, config->tiles.replay.snapshot, sizeof(replay_tile->replay.snapshot) ); - strncpy( replay_tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(replay_tile->replay.status_cache) ); - - strncpy( replay_tile->replay.cluster_version, config->tiles.replay.cluster_version, sizeof(replay_tile->replay.cluster_version) ); - replay_tile->replay.bank_tile_count = config->layout.bank_tile_count; - replay_tile->replay.exec_tile_count = config->firedancer.layout.exec_tile_count; - strncpy( replay_tile->replay.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(replay_tile->replay.tower_checkpt) ); - - /* not specified by [tiles.replay] */ - strncpy( replay_tile->replay.identity_key_path, config->paths.identity_key, sizeof(replay_tile->replay.identity_key_path) ); - replay_tile->replay.ip_addr = config->net.ip_addr; - replay_tile->replay.vote = config->firedancer.consensus.vote; - strncpy( replay_tile->replay.vote_account_path, config->paths.vote_account, sizeof(replay_tile->replay.vote_account_path) ); - replay_tile->replay.full_interval = config->tiles.batch.full_interval; - replay_tile->replay.incremental_interval = config->tiles.batch.incremental_interval; #define FOR(cnt) for( ulong i=0UL; ifiredancer.layout.exec_tile_count; - FOR(exec_tile_cnt) fd_topob_tile( topo, "exec", "exec", "metric_in", static_end_idx+i, 0, 0 ); + ulong exec_tile_cnt = config->firedancer.layout.exec_tile_count; + FOR(exec_tile_cnt) fd_topob_tile( topo, "exec", "exec", "metric_in", static_end_idx+i, 0, 0 ); /**********************************************************************/ /* Setup playback<->storei and storei<->replay links in topo */ @@ -307,6 +175,28 @@ sim_topo( config_t * config ) { FD_TEST( fd_pod_insertf_ulong( topo->props, exec_fseq_obj->id, "exec_fseq.%lu", i ) ); } + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + if( !strcmp( tile->name, "arch_p" ) ) { + strncpy( tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); + if( FD_UNLIKELY( 0==strlen( tile->archiver.archiver_path ) ) ) { + FD_LOG_ERR(( "Archive file not found for playback" )); + } else { + FD_LOG_NOTICE(( "Found archive file from config: %s", tile->archiver.archiver_path )); + } + } else if( !fd_topo_configure_tile( tile, config ) ) { + FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + } + + /* Override */ + if( !strcmp( tile->name, "replay" ) ) { + strncpy( tile->replay.incremental, config->tiles.replay.incremental, sizeof(tile->replay.incremental) ); + strncpy( tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(tile->replay.slots_replayed) ); + strncpy( tile->replay.snapshot, config->tiles.replay.snapshot, sizeof(tile->replay.snapshot) ); + strncpy( tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(tile->replay.status_cache) ); + } + } + /**********************************************************************/ /* Finish and print out the topo information */ /**********************************************************************/ diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c new file mode 100644 index 0000000000..bc2063f117 --- /dev/null +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -0,0 +1,361 @@ +#include "../../firedancer/topology.h" +#include "../../shared/commands/configure/configure.h" +#include "../../shared/commands/run/run.h" +#include "../../../disco/metrics/fd_metrics.h" +#include "../../../disco/topo/fd_topob.h" +#include "../../../disco/topo/fd_pod_format.h" +#include "../../../util/tile/fd_tile_private.h" +#include "../../../flamenco/snapshot/fd_snapshot_loader.h" +#include +#include +#include +#include + +#define NAME "snapshot-load" + +extern fd_topo_obj_callbacks_t * CALLBACKS[]; + +fd_topo_run_tile_t +fdctl_tile_run( fd_topo_tile_t const * tile ); + +/* _is_zstd returns 1 if given file handle points to the beginning of a + zstd stream, otherwise zero. */ + +static int +_is_zstd( char const * path ) { + FILE * file = fopen( path, "r" ); + FD_TEST( file ); + uint magic; + ulong n = fread( &magic, 1UL, 4UL, file ); + if( FD_UNLIKELY( feof( file ) ) ) { + clearerr( file ); + fseek( file, -(long)n, SEEK_CUR ); + fclose( file ); + return 0; + } + int err = ferror( file ); + if( FD_UNLIKELY( err ) ) + FD_LOG_ERR(( "fread() failed (%d-%s)", err, strerror( err ) )); + fseek( file, -4L, SEEK_CUR ); + fclose( file ); + return ( magic==0xFD2FB528UL ); +} + +static void +snapshot_load_topo( config_t * config, + args_t const * args ) { + fd_snapshot_src_t src[1]; + char snapshot_path_copy[4096]; + memcpy( snapshot_path_copy, args->snapshot_load.snapshot_path, sizeof(snapshot_path_copy) ); + fd_snapshot_src_parse_type_unknown( src, snapshot_path_copy ); + + fd_topo_t * topo = &config->topo; + fd_topob_new( &config->topo, config->name ); + topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size ); + + fd_topob_wksp( topo, "funk" ); + fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", + config->firedancer.funk.max_account_records, + config->firedancer.funk.max_database_transactions, + config->firedancer.funk.heap_size_gib ); + + static ushort tile_to_cpu[ FD_TILE_MAX ] = {0}; + if( args->tile_cpus[0] ) { + ulong cpu_cnt = fd_tile_private_cpus_parse( args->tile_cpus, tile_to_cpu ); + if( FD_UNLIKELY( cpu_cnt<6UL ) ) FD_LOG_ERR(( "--tile-cpus specifies %lu CPUs, but need at least 6", cpu_cnt )); + } + + fd_topob_wksp( topo, "metric_in" ); + fd_topob_wksp( topo, "metric" ); + fd_topob_tile( topo, "metric", "metric", "metric_in", tile_to_cpu[0], 0, 0 ); + + /* Uncompressed data stream */ + fd_topob_wksp( topo, "snap_stream" ); + fd_topo_link_t * snapin_link = fd_topob_link( topo, "snap_stream", "snap_stream", 512UL, 0UL, 0UL ); + fd_topo_obj_t * snapin_dcache = fd_topob_obj( topo, "dcache", "snap_stream" ); + snapin_link->dcache_obj_id = snapin_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", snapin_dcache->id ) ); + + if( src->type==FD_SNAPSHOT_SRC_FILE ) { + + int is_zstd = _is_zstd( args->snapshot_load.snapshot_path ); + + /* read() tile */ + fd_topob_wksp( topo, "FileRd" ); + fd_topo_tile_t * filerd_tile = fd_topob_tile( topo, "FileRd", "FileRd", "FileRd", tile_to_cpu[1], 0, 0 ); + fd_memcpy( filerd_tile->filerd.file_path, args->snapshot_load.snapshot_path, PATH_MAX ); + FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==sizeof(args->snapshot_load.snapshot_path), abi ); + FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==PATH_MAX, abi ); + + if( is_zstd ) { /* .tar.zst file */ + + /* "unzstd": Zstandard decompress tile */ + fd_topob_wksp( topo, "Unzstd" ); + fd_topo_tile_t * unzstd_tile = fd_topob_tile( topo, "Unzstd", "Unzstd", "Unzstd", tile_to_cpu[2], 0, 0 ); + (void)unzstd_tile; + + /* Compressed data stream */ + fd_topob_wksp( topo, "snap_zstd" ); + fd_topo_link_t * zstd_link = fd_topob_link( topo, "snap_zstd", "snap_zstd", 512UL, 0UL, 0UL ); + fd_topo_obj_t * zstd_dcache = fd_topob_obj( topo, "dcache", "snap_zstd"); + zstd_link->dcache_obj_id = zstd_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", zstd_dcache->id ) ); + + /* filerd tile -> compressed stream */ + fd_topob_tile_out( topo, "FileRd", 0UL, "snap_zstd", 0UL ); + fd_topob_tile_uses( topo, filerd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + /* compressed stream -> unzstd tile */ + fd_topob_tile_in( topo, "Unzstd", 0UL, "metric_in", "snap_zstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, unzstd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* unzstd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + } else { /* .tar file */ + + /* filerd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "FileRd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, filerd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + } + } + else if ( src->type==FD_SNAPSHOT_SRC_HTTP ) { + + /* httpdl() tile */ + fd_topob_wksp( topo, "HttpDl" ); + fd_topo_tile_t * httpdl_tile = fd_topob_tile( topo, "HttpDl", "HttpDl", "HttpDl", tile_to_cpu[1], 0, 0 ); + fd_memcpy( httpdl_tile->httpdl.path, src->http.path, PATH_MAX ); + fd_memcpy( httpdl_tile->httpdl.snapshot_dir, args->snapshot_load.snapshot_dir, PATH_MAX ); + fd_memcpy( httpdl_tile->httpdl.dest, src->http.dest, sizeof(src->http.dest) ); + httpdl_tile->httpdl.ip4 = src->http.ip4; + httpdl_tile->httpdl.path_len = src->http.path_len; + httpdl_tile->httpdl.port = src->http.port; + + /* "unzstd": Zstandard decompress tile */ + fd_topob_wksp( topo, "Unzstd" ); + fd_topo_tile_t * unzstd_tile = fd_topob_tile( topo, "Unzstd", "Unzstd", "Unzstd", tile_to_cpu[2], 0, 0 ); + (void)unzstd_tile; + + /* Compressed data stream */ + fd_topob_wksp( topo, "snap_zstd" ); + fd_topo_link_t * zstd_link = fd_topob_link( topo, "snap_zstd", "snap_zstd", 512UL, 0UL, 0UL ); + fd_topo_obj_t * zstd_dcache = fd_topob_obj( topo, "dcache", "snap_zstd"); + zstd_link->dcache_obj_id = zstd_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", zstd_dcache->id ) ); + + /* filerd tile -> compressed stream */ + fd_topob_tile_out( topo, "HttpDl", 0UL, "snap_zstd", 0UL ); + fd_topob_tile_uses( topo, httpdl_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + /* compressed stream -> unzstd tile */ + fd_topob_tile_in( topo, "Unzstd", 0UL, "metric_in", "snap_zstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, unzstd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* unzstd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + } + + /* "SnapIn": Snapshot parser tile */ + fd_topob_wksp( topo, "SnapIn" ); + fd_topo_tile_t * snapin_tile = fd_topob_tile( topo, "SnapIn", "SnapIn", "SnapIn", tile_to_cpu[3], 0, 0 ); + snapin_tile->snapin.scratch_sz = (3UL<<30); + + /* uncompressed stream -> snapin tile */ + fd_topob_tile_in ( topo, "SnapIn", 0UL, "metric_in", "snap_stream", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, snapin_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* snapin tile -> account frags */ + fd_topob_wksp( topo, "snap_frags" ); + fd_topo_link_t * snap_frags_link = fd_topob_link( topo, "snap_frags", "snap_frags", 512UL, 0UL, 0UL ); + snap_frags_link->dcache_obj_id = snapin_dcache->id; + fd_topob_tile_out( topo, "SnapIn", 0UL, "snap_frags", 0UL ); + + /* "ActAlc": Account allocator tile */ + fd_topob_wksp( topo, "ActAlc" ); + fd_topo_tile_t * actalc_tile = fd_topob_tile( topo, "ActAlc", "ActAlc", "ActAlc", tile_to_cpu[4], 0, 0 ); + fd_topob_tile_uses( topo, actalc_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + actalc_tile->actalc.funk_obj_id = funk_obj->id; + + /* account frags -> actalc tile */ + fd_topob_tile_in( topo, "ActAlc", 0UL, "metric_in", "snap_frags", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, actalc_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* actalc tile -> record pointers */ + fd_topob_wksp( topo, "snap_descs" ); + fd_topob_link( topo, "snap_descs", "snap_descs", 512UL, 0UL, 0UL )->permit_no_consumers = 1; + fd_topob_tile_out( topo, "ActAlc", 0UL, "snap_descs", 0UL ); + + /* "ActIdx": Account indexer tile */ + fd_topob_wksp( topo, "ActIdx" ); + fd_topo_tile_t * actidx_tile = fd_topob_tile( topo, "ActIdx", "ActIdx", "ActIdx", tile_to_cpu[5], 0, 0 ); + actidx_tile->actidx.funk_obj_id = funk_obj->id; + + /* record pointers -> actidx tile */ + fd_topob_tile_in( topo, "ActIdx", 0UL, "metric_in", "snap_descs", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + fd_topo_configure_tile( tile, config ); + } + + if( !args->tile_cpus[0] ) { + fd_topob_auto_layout( topo, 0 ); + } + fd_topob_finish( topo, CALLBACKS ); + fd_topo_print_log( /* stdout */ 1, topo ); +} + +static void +snapshot_load_cmd_args( int * pargc, + char *** pargv, + args_t * args ) { + char const * tile_cpus = fd_env_strip_cmdline_cstr( pargc, pargv, "--tile-cpus", "FD_TILE_CPUS", NULL ); + char const * snapshot_src = fd_env_strip_cmdline_cstr( pargc, pargv, "--snapshot", NULL, NULL ); + char const * snapshot_dir = fd_env_strip_cmdline_cstr( pargc, pargv, "--snapshot-dir", NULL, NULL ); + + if( tile_cpus ) { + ulong tile_cpus_strlen = strlen( tile_cpus ); + if( FD_UNLIKELY( tile_cpus_strlen>=sizeof(args->tile_cpus) ) ) FD_LOG_ERR(( "--tile-cpus: flag too long" )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->tile_cpus ), tile_cpus, tile_cpus_strlen ) ); + } + + if( FD_UNLIKELY( !snapshot_src ) ) FD_LOG_ERR(( "Missing --snapshot flag" )); + ulong snapshot_file_strlen = strlen( snapshot_src ); + if( FD_UNLIKELY( snapshot_file_strlen>=sizeof(args->snapshot_load.snapshot_path) ) ) FD_LOG_ERR(( "--snapshot: path too long" )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->snapshot_load.snapshot_path ), snapshot_src, snapshot_file_strlen ) ); + + /* FIXME: check if we need the snapshot dir argument (parse the snapshot input src to see if it's http)*/ + if( snapshot_dir!=NULL ) { + ulong snapshot_dir_strlen = strlen( snapshot_dir ); + if( FD_UNLIKELY( snapshot_file_strlen>=sizeof(args->snapshot_load.snapshot_dir) ) ) FD_LOG_ERR(( "--snapshot-dir: dir too long" )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->snapshot_load.snapshot_dir ), snapshot_dir, snapshot_dir_strlen ) ); + } +} + +static void +snapshot_load_cmd_perm( args_t * args, + fd_cap_chk_t * chk, + config_t const * config ) { + (void)args; + ulong mlock_limit = fd_topo_mlock_max_tile( &config->topo ); + fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_MEMLOCK, mlock_limit, "call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`" ); + fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NICE, 40, "call `setpriority(2)` to increase thread priorities" ); +} + +static void +snapshot_load_cmd_fn( args_t * args, + config_t * config ) { + snapshot_load_topo( config, args ); + fd_topo_t * topo = &config->topo; + + configure_stage( &fd_cfg_stage_hugetlbfs, CONFIGURE_CMD_INIT, config ); + initialize_workspaces( config ); + initialize_stacks( config ); + fd_topo_join_workspaces( topo, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topo_fill( topo ); + double tick_per_ns = fd_tempo_tick_per_ns( NULL ); + double ns_per_tick = 1.0/tick_per_ns; + fd_topo_run_single_process( topo, 2, config->uid, config->gid, fdctl_tile_run, NULL ); + + ulong httpdl_tile_idx = fd_topo_find_tile( topo, "HttpDl", 0UL ); + ulong filerd_tile_idx = fd_topo_find_tile( topo, "FileRd", 0UL ); + fd_topo_tile_t * http_dl_tile = httpdl_tile_idx!=ULONG_MAX ? &topo->tiles[ httpdl_tile_idx ] : NULL; + fd_topo_tile_t * file_rd_tile = filerd_tile_idx!=ULONG_MAX ? &topo->tiles[ filerd_tile_idx ] : NULL; + fd_topo_tile_t * const snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; + ulong const zstd_tile_idx = fd_topo_find_tile( topo, "Unzstd", 0UL ); + fd_topo_tile_t * const unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; + fd_topo_tile_t * const actalc_tile = &topo->tiles[ fd_topo_find_tile( topo, "ActAlc", 0UL ) ]; + fd_topo_tile_t * const actidx_tile = &topo->tiles[ fd_topo_find_tile( topo, "ActIdx", 0UL ) ]; + + ulong * const snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; + ulong * const snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); + ulong volatile * file_rd_metrics = file_rd_tile ? fd_metrics_tile( file_rd_tile->metrics ) : NULL; + ulong volatile * http_dl_metrics = http_dl_tile ? fd_metrics_tile( http_dl_tile->metrics ) : NULL; + ulong volatile * const snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); + ulong volatile * const unzstd_in_metrics = unzstd_tile ? fd_metrics_tile( unzstd_tile->metrics ) : NULL; + ulong volatile * const actalc_metrics = fd_metrics_tile( actalc_tile->metrics ); + ulong volatile * const actidx_metrics = fd_metrics_tile( actidx_tile->metrics ); + + ulong goff_old = 0UL; + ulong file_rd_backp_old = 0UL; + ulong file_rd_wait_old = 0UL; + ulong snap_in_backp_old = 0UL; + ulong snap_in_wait_old = 0UL; + ulong actalc_backp_old = 0UL; + ulong actalc_wait_old = 0UL; + ulong actidx_wait_old = 0UL; + ulong acc_cnt_old = 0UL; + sleep( 1 ); + puts( "" ); + puts( "Columns:" ); + puts( "- bw: Uncompressed bandwidth" ); + puts( "- backp: Backpressured by downstream tile" ); + puts( "- stall: Waiting on upstream tile" ); + puts( "- acc: Number of accounts" ); + puts( "" ); + puts( "-------------backp=(file,snap,alc ) busy=(file,snap,alc ,idx )---------------" ); + for(;;) { + ulong filerd_status = file_rd_metrics ? FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; + ulong httpdl_status = http_dl_metrics ? FD_VOLATILE_CONST( http_dl_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; + ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); + ulong unzstd_status = unzstd_in_metrics ? FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; + if( FD_UNLIKELY( httpdl_status==2UL && filerd_status==2UL && unzstd_status==2UL && snapin_status == 2UL ) ) { + FD_LOG_NOTICE(( "Done" )); + break; + } + + ulong goff = FD_VOLATILE_CONST( snap_in_fseq[ 1 ] ); + ulong file_rd_backp = file_rd_metrics ? FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ) : + http_dl_metrics ? FD_VOLATILE_CONST( http_dl_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ) : 0UL; + ulong file_rd_wait = file_rd_metrics ? FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + file_rd_backp : + http_dl_metrics ? FD_VOLATILE_CONST( http_dl_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( http_dl_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + file_rd_backp :0UL; + ulong snap_in_backp = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong snap_in_wait = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + snap_in_backp; + ulong actalc_backp = FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong actalc_wait = FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + actalc_backp; + ulong actidx_backp = FD_VOLATILE_CONST( actidx_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong actidx_wait = FD_VOLATILE_CONST( actidx_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( actidx_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + actidx_backp; + ulong acc_cnt = FD_VOLATILE_CONST( snap_accs_sync[1] ); + printf( "bw=%4.2g GB/s backp=(%3.0f%%,%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%,%3.0f%%) acc=%8.3g/s\n", + (double)( goff-goff_old )/1e9, + ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, + ( (double)( snap_in_backp-snap_in_backp_old )*ns_per_tick )/1e7, + ( (double)( actalc_backp -actalc_backp_old )*ns_per_tick )/1e7, + 100-( ( (double)( file_rd_wait -file_rd_wait_old )*ns_per_tick )/1e7 ), + 100-( ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7 ), + 100-( ( (double)( actalc_wait -actalc_wait_old )*ns_per_tick )/1e7 ), + 100-( ( (double)( actidx_wait -actidx_wait_old )*ns_per_tick )/1e7 ), + (double)( acc_cnt -acc_cnt_old ) ); + fflush( stdout ); + goff_old = goff; + file_rd_backp_old = file_rd_backp; + file_rd_wait_old = file_rd_wait; + snap_in_backp_old = snap_in_backp; + snap_in_wait_old = snap_in_wait; + actalc_backp_old = actalc_backp; + actalc_wait_old = actalc_wait; + actidx_wait_old = actidx_wait; + acc_cnt_old = acc_cnt; + sleep( 1 ); + } + + FD_LOG_NOTICE(( "Loaded %g accounts", (double)FD_VOLATILE_CONST( snap_accs_sync[1] ) )); +} + +action_t fd_action_snapshot_load = { + .name = NAME, + .args = snapshot_load_cmd_args, + .perm = snapshot_load_cmd_perm, + .fn = snapshot_load_cmd_fn +}; diff --git a/src/app/firedancer-dev/config/default.toml b/src/app/firedancer-dev/config/default.toml index 0d09aa34ad..4c6c104034 100644 --- a/src/app/firedancer-dev/config/default.toml +++ b/src/app/firedancer-dev/config/default.toml @@ -15,6 +15,10 @@ idx_max = 8192 alloc_max = 10737418240 file = "{blockstore_path}" +[funk] + heap_size_gib = 140 + max_account_records = 150000000 + max_database_transactions = 2000 [tiles] [tiles.shred] max_pending_shred_sets = 16384 @@ -24,10 +28,6 @@ [tiles.replay] snapshot_url = "http://{validator_ip}:8899/snapshot.tar.bz2" incremental_url = "http://{validator_ip}:8899/incremental-snapshot.tar.bz2" - funk_sz_gb = 140 - funk_rec_max = 150000000 - funk_txn_max = 2000 - funk_file = "{funk_path}" [tiles.metric] prometheus_listen_address = "0.0.0.0" prometheus_listen_port = 7999 diff --git a/src/app/firedancer-dev/config/private.toml b/src/app/firedancer-dev/config/private.toml index aaa5f084c5..fea3201980 100644 --- a/src/app/firedancer-dev/config/private.toml +++ b/src/app/firedancer-dev/config/private.toml @@ -10,6 +10,10 @@ idx_max = 8192 alloc_max = 10737418240 file = "{blockstore_path}" +[funk] + heap_size_gib = 20 + max_account_records = 1048576 + max_database_transactions = 4096 [tiles] [tiles.shred] max_pending_shred_sets = 16384 @@ -18,10 +22,6 @@ repair_serve_listen_port = 8034 [tiles.replay] snapshot_url = "http://{validator_ip}:8899/snapshot.tar.bz2" - funk_sz_gb = 20 - funk_txn_max = 4096 - funk_rec_max = 1048576 - funk_file = "{funk_path}" [tiles.metric] prometheus_listen_address = "0.0.0.0" prometheus_listen_port = 7999 diff --git a/src/app/firedancer-dev/config/tiny.toml b/src/app/firedancer-dev/config/tiny.toml index 6e086482a2..55e0ce3a91 100644 --- a/src/app/firedancer-dev/config/tiny.toml +++ b/src/app/firedancer-dev/config/tiny.toml @@ -1,6 +1,11 @@ [hugetlbfs] max_page_size = "huge" +[funk] +max_account_records = 1048576 +heap_size_gib = 20 +max_database_transactions = 1024 + [runtime] heap_size_gib = 4 @@ -17,11 +22,6 @@ writer_tile_count = 1 [tiles.restart] enabled = false -[tiles.replay] -funk_sz_gb = 20 -funk_txn_max = 1024 -funk_rec_max = 1048576 - [tiles.shred] max_pending_shred_sets = 512 diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index 6cfffd2f2b..2e4732223d 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -22,6 +22,7 @@ extern fd_topo_obj_callbacks_t fd_obj_cb_runtime_pub; extern fd_topo_obj_callbacks_t fd_obj_cb_blockstore; extern fd_topo_obj_callbacks_t fd_obj_cb_txncache; extern fd_topo_obj_callbacks_t fd_obj_cb_exec_spad; +extern fd_topo_obj_callbacks_t fd_obj_cb_funk; fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_mcache, @@ -39,6 +40,7 @@ fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_blockstore, &fd_obj_cb_txncache, &fd_obj_cb_exec_spad, + &fd_obj_cb_funk, NULL, }; @@ -97,9 +99,13 @@ extern fd_topo_run_tile_t fd_tile_archiver_writer; extern fd_topo_run_tile_t fd_tile_archiver_playback; extern fd_topo_run_tile_t fd_tile_archiver_backtest; -extern fd_topo_run_tile_t fd_tile_bencho; -extern fd_topo_run_tile_t fd_tile_benchg; -extern fd_topo_run_tile_t fd_tile_benchs; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActIdx; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_HttpDl; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, @@ -135,9 +141,13 @@ fd_topo_run_tile_t * TILES[] = { &fd_tile_archiver_writer, &fd_tile_archiver_playback, &fd_tile_archiver_backtest, - &fd_tile_bencho, - &fd_tile_benchg, - &fd_tile_benchs, + &fd_tile_snapshot_restore_FileRd, + &fd_tile_snapshot_restore_Unzstd, + &fd_tile_snapshot_restore_SnapIn, + &fd_tile_snapshot_restore_ActAlc, + &fd_tile_snapshot_restore_ActIdx, + &fd_tile_snapshot_restore_Unzstd, + &fd_tile_snapshot_restore_HttpDl, NULL, }; @@ -159,6 +169,7 @@ extern action_t fd_action_help; extern action_t fd_action_load; extern action_t fd_action_pktgen; extern action_t fd_action_quic_trace; +extern action_t fd_action_snapshot_load; extern action_t fd_action_txn; extern action_t fd_action_wksp; extern action_t fd_action_gossip; @@ -183,6 +194,7 @@ action_t * ACTIONS[] = { &fd_action_flame, &fd_action_load, &fd_action_pktgen, + &fd_action_snapshot_load, &fd_action_quic_trace, &fd_action_txn, &fd_action_wksp, diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index 5380b39db2..1c9bd20b93 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -293,6 +293,33 @@ user = "" snapshot_grace_period_seconds = 409 max_vote_accounts = 2000000 +# This section configures the "funk" account database. Currently, funk +# stores all Solana accounts. In future versions of Firedancer, most +# accounts will be offloaded to the "groove" database. +[funk] + # The max amount of records that the funk instance can store. + # Each Solana account uses at least one record. Additional records + # are used for account changes that are not yet finalized by + # consensus (typically takes 13 seconds). + max_account_records = 10_000_000 + + # The size of the funk heap in gibibytes. This value must be large + # enough to store all Solana accounts uncompressed. + heap_size_gib = 32 + + # The max amount of concurrent database transactions. These are + # used to track conflicting versions of accounts until such + # conflicts are resolved by the consensus algorithm. (Not to be + # confused with Solana transactions). + # The validator uses one database transaction for each Solana block + # that is not yet finalized. It is not recommended to change this + # setting. + max_database_transactions = 1024 + +# This section configures the "groove" persistent account database. +# [groove] +# ... + # CPU cores in Firedancer are carefully managed. Where a typical # program lets the operating system scheduler determine which threads to # run on which cores and for how long, Firedancer overrides most of this @@ -1030,9 +1057,6 @@ user = "" # snapshots and frequent validator restarts are expected. snapshot_dir = "" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 cluster_version = "1.18.0" # The metric tile receives metrics updates published from the rest diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index 3ef95ce793..991093077c 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -21,6 +21,7 @@ extern fd_topo_obj_callbacks_t fd_obj_cb_runtime_pub; extern fd_topo_obj_callbacks_t fd_obj_cb_blockstore; extern fd_topo_obj_callbacks_t fd_obj_cb_txncache; extern fd_topo_obj_callbacks_t fd_obj_cb_exec_spad; +extern fd_topo_obj_callbacks_t fd_obj_cb_funk; fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_mcache, @@ -38,6 +39,7 @@ fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_blockstore, &fd_obj_cb_txncache, &fd_obj_cb_exec_spad, + &fd_obj_cb_funk, NULL, }; diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 39c4acfe5b..ea3ead783a 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -1,4 +1,4 @@ -#include "../shared/fd_config.h" +#include "topology.h" #include "../../discof/geyser/fd_replay_notif.h" #include "../../disco/net/fd_net_tile.h" @@ -8,8 +8,6 @@ #include "../../disco/topo/fd_cpu_topo.h" #include "../../disco/topo/fd_pod_format.h" #include "../../flamenco/runtime/fd_blockstore.h" -#include "../../flamenco/runtime/fd_runtime.h" -#include "../../flamenco/runtime/fd_runtime_public.h" #include "../../flamenco/runtime/fd_txncache.h" #include "../../flamenco/snapshot/fd_snapshot_base.h" #include "../../util/tile/fd_tile_private.h" @@ -21,14 +19,14 @@ extern fd_topo_obj_callbacks_t * CALLBACKS[]; -static fd_topo_obj_t * +fd_topo_obj_t * setup_topo_blockstore( fd_topo_t * topo, - char const * wksp_name, - ulong shred_max, - ulong block_max, - ulong idx_max, - ulong txn_max, - ulong alloc_max ) { + char const * wksp_name, + ulong shred_max, + ulong block_max, + ulong idx_max, + ulong txn_max, + ulong alloc_max ) { fd_topo_obj_t * obj = fd_topob_obj( topo, "blockstore", wksp_name ); ulong seed; @@ -50,7 +48,7 @@ setup_topo_blockstore( fd_topo_t * topo, return obj; } -static fd_topo_obj_t * +fd_topo_obj_t * setup_topo_runtime_pub( fd_topo_t * topo, char const * wksp_name, ulong mem_max ) { @@ -60,13 +58,13 @@ setup_topo_runtime_pub( fd_topo_t * topo, return obj; } -static fd_topo_obj_t * +fd_topo_obj_t * setup_topo_txncache( fd_topo_t * topo, - char const * wksp_name, - ulong max_rooted_slots, - ulong max_live_slots, - ulong max_txn_per_slot, - ulong max_constipated_slots ) { + char const * wksp_name, + ulong max_rooted_slots, + ulong max_live_slots, + ulong max_txn_per_slot, + ulong max_constipated_slots ) { fd_topo_obj_t * obj = fd_topob_obj( topo, "txncache", wksp_name ); FD_TEST( fd_pod_insertf_ulong( topo->props, max_rooted_slots, "obj.%lu.max_rooted_slots", obj->id ) ); @@ -77,6 +75,31 @@ setup_topo_txncache( fd_topo_t * topo, return obj; } +fd_topo_obj_t * +setup_topo_funk( fd_topo_t * topo, + char const * wksp_name, + ulong max_account_records, + ulong max_database_transactions, + ulong heap_size_gib ) { + fd_topo_obj_t * obj = fd_topob_obj( topo, "funk", wksp_name ); + FD_TEST( fd_pod_insert_ulong( topo->props, "funk", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, max_account_records, "obj.%lu.rec_max", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, max_database_transactions, "obj.%lu.txn_max", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, heap_size_gib<<30, "obj.%lu.heap_max", obj->id ) ); + ulong funk_footprint = fd_funk_footprint( max_database_transactions, max_account_records ); + if( FD_UNLIKELY( !funk_footprint ) ) FD_LOG_ERR(( "Invalid [funk] parameters" )); + + /* Increase workspace partition count */ + ulong wksp_idx = fd_topo_find_wksp( topo, wksp_name ); + FD_TEST( wksp_idx!=ULONG_MAX ); + fd_topo_wksp_t * wksp = &topo->workspaces[ wksp_idx ]; + ulong part_max = fd_wksp_part_max_est( funk_footprint, 1U<<14U ); + if( FD_UNLIKELY( !part_max ) ) FD_LOG_ERR(( "fd_wksp_part_max_est(%lu,16KiB) failed", funk_footprint )); + wksp->part_max += part_max; + + return obj; +} + static int resolve_gossip_entrypoint( char const * host_port, fd_ip4_port_t * ip4_port ) { @@ -310,6 +333,7 @@ fd_topo_initialize( config_t * config ) { fd_topob_wksp( topo, "exec_spad" ); fd_topob_wksp( topo, "exec_fseq" ); fd_topob_wksp( topo, "writer_fseq" ); + fd_topob_wksp( topo, "funk" ); if( enable_rpc ) fd_topob_wksp( topo, "rpcsrv" ); @@ -449,11 +473,26 @@ fd_topo_initialize( config_t * config ) { FOR(writer_tile_cnt) fd_topob_tile( topo, "writer", "writer", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); fd_topo_tile_t * batch_tile = fd_topob_tile( topo, "batch", "batch", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); - if( enable_rstart ) /* */ fd_topob_tile( topo, "rstart", "restart", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); + fd_topo_tile_t * rstart_tile = NULL; + if( enable_rstart ) rstart_tile =fd_topob_tile( topo, "rstart", "restart", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); fd_topo_tile_t * rpcserv_tile = NULL; if( enable_rpc ) rpcserv_tile = fd_topob_tile( topo, "rpcsrv", "rpcsrv", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); + /* Database cache */ + + fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", + config->firedancer.funk.max_account_records, + config->firedancer.funk.max_database_transactions, + config->firedancer.funk.heap_size_gib ); + + /* */ fd_topob_tile_uses( topo, batch_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + FOR(exec_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "exec", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + /* */ fd_topob_tile_uses( topo, replay_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + if(rstart_tile) fd_topob_tile_uses( topo, rstart_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + if(rpcserv_tile) fd_topob_tile_uses( topo, rpcserv_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + FOR(writer_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "writer", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + /* Create a shared blockstore to be used by store and replay. */ fd_topo_obj_t * blockstore_obj = setup_topo_blockstore( topo, "bstore", @@ -778,7 +817,40 @@ fd_topo_initialize( config_t * config ) { for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; + if( !fd_topo_configure_tile( tile, config ) ) { + FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + } + } + + if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 ); + + fd_topob_finish( topo, CALLBACKS ); + FD_TEST( blockstore_obj->id ); + + const char * status_cache = config->tiles.replay.status_cache; + if ( strlen( status_cache ) > 0 ) { + /* Make the status cache workspace match the parameters used to create the + checkpoint. This is a bit nonintuitive because of the way + fd_topo_create_workspace works. */ + fd_wksp_preview_t preview[1]; + int err = fd_wksp_preview( status_cache, preview ); + if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "unable to preview %s: error %d", status_cache, err )); + fd_topo_wksp_t * wksp = &topo->workspaces[ topo->objs[ txncache_obj->id ].wksp_id ]; + wksp->part_max = preview->part_max; + wksp->known_footprint = 0; + wksp->total_footprint = preview->data_max; + ulong page_sz = FD_SHMEM_GIGANTIC_PAGE_SZ; + wksp->page_sz = page_sz; + ulong footprint = fd_wksp_footprint( preview->part_max, preview->data_max ); + wksp->page_cnt = footprint / page_sz; + } + + config->topo = *topo; +} +int +fd_topo_configure_tile( fd_topo_tile_t * tile, + fd_config_t * config ) { if( FD_UNLIKELY( !strcmp( tile->name, "net" ) || !strcmp( tile->name, "sock" ) ) ) { tile->net.shred_listen_port = config->tiles.shred.shred_listen_port; @@ -813,7 +885,7 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "shred" ) ) ) { strncpy( tile->shred.identity_key_path, config->paths.identity_key, sizeof(tile->shred.identity_key_path) ); - tile->shred.depth = topo->links[ tile->out_link_id[ 0 ] ].depth; + tile->shred.depth = config->topo.links[ tile->out_link_id[ 0 ] ].depth; tile->shred.fec_resolver_depth = config->tiles.shred.max_pending_shred_sets; tile->shred.expected_shred_version = config->consensus.expected_shred_version; tile->shred.shred_listen_port = config->tiles.shred.shred_listen_port; @@ -846,7 +918,6 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "repair" ) ) ) { tile->repair.max_pending_shred_sets = config->tiles.shred.max_pending_shred_sets; - tile->repair.shred_tile_cnt = config->layout.shred_tile_count; tile->repair.repair_intake_listen_port = config->tiles.repair.repair_intake_listen_port; tile->repair.repair_serve_listen_port = config->tiles.repair.repair_serve_listen_port; strncpy( tile->repair.good_peer_cache_file, config->tiles.repair.good_peer_cache_file, sizeof(tile->repair.good_peer_cache_file) ); @@ -866,15 +937,12 @@ fd_topo_initialize( config_t * config ) { tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; strncpy( tile->replay.capture, config->tiles.replay.capture, sizeof(tile->replay.capture) ); strncpy( tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(tile->replay.funk_checkpt) ); - tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); - tile->replay.plugins_enabled = plugins_enabled; + tile->replay.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); + tile->replay.plugins_enabled = fd_topo_find_tile( &config->topo, "plugin", 0UL ) != ULONG_MAX; if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) && !strncmp( config->tiles.replay.snapshot, "", 1 ) ) ) { - fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); + fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); } strncpy( tile->replay.genesis, config->tiles.replay.genesis, sizeof(tile->replay.genesis) ); @@ -883,9 +951,6 @@ fd_topo_initialize( config_t * config ) { strncpy( tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(tile->replay.slots_replayed) ); strncpy( tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(tile->replay.status_cache) ); strncpy( tile->replay.cluster_version, config->tiles.replay.cluster_version, sizeof(tile->replay.cluster_version) ); - tile->replay.bank_tile_count = config->layout.bank_tile_count; - tile->replay.exec_tile_count = config->firedancer.layout.exec_tile_count; - tile->replay.writer_tile_cuont = config->firedancer.layout.writer_tile_count; strncpy( tile->replay.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(tile->replay.tower_checkpt) ); /* not specified by [tiles.replay] */ @@ -897,6 +962,8 @@ fd_topo_initialize( config_t * config ) { tile->replay.full_interval = config->tiles.batch.full_interval; tile->replay.incremental_interval = config->tiles.batch.incremental_interval; + FD_TEST( tile->replay.funk_obj_id == fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ) ); + } else if( FD_UNLIKELY( !strcmp( tile->name, "sign" ) ) ) { strncpy( tile->sign.identity_key_path, config->paths.identity_key, sizeof(tile->sign.identity_key_path) ); @@ -924,19 +991,16 @@ fd_topo_initialize( config_t * config ) { strncpy( tile->eqvoc.identity_key_path, config->paths.identity_key, sizeof(tile->eqvoc.identity_key_path) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "rpcsrv" ) ) ) { strncpy( tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(tile->replay.blockstore_file) ); - tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + tile->rpcserv.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); tile->rpcserv.rpc_port = config->rpc.port; tile->rpcserv.tpu_port = config->tiles.quic.regular_transaction_listen_port; tile->rpcserv.tpu_ip_addr = config->net.ip_addr; strncpy( tile->rpcserv.identity_key_path, config->paths.identity_key, sizeof(tile->rpcserv.identity_key_path) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "batch" ) ) ) { + tile->batch.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); tile->batch.full_interval = config->tiles.batch.full_interval; tile->batch.incremental_interval = config->tiles.batch.incremental_interval; strncpy( tile->batch.out_dir, config->tiles.batch.out_dir, sizeof(tile->batch.out_dir) ); - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "gui" ) ) ) { if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.gui.gui_listen_address, &tile->gui.listen_addr ) ) ) FD_LOG_ERR(( "failed to parse gui listen address `%s`", config->tiles.gui.gui_listen_address )); @@ -951,11 +1015,11 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "plugin" ) ) ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "exec" ) ) ) { - strncpy( tile->exec.funk_file, config->tiles.replay.funk_file, sizeof(tile->exec.funk_file) ); + tile->exec.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); } else if( FD_UNLIKELY( !strcmp( tile->name, "writer" ) ) ) { - strncpy( tile->writer.funk_file, config->tiles.replay.funk_file, sizeof(tile->writer.funk_file) ); + tile->writer.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); } else if( FD_UNLIKELY( !strcmp( tile->name, "rstart" ) ) ) { - strncpy( tile->restart.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + tile->restart.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); strncpy( tile->restart.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(tile->replay.tower_checkpt) ); strncpy( tile->restart.identity_key_path, config->paths.identity_key, sizeof(tile->restart.identity_key_path) ); fd_memcpy( tile->restart.genesis_hash, config->tiles.restart.genesis_hash, FD_BASE58_ENCODED_32_SZ ); @@ -963,34 +1027,9 @@ fd_topo_initialize( config_t * config ) { tile->restart.heap_mem_max = config->firedancer.runtime.heap_size_gib<<30; } else if( FD_UNLIKELY( !strcmp( tile->name, "arch_f" ) || !strcmp( tile->name, "arch_w" ) ) ) { - tile->archiver.enabled = config->tiles.archiver.enabled; strncpy( tile->archiver.archiver_path, config->tiles.archiver.archiver_path, sizeof(tile->archiver.archiver_path) ); } else { - FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + return 0; } - } - - if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 ); - - fd_topob_finish( topo, CALLBACKS ); - - const char * status_cache = config->tiles.replay.status_cache; - if ( strlen( status_cache ) > 0 ) { - /* Make the status cache workspace match the parameters used to create the - checkpoint. This is a bit nonintuitive because of the way - fd_topo_create_workspace works. */ - fd_wksp_preview_t preview[1]; - int err = fd_wksp_preview( status_cache, preview ); - if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "unable to preview %s: error %d", status_cache, err )); - fd_topo_wksp_t * wksp = &topo->workspaces[ topo->objs[ txncache_obj->id ].wksp_id ]; - wksp->part_max = preview->part_max; - wksp->known_footprint = 0; - wksp->total_footprint = preview->data_max; - ulong page_sz = FD_SHMEM_GIGANTIC_PAGE_SZ; - wksp->page_sz = page_sz; - ulong footprint = fd_wksp_footprint( preview->part_max, preview->data_max ); - wksp->page_cnt = footprint / page_sz; - } - - config->topo = *topo; + return 1; } diff --git a/src/app/firedancer/topology.h b/src/app/firedancer/topology.h index f998c9b847..73a2d827e9 100644 --- a/src/app/firedancer/topology.h +++ b/src/app/firedancer/topology.h @@ -1,12 +1,50 @@ #ifndef HEADER_fd_src_app_firedancer_topology_h #define HEADER_fd_src_app_firedancer_topology_h +/* topology.h contains APIs for constructing a Firedancer topology. */ + #include "../shared/fd_config.h" FD_PROTOTYPES_BEGIN +/* fd_topo_initialize constructs a full validator config according to + the given topology. Populates config->topo. */ + void -fd_topo_initialize( config_t * config ); +fd_topo_initialize( fd_config_t * config ); + +fd_topo_obj_t * +setup_topo_blockstore( fd_topo_t * topo, + char const * wksp_name, + ulong shred_max, + ulong block_max, + ulong idx_max, + ulong txn_max, + ulong alloc_max ); + +fd_topo_obj_t * +setup_topo_runtime_pub( fd_topo_t * topo, + char const * wksp_name, + ulong mem_max ); + +fd_topo_obj_t * +setup_topo_txncache( fd_topo_t * topo, + char const * wksp_name, + ulong max_rooted_slots, + ulong max_live_slots, + ulong max_txn_per_slot, + ulong max_constipated_slots ); + +fd_topo_obj_t * +setup_topo_funk( fd_topo_t * topo, + char const * wksp_name, + ulong max_account_records, + ulong max_database_transactions, + ulong heap_size_gib ); + +int +fd_topo_configure_tile( fd_topo_tile_t * tile, + fd_config_t * config ); FD_PROTOTYPES_END diff --git a/src/app/shared/boot/fd_boot.c b/src/app/shared/boot/fd_boot.c index 013db1efd9..4d2e7262ff 100644 --- a/src/app/shared/boot/fd_boot.c +++ b/src/app/shared/boot/fd_boot.c @@ -145,6 +145,7 @@ fd_main_init( int * pargc, if( FD_LIKELY( !gid && setegid( config->gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) )); if( FD_LIKELY( !uid && seteuid( config->uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + if( 0==strcmp( config->log.path, "-" ) ) config->log.path[0] = '\0'; int boot_silent = config_fd>=0; fd_log_private_boot_custom( log_lock, 0UL, diff --git a/src/app/shared/fd_action.h b/src/app/shared/fd_action.h index d21f8b4461..c111ef0304 100644 --- a/src/app/shared/fd_action.h +++ b/src/app/shared/fd_action.h @@ -3,7 +3,9 @@ #include "fd_cap_chk.h" -union fdctl_args { +struct fdctl_args { + char tile_cpus[ 256UL ]; + struct { char tile_name[ 7UL ]; ulong kind_id; @@ -87,9 +89,14 @@ union fdctl_args { int event; int dump; /* whether the user requested --dump */ } quic_trace; + + struct { + char snapshot_path[ PATH_MAX ]; + char snapshot_dir[ PATH_MAX ]; + } snapshot_load; }; -typedef union fdctl_args args_t; +typedef struct fdctl_args args_t; struct fd_action { char const * name; diff --git a/src/app/shared/fd_config.h b/src/app/shared/fd_config.h index a658cebc95..c9f4bd5acd 100644 --- a/src/app/shared/fd_config.h +++ b/src/app/shared/fd_config.h @@ -119,6 +119,12 @@ struct fd_configf { } limits; } runtime; + struct { + ulong max_account_records; + ulong heap_size_gib; + ulong max_database_transactions; + } funk; + struct { uint exec_tile_count; /* TODO: redundant ish with bank tile cnt */ uint writer_tile_count; @@ -287,6 +293,10 @@ struct fd_config { char affinity[ AFFINITY_SZ ]; char fake_dst_ip[ 16 ]; } pktgen; + + struct { + char affinity[ AFFINITY_SZ ]; + } snapshot_load; } development; struct { @@ -366,10 +376,6 @@ struct fd_config { struct { char capture[ PATH_MAX ]; char funk_checkpt[ PATH_MAX ]; - uint funk_rec_max; - ulong funk_sz_gb; - ulong funk_txn_max; - char funk_file[ PATH_MAX ]; char genesis[ PATH_MAX ]; char incremental[ PATH_MAX ]; char incremental_url[ PATH_MAX ]; diff --git a/src/app/shared/fd_config_parse.c b/src/app/shared/fd_config_parse.c index 9737796419..915dfe9ad2 100644 --- a/src/app/shared/fd_config_parse.c +++ b/src/app/shared/fd_config_parse.c @@ -332,6 +332,10 @@ fd_config_extract_podf( uchar * pod, CFG_POP ( ulong, runtime.limits.snapshot_grace_period_seconds ); CFG_POP ( ulong, runtime.limits.max_vote_accounts ); + CFG_POP ( ulong, funk.max_account_records ); + CFG_POP ( ulong, funk.heap_size_gib ); + CFG_POP ( ulong, funk.max_database_transactions ); + return config; } @@ -443,10 +447,6 @@ fd_config_extract_pod( uchar * pod, CFG_POP ( cstr, tiles.replay.capture ); CFG_POP ( cstr, tiles.replay.funk_checkpt ); - CFG_POP ( uint, tiles.replay.funk_rec_max ); - CFG_POP ( ulong, tiles.replay.funk_sz_gb ); - CFG_POP ( ulong, tiles.replay.funk_txn_max ); - CFG_POP ( cstr, tiles.replay.funk_file ); CFG_POP ( cstr, tiles.replay.genesis ); CFG_POP ( cstr, tiles.replay.incremental ); CFG_POP ( cstr, tiles.replay.incremental_url ); diff --git a/src/app/shared/fd_obj_callbacks.c b/src/app/shared/fd_obj_callbacks.c index 1e3aae7e2a..18fed6cad5 100644 --- a/src/app/shared/fd_obj_callbacks.c +++ b/src/app/shared/fd_obj_callbacks.c @@ -10,6 +10,7 @@ #include "../../waltz/neigh/fd_neigh4_map.h" #include "../../waltz/ip/fd_fib4.h" #include "../../disco/keyguard/fd_keyswitch.h" +#include "../../funk/fd_funk.h" #define VAL(name) (__extension__({ \ ulong __x = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.%s", obj->id, name ); \ @@ -276,6 +277,44 @@ fd_topo_obj_callbacks_t fd_obj_cb_keyswitch = { .new = keyswitch_new, }; +static ulong +funk_align( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; (void)obj; + return fd_funk_align(); +} + +static ulong +funk_footprint( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; + return fd_funk_footprint( VAL("txn_max"), VAL("rec_max") ); +} + +static ulong +funk_loose( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; + return VAL("heap_max"); +} + +static void +funk_new( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; + ulong funk_seed = fd_pod_queryf_ulong( topo->props, 0UL, "obj.%lu.seed", obj->id ); + if( !funk_seed ) FD_TEST( fd_rng_secure( &funk_seed, sizeof(ulong) ) ); + FD_TEST( fd_funk_new( fd_topo_obj_laddr( topo, obj->id ), 2UL, funk_seed, VAL("txn_max"), VAL("rec_max") ) ); +} + +fd_topo_obj_callbacks_t fd_obj_cb_funk = { + .name = "funk", + .footprint = funk_footprint, + .loose = funk_loose, + .align = funk_align, + .new = funk_new, +}; + fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); diff --git a/src/disco/stem/fd_stem.h b/src/disco/stem/fd_stem.h index 5ed398bd2b..9fd290a4a6 100644 --- a/src/disco/stem/fd_stem.h +++ b/src/disco/stem/fd_stem.h @@ -8,7 +8,7 @@ struct fd_stem_context { fd_frag_meta_t ** mcaches; ulong * seqs; - ulong * depths; + ulong const * depths; ulong * cr_avail; ulong cr_decrement_amount; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 6c91f3dbb0..6a693b14d8 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -89,7 +89,7 @@ typedef struct { All input links will be automatically polled by the tile infrastructure, and output links will automatically source and manage credits from consumers. */ -typedef struct { +struct fd_topo_tile { ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */ char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */ ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */ @@ -272,12 +272,9 @@ typedef struct { ulong max_vote_accounts; int tx_metadata_storage; + ulong funk_obj_id; char capture[ PATH_MAX ]; char funk_checkpt[ PATH_MAX ]; - uint funk_rec_max; - ulong funk_sz_gb; - ulong funk_txn_max; - char funk_file[ PATH_MAX ]; char genesis[ PATH_MAX ]; char incremental[ PATH_MAX ]; char slots_replayed[ PATH_MAX ]; @@ -292,9 +289,6 @@ typedef struct { uint ip_addr; int vote; char vote_account_path[ PATH_MAX ]; - ulong bank_tile_count; - ulong exec_tile_count; - ulong writer_tile_cuont; ulong full_interval; ulong incremental_interval; @@ -313,7 +307,7 @@ typedef struct { struct { int in_wen_restart; int tower_checkpt_fileno; - char funk_file[ PATH_MAX ]; + ulong funk_obj_id; char tower_checkpt[ PATH_MAX ]; char identity_key_path[ PATH_MAX ]; char genesis_hash[ FD_BASE58_ENCODED_32_SZ ]; @@ -322,11 +316,11 @@ typedef struct { } restart; struct { - char funk_file[ PATH_MAX ]; + ulong funk_obj_id; } exec; struct { - char funk_file[ PATH_MAX ]; + ulong funk_obj_id; } writer; struct { @@ -373,7 +367,6 @@ typedef struct { int good_peer_cache_file_fd; char identity_key_path[ PATH_MAX ]; ulong max_pending_shred_sets; - uint shred_tile_cnt; } repair; struct { @@ -406,6 +399,7 @@ typedef struct { } eqvoc; struct { + ulong funk_obj_id; ushort rpc_port; ushort tpu_port; uint tpu_ip_addr; @@ -413,6 +407,7 @@ typedef struct { } rpcserv; struct { + ulong funk_obj_id; ulong full_interval; ulong incremental_interval; char out_dir[ PATH_MAX ]; @@ -427,7 +422,6 @@ typedef struct { } pktgen; struct { - int enabled; ulong end_slot; char archiver_path[ PATH_MAX ]; @@ -435,8 +429,35 @@ typedef struct { int archive_fd; } archiver; + struct { + char file_path[ PATH_MAX ]; + } filerd; + + struct { + char dest[128]; + uint ip4; + ushort port; + char path[ PATH_MAX ]; + ulong path_len; + char snapshot_dir[ PATH_MAX ]; + } httpdl; + + struct { + ulong scratch_sz; + } snapin; + + struct { + ulong funk_obj_id; + } actalc; + + struct { + ulong funk_obj_id; + } actidx; + }; -} fd_topo_tile_t; +}; + +typedef struct fd_topo_tile fd_topo_tile_t; typedef struct { ulong id; @@ -515,10 +536,13 @@ fd_topo_workspace_align( void ) { return 4096UL; } -FD_FN_PURE static inline void * +static inline void * fd_topo_obj_laddr( fd_topo_t const * topo, ulong obj_id ) { fd_topo_obj_t const * obj = &topo->objs[ obj_id ]; + FD_TEST( obj_idid == obj_id ); + FD_TEST( obj->offset ); return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset); } @@ -645,6 +669,42 @@ fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo, return cnt; } +FD_FN_PURE static inline ulong +fd_topo_tile_consumer_cnt( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + (void)topo; + return tile->out_cnt; +} + +FD_FN_PURE static inline ulong +fd_topo_tile_reliable_consumer_cnt( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + ulong reliable_cons_cnt = 0UL; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + reliable_cons_cnt++; + } + } + } + } + return reliable_cons_cnt; +} + +FD_FN_PURE static inline ulong +fd_topo_tile_producer_cnt( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + (void)topo; + ulong in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + in_cnt++; + } + return in_cnt; +} + /* Join (map into the process) all shared memory (huge/gigantic pages) needed by the tile, in the given topology. All memory associated with the tile (aka. used by links that the tile either produces to or diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index 2ce07b9f9e..302af889c8 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -64,7 +64,7 @@ fd_topob_obj( fd_topo_t * topo, return obj; } -void +fd_topo_link_t * fd_topob_link( fd_topo_t * topo, char const * link_name, char const * wksp_name, @@ -100,6 +100,8 @@ fd_topob_link( fd_topo_t * topo, FD_TEST( fd_pod_insertf_ulong( topo->props, mtu, "obj.%lu.mtu", obj->id ) ); } topo->link_cnt++; + + return link; } void @@ -374,6 +376,8 @@ fd_topob_auto_layout( fd_topo_t * topo, "rpcsrv", /* FIREDANCER only */ "batch", /* FIREDANCER only */ "pktgen", + "FileRd", + "SnapIn", }; char const * CRITICAL_TILES[] = { @@ -515,13 +519,13 @@ initialize_numa_assignments( fd_topo_t * topo ) { int found_lazy = 0; for( ulong j=0UL; jtile_cnt; j++ ) { fd_topo_tile_t * tile = &topo->tiles[ j ]; - if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idx!=ULONG_MAX ) ) { + if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idxworkspaces[ i ].numa_idx = fd_numa_node_idx( tile->cpu_idx ); FD_TEST( topo->workspaces[ i ].numa_idx!=ULONG_MAX ); found_strict = 1; found_lazy = 1; break; - } else if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idx==ULONG_MAX ) ) { + } else if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idx>=FD_TILE_MAX ) ) { topo->workspaces[ i ].numa_idx = 0; found_lazy = 1; break; @@ -532,12 +536,12 @@ initialize_numa_assignments( fd_topo_t * topo ) { for( ulong j=0UL; jtile_cnt; j++ ) { fd_topo_tile_t * tile = &topo->tiles[ j ]; for( ulong k=0UL; kuses_obj_cnt; k++ ) { - if( FD_LIKELY( tile->uses_obj_id[ k ]==max_obj && tile->cpu_idx!=ULONG_MAX ) ) { + if( FD_LIKELY( tile->uses_obj_id[ k ]==max_obj && tile->cpu_idxworkspaces[ i ].numa_idx = fd_numa_node_idx( tile->cpu_idx ); FD_TEST( topo->workspaces[ i ].numa_idx!=ULONG_MAX ); found_lazy = 1; break; - } else if( FD_UNLIKELY( tile->uses_obj_id[ k ]==max_obj ) && tile->cpu_idx==ULONG_MAX ) { + } else if( FD_UNLIKELY( tile->uses_obj_id[ k ]==max_obj ) && tile->cpu_idx>=FD_TILE_MAX ) { topo->workspaces[ i ].numa_idx = 0; found_lazy = 1; /* Don't break, keep looking -- a tile with a CPU assignment @@ -602,7 +606,9 @@ fd_topob_finish( fd_topo_t * topo, if( FD_UNLIKELY( cb->loose ) ) loose_sz += cb->loose( topo, obj ); } - ulong part_max = 3UL + (loose_sz / (64UL << 10)); /* 3 for initial alignment + actual alloc + residual padding */ + ulong part_max = wksp->part_max; + if( !part_max ) part_max = (loose_sz / (64UL << 10)); /* alloc + residual padding */ + part_max += 3; /* for initial alignment */ ulong offset = fd_ulong_align_up( fd_wksp_private_data_off( part_max ), fd_topo_workspace_align() ); for( ulong j=0UL; jobj_cnt; j++ ) { diff --git a/src/disco/topo/fd_topob.h b/src/disco/topo/fd_topob.h index cbb6d100b0..162237d18d 100644 --- a/src/disco/topo/fd_topob.h +++ b/src/disco/topo/fd_topob.h @@ -72,7 +72,7 @@ fd_topob_tile_uses( fd_topo_t * topo, can have no backing data buffer, a dcache, or a reassembly buffer behind it. */ -void +fd_topo_link_t * fd_topob_link( fd_topo_t * topo, char const * link_name, char const * wksp_name, diff --git a/src/discof/batch/fd_batch_tile.c b/src/discof/batch/fd_batch_tile.c index ab6ea9beb8..d26f2f2685 100644 --- a/src/discof/batch/fd_batch_tile.c +++ b/src/discof/batch/fd_batch_tile.c @@ -1,7 +1,6 @@ #include "../../disco/topo/fd_topo.h" #include "../../disco/topo/fd_pod_format.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" #include "../../flamenco/runtime/fd_hashes.h" #include "../../flamenco/runtime/fd_txncache.h" #include "../../flamenco/snapshot/fd_snapshot_create.h" @@ -23,7 +22,6 @@ struct fd_snapshot_tile_ctx { ulong full_interval; ulong incremental_interval; char const * out_dir; - char funk_file[ PATH_MAX ]; /* Shared data structures. */ fd_txncache_t * status_cache; @@ -36,9 +34,6 @@ struct fd_snapshot_tile_ctx { int full_snapshot_fd; int incremental_snapshot_fd; - /* Only join funk after tiles start spinning. */ - int is_funk_active; - /* Metadata from the full snapshot used for incremental snapshots. */ ulong last_full_snap_slot; fd_hash_t last_hash; @@ -171,11 +166,9 @@ unprivileged_init( fd_topo_t * topo, /* funk */ /**********************************************************************/ - /* We only want to join funk after it has been setup and joined in the - replay tile. - TODO: Eventually funk will be joined via a shared topology object. */ - ctx->is_funk_active = 0; - memcpy( ctx->funk_file, tile->replay.funk_file, sizeof(tile->replay.funk_file) ); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->batch.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } /**********************************************************************/ /* status cache */ @@ -428,20 +421,6 @@ after_credit( fd_snapshot_tile_ctx_t * ctx, return; } - if( FD_UNLIKELY( !ctx->is_funk_active ) ) { - /* Setting these parameters are not required because we are joining the - funk that was setup in the replay tile. */ - fd_funk_t * funk = fd_funk_open_file( - ctx->funk, ctx->funk_file, - 1UL, 0UL, 0UL, 0UL, 0UL, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "Failed to join a funk database" )); - } - ctx->is_funk_active = 1; - - FD_LOG_WARNING(( "Joined funk database at file=%s", ctx->funk_file )); - } - if( fd_batch_fseq_is_snapshot( batch_fseq ) ) { produce_snapshot( ctx, batch_fseq ); } else { diff --git a/src/discof/exec/fd_exec_tile.c b/src/discof/exec/fd_exec_tile.c index 3de851a001..342d156430 100644 --- a/src/discof/exec/fd_exec_tile.c +++ b/src/discof/exec/fd_exec_tile.c @@ -1,5 +1,3 @@ -#include -#define _GNU_SOURCE #include "../../disco/tiles.h" #include "generated/fd_exec_tile_seccomp.h" @@ -12,7 +10,6 @@ #include "../../flamenco/runtime/program/fd_bpf_program_util.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" struct fd_exec_tile_out_ctx { ulong idx; @@ -118,9 +115,7 @@ struct fd_exec_tile_ctx { int pending_slot_pop; int pending_epoch_pop; - /* Funk-specific setup. */ fd_funk_t funk[1]; - fd_wksp_t * funk_wksp; /* Data structures related to managing and executing the transaction. The fd_txn_p_t is refreshed with every transaction and is sent @@ -642,19 +637,9 @@ unprivileged_init( fd_topo_t * topo, /* funk-specific setup */ /********************************************************************/ - /* Setting these parameters are not required because we are joining - the funk that was setup in the replay tile. */ - FD_LOG_NOTICE(( "Trying to join funk at file=%s", tile->exec.funk_file )); - fd_funk_txn_start_write( NULL ); - if( FD_UNLIKELY( !fd_funk_open_file( - ctx->funk, tile->exec.funk_file, - 1UL, 0UL, 0UL, 0UL, 0UL, FD_FUNK_READONLY, NULL ) ) ) { - FD_LOG_ERR(( "fd_funk_open_file(%s) failed", tile->exec.funk_file )); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->exec.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); } - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( ctx->funk ); - - FD_LOG_NOTICE(( "Just joined funk at file=%s", tile->exec.funk_file )); //FIXME /********************************************************************/ diff --git a/src/discof/repair/fd_repair_tile.c b/src/discof/repair/fd_repair_tile.c index 7c22e26b7a..b00871180a 100644 --- a/src/discof/repair/fd_repair_tile.c +++ b/src/discof/repair/fd_repair_tile.c @@ -1348,7 +1348,7 @@ unprivileged_init( fd_topo_t * topo, } if( FD_UNLIKELY( sign_link_out_idx==UINT_MAX ) ) FD_LOG_ERR(( "Missing gossip_sign link" )); ctx->shred_tile_cnt = shred_tile_idx; - FD_TEST( ctx->shred_tile_cnt == tile->repair.shred_tile_cnt ); + FD_TEST( ctx->shred_tile_cnt == fd_topo_tile_name_cnt( topo, "shred" ) ); /* Scratch mem setup */ diff --git a/src/discof/replay/fd_replay_tile.c b/src/discof/replay/fd_replay_tile.c index e38d515763..a8f070fe7c 100644 --- a/src/discof/replay/fd_replay_tile.c +++ b/src/discof/replay/fd_replay_tile.c @@ -24,7 +24,6 @@ #include "../../flamenco/rewards/fd_rewards.h" #include "../../disco/metrics/fd_metrics.h" #include "../../choreo/fd_choreo.h" -#include "../../funk/fd_funk_filemap.h" #include "../../flamenco/snapshot/fd_snapshot_create.h" #include "../../disco/plugin/fd_plugin.h" //#include "fd_replay.h" @@ -109,7 +108,6 @@ typedef struct fd_slice_exec_ctx fd_slice_exec_ctx_t; struct fd_replay_tile_ctx { fd_wksp_t * wksp; fd_wksp_t * blockstore_wksp; - fd_wksp_t * funk_wksp; fd_wksp_t * status_cache_wksp; fd_wksp_t * runtime_public_wksp; @@ -745,7 +743,7 @@ checkpt( fd_replay_tile_ctx_t * ctx ) { FD_LOG_ERR( ( "blockstore checkpt failed: error %d", rc ) ); } } - int rc = fd_wksp_checkpt( ctx->funk_wksp, ctx->funk_checkpt, 0666, 0, NULL ); + int rc = fd_wksp_checkpt( ctx->funk->wksp, ctx->funk_checkpt, 0666, 0, NULL ); if( rc ) { FD_LOG_ERR( ( "funk checkpt failed: error %d", rc ) ); } @@ -2687,40 +2685,6 @@ privileged_init( fd_topo_t * topo, if( FD_UNLIKELY( !ctx->runtime_public ) ) { FD_LOG_ERR(( "no runtime_public" )); } - - - /* Open Funk */ - fd_funk_txn_start_write( NULL ); - fd_funk_t * funk; - const char * snapshot = tile->replay.snapshot; - if( strcmp( snapshot, "funk" ) == 0 ) { - /* Funk database already exists. The parameters are actually mostly ignored. */ - funk = fd_funk_open_file( - ctx->funk, - tile->replay.funk_file, 1, ctx->funk_seed, tile->replay.funk_txn_max, - tile->replay.funk_rec_max, tile->replay.funk_sz_gb * (1UL<<30), - FD_FUNK_READ_WRITE, NULL ); - } else if( strncmp( snapshot, "wksp:", 5 ) == 0) { - /* Recover funk database from a checkpoint. */ - funk = fd_funk_recover_checkpoint( ctx->funk, tile->replay.funk_file, 1, snapshot+5, NULL ); - } else { - FD_LOG_NOTICE(( "Trying to create new funk at file=%s", tile->replay.funk_file )); - /* Create new funk database */ - funk = fd_funk_open_file( - ctx->funk, - tile->replay.funk_file, 1, ctx->funk_seed, tile->replay.funk_txn_max, - tile->replay.funk_rec_max, tile->replay.funk_sz_gb * (1UL<<30), - FD_FUNK_OVERWRITE, NULL ); - FD_LOG_NOTICE(( "Opened funk file at %s", tile->replay.funk_file )); - } - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "Failed to join funk database" )); - } - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( funk ); - if( FD_UNLIKELY( ctx->funk_wksp == NULL ) ) { - FD_LOG_ERR(( "no funk wksp" )); - } } static void @@ -2804,8 +2768,9 @@ unprivileged_init( fd_topo_t * topo, /* funk */ /**********************************************************************/ - /* TODO: This below code needs to be shared as a topology object. This - will involve adding support to create a funk-based file here. */ + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->replay.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } ctx->is_caught_up = 0; @@ -2916,7 +2881,7 @@ unprivileged_init( fd_topo_t * topo, /**********************************************************************/ /* Join each of the exec spads. */ - ctx->exec_cnt = tile->replay.exec_tile_count; + ctx->exec_cnt = fd_topo_tile_name_cnt( topo, "exec" ); for( ulong i=0UL; iexec_cnt; i++ ) { ulong exec_spad_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "exec_spad.%lu", i ); fd_spad_t * spad = fd_spad_join( fd_topo_obj_laddr( topo, exec_spad_id ) ); @@ -3013,8 +2978,8 @@ unprivileged_init( fd_topo_t * topo, /* bank */ /**********************************************************************/ - ctx->bank_cnt = tile->replay.bank_tile_count; - for( ulong i=0UL; ireplay.bank_tile_count; i++ ) { + ctx->bank_cnt = fd_topo_tile_name_cnt( topo, "bank" ); + for( ulong i=0UL; i<(ctx->bank_cnt); i++ ) { ulong busy_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "bank_busy.%lu", i ); FD_TEST( busy_obj_id!=ULONG_MAX ); ctx->bank_busy[ i ] = fd_fseq_join( fd_topo_obj_laddr( topo, busy_obj_id ) ); @@ -3039,7 +3004,7 @@ unprivileged_init( fd_topo_t * topo, /**********************************************************************/ /* exec */ /**********************************************************************/ - ctx->exec_cnt = tile->replay.exec_tile_count; + ctx->exec_cnt = fd_topo_tile_name_cnt( topo, "exec" ); if( FD_UNLIKELY( ctx->exec_cnt>FD_PACK_MAX_BANK_TILES ) ) { FD_LOG_ERR(( "replay tile has too many exec tiles %lu", ctx->exec_cnt )); } @@ -3083,7 +3048,7 @@ unprivileged_init( fd_topo_t * topo, /**********************************************************************/ /* writer */ /**********************************************************************/ - ctx->writer_cnt = tile->replay.writer_tile_cuont; + ctx->writer_cnt = fd_topo_tile_name_cnt( topo, "writer" ); if( FD_UNLIKELY( ctx->writer_cnt>FD_PACK_MAX_BANK_TILES ) ) { FD_LOG_CRIT(( "replay tile has too many writer tiles %lu", ctx->writer_cnt )); } diff --git a/src/discof/restart/fd_restart_tile.c b/src/discof/restart/fd_restart_tile.c index 6d7d8a0500..944d484729 100644 --- a/src/discof/restart/fd_restart_tile.c +++ b/src/discof/restart/fd_restart_tile.c @@ -4,7 +4,6 @@ #include "../../disco/topo/fd_topo.h" #include "../../disco/topo/fd_pod_format.h" #include "../../disco/keyguard/fd_keyload.h" -#include "../../funk/fd_funk_filemap.h" #include "../../flamenco/runtime/fd_runtime.h" #define GOSSIP_IN_IDX (0UL) @@ -18,7 +17,6 @@ struct fd_restart_tile_ctx { fd_funk_t funk[1]; fd_epoch_bank_t epoch_bank; int is_funk_active; - char funk_file[ PATH_MAX ]; fd_spad_t * runtime_spad; int tower_checkpt_fileno; fd_pubkey_t identity, coordinator, genesis_hash; @@ -114,10 +112,10 @@ unprivileged_init( fd_topo_t * topo, /* funk */ /**********************************************************************/ - /* TODO: Same as what happens in the batch tile, eventually, funk should - be joined via a shared topology object. */ + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->restart.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } ctx->is_funk_active = 0; - memcpy( ctx->funk_file, tile->restart.funk_file, sizeof(tile->restart.funk_file) ); /**********************************************************************/ /* spad */ @@ -346,16 +344,6 @@ after_credit( fd_restart_tile_ctx_t * ctx, int * opt_poll_in FD_PARAM_UNUSED, int * charge_busy FD_PARAM_UNUSED ) { if( FD_UNLIKELY( !ctx->is_funk_active ) ) { - /* Setting these parameters are not required because we are joining the - funk that was setup in the replay tile. */ - fd_funk_t * funk = fd_funk_open_file( - ctx->funk, ctx->funk_file, - 1UL, 0UL, 0UL, 0UL, 0UL, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "fd_funk_open_file failed" )); - } else { - FD_LOG_NOTICE(("Restart tile joins funk successfully")); - } ctx->is_funk_active = 1; /* Decode the slot bank from funk, referencing fd_runtime_recover_banks() in fd_runtime_init.c */ diff --git a/src/discof/restart/test/restart_fd.sh b/src/discof/restart/test/restart_fd.sh index 399c03c341..fe248b9ffb 100755 --- a/src/discof/restart/test/restart_fd.sh +++ b/src/discof/restart/test/restart_fd.sh @@ -47,12 +47,8 @@ echo " repair_serve_listen_port = 9056 [tiles.replay] snapshot = \"funk\" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 cluster_version = \"$CLUSTER_VERSION\" tower_checkpt = \"$TOWER_CHECKPT_FILE\" - funk_file = \"$FUNK_FILE\" [tiles.restart] in_wen_restart = true wen_restart_coordinator = \"$RESTART_COORDINATOR\" @@ -74,6 +70,10 @@ echo " idx_max = 512 alloc_max = 10737418240 file = \"$BLOCK_FILE\" +[funk] + max_account_records = 10000000 + heap_size_gib = 32 + max_database_transactions = 1024 " > wen_restart.toml sudo gdb --args build/native/gcc/bin/firedancer-dev dev --config wen_restart.toml diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk new file mode 100644 index 0000000000..334526f06b --- /dev/null +++ b/src/discof/restore/Local.mk @@ -0,0 +1,16 @@ +$(call add-objs,fd_filerd_tile,fd_discof) +ifdef FD_HAS_ZSTD +$(call add-objs,fd_unzstd_tile,fd_discof) +endif +ifdef FD_HAS_INT128 +$(call add-objs,fd_snapin_tile,fd_discof) +$(call add-objs,fd_actalc_tile,fd_discof) +endif +$(call add-objs,fd_actidx_tile,fd_discof) +$(call add-objs,fd_httpdl_tile,fd_discof) +$(call add-objs,stream/fd_stream_writer,fd_discof) +$(call add-objs,stream/fd_event_map,fd_discof) +$(call add-objs,stream/fd_stream_ctx,fd_discof) +ifdef FD_HAS_INT128 +$(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_disco fd_flamenco fd_tango fd_ballet fd_util) +endif diff --git a/src/discof/restore/README.md b/src/discof/restore/README.md new file mode 100644 index 0000000000..4eb5d87352 --- /dev/null +++ b/src/discof/restore/README.md @@ -0,0 +1,87 @@ +# Snapshot Restore + +## Philosophy + +Firedancer is optimized to restore snapshots as fast as possible, i.e. +at I/O and memory bandwidth limits. + +Fast snapshot restore time is not only helpful for operators, but +crucial for fast recovery from failures, which may be widespread in the +worst case. + +To meet these performance requirements, a multi-layer scaling approach +is used: + +- **SIMD:** Cryptographic computations (hashing) are accelerated via + AVX2 / AVX10 SIMD instructions +- **ILP:** Performance-critical logic is hand-optimized for good single-core + throughput on AMD Zen 2 (parallel random memory accesses via prefetching, + non-temporal memory copies, xxHash3 hashing) +- **Thread parallelism:** Certain algorithms redesigned as massively + parallel batch computations (e.g. parallel hashmap insert via sample sort) +- **Pipelining:** Snapshot loading step run concurrently / streaming if + possible. Each step is pinned to a core and independently scalable + for ideal throughput and efficient cache utilization. + +## Pipeline + +Phase 1: Ingest accounts into memory + +``` +FileRd -> UnZstd -> SnapIn -> FnkAlc -> FnkCpy +``` + +- FileRd: Reads a file +- UnZstd: Does Zstandard decompression +- SnapIn: Reads a snapshot +- FnkAlc: Allocates funk heap memory +- FnkCpy: Copies account data out to funk memory + +Phase 2: Index accounts + +``` +ActIdx -> ActDup +``` + +- ActIdx: Indexes accounts +- ActDup: Deletes duplicate accounts + +## Stream link conventions + +Various snapshot components use byte streams, not packet streams. + +These require custom conventions. + +**Stream fragment descriptors** + +Byte streams use `fd_stream_frag_meta_t` (defined in `fd_restore_base.h`). + +These have the following changes: +- `chunk` is replaced by `goff` and `loff`, which are 64-bit offsets + describing the stream offset and dcache offset respectively +- `tsorig` / `tspub` are removed (latency is less relevant) +- `sig` is removed (cannot filter without looking at stream data) +- `sz` is widened to 32 bits. + +**Dcache allocations** + +Payloads in stream dcaches are unaligned. Payloads are addressed with +uncompressed byte offsets relative to the workspace start. + +(Compare this to the usual compact packet dcaches, which use 64 byte +aligned chunks with compressed addressing.) + +**Stream backpressure** + +Byte streams naturally require a reliable transport. + +Consumers periodically publish their progress in `fseq`. +- `fseq[0]` is the lowest sequence number not yet consumed (standard) +- `fseq[1]` is the stream offset of the next byte not yet consumed + +**Frames in streams** + +Tiles can reference stream data zero-copy style. For example, the +`SnapIn` tile publishes fragments describing the accounts it parsed out +of a snapshot stream, where each fragment refers to a byte range in the +stream dcache. diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c new file mode 100644 index 0000000000..1f17b0390a --- /dev/null +++ b/src/discof/restore/fd_actalc_tile.c @@ -0,0 +1,607 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../disco/metrics/fd_metrics.h" +#include "../../flamenco/runtime/fd_acc_mgr.h" /* FD_ACC_SZ_MAX */ +#include "../../flamenco/types/fd_types.h" +#include "../../funk/fd_funk.h" + +#define NAME "ActAlc" +#define LINK_IN_MAX 1UL +#define BURST 1UL + +/* The ActAlc tile has the following responsibilities: + - Bump allocate funk records + - Heap allocate account datas + - Copy account data */ + +struct fd_actalc_tile { + /* Stream input */ + + uchar const * in_base; + union { + fd_solana_account_hdr_t acc_meta; + uchar in_buf[ 136 ]; + }; + ulong in_skip; + ulong acc_seq0; + + /* Funk database */ + + fd_funk_t funk[1]; + void * funk_base; + + fd_alloc_t * alloc; + ulong funk_seed; + + fd_funk_rec_t * rec_next; + ulong rec1_laddr; + + uint db_full : 1; + + /* Account output */ + + fd_account_frag_meta_t * out_mcache; + + ulong out_seq; + ulong out_seq_max; + ulong out_cnt; + ulong out_depth; + + /* Metrics */ + + struct { + ulong alloc_cnt; + ulong cum_alloc_sz; + } metrics; +}; + +typedef struct fd_actalc_tile fd_actalc_tile_t; + +struct fd_actalc_in { + fd_stream_frag_meta_t const * mcache; + uint depth; + uint idx; + ulong seq; + ulong goff; + fd_stream_frag_meta_t const * mline; + ulong volatile * restrict fseq; + uint accum[6]; +}; + +typedef struct fd_actalc_in fd_actalc_in_t; + +static ulong +scratch_align( void ) { + return alignof(fd_actalc_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + return sizeof(fd_actalc_tile_t); +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `ActAlc` tile" )); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + /* FIXME check link names */ + + fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + memset( ctx, 0, sizeof(fd_actalc_tile_t) ); + + /* Join stream input */ + + FD_TEST( fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ) ); + ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; + ctx->in_skip = 0UL; + + /* Join funk database */ + + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->actalc.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + + ctx->funk_base = fd_wksp_containing( ctx->funk->shmem ); + ctx->alloc = fd_funk_alloc( ctx->funk ); + ctx->funk_seed = fd_funk_seed( ctx->funk ); + + fd_funk_rec_map_t const * rec_map = fd_funk_rec_map( ctx->funk ); + ctx->rec_next = rec_map->ele; + ctx->rec1_laddr = (ulong)( rec_map->ele + rec_map->ele_max ); + + /* Join account output */ + + ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); + ctx->out_seq = 0UL; + ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); +} + +static void +during_housekeeping( fd_actalc_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_actalc_tile_t * ctx ) { + (void)ctx; +} + +static void +allocate_account( fd_actalc_tile_t * ctx, + ulong acc_seq0, + fd_solana_account_hdr_t const * hdr ) { + ulong const acc_data_sz = hdr->meta.data_len; + + if( FD_UNLIKELY( acc_data_sz > FD_ACC_SZ_MAX ) ) { + FD_LOG_ERR(( "account data size (%lu) exceeds max (%lu) (possible memory corruption?)", acc_data_sz, FD_ACC_SZ_MAX )); + } + + /* Allocate account */ + + ulong const buf_min = sizeof(fd_account_meta_t)+acc_data_sz; + ulong buf_max = 0UL; + void * const buf = fd_alloc_malloc_at_least( ctx->alloc, 1UL, buf_min, &buf_max ); + if( FD_UNLIKELY( !buf ) ) { + FD_LOG_WARNING(( "Database full after inserting %lu records totalling %.3f GiB: fd_alloc_malloc(align=1,sz=%lu) failed", + ctx->metrics.alloc_cnt, (double)ctx->metrics.cum_alloc_sz/(double)(1UL<<30), buf_min )); + ctx->db_full = 1; + return; + } + ulong const buf_gaddr = (ulong)buf - (ulong)ctx->funk_base; + ctx->metrics.alloc_cnt++; + ctx->metrics.cum_alloc_sz += buf_min; + + /* Calculate funk hash */ + + ulong const funk_hash = fd_funk_rec_key_hash1( hdr->meta.pubkey, FD_FUNK_KEY_TYPE_ACC, ctx->funk_seed ); + + /* Copy account metadata */ + + fd_account_meta_t * meta = buf; + *meta = (fd_account_meta_t) { + .magic = FD_ACCOUNT_META_MAGIC, + .hlen = sizeof(fd_account_meta_t), + .dlen = acc_data_sz, + .slot = hdr->meta.write_version_obsolete, /* ??? */ + .info = { + .lamports = hdr->info.lamports, + .rent_epoch = hdr->info.rent_epoch, + .executable = hdr->info.executable + } + }; + memcpy( meta->info.owner, hdr->info.owner, sizeof(fd_pubkey_t) ); + + /* Allocate funk record */ + + fd_funk_rec_t * rec = ctx->rec_next; + ulong const rec_gaddr = (ulong)rec - (ulong)ctx->funk_base; + memset( rec, 0, sizeof(fd_funk_rec_t) ); + + memcpy( rec->pair.key->uc, hdr->meta.pubkey, sizeof(fd_pubkey_t) ); + rec->pair.key->ul[ 4 ] = FD_FUNK_KEY_TYPE_ACC; + + rec->map_hash = funk_hash; + rec->val_sz = (uint)buf_min; + rec->val_max = (uint)buf_max; + rec->val_gaddr = buf_gaddr; + + /* Publish account descriptor */ + + fd_mcache_publish_account( + ctx->out_mcache, + ctx->out_depth, + ctx->out_seq, + funk_hash, + rec_gaddr, + acc_seq0 + ); + + /* Wind up for next publish */ + + ctx->out_seq = fd_seq_inc( ctx->out_seq, 1UL ); + ctx->rec_next++; + if( FD_UNLIKELY( (ulong)ctx->rec_next >= ctx->rec1_laddr ) ) { + FD_LOG_WARNING(( "Funk record map full" )); + ctx->db_full = 1; + return; + } +} + +static void +on_stream_frag( fd_actalc_tile_t * ctx, + fd_stream_frag_meta_t const * meta ) { + ulong const seq = meta->seq; + ulong const loff = meta->loff; + ulong const sz = meta->sz; + ulong const ctl = meta->ctl; + int const som = fd_frag_meta_ctl_som( ctl ); /* first frag of account? */ + + /* Are we already done with this account? */ + if( FD_UNLIKELY( !som && !ctx->in_skip ) ) return; + + /* Read account header */ + ulong const want_sz = sizeof(fd_solana_account_hdr_t); + uchar const * frag = ctx->in_base + loff; + + /* Unfragmented fast path */ + if( FD_LIKELY( som && sz>=want_sz ) ) { + allocate_account( ctx, seq, (fd_solana_account_hdr_t const *)frag ); + return; + } + + /* Slow path: Recover from fragmentation */ + if( som ) ctx->acc_seq0 = seq; + if( FD_UNLIKELY( ctx->in_skip >= want_sz ) ) FD_LOG_CRIT(( "invariant violation: in_skip (%lu) > want_sz (%lu)", ctx->in_skip, want_sz )); + ulong const chunk0 = ctx->in_skip; + ulong const rem_sz = want_sz-chunk0; + ulong const read_sz = fd_ulong_min( rem_sz, sz ); + if( FD_UNLIKELY( !read_sz ) ) return; + fd_memcpy( ctx->in_buf+chunk0, frag, read_sz ); + ctx->in_skip += read_sz; + if( FD_LIKELY( ctx->in_skip == want_sz ) ) { + allocate_account( ctx, ctx->acc_seq0, &ctx->acc_meta ); + ctx->in_skip = 0UL; + } +} + +/* fd_actalc_in_update gets called periodically synchronize flow control + credits back to the stream producer. Also updates link in metrics. */ + +static void +fd_actalc_in_update( fd_actalc_in_t * in ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->fseq[0] ) = in->seq; + FD_VOLATILE( in->fseq[1] ) = in->goff; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + + uint * accum = in->accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +__attribute__((noinline)) static void +fd_actalc_run1( + fd_actalc_tile_t * ctx, + ulong in_cnt, + fd_actalc_in_t * in, /* [in_cnt] */ + fd_frag_meta_t * out_mcache, + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong ** cons_fseq, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [cons_cnt] */ + long lazy, + fd_rng_t * rng +) { + /* in frag stream state */ + ulong in_seq; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; /* First in to poll */ + + ulong min_in_depth = (ulong)LONG_MAX; + for( ulong in_idx=0UL; in_idxmcache->f ); + min_in_depth = fd_ulong_min( min_in_depth, depth ); + } + + /* out frag stream init */ + + ulong const burst = BURST; + + if( FD_UNLIKELY( !out_mcache ) ) FD_LOG_ERR(( "NULL out_mcache" )); + + ulong const out_depth = fd_mcache_depth( out_mcache ); + ulong const cr_max = out_depth; + + for( ulong cons_idx=0UL; cons_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ + + /* Send flow control credits and drain flow control diagnostics. */ + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_actalc_in_update( &in[ in_idx ] ); + + } else { /* event_idx==cons_cnt, housekeeping event */ + + /* Send synchronization info */ + FD_COMPILER_MFENCE(); + FD_VOLATILE( out_sync[0] ) = ctx->out_seq; + FD_VOLATILE( out_sync[1] ) = ctx->out_cnt; + FD_COMPILER_MFENCE(); + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + ulong slowest_cons = ULONG_MAX; + ulong cr_avail = cr_max; + for( ulong cons_idx=0UL; cons_idxout_seq, cons_seq[ cons_idx ] ), 0L ), 0L ); + slowest_cons = fd_ulong_if( cons_cr_availout_seq_max = ctx->out_seq + cr_avail; + + if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*cons_slow[ slowest_cons ]) += metric_in_backp; + FD_COMPILER_MFENCE(); + } + + during_housekeeping( ctx ); + + } + + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + + event_seq++; + if( FD_UNLIKELY( event_seq>=event_cnt ) ) { + event_seq = 0UL; + + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + + if( FD_LIKELY( in_cnt>1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_actalc_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( ctx->db_full || ctx->out_seq+burst > ctx->out_seq_max ) ) { + metric_backp_cnt += (ulong)!metric_in_backp; + metric_in_backp = 1UL; + FD_SPIN_PAUSE(); + metric_regime_ticks[2] += housekeeping_ticks; + long next = fd_tickcount(); + metric_regime_ticks[5] += (ulong)(next - now); + now = next; + continue; + } + metric_in_backp = 0UL; + + /* Select which in to poll next (randomized round robin) */ + + if( FD_UNLIKELY( !in_cnt ) ) { + metric_regime_ticks[0] += housekeeping_ticks; + long next = fd_tickcount(); + metric_regime_ticks[3] += (ulong)(next - now); + now = next; + continue; + } + + ulong prefrag_ticks = 0UL; + + fd_actalc_in_t * this_in = &in[ in_seq ]; + in_seq++; + if( in_seq>=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + ulong this_in_seq = this_in->seq; + fd_stream_frag_meta_t const * this_in_mline = this_in->mline; + + ulong seq_found = fd_frag_meta_seq_query( this_in_mline->f ); + + long diff = fd_seq_diff( this_in_seq, seq_found ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + this_in->seq = seq_found; + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + fd_stream_frag_meta_t meta = FD_VOLATILE_CONST( *this_in_mline ); + on_stream_frag( ctx, &meta ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)meta.sz; + + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu: seq_found=%lu seq_test=%lu", in_seq, seq_found, seq_test )); + } + + /* Windup for the next in poll and accumulate diagnostics */ + + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->goff = meta.goff + meta.sz; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +static void +fd_actalc_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + FD_TEST( tile->out_cnt==1UL ); + fd_frag_meta_t * const out_mcache = topo->links[ tile->out_link_id[ 0 ] ].mcache; + FD_TEST( out_mcache ); + + ulong reliable_cons_cnt = 0UL; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cntmcache = in_mcache[ i ]; + this_in->fseq = in_fseq [ i ]; + + ulong depth = fd_mcache_depth( this_in->mcache->f ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", i )); + this_in->depth = (uint)depth; + this_in->idx = (uint)i; + this_in->seq = 0UL; + this_in->goff = 0UL; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + + this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; + this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; + } + + fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ reliable_cons_cnt ]; + fd_actalc_run1( ctx, polled_in_cnt, polled_in, out_mcache, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc = { + .name = "ActAlc", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_actalc_run, +}; +#endif diff --git a/src/discof/restore/fd_actidx_tile.c b/src/discof/restore/fd_actidx_tile.c new file mode 100644 index 0000000000..f58fa243c2 --- /dev/null +++ b/src/discof/restore/fd_actidx_tile.c @@ -0,0 +1,473 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../funk/fd_funk.h" +#include "../../disco/metrics/fd_metrics.h" + +#define NAME "ActIdx" +#define LINK_IN_MAX 1UL +#define BURST 1UL + +typedef fd_funk_rec_map_shmem_private_chain_t fd_funk_rec_chain_t; + +struct fd_actidx_tile { + /* Stream input */ + + uchar const * in_base; + + /* Funk database */ + + fd_funk_t funk[1]; + void * funk_base; + fd_funk_rec_t * rec0; + fd_funk_rec_chain_t * chain0; + ulong chain_mask; +}; + +typedef struct fd_actidx_tile fd_actidx_tile_t; + +struct fd_actidx_in { + fd_account_frag_meta_t const * mcache; + uint depth; + uint idx; + ulong seq; + fd_account_frag_meta_t const * mline; + ulong volatile * restrict fseq; + uint accum[6]; +}; + +typedef struct fd_actidx_in fd_actidx_in_t; + +static ulong +scratch_align( void ) { + return alignof(fd_actidx_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + return sizeof(fd_actidx_tile_t); +} + +static void +during_housekeeping( fd_actidx_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_actidx_tile_t * ctx ) { + (void)ctx; +} + +static void +on_account_frag( fd_actidx_tile_t * ctx, + fd_account_frag_meta_t const * meta ) { + ulong const chain_mask = ctx->chain_mask; + ulong const rec_hash = meta->rec_hash; + ulong const rec_gaddr = meta->gaddr; + + fd_funk_rec_t * const rec = (fd_funk_rec_t *)( (ulong)ctx->funk_base + rec_gaddr ); + ulong const rec_idx = (ulong)( rec - ctx->rec0 ) / sizeof(fd_funk_rec_t); + ulong const chain_idx = rec_hash & chain_mask; + fd_funk_rec_chain_t * const chain = ctx->chain0 + chain_idx; + + ulong ver_cnt = chain->ver_cnt; + ulong version = fd_funk_rec_map_private_vcnt_ver( ver_cnt ); + ulong ele_cnt = fd_funk_rec_map_private_vcnt_cnt( ver_cnt ); + + uint old_head = chain->head_cidx; + uint new_head = (uint)rec_idx; + rec->map_next = old_head; + + chain->head_cidx = new_head; + chain->ver_cnt = fd_funk_rec_map_private_vcnt( version, ele_cnt+1UL ); +} + +static void +fd_actidx_in_update( fd_actidx_in_t * in ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->fseq[0] ) = in->seq; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + + uint * accum = in->accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `ActIdx` tile" )); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + //if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + + fd_actidx_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + memset( ctx, 0, sizeof(fd_actidx_tile_t) ); + + /* Join funk database */ + + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->actidx.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + ctx->funk_base = fd_wksp_containing( ctx->funk->shmem ); + + fd_funk_rec_map_t const * rec_map = fd_funk_rec_map( ctx->funk ); + ctx->rec0 = rec_map->ele; + ctx->chain0 = fd_funk_rec_map_shmem_private_chain( rec_map->map, 0UL ); + ctx->chain_mask = rec_map->map->chain_cnt-1UL; +} + +__attribute__((noinline)) static void +fd_actidx_run1( + fd_actidx_tile_t * ctx, + ulong in_cnt, + fd_actidx_in_t * in, /* [in_cnt] */ + ulong out_cnt, + fd_frag_meta_t ** out_mcache, /* [out_cnt] */ + ulong * out_depth, /* [out_cnt] */ + ulong * out_seq, /* [out_cnt] */ + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong * cons_out, /* [cons_cnt] */ + ulong ** cons_fseq, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [cons_cnt] */ + long lazy, + fd_rng_t * rng +) { + /* in frag stream state */ + ulong in_seq; + + /* out flow control state */ + ulong cr_avail; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; /* First in to poll */ + + ulong min_in_depth = (ulong)LONG_MAX; + for( ulong in_idx=0UL; in_idxmcache->f ); + min_in_depth = fd_ulong_min( min_in_depth, depth ); + } + + /* out frag stream init */ + + cr_avail = 0UL; + + ulong const burst = BURST; + + ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX ); + + for( ulong out_idx=0UL; out_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ + + /* Send flow control credits and drain flow control diagnostics. */ + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_actidx_in_update( &in[ in_idx ] ); + + } else { /* event_idx==cons_cnt, housekeeping event */ + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + if( FD_LIKELY( cr_avail=event_cnt ) ) { + event_seq = 0UL; + + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + + if( FD_LIKELY( in_cnt>1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_actidx_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( cr_avail=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + ulong this_in_seq = this_in->seq; + fd_account_frag_meta_t const * this_in_mline = this_in->mline; + + ulong seq_found = fd_frag_meta_seq_query( this_in_mline->f ); + + long diff = fd_seq_diff( this_in_seq, seq_found ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + this_in->seq = seq_found; + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + fd_account_frag_meta_t meta = FD_VOLATILE_CONST( *this_in_mline ); + on_account_frag( ctx, &meta ); + + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu: seq_found=%lu seq_test=%lu", in_seq, seq_found, seq_test )); + } + + /* Windup for the next in poll and accumulate diagnostics */ + + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +static void +fd_actidx_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_account_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + fd_frag_meta_t * out_mcache[ tile->out_cnt ]; + ulong out_depth [ tile->out_cnt ]; + ulong out_seq [ tile->out_cnt ]; + for( ulong i=0UL; iout_cnt; i++ ) { + out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; + FD_TEST( out_mcache[ i ] ); + out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); + out_seq [ i ] = 0UL; + } + + ulong reliable_cons_cnt = 0UL; + ulong cons_out[ FD_TOPO_MAX_LINKS ]; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_out[ reliable_cons_cnt ] = k; + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cntmcache = in_mcache[ i ]; + this_in->fseq = in_fseq [ i ]; + + ulong depth = fd_mcache_depth( this_in->mcache->f ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", i )); + this_in->depth = (uint)depth; + this_in->idx = (uint)i; + this_in->seq = 0UL; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + + this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; + this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; + } + + fd_actidx_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ reliable_cons_cnt ]; + fd_actidx_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, out_seq, reliable_cons_cnt, event_map, cons_out, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_ActIdx = { + .name = "ActIdx", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_actidx_run, +}; +#endif diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c new file mode 100644 index 0000000000..da683448da --- /dev/null +++ b/src/discof/restore/fd_filerd_tile.c @@ -0,0 +1,139 @@ +#include "fd_restore_base.h" +#include "stream/fd_stream_ctx.h" +#include "../../disco/topo/fd_topo.h" +#include "../../disco/metrics/fd_metrics.h" +#include +#include +#include +#include + +#define NAME "FileRd" +#define FILE_READ_MAX 8UL<<20 + +struct fd_filerd_tile { + fd_stream_writer_t * writer; + int fd; +}; + +typedef struct fd_filerd_tile fd_filerd_tile_t; + +static ulong +scratch_align( void ) { + return alignof(fd_filerd_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + return sizeof(fd_filerd_tile_t); +} + +static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + fd_memset( ctx, 0, sizeof(fd_filerd_tile_t) ); + + if( FD_UNLIKELY( !tile->filerd.file_path[0] ) ) FD_LOG_ERR(( "File path not set" )); + ctx->fd = open( tile->filerd.file_path, O_RDONLY|O_CLOEXEC ); + if( FD_UNLIKELY( ctx->fd<0 ) ) FD_LOG_ERR(( "open() failed (%i-%s)", errno, fd_io_strerror( errno ) )); +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + (void)topo; + if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); +} + +static void +fd_filerd_init_from_stream_ctx( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_filerd_tile_t * ctx = _ctx; + ctx->writer = fd_stream_writer_join( stream_ctx->writers[0] ); + FD_TEST( ctx->writer ); + fd_stream_writer_set_frag_sz_max( ctx->writer, FILE_READ_MAX ); +} + +__attribute__((noreturn)) FD_FN_UNUSED static void +fd_filerd_shutdown( fd_filerd_tile_t * ctx ) { + if( FD_UNLIKELY( close( ctx->fd ) ) ) { + FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } + ctx->fd = -1; + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + fd_stream_writer_close( ctx->writer ); + FD_COMPILER_MFENCE(); + FD_LOG_INFO(( "Reached end of file" )); + + for(;;) pause(); +} + +static void +after_credit( void * _ctx, + fd_stream_ctx_t * stream_ctx, + int * poll_in FD_PARAM_UNUSED ) { + fd_filerd_tile_t * ctx = _ctx; + (void)stream_ctx; + + uchar * out = fd_stream_writer_prepare( ctx->writer ); + ulong out_max = fd_stream_writer_publish_sz_max( ctx->writer ); + + /* technically, this is not needed because fd_stream_ctx_run_loop + checks for backpresure on all outgoing links and there is only one + outgoing link anyways. But, it is added for clarity that + callbacks should handle backpressure for their out links. */ + if( FD_UNLIKELY( !out_max ) ) return; + + int fd = ctx->fd; + if( FD_UNLIKELY( fd<0 ) ) return; + + long res = read( fd, out, out_max ); + if( FD_UNLIKELY( res<=0L ) ) { + if( FD_UNLIKELY( res==0 ) ) { + fd_filerd_shutdown( ctx ); + return; + } + if( FD_LIKELY( errno==EAGAIN ) ) return; + FD_LOG_ERR(( "readv() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + /* aborts app */ + } + + fd_stream_writer_publish( ctx->writer, (ulong)res, 0UL ); +} + +__attribute__((noinline)) static void +fd_filerd_run1( fd_filerd_tile_t * ctx, + fd_stream_ctx_t * stream_ctx ) { + FD_LOG_INFO(( "Running filerd tile" )); + + fd_stream_ctx_run( stream_ctx, + ctx, + fd_filerd_init_from_stream_ctx, + NULL, + NULL, + NULL, + after_credit, + NULL ); +} + +static void +fd_filerd_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + void * ctx_mem = fd_alloca_check( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_footprint( topo, tile ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile ); + fd_filerd_run1( ctx, stream_ctx ); +} + +fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd = { + .name = NAME, + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, + .run = fd_filerd_run, +}; + +#undef NAME diff --git a/src/discof/restore/fd_httpdl_tile.c b/src/discof/restore/fd_httpdl_tile.c new file mode 100644 index 0000000000..fa03623d31 --- /dev/null +++ b/src/discof/restore/fd_httpdl_tile.c @@ -0,0 +1,163 @@ +#include "../../disco/topo/fd_topo.h" +#include "../../flamenco/snapshot/fd_snapshot_http.h" +#include "stream/fd_stream_writer.h" +#include "stream/fd_stream_ctx.h" +#include + +#define NAME "HttpDl" +#define HTTP_CHUNK_SZ 8 * 1024 * 1024UL + +struct fd_httpdl_tile { + fd_snapshot_http_t * http; + fd_stream_writer_t * writer; +}; +typedef struct fd_httpdl_tile fd_httpdl_tile_t; + +FD_FN_PURE static ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_httpdl_tile_t), + fd_ulong_max( fd_snapshot_http_align(), fd_stream_writer_align() ) ); +} + +FD_FN_PURE static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_httpdl_tile_t), sizeof(fd_httpdl_tile_t) ); + l = FD_LAYOUT_APPEND( l, fd_snapshot_http_align(), fd_snapshot_http_footprint() ); + return FD_LAYOUT_FINI( l, scratch_align() ); +} + +static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + fd_httpdl_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_httpdl_tile_t), sizeof(fd_httpdl_tile_t) ); + void * http_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_snapshot_http_align(), fd_snapshot_http_footprint() ); + + fd_memset( ctx, 0, sizeof(fd_httpdl_tile_t) ); + + if( FD_UNLIKELY( !tile->httpdl.dest[0] ) ) { + FD_LOG_ERR(( "http dest not set" )); + } + + /* TODO: is null ok for the name? */ + ctx->http = fd_snapshot_http_new( http_mem, + tile->httpdl.dest, + tile->httpdl.ip4, + tile->httpdl.port, + tile->httpdl.snapshot_dir, + NULL ); + + fd_snapshot_http_privileged_init( ctx->http ); +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + (void)topo; + if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); +} + +static void +fd_httpdl_init_from_stream_ctx( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); + + /* join writer */ + ctx->writer = fd_stream_writer_join( stream_ctx->writers[0] ); + FD_TEST( ctx->writer ); + fd_stream_writer_set_frag_sz_max( ctx->writer, HTTP_CHUNK_SZ ); +} + +__attribute__((noreturn)) FD_FN_UNUSED static void +fd_httpdl_shutdown( fd_httpdl_tile_t * ctx ) { + fd_snapshot_http_cleanup_fds( ctx->http ); + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + fd_stream_writer_close( ctx->writer ); + FD_COMPILER_MFENCE(); + FD_LOG_WARNING(("Done downloading snapshot")); + + for(;;) pause(); +} + +__attribute__((unused)) static void +after_credit_chunk( void * _ctx, + fd_stream_ctx_t * stream_ctx, + int * opt_poll_in FD_PARAM_UNUSED ) { + fd_httpdl_tile_t * ctx = _ctx; + (void)stream_ctx; + + /* Output */ + uchar * const out = fd_stream_writer_prepare( ctx->writer ); + uchar * const out_end = out + fd_stream_writer_publish_sz_max( ctx->writer ); + uchar * out_cur = out; + + while( out_curhttp, out_cur, (ulong)out_cur-(ulong)out, &chunk_sz ); + if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); + else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); + out_cur += chunk_sz; + } + + fd_stream_writer_publish( ctx->writer, (ulong)out_cur-(ulong)out, 0UL ); +} + +__attribute__((unused)) static void +after_credit_stream( void * _ctx, + fd_stream_ctx_t * stream_ctx, + int * opt_poll_in FD_PARAM_UNUSED ) { + fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); + (void)stream_ctx; + + /* Output */ + uchar * const out = fd_stream_writer_prepare( ctx->writer ); + ulong const out_max = fd_stream_writer_publish_sz_max( ctx->writer ); + + ulong chunk_sz; + int err = fd_io_istream_snapshot_http_read( ctx->http, out, out_max, &chunk_sz ); + if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); + else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); + + fd_stream_writer_publish( ctx->writer, chunk_sz, 0UL ); +} + +__attribute__((noinline)) static void +fd_httpdl_run1( + fd_httpdl_tile_t * ctx, + fd_stream_ctx_t * stream_ctx ) { + + FD_LOG_INFO(( "Running httpdl tile" )); + + fd_stream_ctx_run( stream_ctx, + ctx, + fd_httpdl_init_from_stream_ctx, + NULL, + NULL, + NULL, + after_credit_stream, + NULL ); +} + +static void +fd_httpdl_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_httpdl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + void * ctx_mem = fd_alloca_check( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_footprint( topo, tile ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile ); + FD_TEST( stream_ctx ); + fd_httpdl_run1( ctx, stream_ctx ); +} + +fd_topo_run_tile_t fd_tile_snapshot_restore_HttpDl = { + .name = NAME, + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, + .run = fd_httpdl_run, +}; + +#undef NAME diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h new file mode 100644 index 0000000000..0dd55ecc26 --- /dev/null +++ b/src/discof/restore/fd_restore_base.h @@ -0,0 +1,111 @@ +#ifndef HEADER_fd_src_discof_restore_fd_restore_base_h +#define HEADER_fd_src_discof_restore_fd_restore_base_h + +#include "../../tango/mcache/fd_mcache.h" + +/* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for + stream I/O. */ + +union fd_stream_frag_meta { + + struct { + + ulong seq; /* frag sequence number */ + uint sz; + ushort unused; + ushort ctl; + + ulong goff; /* stream offset */ + ulong loff; /* dcache offset */ + + }; + + fd_frag_meta_t f[1]; + +}; + +typedef union fd_stream_frag_meta fd_stream_frag_meta_t; + +FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); +FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); + +/* fd_account_frag_meta_t is a variation of fd_frag_meta_t optimized for + accounts. */ + +union fd_account_frag_meta { + + struct { + + ulong seq; + ulong rec_hash; + + ulong gaddr; + ulong frag_seq; + + }; + + fd_frag_meta_t f[1]; + + fd_stream_frag_meta_t acc[1]; + +}; + +typedef union fd_account_frag_meta fd_account_frag_meta_t; + +FD_STATIC_ASSERT( alignof(fd_account_frag_meta_t)==32, abi ); +FD_STATIC_ASSERT( sizeof (fd_account_frag_meta_t)==32, abi ); + +/* fd_stream_frag_meta_ctx_t tracks receiving state from a stream */ +struct fd_stream_frag_meta_ctx { + uchar const * in_buf; + ulong goff_translate; + ulong loff_translate; + ulong in_skip; +}; +typedef struct fd_stream_frag_meta_ctx fd_stream_frag_meta_ctx_t; + +FD_PROTOTYPES_BEGIN + +static inline void +fd_mcache_publish_stream( fd_stream_frag_meta_t * mcache, + ulong depth, + ulong seq, + ulong goff, + ulong loff, + ulong sz, + ulong ctl ) { + fd_stream_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + FD_COMPILER_MFENCE(); + meta->seq = fd_seq_dec( seq, 1UL ); + FD_COMPILER_MFENCE(); + meta->goff = goff; + meta->sz = (uint)sz; + meta->ctl = (ushort)ctl; + meta->loff = loff; + FD_COMPILER_MFENCE(); + meta->seq = seq; + FD_COMPILER_MFENCE(); +} + +static inline void +fd_mcache_publish_account( fd_account_frag_meta_t * mcache, + ulong depth, + ulong seq, + ulong rec_hash, + ulong gaddr, + ulong frag_seq ) { + fd_account_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + FD_COMPILER_MFENCE(); + meta->seq = fd_seq_dec( seq, 1UL ); + FD_COMPILER_MFENCE(); + meta->rec_hash = rec_hash; + meta->gaddr = gaddr; + meta->frag_seq = frag_seq; + FD_COMPILER_MFENCE(); + meta->seq = seq; + FD_COMPILER_MFENCE(); +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_fd_restore_base_h */ diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c new file mode 100644 index 0000000000..e14eb3bf2f --- /dev/null +++ b/src/discof/restore/fd_snapin_tile.c @@ -0,0 +1,1240 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../disco/metrics/fd_metrics.h" +#include "../../util/archive/fd_tar.h" +#include "../../flamenco/runtime/fd_acc_mgr.h" /* FD_ACC_SZ_MAX */ +#include "../../flamenco/types/fd_types.h" +#include +#include +#include +#include + +#define NAME "SnapIn" +#define LINK_IN_MAX 1UL +#define BURST 16UL + +#define SNAP_STATE_IGNORE ((uchar)0) /* ignore file content */ +#define SNAP_STATE_TAR ((uchar)1) /* reading tar header (buffered) */ +#define SNAP_STATE_MANIFEST ((uchar)2) /* reading manifest (buffered) */ +#define SNAP_STATE_ACCOUNT_HDR ((uchar)3) /* reading account hdr (buffered) */ +#define SNAP_STATE_ACCOUNT_DATA ((uchar)4) /* reading account data (zero copy) */ +#define SNAP_STATE_DONE ((uchar)5) /* expect no more data */ + +struct fd_snapshot_accv_key { + ulong slot; + ulong id; +}; + +typedef struct fd_snapshot_accv_key fd_snapshot_accv_key_t; + +static const fd_snapshot_accv_key_t +fd_snapshot_accv_key_null = { 0UL, 0UL }; + +FD_FN_PURE static inline ulong +fd_snapshot_accv_key_hash( fd_snapshot_accv_key_t key ) { + return fd_hash( 0x39c49607bf16463aUL, &key, sizeof(fd_snapshot_accv_key_t) ); +} + +struct fd_snapshot_accv_map { + fd_snapshot_accv_key_t key; + ulong sz; + ulong hash; /* use uint or ulong hash? */ +}; + +typedef struct fd_snapshot_accv_map fd_snapshot_accv_map_t; + +#define MAP_NAME fd_snapshot_accv_map +#define MAP_T fd_snapshot_accv_map_t +#define MAP_LG_SLOT_CNT 23 /* 8.39 million */ +#define MAP_KEY_T fd_snapshot_accv_key_t +#define MAP_KEY_NULL fd_snapshot_accv_key_null +#define MAP_KEY_INVAL(k) ( ((k).slot==0UL) & ((k).id==0UL) ) +#define MAP_KEY_EQUAL(k0,k1) ( ((k0).slot==(k1).slot) & ((k0).id==(k1).id) ) +#define MAP_KEY_EQUAL_IS_SLOW 0 +#define MAP_HASH_T ulong +#define MAP_KEY_HASH(k0) fd_snapshot_accv_key_hash(k0) +#include "../../util/tmpl/fd_map.c" + +#define SNAP_FLAG_FAILED 1 +#define SNAP_FLAG_BLOCKED 2 +#define SNAP_FLAG_DONE 4 + +struct fd_snapin_tile { + uchar state; + uchar flags; + uchar manifest_done; + + /* Stream input */ + + uchar const * in_base; + ulong goff_translate; + ulong in_skip; + ulong const * in_sync; + + /* Frame buffer */ + + uchar * buf; + ulong buf_ctr; /* number of bytes allocated in buffer */ + ulong buf_sz; /* target buffer size (buf_ctrflags = SNAP_FLAG_DONE; + + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + FD_LOG_WARNING(( "Finished parsing snapshot" )); + + /* Send synchronization info */ + ulong volatile * out_sync = fd_mcache_seq_laddr( ctx->out_mcache->f ); + FD_COMPILER_MFENCE(); + FD_VOLATILE( out_sync[0] ) = ctx->out_seq; + FD_VOLATILE( out_sync[2] ) = 1; + FD_COMPILER_MFENCE(); + + for(;;) pause(); +} + +static void +fd_snapshot_restore_discard_buf( fd_snapin_tile_t * self ) { + self->buf_ctr = 0UL; + self->buf_sz = 0UL; +} + +static void * +fd_snapshot_restore_prepare_buf( fd_snapin_tile_t * self, + ulong sz ) { + self->buf_ctr = 0UL; + self->buf_sz = 0UL; + + fd_snapshot_restore_discard_buf( self ); + if( FD_UNLIKELY( sz > self->buf_max ) ) { + FD_LOG_WARNING(( "Alloc failed (need %lu bytes, have %lu)", sz, self->buf_max )); + self->state = SNAP_FLAG_FAILED; + return NULL; + } + + return self->buf; +} + +static int +fd_snapshot_expect_account_hdr( fd_snapin_tile_t * restore ) { + + ulong accv_sz = restore->accv_sz; + if( accv_sz < sizeof(fd_solana_account_hdr_t) ) { + if( FD_LIKELY( accv_sz==0UL ) ) { + restore->state = SNAP_STATE_ACCOUNT_HDR; + return 0; + } + FD_LOG_WARNING(( "encountered unexpected EOF while reading account header" )); + restore->flags |= SNAP_FLAG_FAILED; + return EINVAL; + } + + restore->state = SNAP_STATE_ACCOUNT_HDR; + restore->buf_ctr = 0UL; + restore->buf_sz = sizeof(fd_solana_account_hdr_t); + + return 0; +} + +static int +fd_snapshot_restore_accv_prepare( fd_snapin_tile_t * const restore, + fd_tar_meta_t const * const meta, + ulong const real_sz ) { + + if( FD_UNLIKELY( !fd_snapshot_restore_prepare_buf( restore, sizeof(fd_solana_account_hdr_t) ) ) ) { + FD_LOG_WARNING(( "Failed to allocate read buffer while restoring accounts from snapshot" )); + return ENOMEM; + } + + /* Parse file name */ + ulong id, slot; + if( FD_UNLIKELY( sscanf( meta->name, "accounts/%lu.%lu", &slot, &id )!=2 ) ) { + /* Ignore entire file if file name invalid */ + restore->state = SNAP_STATE_IGNORE; + return 0; + } + + /* Lookup account vec file size */ + fd_snapshot_accv_key_t key = { .slot = slot, .id = id }; + fd_snapshot_accv_map_t * rec = fd_snapshot_accv_map_query( restore->accv_map, key, NULL ); + if( FD_UNLIKELY( !rec ) ) { + /* Ignore account vec files that are not explicitly mentioned in the + manifest. */ + FD_LOG_DEBUG(( "Ignoring %s (sz %lu)", meta->name, real_sz )); + restore->state = SNAP_STATE_IGNORE; + return 0; + } + ulong sz = rec->sz; + + /* Validate the supposed file size against real size */ + if( FD_UNLIKELY( sz > real_sz ) ) { + FD_LOG_WARNING(( "AppendVec %lu.%lu is %lu bytes long according to manifest, but actually only %lu bytes", + slot, id, sz, real_sz )); + restore->flags |= SNAP_FLAG_FAILED; + return EINVAL; + } + restore->accv_sz = sz; + restore->accv_slot = slot; + restore->accv_id = id; + + /* Prepare read of account header */ + FD_LOG_DEBUG(( "Loading account vec %s", meta->name )); + return fd_snapshot_expect_account_hdr( restore ); +} + + +/* fd_snapshot_restore_manifest_prepare prepares for consumption of the + snapshot manifest. */ + +static int +fd_snapshot_restore_manifest_prepare( fd_snapin_tile_t * restore, + ulong sz ) { + /* Only read once */ + if( restore->manifest_done ) { + FD_LOG_WARNING(( "Snapshot file contains multiple manifests" )); + restore->state = SNAP_STATE_IGNORE; + return 0; + } + + /* We don't support streaming manifest deserialization yet. Thus, + buffer the whole manifest in one place. */ + if( FD_UNLIKELY( !fd_snapshot_restore_prepare_buf( restore, sz ) ) ) { + restore->flags |= SNAP_FLAG_FAILED; + return ENOMEM; + } + + restore->state = SNAP_STATE_MANIFEST; + restore->buf_sz = sz; + + return 0; +} + +static void +restore_file( void * restore_, + fd_tar_meta_t const * meta, + ulong sz ) { + fd_snapin_tile_t * restore = restore_; + + restore->buf_ctr = 0UL; /* reset buffer */ + restore->state = SNAP_STATE_IGNORE; + + if( (sz==0UL) | (!fd_tar_meta_is_reg( meta )) ) return; + + /* Detect account vec files. These are files that contain a vector + of accounts in Solana Labs "AppendVec" format. */ + assert( sizeof("accounts/")name, "accounts/", sizeof("accounts/")-1) ) { + if( FD_UNLIKELY( !restore->manifest_done ) ) { + FD_LOG_WARNING(( "Unsupported snapshot: encountered AppendVec before manifest" )); + restore->flags |= SNAP_FLAG_FAILED; + return; + } + fd_snapshot_restore_accv_prepare( restore, meta, sz ); + } else if( fd_memeq( meta->name, "snapshots/status_cache", sizeof("snapshots/status_cache") ) ) { + /* TODO */ + } else if(0==strncmp( meta->name, "snapshots/", sizeof("snapshots/")-1 ) ) { + fd_snapshot_restore_manifest_prepare( restore, sz ); + } + +} + +static uchar const * +snapshot_read_buffered( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + /* Should not be called if read is complete */ + FD_TEST( restore->buf_ctr < restore->buf_sz ); + + /* Determine number of bytes to buffer */ + ulong sz = restore->buf_sz - restore->buf_ctr; + if( sz>bufsz ) sz = bufsz; + + /* Append to buffer */ + fd_memcpy( restore->buf + restore->buf_ctr, buf, sz ); + restore->buf_ctr += sz; + + return buf+sz; +} + +FD_FN_PURE static inline int +snapshot_read_is_complete( fd_snapin_tile_t const * restore ) { + return restore->buf_ctr == restore->buf_sz; +} + +static int +snapshot_restore_account_hdr( fd_snapin_tile_t * restore ) { + fd_solana_account_hdr_t const * hdr = fd_type_pun_const( restore->buf ); + if( FD_UNLIKELY( hdr->meta.data_len > FD_ACC_SZ_MAX ) ) { + FD_LOG_ERR(( "account data size (%lu) exceeds max (%lu) (possible memory corruption?)", hdr->meta.data_len, FD_ACC_SZ_MAX )); + } + + ulong data_sz = hdr->meta.data_len; + restore->acc_sz = data_sz; + restore->acc_rem = data_sz; + restore->acc_pad = fd_ulong_align_up( data_sz, 8UL ) - data_sz; + + if( FD_UNLIKELY( data_sz>(10UL<<20) ) ) { + FD_LOG_ERR(( "Oversize account found (%lu bytes)", data_sz )); + } + + /* Next step */ + if( data_sz == 0UL ) { + return fd_snapshot_expect_account_hdr( restore ); + } + + restore->state = SNAP_STATE_ACCOUNT_DATA; + restore->buf_ctr = 0UL; + restore->buf_sz = 0UL; + return 0; +} + +static uchar const * +snapshot_read_account_hdr_chunk( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + if( !restore->accv_sz ) { + /* Reached end of AppendVec */ + restore->state = SNAP_STATE_IGNORE; + restore->buf_ctr = restore->buf_sz = 0UL; + return buf; + } + bufsz = fd_ulong_min( bufsz, restore->accv_sz ); + + int som = restore->buf_ctr == 0UL; + + ulong frag_goff = (ulong)buf - restore->goff_translate; + ulong frag_loff = (ulong)buf - (ulong)restore->in_base; + + uchar const * buf_next = snapshot_read_buffered( restore, buf, bufsz ); + ulong hdr_read = (ulong)(buf_next-buf); + restore->accv_sz -= hdr_read; + bufsz -= hdr_read; + + ulong peek_sz = 0UL; + if( FD_LIKELY( snapshot_read_is_complete( restore ) ) ) { + if( FD_UNLIKELY( 0!=snapshot_restore_account_hdr( restore ) ) ) { + return buf; /* parse error */ + } + peek_sz = fd_ulong_min( restore->acc_rem, bufsz ); + } + + int eom = bufsz >= restore->acc_rem; + + /* Publish header-only fragment or header+data fragment. + If data was included, skip ahead. (Combining header+data into the + same fragment reduces the amount of descriptor frags published.) */ + + ulong const frag_sz = hdr_read + peek_sz; + fd_mcache_publish_stream( + restore->out_mcache, + restore->out_depth, + restore->out_seq, + frag_goff, + frag_loff, + frag_sz, + fd_frag_meta_ctl( 0UL, som, eom, 0 ) + ); + restore->out_seq = fd_seq_inc( restore->out_seq, 1UL ); + restore->out_cnt += !!som; + restore->acc_rem -= peek_sz; + restore->accv_sz -= peek_sz; + buf_next += peek_sz; + + return buf_next; +} + +static uchar const * +snapshot_read_account_chunk( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + + ulong chunk_sz = fd_ulong_min( restore->acc_rem, bufsz ); + if( FD_UNLIKELY( chunk_sz > restore->accv_sz ) ) + FD_LOG_CRIT(( "OOB account vec read: chunk_sz=%lu accv_sz=%lu", chunk_sz, restore->accv_sz )); + + if( FD_LIKELY( chunk_sz ) ) { + + int eom = restore->acc_rem == chunk_sz; + + fd_mcache_publish_stream( + restore->out_mcache, + restore->out_depth, + restore->out_seq, + (ulong)buf - restore->goff_translate, + (ulong)buf - (ulong)restore->in_base, + chunk_sz, + fd_frag_meta_ctl( 0UL, 0, eom, 0 ) + ); + + restore->out_seq = fd_seq_inc( restore->out_seq, 1UL ); + restore->acc_rem -= chunk_sz; + restore->accv_sz -= chunk_sz; + buf += chunk_sz; + bufsz -= chunk_sz; + + } + + if( restore->acc_rem == 0UL ) { + ulong pad_sz = fd_ulong_min( fd_ulong_min( restore->acc_pad, bufsz ), restore->accv_sz ); + buf += pad_sz; + bufsz -= pad_sz; + restore->acc_pad -= pad_sz; + restore->accv_sz -= pad_sz; + + if( restore->accv_sz == 0UL ) { + restore->state = SNAP_STATE_IGNORE; + return buf; + } + if( restore->acc_pad == 0UL ) { + return (0==fd_snapshot_expect_account_hdr( restore )) ? buf : NULL; + } + } + + return buf; +} + + +/* fd_snapshot_accv_index populates the index of account vecs. This + index will be used when loading accounts. Returns errno-compatible + error code. */ + +static int +fd_snapshot_accv_index( fd_snapshot_accv_map_t * map, + fd_solana_accounts_db_fields_t const * fields ) { + + for( ulong i=0UL; i < fields->storages_len; i++ ) { + + fd_snapshot_slot_acc_vecs_t * slot = &fields->storages[ i ]; + + for( ulong j=0UL; j < slot->account_vecs_len; j++ ) { + fd_snapshot_acc_vec_t * accv = &slot->account_vecs[ j ]; + + /* Insert new AppendVec */ + fd_snapshot_accv_key_t key = { .slot = slot->slot, .id = accv->id }; + fd_snapshot_accv_map_t * rec = fd_snapshot_accv_map_insert( map, key ); + if( FD_UNLIKELY( !rec ) ) { + FD_LOG_WARNING(( "fd_snapshot_accv_map_insert failed" )); + return ENOMEM; + } + + /* Remember size */ + rec->sz = accv->file_sz; + } + + } + + return 0; +} + +/* snapshot_restore_manifest imports a snapshot manifest into the + given slot context. Also populates the accv index. Destroys the + existing bank structure. */ + +static void +snapshot_restore_manifest( fd_snapin_tile_t * restore ) { + + /* Decode manifest placing dynamic data structures onto slot context + heap. Once the epoch context heap is separated out, we need to + revisit this. + + This is horrible. Plenty of room for optimization, including: + - Streaming decoding + - Fixing the decoder (does 2 walks in decode_footprint, decode) + - Unpack directly into slot_ctx */ + + long dt = -fd_log_wallclock(); + + fd_bincode_decode_ctx_t decode = { + .data = restore->buf, + .dataend = restore->buf + restore->buf_sz + }; + + ulong total_sz = 0UL; + int err = fd_solana_manifest_decode_footprint( &decode, &total_sz ); + if( FD_UNLIKELY( err ) ) { + FD_LOG_ERR(( "fd_solana_manifest_decode_footprint failed (%d)", err )); + } + + uchar * scratch = (uchar *)fd_ulong_align_up( (ulong)decode.dataend, fd_solana_manifest_align() ); + ulong scratch_sz = (ulong)( restore->buf + restore->buf_max - scratch ); + if( FD_UNLIKELY( total_sz > scratch_sz ) ) { + FD_LOG_ERR(( "Cannot decode snapshot. Insufficient scratch buffer size (need %lu, have %lu bytes)", + (ulong)scratch + total_sz - (ulong)restore->buf, restore->buf_max )); + } + fd_solana_manifest_t * manifest = fd_solana_manifest_decode( scratch, &decode ); + + char acc_hash_cstr[ FD_BASE58_ENCODED_32_SZ ]; + fd_base58_encode_32( manifest->accounts_db.bank_hash_info.accounts_hash.uc, NULL, acc_hash_cstr ); + if( manifest->bank_incremental_snapshot_persistence ) { + FD_LOG_ERR(( "Incremental snapshots not yet supported TODO" )); + } else { + FD_LOG_NOTICE(( "Full snapshot acc_hash=%s", acc_hash_cstr )); + } + + dt += fd_log_wallclock(); + FD_LOG_NOTICE(( "Snapshot manifest decode took %.2g seconds", (double)dt/1e9 )); + + /* Move over accounts DB fields */ + + fd_solana_accounts_db_fields_t accounts_db = manifest->accounts_db; + fd_memset( &manifest->accounts_db, 0, sizeof(fd_solana_accounts_db_fields_t) ); + + /* Remember slot number */ + + //ulong slot = manifest->bank.slot; + + /* Copy objects into slot context */ + + //if( restore->cb_manifest ) { + // err = restore->cb_manifest( restore->cb_manifest_ctx, manifest, restore->spad ); + //} + + /* Read AccountVec map */ + + if( FD_LIKELY( !err ) ) { + err = fd_snapshot_accv_index( restore->accv_map, &accounts_db ); + } + + /* Discard buffer to reclaim heap space */ + + fd_snapshot_restore_discard_buf( restore ); + + restore->manifest_done = 1; +} + +/* snapshot_read_manifest_chunk reads partial manifest content. */ + +static uchar const * +snapshot_read_manifest_chunk( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + uchar const * end = snapshot_read_buffered( restore, buf, bufsz ); + if( snapshot_read_is_complete( restore ) ) { + snapshot_restore_manifest( restore ); + restore->state = SNAP_STATE_IGNORE; + } + return end; +} + +static ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_snapin_tile_t), fd_snapshot_accv_map_align() ); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_snapin_tile_t), sizeof(fd_snapin_tile_t) ); + l = FD_LAYOUT_APPEND( l, fd_snapshot_accv_map_align(), fd_snapshot_accv_map_footprint() ); + l = FD_LAYOUT_APPEND( l, 16UL, tile->snapin.scratch_sz ); + return l; +} + +static fd_snapin_tile_t * +scratch_init( void * mem, + fd_topo_tile_t const * tile ) { + if( FD_UNLIKELY( !mem ) ) return NULL; + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, scratch_align() ) ) ) return NULL; + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_snapin_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_snapin_tile_t), sizeof(fd_snapin_tile_t) ); + void * accv_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_snapshot_accv_map_align(), fd_snapshot_accv_map_footprint() ); + void * scratch_mem = FD_SCRATCH_ALLOC_APPEND( l, 16UL, tile->snapin.scratch_sz ); + + fd_memset( ctx, 0, sizeof(fd_snapin_tile_t) ); + ctx->accv_map = fd_snapshot_accv_map_join( fd_snapshot_accv_map_new( accv_map_mem ) ); + FD_TEST( ctx->accv_map ); + ctx->buf = scratch_mem; + + return ctx; +} + +FD_FN_UNUSED static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `" NAME "` tile" )); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + /* FIXME check link names */ + + if( FD_UNLIKELY( !tile->snapin.scratch_sz ) ) FD_LOG_ERR(( "scratch_sz param not set" )); + + fd_snapin_tile_t * ctx = scratch_init( fd_topo_obj_laddr( topo, tile->tile_obj_id ), tile ); + if( FD_UNLIKELY( !ctx ) ) FD_LOG_ERR(( "scratch_init failed" )); + + /* Init state */ + + ctx->state = SNAP_STATE_TAR; + ctx->flags = 0; + ctx->manifest_done = 0; + + /* Join stream input */ + + FD_TEST( fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ) ); + ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; + ctx->in_skip = 0UL; + ctx->in_sync = fd_mcache_seq_laddr_const( topo->links[ tile->in_link_id[ 0 ] ].mcache ); + + /* Join frame buffer */ + + ctx->buf_sz = 0UL; + ctx->buf_ctr = 0UL; + ctx->buf_max = tile->snapin.scratch_sz; + + /* Join account output */ + + ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); + ctx->out_seq_max = 0UL; + ctx->out_seq = 0UL; + ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); + +} + +static void +during_housekeeping( fd_snapin_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_snapin_tile_t * ctx ) { + (void)ctx; +} + +static void +tar_process_hdr( fd_snapin_tile_t * reader, + uchar const * cur ) { + + fd_tar_meta_t const * hdr = (fd_tar_meta_t const *)reader->buf; + + /* "ustar\x00" and "ustar \x00" (overlaps with version) are both + valid values for magic. These are POSIX ustar and OLDGNU versions + respectively. */ + if( FD_UNLIKELY( 0!=memcmp( hdr->magic, FD_TAR_MAGIC, 5UL ) ) ) { + + /* Detect EOF. A TAR EOF is marked by 1024 bytes of zeros. + We abort after 512 bytes. */ + int not_zero=0; + for( ulong i=0UL; ibuf[ i ]; + if( !not_zero ) { + cur += sizeof(fd_tar_meta_t); + fd_snapin_shutdown( reader ); + return; + } + /* Not an EOF, so must be a protocol error */ + ulong goff = (ulong)cur - reader->goff_translate - sizeof(fd_tar_meta_t); + FD_LOG_WARNING(( "Invalid tar header magic at goff=0x%lx", goff )); + FD_LOG_HEXDUMP_WARNING(( "Tar header", hdr, sizeof(fd_tar_meta_t) )); + reader->flags |= SNAP_FLAG_FAILED; + return; + } + + ulong file_sz = fd_tar_meta_get_size( hdr ); + if( FD_UNLIKELY( file_sz==ULONG_MAX ) ) { + FD_LOG_WARNING(( "Failed to parse file size in tar header" )); + reader->flags |= SNAP_FLAG_FAILED; + return; + } + reader->tar_file_rem = file_sz; + reader->buf_ctr = (ushort)0U; + + /* Call back to recipient */ + restore_file( reader, hdr, file_sz ); +} + +static uchar const * +tar_read_hdr( fd_snapin_tile_t * reader, + uchar const * cur, + ulong bufsz ) { + uchar const * end = cur+bufsz; + + /* Skip padding */ + if( reader->buf_ctr==0UL ) { + ulong goff = (ulong)cur - reader->goff_translate; + ulong pad_sz = fd_ulong_align_up( goff, 512UL ) - goff; + pad_sz = fd_ulong_min( pad_sz, (ulong)( end-cur ) ); + cur += pad_sz; + } + + /* Determine number of bytes to read */ + long chunk_sz = (long)sizeof(fd_tar_meta_t) - (long)reader->buf_ctr; + FD_TEST( chunk_sz>=0L ); + if( end-cur < chunk_sz ) chunk_sz = end-cur; + + /* Copy to header */ + fd_memcpy( reader->buf + reader->buf_ctr, cur, (ulong)chunk_sz ); + cur += chunk_sz; + reader->buf_ctr += (ulong)chunk_sz; + + /* Handle complete header */ + if( FD_LIKELY( reader->buf_ctr == sizeof(fd_tar_meta_t) ) ) { + tar_process_hdr( reader, cur ); + } + + return cur; +} + +static uchar const * +snapshot_read_discard( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + ulong avail = fd_ulong_min( bufsz, restore->tar_file_rem ); + return buf + avail; +} + +static uchar const * +restore_chunk1( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + if( FD_UNLIKELY( restore->state==SNAP_STATE_TAR ) ) { + return tar_read_hdr( restore, buf, bufsz ); + } + bufsz = fd_ulong_min( bufsz, restore->tar_file_rem ); + + uchar const * buf_next = NULL; + switch( restore->state ) { + case SNAP_STATE_IGNORE: + buf_next = snapshot_read_discard ( restore, buf, bufsz ); + break; + case SNAP_STATE_MANIFEST: + buf_next = snapshot_read_manifest_chunk ( restore, buf, bufsz ); + break; + case SNAP_STATE_ACCOUNT_HDR: + buf_next = snapshot_read_account_hdr_chunk( restore, buf, bufsz ); + break; + case SNAP_STATE_ACCOUNT_DATA: + buf_next = snapshot_read_account_chunk ( restore, buf, bufsz ); + break; + default: + FD_LOG_ERR(( "Invalid parser state %u (this is a bug)", restore->state )); + } + + ulong consumed = (ulong)buf_next - (ulong)buf; + if( FD_UNLIKELY( consumed>bufsz ) ) FD_LOG_CRIT(( "Buffer overflow (consumed=%lu bufsz=%lu)", consumed, bufsz )); + restore->tar_file_rem -= consumed; + if( restore->tar_file_rem==0UL ) { + restore->buf_ctr = 0UL; + restore->buf_sz = 0UL; + restore->state = SNAP_STATE_TAR; + } + return buf_next; +} + +/* on_stream_frag consumes an incoming stream data fragment. This frag + may be up to the dcache size (e.g. 8 MiB), therefore could contain + thousands of accounts. This function will publish a message for each + account to consumers. Slow consumers may cause backpressure and + force this function to exit early (before all accounts in this frag + were published). In that case, this function is called repeatedly + once the backpressure condition resolves (see in_skip). */ + +static int +on_stream_frag( fd_snapin_tile_t * ctx, + fd_snapin_in_t * in, + fd_stream_frag_meta_t const * frag, + ulong * read_sz ) { + if( FD_UNLIKELY( ctx->flags ) ) { + if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_FAILED ) ) FD_LOG_ERR(( "Failed to restore snapshot" )); + if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_DONE ) ) { + *read_sz = frag->sz; + return 1; + } + return 0; + } + + (void)in; + uchar const * const chunk0 = ctx->in_base + frag->loff; + uchar const * const chunk1 = chunk0 + frag->sz; + uchar const * const start = chunk0 + ctx->in_skip; + uchar const * cur = start; + + ctx->goff_translate = (ulong)chunk0 - frag->goff; + + int consume_frag = 1; + for(;;) { + if( FD_UNLIKELY( cur>=chunk1 ) ) { + ctx->in_skip = 0U; + break; + } + cur = restore_chunk1( ctx, cur, (ulong)( chunk1-cur ) ); + if( FD_UNLIKELY( ctx->flags ) ) { + if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_FAILED ) ) { + FD_LOG_ERR(( "Failed to restore snapshot" )); + } + } + if( FD_UNLIKELY( fd_seq_ge( ctx->out_seq, ctx->out_seq_max ) ) ) { + consume_frag = 0; /* retry this frag */ + ulong consumed_sz = (uint)( cur-start ); + ctx->in_skip += consumed_sz; + break; + } + } + + ulong consumed_sz = (ulong)( cur-start ); + in->goff += consumed_sz; + *read_sz = consumed_sz; + return consume_frag; +} + +/* fd_snapin_in_update gets called periodically synchronize flow control + credits back to the stream producer. Also updates link in metrics. */ + +static void +fd_snapin_in_update( fd_snapin_tile_t * ctx, + fd_snapin_in_t * in, + ulong const * restrict cons_seq ) { + int const downstream_active = !!ctx->manifest_done; + ulong const downstream_seq = cons_seq[ 0 ]; + ulong const downstream_goff = cons_seq[ 1 ]; + + /* Defend against buggy consumer */ + if( FD_UNLIKELY( fd_seq_gt( downstream_seq, ctx->out_seq ) | + fd_seq_gt( downstream_goff, in->goff ) ) ) { + FD_LOG_CRIT(( "Consumer skipped ahead of me: self=(%lu,%lu) consumer=(%lu,%lu)", + ctx->out_seq, in->goff, + downstream_seq, downstream_goff )); + } + + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->fseq[0] ) = in->seq; + if( !downstream_active ) { + /* Initially, just send this tile's progress */ + FD_VOLATILE( in->fseq[1] ) = in->goff; + } else { + /* Once downstream tiles are active, forward backpressure signals */ + FD_VOLATILE( in->fseq[1] ) = downstream_goff; + } + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + + uint * accum = in->accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +__attribute__((noinline)) static void +fd_snapin_run1( + fd_snapin_tile_t * ctx, + ulong in_cnt, + fd_snapin_in_t * in, /* [in_cnt] */ + ulong out_cnt, + fd_frag_meta_t ** out_mcache, /* [out_cnt] */ + ulong * out_depth, /* [out_cnt] */ + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong ** cons_fseq, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [2*cons_cnt] */ + long lazy, + fd_rng_t * rng +) { + /* in frag stream state */ + ulong in_seq; + + /* out flow control state */ + ulong cr_avail; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; /* First in to poll */ + + ulong min_in_depth = (ulong)LONG_MAX; + for( ulong in_idx=0UL; in_idxmcache->f ); + min_in_depth = fd_ulong_min( min_in_depth, depth ); + } + + FD_TEST( in_cnt==1 ); + + /* out frag stream init */ + + cr_avail = 0UL; + + ulong const burst = BURST; + + ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX ); + + for( ulong out_idx=0UL; out_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ + + /* Send flow control credits and drain flow control diagnostics. */ + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_snapin_in_update( ctx, &in[ in_idx ], cons_seq ); + + } else { /* event_idx==cons_cnt, housekeeping event */ + + /* Send synchronization info */ + FD_COMPILER_MFENCE(); + FD_VOLATILE( out_sync[0] ) = ctx->out_seq; + FD_VOLATILE( out_sync[1] ) = ctx->out_cnt; + FD_COMPILER_MFENCE(); + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + if( FD_LIKELY( cr_availout_seq, cons_seq[ cons_idx ] ), 0L ), 0L ); + slowest_cons = fd_ulong_if( cons_cr_availout_seq_max = ctx->out_seq + cr_avail; + + if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*cons_slow[ slowest_cons ]) += metric_in_backp; + FD_COMPILER_MFENCE(); + } + } + + during_housekeeping( ctx ); + + } + + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + + event_seq++; + if( FD_UNLIKELY( event_seq>=event_cnt ) ) { + event_seq = 0UL; + + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + + if( FD_LIKELY( in_cnt>1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_snapin_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( cr_avail=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + ulong this_in_seq = this_in->seq; + fd_stream_frag_meta_t const * this_in_mline = this_in->mline; + + ulong seq_found = fd_frag_meta_seq_query( this_in_mline->f ); + + long diff = fd_seq_diff( this_in_seq, seq_found ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + this_in->seq = seq_found; + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + fd_stream_frag_meta_t meta = FD_VOLATILE_CONST( *this_in_mline ); + ulong sz = 0U; + + ulong const out_seq0 = ctx->out_seq; + int consumed_frag = on_stream_frag( ctx, this_in, &meta, &sz ); + ulong const out_seq1 = ctx->out_seq; + ulong const frags_published = out_seq1-out_seq0; + if( FD_UNLIKELY( frags_published>cr_avail ) ) FD_LOG_CRIT(( "frags_published (%lu) > cr_avail (%lu)", frags_published, cr_avail )); + cr_avail -= frags_published; + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)sz; + + if( FD_LIKELY( consumed_frag ) ) { + + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu", in_seq )); + } + + /* Windup for the next in poll and accumulate diagnostics */ + + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + ctx->seq = this_in->seq; + + } + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +FD_FN_UNUSED static void +fd_snapin_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + fd_frag_meta_t * out_mcache[ tile->out_cnt ]; + ulong out_depth [ tile->out_cnt ]; + for( ulong i=0UL; iout_cnt; i++ ) { + out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; + FD_TEST( out_mcache[ i ] ); + out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); + } + + ulong reliable_cons_cnt = 0UL; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cntmcache = in_mcache[ i ]; + this_in->fseq = in_fseq [ i ]; + + ulong depth = fd_mcache_depth( this_in->mcache->f ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", i )); + this_in->depth = (uint)depth; + this_in->idx = (uint)i; + this_in->seq = 0UL; + this_in->goff = 0UL; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + + this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; + this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; + } + + fd_snapin_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ reliable_cons_cnt ]; + fd_snapin_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn = { + .name = "SnapIn", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_snapin_run, +}; +#endif + +#undef LINK_IN_MAX +#undef BURST diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c new file mode 100644 index 0000000000..a22b9e061b --- /dev/null +++ b/src/discof/restore/fd_unzstd_tile.c @@ -0,0 +1,198 @@ +#include "../../disco/topo/fd_topo.h" +#include "../../ballet/zstd/fd_zstd.h" +#include "fd_restore_base.h" +#include "stream/fd_stream_ctx.h" +#include "stream/fd_stream_writer.h" +#include /* pause */ + +#define NAME "unzstd" +#define ZSTD_WINDOW_SZ (33554432UL) +#define ZSTD_FRAME_SZ 16384UL +#define LINK_IN_MAX 1 + +struct fd_unzstd_tile { + fd_stream_frag_meta_ctx_t in_state; /* input mcache context */ + fd_zstd_dstream_t * dstream; /* zstd decompress reader */ + fd_stream_writer_t * writer; /* stream writer object */ +}; +typedef struct fd_unzstd_tile fd_unzstd_tile_t; + +FD_FN_PURE static ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_unzstd_tile_t), fd_zstd_dstream_align() ); +} + +FD_FN_PURE static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); + l = FD_LAYOUT_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); + return FD_LAYOUT_FINI( l, scratch_align() ); +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + + fd_unzstd_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); + void * zstd_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); + + void * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ 0 ] ].dcache_obj_id ) ); + FD_TEST( out_dcache ); + + fd_memset( ctx, 0, sizeof(fd_unzstd_tile_t) ); + + ctx->in_state.in_buf = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; + ctx->dstream = fd_zstd_dstream_new( zstd_mem, ZSTD_WINDOW_SZ ); + + fd_zstd_dstream_reset( ctx->dstream ); +} + +static void +fd_unzstd_init_from_stream_ctx( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + + /* There's only one writer */ + ctx->writer = fd_stream_writer_join( stream_ctx->writers[0] ); + FD_TEST( ctx->writer ); + fd_stream_writer_set_frag_sz_max( ctx->writer, ZSTD_FRAME_SZ ); +} + +__attribute__((noreturn)) static void +fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + fd_stream_writer_close( ctx->writer ); + FD_COMPILER_MFENCE(); + + for(;;) pause(); +} + +static void +fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, + fd_unzstd_tile_t * ctx ) { + ulong const volatile * in_sync = stream_ctx->in_ptrs[ 0 ]->in_sync; + if( FD_LIKELY( !FD_VOLATILE_CONST( in_sync[ 2 ] ) ) ) return; + + FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", + FD_VOLATILE_CONST( in_sync[ 0 ] ), stream_ctx->in[0].base.seq )); + fd_unzstd_shutdown( ctx ); +} + +static void +during_housekeeping( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + fd_unzstd_poll_shutdown( stream_ctx, ctx ); +} + +static int +on_stream_frag( void * _ctx, + fd_stream_reader_t * reader FD_PARAM_UNUSED, + fd_stream_frag_meta_t const * frag, + ulong * sz ) { + fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + + /* Input */ + uchar const * in_chunk0 = ctx->in_state.in_buf + frag->loff; + uchar const * in_chunk_start = in_chunk0 + ctx->in_state.in_skip; + uchar const * in_chunk_end = in_chunk0 + frag->sz; + uchar const * in_cur = in_chunk_start; + int in_consume = 0; + + /* Output */ + uchar * const out = fd_stream_writer_prepare( ctx->writer ); + uchar * const out_end = out + fd_stream_writer_publish_sz_max( ctx->writer ); + uchar * out_cur = out; + + while( out_curin_state.in_skip = 0UL; + in_consume = 1; + break; + } + + /* fd_zstd_dstream_read updates chunk_start and out */ + int zstd_err = fd_zstd_dstream_read( ctx->dstream, &in_cur, in_chunk_end, &out_cur, out_end, NULL ); + if( FD_UNLIKELY( zstd_err>0 ) ) { + FD_LOG_ERR(( "fd_zstd_dstream_read failed" )); + break; + } + + /* accumulate consumed bytes */ + ulong consumed_sz = (ulong)in_cur - (ulong)in_prev; + ctx->in_state.in_skip += consumed_sz; + } + + fd_stream_writer_publish( ctx->writer, (ulong)out_cur-(ulong)out, 0UL ); + + *sz = (ulong)in_cur - (ulong)in_chunk_start; + return in_consume; +} + +static void +fd_unzstd_in_update( fd_stream_reader_t * in ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->base.fseq[0] ) = in->base.seq; + FD_VOLATILE( in->base.fseq[1] ) = in->goff; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->base.idx ); + + uint * accum = in->base.accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +__attribute__((noinline)) static void +fd_unzstd_run1( + fd_unzstd_tile_t * ctx, + fd_stream_ctx_t * stream_ctx ) { + + FD_LOG_INFO(( "Running unzstd tile" )); + + fd_stream_ctx_run( stream_ctx, + ctx, + fd_unzstd_init_from_stream_ctx, + fd_unzstd_in_update, + during_housekeeping, + NULL, + NULL, + on_stream_frag ); +} + +static void +fd_unzstd_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_unzstd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + void * ctx_mem = fd_alloca_check( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_footprint( topo, tile ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile ); + FD_TEST( stream_ctx ); + fd_unzstd_run1( ctx, stream_ctx ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd = { + .name = "Unzstd", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_unzstd_run, +}; +#endif + +#undef NAME diff --git a/src/discof/restore/stream/fd_event_map.c b/src/discof/restore/stream/fd_event_map.c new file mode 100644 index 0000000000..9eddf5f0c5 --- /dev/null +++ b/src/discof/restore/stream/fd_event_map.c @@ -0,0 +1,29 @@ +#include "fd_event_map.h" + +fd_event_map_t * +fd_event_map_new( void * mem, + ulong in_cnt, + ulong out_cnt ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_event_map_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_event_map_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_event_map_t), sizeof(fd_event_map_t) ); + + ulong event_cnt = 1UL + in_cnt + out_cnt; + self->event_map = FD_SCRATCH_ALLOC_APPEND( l, alignof(ushort), sizeof(ushort)*event_cnt ); + self->event_cnt = event_cnt; + self->event_seq = 0UL; + + /* init event map */ + fd_event_map_init(self, in_cnt, out_cnt ); + + return self; +} diff --git a/src/discof/restore/stream/fd_event_map.h b/src/discof/restore/stream/fd_event_map.h new file mode 100644 index 0000000000..13b3fc0d08 --- /dev/null +++ b/src/discof/restore/stream/fd_event_map.h @@ -0,0 +1,99 @@ +#ifndef HEADER_fd_src_discof_restore_fd_event_map_h +#define HEADER_fd_src_discof_restore_fd_event_map_h + +#include "../../../util/fd_util_base.h" +#include "../../../util/bits/fd_bits.h" +#include "../../../util/rng/fd_rng.h" +#include "fd_stream_reader.h" + +struct fd_event_map { + ulong event_cnt; + ulong event_seq; + ushort * event_map; +}; +typedef struct fd_event_map fd_event_map_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_event_map_align( void ) { + return alignof(fd_event_map_t); +} + +FD_FN_CONST static inline ulong +fd_event_map_footprint( ulong in_cnt, + ulong out_cnt ) { + ulong event_cnt = 1UL + in_cnt + out_cnt; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND(l, alignof(fd_event_map_t), sizeof(fd_event_map_t) ); + l = FD_LAYOUT_APPEND(l, alignof(ushort), sizeof(ushort)*event_cnt ); + return FD_LAYOUT_FINI( l, fd_event_map_align() ); +} + +fd_event_map_t * +fd_event_map_new( void * mem, + ulong in_cnt, + ulong out_cnt ); + +static inline void +fd_event_map_init( fd_event_map_t * map, + ulong in_cnt, + ulong out_cnt ) { + ulong idx = 0UL; + map->event_map[ idx++ ] = (ushort)out_cnt; + for( ulong in_idx=0UL; in_idxevent_map[ idx++ ] = (ushort)(in_idx+out_cnt+1UL); + for( ulong cons_idx=0UL; cons_idxevent_map[ idx++ ] = (ushort)cons_idx; +} + +static inline ushort +fd_event_map_get_event( fd_event_map_t * map ) { + return map->event_map[ map->event_seq ]; +} + +static inline void +fd_event_map_randomize( fd_event_map_t * map, + fd_rng_t * rng ) { + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)map->event_cnt ); + ushort map_tmp = map->event_map[ swap_idx ]; + map->event_map[ swap_idx ] = map->event_map[ 0 ]; + map->event_map[ 0 ] = map_tmp; +} + +static inline void +fd_event_map_randomize_inputs( void ** in, + ulong in_cnt, + fd_rng_t * rng ) { + if( FD_LIKELY( in_cnt>1UL ) ) { + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + void * in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } +} + +static inline void +fd_event_map_advance( fd_event_map_t * map, + fd_rng_t * rng, + void ** in, + ulong in_cnt ) { + map->event_seq++; + if( FD_UNLIKELY( map->event_seq>=map->event_cnt) ) { + map->event_seq = 0UL; + + fd_event_map_randomize( map, rng ); + + fd_event_map_randomize_inputs( in, in_cnt, rng ); + } +} + +static inline void * +fd_event_map_delete( fd_event_map_t * map ) { + fd_memset(map, 0, sizeof(fd_event_map_t) ); + return (void *)map; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_fd_event_map_h */ diff --git a/src/discof/restore/stream/fd_frag_reader.h b/src/discof/restore/stream/fd_frag_reader.h new file mode 100644 index 0000000000..22b89f5640 --- /dev/null +++ b/src/discof/restore/stream/fd_frag_reader.h @@ -0,0 +1,124 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_frag_reader_h +#define HEADER_fd_src_discof_restore_stream_fd_frag_reader_h + +#include "../../../disco/stem/fd_stem.h" +#include "../../../disco/metrics/fd_metrics.h" + +struct __attribute__((aligned(64))) fd_frag_reader { + fd_frag_meta_t const * mcache; /* local join to this in's mcache */ + uint depth; /* == fd_mcache_depth( mcache ), depth of this in's cache (const) */ + uint idx; /* index of this in in the list of providers, [0, in_cnt) */ + ulong seq; /* sequence number of next frag expected from the upstream producer, + updated when frag from this in is published */ + fd_frag_meta_t const * mline; /* == mcache + fd_mcache_line_idx( seq, depth ), location to poll next */ + ulong * fseq; /* local join to the fseq used to return flow control credits to the in */ + uint accum[6]; /* local diagnostic accumulators. These are drained during in housekeeping. */ + /* Assumes FD_FSEQ_DIAG_{PUB_CNT,PUB_SZ,FILT_CNT,FILT_SZ,OVRNP_CNT,OVRNP_FRAG_CNT} are 0:5 */ +}; +typedef struct fd_frag_reader fd_frag_reader_t; + +struct fd_frag_reader_consume_ctx { + ulong seq_found; /* the seq num at the current mline */ + ulong seq_curr; /* the seq num in the stream reader */ + fd_frag_meta_t const * mline; /* current mline being consumed */ + ulong in_idx; /* link idx being polled */ +}; +typedef struct fd_frag_reader_consume_ctx fd_frag_reader_consume_ctx_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_frag_reader_align( void ) { + return alignof(fd_frag_reader_t); +} + +FD_FN_CONST static inline ulong +fd_frag_reader_footprint( void ) { + return sizeof(fd_frag_reader_t); +} + +static inline void +fd_frag_reader_init( fd_frag_reader_t * reader, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + reader->mcache = mcache; + reader->fseq = fseq; + ulong depth = fd_mcache_depth( reader->mcache ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", in_idx )); + reader->depth = (uint)depth; + reader->idx = (uint)in_idx; + reader->seq = 0UL; + reader->mline = reader->mcache + fd_mcache_line_idx( reader->seq, reader->depth ); + + reader->accum[0] = 0U; reader->accum[1] = 0U; reader->accum[2] = 0U; + reader->accum[3] = 0U; reader->accum[4] = 0U; reader->accum[5] = 0U; +} + +static inline fd_frag_reader_t * +fd_frag_reader_new( void * mem, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_frag_reader_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_frag_reader_t * self = (fd_frag_reader_t *)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_frag_reader_t), sizeof(fd_frag_reader_t) ); + + fd_frag_reader_init( self, mcache, fseq, in_idx ); + return self; +} + +static inline long +fd_frag_reader_poll_frag( fd_frag_reader_t * reader, + ulong in_idx, + fd_frag_reader_consume_ctx_t * ctx ) { + ctx->seq_curr = reader->seq; + ctx->mline = reader->mline; + ctx->in_idx = in_idx; + ctx->seq_found = fd_frag_meta_seq_query( ctx->mline ); + return fd_seq_diff( ctx->seq_curr, ctx->seq_found ); +} + +static inline void +fd_frag_reader_process_overrun( fd_frag_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx, + long seq_diff ) { + reader->seq = ctx->seq_curr; + reader->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + reader->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-seq_diff); +} + +static inline void +fd_frag_reader_consume_frag( fd_frag_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx ) { + /* check for overrun: when sequence number has changed */ + ulong seq_test = fd_frag_meta_seq_query( ctx->mline ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, ctx->seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu", ctx->in_idx )); + } + + /* wind up for next in poll and accumulate diagnostics */ + ctx->seq_curr = fd_seq_inc( ctx->seq_curr, 1UL ); + reader->seq = ctx->seq_curr; + reader->mline = reader->mcache + fd_mcache_line_idx( ctx->seq_curr, reader->depth ); + reader->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; +} + +static inline void * +fd_frag_reader_delete( fd_frag_reader_t * reader ) { + fd_memset( reader, 0, sizeof(fd_frag_reader_t) ); + return (void *)reader; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_frag_reader_h */ diff --git a/src/discof/restore/stream/fd_stream_ctx.c b/src/discof/restore/stream/fd_stream_ctx.c new file mode 100644 index 0000000000..8743a7bb61 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_ctx.c @@ -0,0 +1,102 @@ +#include "fd_stream_ctx.h" +#include "fd_stream_writer.h" + +FD_FN_PURE ulong +fd_stream_ctx_align( void ) { + return 128UL; +} + +ulong +fd_stream_ctx_footprint( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + ulong const in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong const out_cnt = tile->out_cnt; + + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_event_map_t), fd_event_map_footprint( in_cnt, out_cnt ) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_writer_t *), out_cnt*sizeof(fd_stream_writer_t *) ); + for( ulong i=0UL; ilinks[ tile->out_link_id[ i ] ]; + ulong writer_fp = fd_stream_writer_footprint( fd_topo_link_reliable_consumer_cnt( topo, link ) ); + FD_TEST( writer_fp ); + l = FD_LAYOUT_APPEND( l, fd_stream_writer_align(), writer_fp ); + } + return FD_LAYOUT_FINI( l, fd_stream_ctx_align() ); +} + +fd_stream_ctx_t * +fd_stream_ctx_new( void * mem, + fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_ctx_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_stream_ctx_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); + fd_memset( self, 0, sizeof(fd_stream_ctx_t) ); + + ulong const in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong const out_cnt = tile->out_cnt; + + self->in = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); + self->in_ptrs = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); + void * event_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, out_cnt ) ); + self->writers = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_writer_t *), out_cnt*sizeof(fd_stream_writer_t *) ); + + for( ulong i=0UL; ilinks[ tile->out_link_id[ i ] ]; + ulong const cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, link ); + void * writer = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint( cons_cnt ) ); + + self->writers[ i ] = fd_stream_writer_new_topo( + writer, + fd_topo_link_reliable_consumer_cnt( topo, link ), + topo, + tile, + i + ); + if( FD_UNLIKELY( !self->writers[ i ] ) ) return NULL; /* logs warning */ + } + + self->in_cnt = in_cnt; + self->out_cnt = out_cnt; + + self->event_map = fd_event_map_new( event_map_mem, in_cnt, out_cnt ); + self->in_seq = 0UL; + + /* init in */ + ulong in_idx = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + fd_stream_reader_init( &self->in[ in_idx ], + fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ), + tile->in_link_fseq[ i ], + in_idx ); + in_idx++; + } + + /* init in_ptrs */ + for( ulong i=0UL; iin_cnt; i++ ) { + self->in_ptrs[ i ] = &self->in[ i ]; + } + + fd_stream_ticks_init( self->ticks, self->event_map->event_cnt, 1e3L ); + fd_stream_metrics_init( self->metrics ); + + /* FIXME: rng seed should not be 0 */ + FD_TEST( fd_rng_join( fd_rng_new( self->rng, 0, 0UL ) ) ); + + FD_SCRATCH_ALLOC_FINI( l, fd_stream_ctx_align() ); + return self; +} diff --git a/src/discof/restore/stream/fd_stream_ctx.h b/src/discof/restore/stream/fd_stream_ctx.h new file mode 100644 index 0000000000..ce46d547a4 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_ctx.h @@ -0,0 +1,327 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_ctx_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_ctx_h + +#include "../../../disco/topo/fd_topo.h" +#include "fd_stream_reader.h" +#include "fd_stream_writer.h" +#include "fd_event_map.h" +#include "fd_stream_ticks.h" +#include "fd_stream_metrics.h" + +struct fd_stream_ctx; +typedef struct fd_stream_ctx fd_stream_ctx_t; + +typedef void +(* fd_tile_ctx_init_run_loop_fn_t)( void * ctx, + fd_stream_ctx_t * stream_ctx ); + +typedef void +(* fd_tile_update_in_fn_t)( fd_stream_reader_t * reader ); + +typedef void +(* fd_tile_housekeeping_fn_t)( void * ctx, + fd_stream_ctx_t * stream_ctx ); + +typedef void +(* fd_tile_metrics_write_fn_t)( void * ctx ); + +typedef void +(* fd_tile_run_fn_t)( void * ctx, + fd_stream_ctx_t * stream_ctx, + int * opt_poll_in ); + +typedef int +(* fd_tile_on_stream_frag_fn_t)( void * ctx, + fd_stream_reader_t * reader, + fd_stream_frag_meta_t const * frag, + ulong * sz ); + +struct fd_stream_ctx { + fd_stream_reader_t * in; + fd_stream_reader_t ** in_ptrs; + fd_event_map_t * event_map; + ulong in_cnt; + ulong out_cnt; + ulong in_seq; + fd_rng_t rng[1]; + fd_stream_ticks_t ticks[1]; + fd_stream_metrics_t metrics[1]; + fd_stream_writer_t ** writers; + fd_tile_update_in_fn_t tile_update_in; + fd_tile_housekeeping_fn_t tile_housekeeping; + fd_tile_metrics_write_fn_t tile_metrics_write; + fd_tile_run_fn_t tile_run; + fd_tile_on_stream_frag_fn_t tile_on_stream_frag; +}; +typedef struct fd_stream_ctx fd_stream_ctx_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_PURE ulong +fd_stream_ctx_align( void ); + +ulong +fd_stream_ctx_footprint( fd_topo_t const * topo, + fd_topo_tile_t const * tile ); + +fd_stream_ctx_t * +fd_stream_ctx_new( void * mem, + fd_topo_t const * topo, + fd_topo_tile_t const * tile ); + +static inline void +fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx, + void * tile_ctx, + fd_tile_ctx_init_run_loop_fn_t tile_init_run_loop, + fd_tile_update_in_fn_t tile_update_in, + fd_tile_housekeeping_fn_t tile_housekeeping, + fd_tile_metrics_write_fn_t tile_metrics_write, + fd_tile_run_fn_t tile_run, + fd_tile_on_stream_frag_fn_t tile_on_stream_frag ) { + if( ctx->in_cnt && !tile_update_in ) { + FD_LOG_ERR(( "tile_update_in function cannot be null if there are producers to this tile!" )); + } + + if( ctx->in_cnt && !tile_on_stream_frag ) { + FD_LOG_ERR(( "tile_on_stream_frag function cannot be null if there are producers to this tile!" )); + } + + ctx->tile_update_in = tile_update_in; + ctx->tile_housekeeping = tile_housekeeping; + ctx->tile_metrics_write = tile_metrics_write; + ctx->tile_run = tile_run; + ctx->tile_on_stream_frag = tile_on_stream_frag; + + FD_MGAUGE_SET( TILE, STATUS, 1UL ); + fd_stream_ticks_init_timer( ctx->ticks ); + + if( tile_init_run_loop ) { + tile_init_run_loop( tile_ctx, ctx ); + } +} + +static inline void +fd_stream_ctx_calculate_backpressure( fd_stream_ctx_t * ctx ) { + /* Recalculate flow control credits */ + for( ulong i=0UL; iout_cnt; i++ ) { + fd_stream_writer_calculate_backpressure( ctx->writers[i] ); + } +} + +static inline void +fd_stream_ctx_housekeeping_advance( fd_stream_ctx_t * ctx ) { + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + fd_event_map_advance( ctx->event_map, + ctx->rng, + (void **)ctx->in_ptrs, + ctx->in_cnt ); + + /* Reload housekeeping timer */ + fd_stream_ticks_reload_housekeeping( ctx->ticks, + ctx->rng); +} + +static inline void +fd_stream_ctx_do_housekeeping( fd_stream_ctx_t * ctx, + void * tile_ctx ) { + if( FD_UNLIKELY( fd_stream_ticks_is_housekeeping_time( ctx->ticks ) ) ) { + ulong event_idx = fd_event_map_get_event( ctx->event_map ); + + if( FD_LIKELY( event_idxout_cnt ) ) { /* receive credits */ + ulong out_idx = event_idx; + + /* Receive flow control credits from this out. */ + fd_stream_writer_receive_flow_control_credits( ctx->writers[ out_idx ] ); + + } else if( event_idx>ctx->out_cnt) { /* send credits */ + ulong in_idx = event_idx - ctx->out_cnt - 1UL; + ctx->tile_update_in( &ctx->in[ in_idx ] ); + + } else { /* event_idx==out_cnt, housekeeping event */ + + /* Update metrics counters to external viewers */ + fd_stream_metrics_update_external( ctx->metrics, + ctx->ticks->now, + ctx->tile_metrics_write, + ctx ); + fd_stream_ctx_calculate_backpressure( ctx ); + + if( ctx->tile_housekeeping ) { + ctx->tile_housekeeping( tile_ctx, ctx ); + } + } + + fd_stream_ctx_housekeeping_advance( ctx ); + } +} + +static inline void +fd_stream_ctx_process_backpressure( fd_stream_ctx_t * ctx ) { + ctx->metrics->backp_cnt += (ulong)!ctx->metrics->in_backp; + ctx->metrics->in_backp = 1UL; + FD_SPIN_PAUSE(); + ctx->metrics->regime_ticks[2] += ctx->ticks->housekeeping_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[5] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline int +fd_stream_ctx_is_backpressured( fd_stream_ctx_t * ctx ) { + int backpressured = 1UL; + for( ulong i=0UL; iout_cnt; i++ ) { + backpressured &= !fd_stream_writer_publish_sz_max( ctx->writers[i] ); + } + return backpressured; +} + +static inline void +fd_stream_ctx_advance_poll_empty( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[0] += ctx->ticks->housekeeping_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[3] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline void +fd_stream_ctx_advance_poll( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[1] += ctx->ticks->housekeeping_ticks; + ctx->metrics->regime_ticks[4] += ctx->ticks->prefrag_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[7] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline void +fd_stream_ctx_advance_poll_idle( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[0] += ctx->ticks->housekeeping_ticks; + ctx->metrics->regime_ticks[3] += ctx->ticks->prefrag_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[6] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline void +fd_stream_ctx_advance_skip_poll( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[1] += ctx->ticks->housekeeping_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[4] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline void +fd_stream_ctx_poll( fd_stream_ctx_t * ctx, + void * tile_ctx ) { + ctx->metrics->in_backp = 0UL; + + if( FD_UNLIKELY( !ctx->in_cnt ) ) { + fd_stream_ctx_advance_poll_empty( ctx ); + return; + } + + ctx->ticks->prefrag_ticks = 0UL; + + /* select input to poll */ + fd_stream_reader_t * this_in = &ctx->in[ ctx->in_seq ]; + ctx->in_seq++; + if( ctx->in_seq>=ctx->in_cnt ) { + ctx->in_seq = 0UL; /* cmov */ + } + + fd_frag_reader_consume_ctx_t consume_ctx; + long diff = fd_stream_reader_poll_frag( this_in, + ctx->in_seq, + &consume_ctx ); + + if( FD_UNLIKELY( diff<0L ) ) { + /* overrun case, technically impossible with reliable streams */ + fd_stream_ctx_advance_poll( ctx ); + + fd_stream_reader_process_overrun( this_in, + &consume_ctx, + diff ); + } + else if ( FD_UNLIKELY( diff ) ) { + /* nothing new to poll */ + fd_stream_ctx_advance_poll_idle( ctx ); + } + else { + FD_COMPILER_MFENCE(); + ulong sz = 0U; + fd_stream_frag_meta_t const * frag = fd_type_pun_const( consume_ctx.mline ); + int consumed_frag = ctx->tile_on_stream_frag( tile_ctx, this_in, frag, &sz ); + + fd_stream_reader_consume_bytes( this_in, sz ); + + if( FD_LIKELY( consumed_frag ) ) { + fd_stream_reader_consume_frag( this_in, + &consume_ctx ); + } + + fd_stream_ctx_advance_poll( ctx ); + } +} + +static inline void +fd_stream_ctx_run_loop( fd_stream_ctx_t * ctx, + void * tile_ctx ) { + for(;;) { + fd_stream_ctx_do_housekeeping( ctx, tile_ctx ); + + if( FD_UNLIKELY( fd_stream_ctx_is_backpressured( ctx ) ) ) { + fd_stream_ctx_process_backpressure( ctx ); + continue; + } + + /* equivalent of after credit */ + if( ctx->tile_run ) { + int poll_in = 1; + ctx->tile_run( tile_ctx, ctx, &poll_in ); + + if( FD_UNLIKELY( !poll_in ) ) { + fd_stream_ctx_advance_skip_poll( ctx ); + continue; + } + } + + fd_stream_ctx_poll( ctx, tile_ctx ); + } +} + +static inline void +fd_stream_ctx_run( fd_stream_ctx_t * ctx, + void * tile_ctx, + fd_tile_ctx_init_run_loop_fn_t tile_init_run_loop, + fd_tile_update_in_fn_t tile_update_in, + fd_tile_housekeeping_fn_t tile_housekeeping, + fd_tile_metrics_write_fn_t tile_metrics_write, + fd_tile_run_fn_t tile_run, + fd_tile_on_stream_frag_fn_t tile_on_stream_frag ) { + fd_stream_ctx_init_run_loop( ctx, + tile_ctx, + tile_init_run_loop, + tile_update_in, + tile_housekeeping, + tile_metrics_write, + tile_run, + tile_on_stream_frag ); + + fd_stream_ctx_run_loop( ctx, tile_ctx ); +} + +static inline void * +fd_stream_ctx_delete( fd_stream_ctx_t * ctx ) { + for( ulong i=0UL; iin_cnt; i++ ) { + fd_stream_reader_delete( &ctx->in[ i ] ); + ctx->in_ptrs[ i ] = NULL; + } + + fd_event_map_delete( ctx->event_map ); + fd_memset(ctx, 0, sizeof(fd_stream_ctx_t) ); + return (void *)ctx; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_ctx_h */ diff --git a/src/discof/restore/stream/fd_stream_metrics.h b/src/discof/restore/stream/fd_stream_metrics.h new file mode 100644 index 0000000000..04eb728287 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_metrics.h @@ -0,0 +1,46 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h + +#include "../../../util/fd_util_base.h" +#include "../../../disco/metrics/fd_metrics.h" + +struct fd_stream_metrics { + ulong in_backp; + ulong backp_cnt; + ulong regime_ticks[9]; +}; +typedef struct fd_stream_metrics fd_stream_metrics_t; + +typedef void fd_metrics_write_fn_t( void * ctx ); + +FD_PROTOTYPES_BEGIN + +static inline void +fd_stream_metrics_init( fd_stream_metrics_t * metrics ) { + metrics->in_backp = 1UL; + metrics->backp_cnt = 0UL; + fd_memset( metrics->regime_ticks, 0, sizeof(metrics->regime_ticks) ); +} + +static inline void +fd_stream_metrics_update_external( fd_stream_metrics_t * metrics, + long now, + fd_metrics_write_fn_t * metrics_write, + void * ctx ) { + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metrics->in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metrics->backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metrics->regime_ticks ); + + if( metrics_write ) { + metrics_write( ctx ); + } + + FD_COMPILER_MFENCE(); + metrics->backp_cnt = 0UL; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h */ diff --git a/src/discof/restore/stream/fd_stream_reader.h b/src/discof/restore/stream/fd_stream_reader.h new file mode 100644 index 0000000000..bf40a9787d --- /dev/null +++ b/src/discof/restore/stream/fd_stream_reader.h @@ -0,0 +1,108 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_reader_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_reader_h + +#include "fd_frag_reader.h" +#include "../../restore/fd_restore_base.h" + +struct fd_stream_reader { + union { + struct { + fd_stream_frag_meta_t const * mcache; + uint depth; + uint idx; + ulong seq; + fd_stream_frag_meta_t const * mline; + ulong volatile * fseq; + uint accum[6]; + }; + + fd_frag_reader_t r[1]; /* FIXME strict aliasing violation on mcache */ + } base; + + ulong goff; + ulong const volatile * in_sync; +}; +typedef struct fd_stream_reader fd_stream_reader_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_stream_reader_align( void ) { + return alignof(fd_stream_reader_t); +} + +FD_FN_CONST static inline ulong +fd_stream_reader_footprint( void ) { + return sizeof(fd_stream_reader_t); +} + +static inline void +fd_stream_reader_init( fd_stream_reader_t * reader, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + fd_frag_reader_init( reader->base.r, mcache, fseq, in_idx ); + reader->goff = 0UL; + reader->in_sync = fd_mcache_seq_laddr_const( reader->base.mcache->f ); +} + +static inline fd_stream_reader_t * +fd_stream_reader_new( void * mem, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_reader_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_stream_reader_t * self = (fd_stream_reader_t *)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t), sizeof(fd_stream_reader_t) ); + + fd_stream_reader_init( self, mcache, fseq, in_idx ); + + return self; +} + +static inline long +fd_stream_reader_poll_frag( fd_stream_reader_t * reader, + ulong in_idx, + fd_frag_reader_consume_ctx_t * ctx ) { + return fd_frag_reader_poll_frag( reader->base.r, in_idx, ctx ); +} + +static inline void +fd_stream_reader_process_overrun( fd_stream_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx, + long seq_diff ) { + fd_frag_reader_process_overrun( reader->base.r, ctx, seq_diff ); +} + +static inline void +fd_stream_reader_consume_bytes( fd_stream_reader_t * reader, + ulong bytes ) { + reader->goff += bytes; + reader->base.accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)bytes; +} + +static inline void +fd_stream_reader_consume_frag( fd_stream_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx ) { + fd_frag_reader_consume_frag( reader->base.r, ctx ); +} + + +static inline void * +fd_stream_reader_delete( fd_stream_reader_t * reader ) { + fd_frag_reader_delete( reader->base.r ); + return (void *)reader; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_reader_h */ diff --git a/src/discof/restore/stream/fd_stream_ticks.h b/src/discof/restore/stream/fd_stream_ticks.h new file mode 100644 index 0000000000..ac181fadb1 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_ticks.h @@ -0,0 +1,49 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h + +#include "../../../util/fd_util_base.h" +#include "../../../tango/tempo/fd_tempo.h" + +struct fd_stream_ticks { + ulong housekeeping_ticks; + ulong prefrag_ticks; + ulong async_min; + long lazy; + long now; + long then; +}; +typedef struct fd_stream_ticks fd_stream_ticks_t; + +static inline void +fd_stream_ticks_init( fd_stream_ticks_t * ticks, + ulong event_cnt, + long lazy ) { + fd_memset( ticks, 0, sizeof(fd_stream_ticks_t) ); + ticks->lazy = lazy; + ticks->async_min = fd_tempo_async_min( ticks->lazy, + event_cnt, + (float)fd_tempo_tick_per_ns( NULL ) ); + if( FD_UNLIKELY( !ticks->async_min ) ) FD_LOG_ERR(( "bad lazy %lu %lu", (ulong)ticks->lazy, event_cnt )); +} + +static inline void +fd_stream_ticks_init_timer( fd_stream_ticks_t * ticks ) { + ticks->then = fd_tickcount(); + ticks->now = ticks->then; +} + +static inline int +fd_stream_ticks_is_housekeeping_time( fd_stream_ticks_t * ticks ) { + ticks->housekeeping_ticks = 0UL; + return (ticks->now - ticks->then) >= 0L; +} + +static inline void +fd_stream_ticks_reload_housekeeping( fd_stream_ticks_t * ticks, fd_rng_t * rng ) { + ticks->then = ticks->now + (long)fd_tempo_async_reload( rng, ticks->async_min ); + long next = fd_tickcount(); + ticks->housekeeping_ticks = (ulong)(next - ticks->now); + ticks->now = next; +} + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h */ diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c new file mode 100644 index 0000000000..f3fbf668ea --- /dev/null +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -0,0 +1,164 @@ +#include "fd_stream_writer.h" +#include "../../../util/log/fd_log.h" +#include "../../../tango/dcache/fd_dcache.h" +#include "../../../disco/topo/fd_topo.h" + +fd_stream_writer_t * +fd_stream_writer_new( void * mem, + ulong cons_max, + fd_stream_frag_meta_t * mcache, + uchar * dcache ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_writer_align() ) ) ) { + FD_LOG_WARNING(( "misaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_stream_writer_t * writer = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_writer_t), sizeof(fd_stream_writer_t) ); + ulong * cons_seq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), cons_max*sizeof(ulong)*FD_STREAM_WRITER_CONS_SEQ_STRIDE ); + ulong volatile ** cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_max*sizeof(ulong *) ); + FD_SCRATCH_ALLOC_FINI( l, fd_stream_writer_align() ); + + fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); + + writer->mcache = mcache; + writer->out_sync = fd_mcache_seq_laddr( mcache->f ); + writer->seq = fd_mcache_seq_query( writer->out_sync ); + writer->depth = fd_mcache_depth( mcache->f ); + + writer->data = dcache; + writer->data_max = fd_dcache_data_sz( dcache ); + writer->data_cur = 0UL; + writer->base = (uchar *)fd_wksp_containing( dcache ); /* FIXME impure */ + writer->goff = 0UL; + + writer->cr_byte_avail = ULONG_MAX; + writer->cr_frag_avail = ULONG_MAX; + writer->cons_seq = cons_seq; + writer->cons_fseq = cons_fseq; + + writer->frag_sz_max = writer->data_max; + + writer->cons_cnt = 0UL; + writer->cons_max = cons_max; + /* writer->out_sync already set */ + + FD_COMPILER_MFENCE(); + writer->magic = FD_STREAM_WRITER_MAGIC; + return writer; +} + +void * +fd_stream_writer_delete( fd_stream_writer_t * writer ) { + fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); + return writer; +} + +ulong * +fd_stream_writer_register_consumer( + fd_stream_writer_t * writer, + ulong * fseq_join +) { + if( FD_UNLIKELY( writer->cons_cnt >= writer->cons_max ) ) { + FD_LOG_WARNING(( "Can't register consumer, cons_max %lu exceeded", writer->cons_max )); + return NULL; + } + writer->cr_byte_avail = 0UL; + writer->cr_frag_avail = 0UL; + + ulong const cons_idx = writer->cons_cnt++; + ulong * seq = writer->cons_seq + ( cons_idx*FD_STREAM_WRITER_CONS_SEQ_STRIDE ); + writer->cons_fseq[ cons_idx ] = fd_type_pun( fseq_join ); + seq[ 0 ] = FD_VOLATILE_CONST( fseq_join[ 0 ] ); + seq[ 1 ] = FD_VOLATILE_CONST( fseq_join[ 1 ] ); + return seq; +} + +fd_stream_writer_t * +fd_stream_writer_new_topo( + void * mem, + ulong cons_max, + fd_topo_t const * topo, + fd_topo_tile_t const * tile, + ulong out_link_idx +) { + ulong const out_link_id = tile->out_link_id[ out_link_idx ]; + fd_topo_link_t const * out_link = &topo->links[ out_link_id ]; + fd_stream_frag_meta_t * mcache = fd_type_pun( out_link->mcache ); + void * dcache = fd_dcache_join( fd_topo_obj_laddr( topo, out_link->dcache_obj_id ) ); + ulong cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, out_link ); + if( FD_UNLIKELY( !mcache ) ) { + FD_LOG_WARNING(( "NULL mcache" )); + return NULL; + } + if( FD_UNLIKELY( !dcache ) ) { + FD_LOG_WARNING(( "NULL dcache" )); + return NULL; + } + if( FD_UNLIKELY( cons_cnt>cons_max ) ) { + FD_LOG_WARNING(( "cons_cnt is %lu but cons_max is only %lu", cons_cnt, cons_max )); + } + + fd_stream_writer_t * writer = fd_stream_writer_new( mem, cons_max, mcache, dcache ); + if( FD_UNLIKELY( !writer ) ) return NULL; /* logs warning */ + + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + if( consumer_tile->in_link_id[ j ]!=out_link_id ) continue; + if( !consumer_tile->in_link_reliable[ j ] ) continue; + + ulong * fseq = consumer_tile->in_link_fseq[ j ]; + if( FD_UNLIKELY( !fseq ) ) { + FD_LOG_WARNING(( "NULL fseq for consumer tile=%s:%lu in_link_idx=%lu", + consumer_tile->name, consumer_tile->kind_id, j )); + } + if( FD_UNLIKELY( !fd_stream_writer_register_consumer( writer, fseq ) ) ) { + return NULL; /* logs warning */ + } + } + } + + return writer; +} + +void +fd_stream_writer_set_frag_sz_max( fd_stream_writer_t * writer, + ulong frag_sz_max ) { + writer->frag_sz_max = fd_ulong_min( writer->data_max, frag_sz_max ); +} + +void +fd_stream_writer_copy( fd_stream_writer_t * writer, + void const * data, + ulong data_sz, + ulong const ctl_mask ) { + if( FD_UNLIKELY( ( data_sz > writer->cr_byte_avail ) | + ( data_sz > writer->data_max ) ) ) { + FD_LOG_ERR(( "invalid data_sz %lu (cr_byte_avail=%lu data_max=%lu)", + data_sz, writer->cr_byte_avail, writer->data_max )); + } + + ulong const frag_sz_max = writer->frag_sz_max; + if( FD_UNLIKELY( !frag_sz_max ) ) { + FD_LOG_ERR(( "zero frag_sz_max" )); + } + int som = 1; + while( data_sz ) { + ulong const op_sz = fd_ulong_min( data_sz, frag_sz_max ); + ulong const next_sz = data_sz-op_sz; + int const eom = next_sz==0UL; + ulong const ctl = ctl_mask & fd_frag_meta_ctl( FD_FRAG_META_ORIG_MAX-1, som, eom, 1 ); + + fd_memcpy( fd_stream_writer_prepare( writer ), data, op_sz ); + fd_stream_writer_publish( writer, op_sz, ctl ); + + som = 0; + data_sz = next_sz; + } +} diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h new file mode 100644 index 0000000000..4d3243d6db --- /dev/null +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -0,0 +1,294 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_writer_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_writer_h + +/* fd_stream_writer.h provides an API to publish data to SPMC shared + memory byte streams. */ + +#include "../fd_restore_base.h" + +/* fd_stream_writer_t holds stream producer state. */ + +struct __attribute__((aligned(16))) fd_stream_writer { + /* Fragment descriptor output */ + fd_stream_frag_meta_t * mcache; /* frag producer mcache */ + ulong seq; /* next sequence number */ + ulong depth; /* mcache depth */ + + /* Data buffer (dcache) output */ + uchar * data; /* points to first byte of dcache data region (dcache join) */ + ulong data_max; /* dcache data region size */ + ulong data_cur; /* next dcache data offset in [0,data_sz) */ + uchar * base; /* workspace base address */ + ulong goff; /* byte stream offset */ + + /* This point is 16-byte aligned */ + + /* Backpressure */ + ulong cr_byte_avail; /* byte publish count before slowest consumer overrun */ + ulong cr_frag_avail; /* frag publish count before slowest consumer overrun */ + ulong * cons_seq; /* cons_seq[ 2*cons_idx+i ] caches cons_fseq[ cons_idx ][i] */ + ulong volatile ** cons_fseq; /* cons_fseq[ cons_idx ] points to consumer fseq */ + /* Each consumer reports a 'frag sequence number' and the 'stream offset' */ +# define FD_STREAM_WRITER_CONS_SEQ_STRIDE 2UL + + /* Fragmentation */ + ulong frag_sz_max; /* max data sz for each frag descriptor */ + + /* Cold data */ + ulong magic; + ulong cons_cnt; /* number of consumers */ + ulong cons_max; /* max number of consumers */ + ulong * out_sync; /* points to mcache 'sync' field (last published seq no) */ + + /* variable length data follows */ +}; + +typedef struct fd_stream_writer fd_stream_writer_t; + +#define FD_STREAM_WRITER_MAGIC (0xFD57337717E736C0UL) + +/* Forward declarations */ + +typedef struct fd_topo fd_topo_t; +typedef struct fd_topo_tile fd_topo_tile_t; + +FD_PROTOTYPES_BEGIN + +/* Constructor API ****************************************************/ + +/* fd_stream_writer_{align,footprint} describe a memory region suitable + to hold a stream_writer. */ + +FD_FN_CONST static inline ulong +fd_stream_writer_align( void ) { + return alignof(fd_stream_writer_t); +} + +FD_FN_CONST static inline ulong +fd_stream_writer_footprint( ulong cons_max ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_writer_t), sizeof(fd_stream_writer_t) ); + l = FD_LAYOUT_APPEND( l, alignof(ulong), cons_max*sizeof(ulong)*FD_STREAM_WRITER_CONS_SEQ_STRIDE ); + l = FD_LAYOUT_APPEND( l, alignof(ulong *), cons_max*sizeof(ulong *) ); + return FD_LAYOUT_FINI( l, fd_stream_writer_align() ); +} + +/* fd_stream_writer_new initializes the memory region at mem as a + stream_writer object. mcache_join is a local join to an mcache + (frag_meta or similar pointer) to which frags will be published. + dcache_join is a local join to a dcache into which data is written. + Returns writer object in mem on success, and NULL on failure. Logs + reason for failure. */ + +fd_stream_writer_t * +fd_stream_writer_new( void * mem, + ulong cons_max, + fd_stream_frag_meta_t * mcache_join, + uchar * dcache_join ); + +/* fd_stream_writer_delete releases the memory region backing a + stream_writer. Returns a pointer to the memory region originally + provided to fd_stream_writer_new. */ + +void * +fd_stream_writer_delete( fd_stream_writer_t * writer ); + +/* fd_stream_writer_new_topo constructs a stream writer for a topology + definition. Calls new() and register_consumer() under the hood. + tile is the actor that will be writing stream frags in topo. + out_link_idx is the index of the output link for that tile. */ + +fd_stream_writer_t * +fd_stream_writer_new_topo( + void * mem, + ulong cons_max, + fd_topo_t const * topo, + fd_topo_tile_t const * tile, + ulong out_link_idx +); + +static inline fd_stream_writer_t * +fd_stream_writer_join( void * _writer ) { + fd_stream_writer_t * writer = _writer; + if( FD_UNLIKELY( !writer ) ) return NULL; + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)writer, fd_stream_writer_align() ) ) ) return NULL; + if( FD_UNLIKELY( writer->magic!=FD_STREAM_WRITER_MAGIC ) ) return NULL; + return writer; +} + +/* Control API ********************************************************/ + +/* fd_stream_writer_register_consumer registers a consumer of the + stream to the writer. fseq_join is a local join to that consumer's + fseq (points to the fseq's seq[0] field). Future backpressure checks + will include this consumer. Returns a pointer to this consumer's + seq cache field, or NULL on if cons_max exceeded (logs warning). */ + +ulong * +fd_stream_writer_register_consumer( + fd_stream_writer_t * writer, + ulong * fseq_join +); + +/* fd_stream_writer_close marks the stream as closed. */ + +static inline void +fd_stream_writer_close( fd_stream_writer_t * writer ) { + FD_VOLATILE( writer->out_sync[ 0 ] ) = writer->seq; + FD_VOLATILE( writer->out_sync[ 1 ] ) = writer->goff; + FD_COMPILER_MFENCE(); + FD_VOLATILE( writer->out_sync[ 2 ] ) = 1; +} + +/* Flow control API ***************************************************/ + +/* fd_stream_writer_set_frag_sz_max puts an upper bound on the fragment + sizes produced to the stream. This helps reduce latency. */ + +void +fd_stream_writer_set_frag_sz_max( fd_stream_writer_t * writer, + ulong frag_sz_max ); + +/* fd_stream_writer_receive_flow_control_credits updates cached consumer + progress from the consumers' fseq objects. + + FIXME Provide an API to round-robin update ins temporally spaced apart */ + +static inline void +fd_stream_writer_receive_flow_control_credits( fd_stream_writer_t * writer ) { + ulong const stride = FD_STREAM_WRITER_CONS_SEQ_STRIDE; + for( ulong i=0UL; icons_cnt; i++ ) { + /* FIXME could be SSE aligned copy */ + FD_COMPILER_MFENCE(); + writer->cons_seq[ stride*i ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][0] ); + writer->cons_seq[ stride*i+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][1] ); + FD_COMPILER_MFENCE(); + } +} + +/* fd_stream_writer_calculate_backpressure updates fragment and stream + backpressure from cached consumer progress. */ + +static inline void +fd_stream_writer_calculate_backpressure( fd_stream_writer_t * writer ) { + ulong const cr_byte_max = writer->data_max; + ulong const cr_frag_max = writer->depth; + + ulong cr_byte_avail = ULONG_MAX; + ulong cr_frag_avail = ULONG_MAX; + ulong const stride = FD_STREAM_WRITER_CONS_SEQ_STRIDE; + for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { + ulong cons_cr_byte_avail = (ulong)fd_long_max( (long)cr_byte_max-fd_long_max( fd_seq_diff( writer->goff, writer->cons_seq[ stride*cons_idx+1 ] ), 0L ), 0L ); + ulong cons_cr_frag_avail = (ulong)fd_long_max( (long)cr_frag_max-fd_long_max( fd_seq_diff( writer->seq, writer->cons_seq[ stride*cons_idx ] ), 0L ), 0L ); + cr_byte_avail = fd_ulong_min( cons_cr_byte_avail, cr_byte_avail ); + cr_frag_avail = fd_ulong_min( cons_cr_frag_avail, cr_frag_avail ); + } + + writer->cr_byte_avail = cr_byte_avail; + writer->cr_frag_avail = cr_frag_avail; +} + +/* In-place publish API ************************************************ + + Example usage: + + void * p = fd_stream_writer_prepare( w ); + ulong sz = fd_stream_writer_publish_sz_max( w ) + fd_memcpy( p, src, sz ); + src += sz; + fd_stream_writer_publish( w, sz ); */ + +/* fd_stream_writer_prepare prepares the caller for a frag publish. + Returns a pointer to a memory region of publish_sz_max() bytes, into + which the caller can write data. A subsequent publish() call makes + the data visible to consumers. U.B. return value if + publish_sz_max()==0. */ + +static inline void * +fd_stream_writer_prepare( fd_stream_writer_t * writer ) { + if( FD_UNLIKELY( writer->data_cur > writer->data_max ) ) { + FD_LOG_CRIT(( "Out-of-bounds data_cur (data_cur=%lu data_max=%lu)", writer->data_cur, writer->data_max )); + return 0; + } + return writer->data + writer->data_cur; +} + +/* fd_stream_writer_publish_sz_max returns the max amount of bytes that + can be published in the next fragment. */ + +static inline ulong +fd_stream_writer_publish_sz_max( fd_stream_writer_t * writer ) { + ulong const data_backp = writer->cr_byte_avail; + ulong const frag_backp = fd_ulong_if( !!writer->cr_frag_avail, writer->frag_sz_max, 0UL ); + ulong const buf_avail = writer->data_max - writer->data_cur; + return fd_ulong_min( fd_ulong_min( data_backp, frag_backp ), buf_avail ); +} + +/* fd_stream_writer_publish completes a publish operation. Writes a + fragment descriptor out to the mcache if frag_sz>0. */ + +static inline void +fd_stream_writer_publish( fd_stream_writer_t * writer, + ulong frag_sz, + ulong ctl ) { + if( FD_UNLIKELY( !frag_sz ) ) return; + + uchar * const data = writer->data + writer->data_cur; + ulong const loff = (ulong)data - (ulong)writer->base; + + fd_mcache_publish_stream( + writer->mcache, + writer->depth, + writer->seq, + writer->goff, + loff, + frag_sz, + ctl + ); + + /* Advance fragment descriptor stream */ + writer->seq = fd_seq_inc( writer->seq, 1UL ); + writer->cr_frag_avail -= 1; + + /* Advance buffer */ + writer->data_cur += frag_sz; + writer->goff += frag_sz; + writer->cr_byte_avail -= frag_sz; + if( FD_UNLIKELY( writer->data_cur > writer->data_max ) ) { + FD_LOG_CRIT(( "Out-of-bounds data_cur (data_cur=%lu data_max=%lu)", writer->data_cur, writer->data_max )); + return; + } + if( writer->data_cur == writer->data_max ) { + writer->data_cur = 0UL; /* cmov */ + } +} + +/* Copy publish API ***************************************************/ + +/* fd_stream_writer_copy publishes the given chunk to the stream as a + sequence of stream frags. data points to the first byte of the chunk + to send. data_sz is the number of bytes (<=copy_max()). + ctl specifies how to set the 'ctl' field. All ctl bits are copied as + is, except for 'som' and 'eom', which act as a mask: + Use 'fd_frag_meta_ctl( ..., som=1, eom=1, ... )' to set the 'som' + bit on the first frag and the 'eom' bit on the last flag. Pass + 'fd_frag_meta_ctl( ..., som=0, eom=0, ... )' or just '0UL' to leave + fragmentation bits cleared on published frags. */ + +void +fd_stream_writer_copy( fd_stream_writer_t * writer, + void const * data, + ulong data_sz, + ulong ctl ); + +static inline ulong +fd_stream_writer_copy_max( fd_stream_writer_t * writer ) { + ulong const data_backp = writer->cr_byte_avail; + ulong const frag_backp = fd_ulong_sat_mul( writer->cr_frag_avail, writer->frag_sz_max ); + ulong const buf_avail = writer->data_max - writer->data_cur; + return fd_ulong_min( fd_ulong_min( data_backp, frag_backp ), buf_avail ); +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_writer_h */ diff --git a/src/discof/restore/test_snapin_tile.c b/src/discof/restore/test_snapin_tile.c new file mode 100644 index 0000000000..8e084a85ad --- /dev/null +++ b/src/discof/restore/test_snapin_tile.c @@ -0,0 +1,127 @@ +#define FD_TILE_TEST +#include "fd_snapin_tile.c" +#include "stream/fd_stream_writer.h" + +static ulong +mock_stream_align( void ) { + return fd_ulong_max( fd_ulong_max( fd_stream_writer_align(), fd_mcache_align() ), fd_dcache_align() ); +} + +static ulong +mock_stream_footprint( ulong depth, + ulong dcache_data_sz ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint( 0UL ) ); + l = FD_LAYOUT_APPEND( l, fd_mcache_align(), fd_mcache_footprint( depth, 0uL ) ); + l = FD_LAYOUT_APPEND( l, fd_dcache_align(), fd_dcache_footprint( dcache_data_sz, 0UL ) ); + return l; +} + +static fd_stream_writer_t * +mock_stream_init( void * mem, + ulong depth, + ulong dcache_data_sz ) { + if( FD_UNLIKELY( !mem ) ) return NULL; + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, mock_stream_align() ) ) ) return NULL; + + FD_SCRATCH_ALLOC_INIT( l, mem ); + void * writer_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint( 0UL ) ); + void * mcache_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_mcache_align(), fd_mcache_footprint( depth, 0uL ) ); + void * dcache_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_dcache_align(), fd_dcache_footprint( dcache_data_sz, 0UL ) ); + + fd_frag_meta_t * mcache = fd_mcache_join( fd_mcache_new( mcache_mem, depth, 0UL, 0UL ) ); + uchar * dcache = fd_dcache_join( fd_dcache_new( dcache_mem, dcache_data_sz, 0UL ) ); + + return fd_stream_writer_new( writer_mem, 0UL, fd_type_pun( mcache ), dcache ); +} + +static void * +mock_stream_delete( fd_stream_writer_t * writer ) { + fd_dcache_delete( fd_dcache_leave( writer->data ) ); + fd_mcache_delete( fd_mcache_leave( fd_type_pun( writer->mcache ) ) ); + return fd_stream_writer_delete( writer ); +} + +/* Feed in snapshot stream frags and validate the resulting account + frags are sane. This variant tests handwritten edge cases. */ + +static void +test_account_frags( fd_wksp_t * wksp ) { + /* Create a snapin context */ + fd_topo_tile_t topo_tile = { + .name = "snapin", + .snapin = { + .scratch_sz = 4096UL + } + }; + void * tile_scratch = fd_wksp_alloc_laddr( wksp, scratch_align(), scratch_footprint( &topo_tile ), 1UL ); + FD_TEST( tile_scratch ); + fd_snapin_tile_t * ctx = scratch_init( tile_scratch, &topo_tile ); + FD_TEST( ctx ); + + void * out_mcache_mem = fd_wksp_alloc_laddr( wksp, fd_mcache_align(), fd_mcache_footprint( 128UL, 0UL ), 1UL ); + ctx->out_mcache = fd_type_pun( fd_mcache_join( fd_mcache_new( out_mcache_mem, 128UL, 0UL, 0UL ) ) ); + FD_TEST( ctx->out_mcache ); + ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); + ctx->out_seq_max = UINT_MAX; + + ctx->tar_file_rem = ULONG_MAX; + ctx->accv_sz = ULONG_MAX; + fd_snapshot_expect_account_hdr( ctx ); + uchar scratch_buf[ 256 ]; + ctx->buf = scratch_buf; + ctx->buf_max = sizeof(scratch_buf); + + /* Create an input */ + void * in_stream_mem = fd_wksp_alloc_laddr( wksp, mock_stream_align(), mock_stream_footprint( 128UL, 4096UL ), 1UL ); + fd_stream_writer_t * in_stream = mock_stream_init( in_stream_mem, 128UL, 4096UL ); + FD_TEST( in_stream ); + fd_snapin_in_t in = { + .mcache = in_stream->mcache, + .depth = (uint)in_stream->depth, + .idx = 0U, + .seq = 0UL, + .goff = 0UL, + .mline = in_stream->mcache + }; + ctx->in_base = (uchar *)wksp; + + /* An empty account */ + fd_solana_account_hdr_t const acc1 = { .hash={ .uc={ 1,2,3 } } }; + fd_stream_writer_copy( in_stream, &acc1, sizeof(fd_solana_account_hdr_t), fd_frag_meta_ctl( 0, 1, 1, 0 ) ); + ulong read_sz; + FD_TEST( on_stream_frag( ctx, &in, in_stream->mcache+0, &read_sz )==1 ); + FD_TEST( ctx->out_mcache[ 0 ].seq==0UL ); + FD_TEST( ctx->out_mcache[ 0 ].sz==sizeof(fd_solana_account_hdr_t) ); + FD_TEST( ctx->out_mcache[ 0 ].ctl==fd_frag_meta_ctl( 0, 1, 1, 0 ) ); + FD_TEST( ctx->out_mcache[ 0 ].goff==0UL ); + FD_TEST( fd_memeq( ctx->in_base+ctx->out_mcache[ 0 ].loff, &acc1, sizeof(fd_solana_account_hdr_t) ) ); + + fd_wksp_free_laddr( mock_stream_delete( in_stream ) ); + fd_wksp_free_laddr( tile_scratch ); +} + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); + ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); + ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() ); + uint rng_seed = fd_env_strip_cmdline_uint ( &argc, &argv, "--rng-seed", NULL, 0U ); + + fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, rng_seed, 0UL ) ); + + fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL ); + if( FD_UNLIKELY( !wksp ) ) FD_LOG_ERR(( "Unable to attach to wksp" )); + + test_account_frags( wksp ); + + fd_wksp_delete_anonymous( wksp ); + fd_rng_delete( fd_rng_leave( rng ) ); + + FD_LOG_NOTICE(( "pass" )); + fd_halt(); + return 0; +} diff --git a/src/discof/rpc/fd_rpcserv_tile.c b/src/discof/rpc/fd_rpcserv_tile.c index 971c209ba4..240e9dccaa 100644 --- a/src/discof/rpc/fd_rpcserv_tile.c +++ b/src/discof/rpc/fd_rpcserv_tile.c @@ -14,7 +14,6 @@ #include "../../disco/fd_disco.h" #include "../../disco/shred/fd_stake_ci.h" #include "../../disco/topo/fd_pod_format.h" -#include "../../funk/fd_funk_filemap.h" #include "../../disco/keyguard/fd_keyload.h" #include @@ -26,9 +25,6 @@ struct fd_rpcserv_tile_ctx { fd_rpcserver_args_t args; - char funk_file[ PATH_MAX ]; - - int activated; fd_rpc_ctx_t * ctx; @@ -82,12 +78,7 @@ before_credit( fd_rpcserv_tile_ctx_t * ctx, fd_stem_context_t * stem, int * charge_busy ) { (void)stem; - - if( FD_UNLIKELY( !ctx->activated ) ) { - *charge_busy = 0; - } else { - *charge_busy = fd_rpc_ws_poll( ctx->ctx ); - } + *charge_busy = fd_rpc_ws_poll( ctx->ctx ); } static void @@ -135,20 +126,7 @@ after_frag( fd_rpcserv_tile_ctx_t * ctx, (void)stem; if( FD_LIKELY( in_idx==REPLAY_NOTIF_IDX ) ) { - if( FD_UNLIKELY( !ctx->activated ) ) { - fd_rpcserver_args_t * args = &ctx->args; - fd_funk_t * funk = fd_funk_open_file( - args->funk, ctx->funk_file, 1, 0, 0, 0, 0, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "failed to join a funky" )); - } - - ctx->activated = 1; - fd_rpc_start_service( args, ctx->ctx ); - } - fd_rpc_replay_after_frag( ctx->ctx, &ctx->replay_notif_in_state ); - } else if( FD_UNLIKELY( in_idx==STAKE_CI_IN_IDX ) ) { fd_rpc_stake_after_frag( ctx->ctx, ctx->args.stake_ci ); @@ -189,7 +167,6 @@ privileged_init( fd_topo_t * topo, uchar * spad_mem_cur = spad_mem; args->spad = fd_spad_join( fd_spad_new( spad_mem_cur, FD_RPC_SCRATCH_MAX ) ); - strncpy( ctx->funk_file, tile->replay.funk_file, sizeof(ctx->funk_file) ); /* Open funk after replay tile is booted */ /* Blockstore setup */ @@ -238,8 +215,6 @@ unprivileged_init( fd_topo_t * topo, if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) ) FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) )); - ctx->activated = 0; - fd_topo_link_t * replay_notif_in_link = &topo->links[ tile->in_link_id[ REPLAY_NOTIF_IDX ] ]; ctx->replay_notif_in_mem = topo->workspaces[ topo->objs[ replay_notif_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->replay_notif_in_chunk0 = fd_dcache_compact_chunk0( ctx->replay_notif_in_mem, replay_notif_in_link->dcache ); @@ -249,6 +224,12 @@ unprivileged_init( fd_topo_t * topo, ctx->stake_ci_in_mem = topo->workspaces[ topo->objs[ stake_ci_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->stake_ci_in_chunk0 = fd_dcache_compact_chunk0( ctx->stake_ci_in_mem, stake_ci_in_link->dcache ); ctx->stake_ci_in_wmark = fd_dcache_compact_wmark ( ctx->stake_ci_in_mem, stake_ci_in_link->dcache, stake_ci_in_link->mtu ); + + fd_rpcserver_args_t * args = &ctx->args; + if( FD_UNLIKELY( !fd_funk_join( args->funk, fd_topo_obj_laddr( topo, tile->rpcserv.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + fd_rpc_start_service( args, ctx->ctx ); } static ulong diff --git a/src/discof/writer/fd_writer_tile.c b/src/discof/writer/fd_writer_tile.c index f1f894b179..32e2f7b3ac 100644 --- a/src/discof/writer/fd_writer_tile.c +++ b/src/discof/writer/fd_writer_tile.c @@ -9,7 +9,6 @@ #include "../../flamenco/runtime/fd_executor.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" struct fd_writer_tile_in_ctx { fd_wksp_t * mem; @@ -31,7 +30,6 @@ struct fd_writer_tile_ctx { /* Local join of Funk. R/W. */ fd_funk_t funk[1]; - fd_wksp_t * funk_wksp; /* Link management. */ fd_writer_tile_in_ctx_t exec_writer_in[ FD_PACK_MAX_BANK_TILES ]; @@ -339,23 +337,9 @@ unprivileged_init( fd_topo_t * topo, /* Funk */ /********************************************************************/ - FD_LOG_DEBUG(( "Trying to join funk at file=%s", tile->writer.funk_file )); - fd_funk_txn_start_write( NULL ); - int funk_join_ok = !!fd_funk_open_file( ctx->funk, - tile->writer.funk_file, - 1UL, - 0UL, - 0UL, - 0UL, - 0UL, - FD_FUNK_READ_WRITE, - NULL ); - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( ctx->funk ); - if( FD_UNLIKELY( !funk_join_ok ) ) { - FD_LOG_CRIT(( "Failed to join funk" )); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->writer.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); } - FD_LOG_DEBUG(( "Just joined funk at file=%s", tile->writer.funk_file )); /********************************************************************/ /* Setup fseq */ diff --git a/src/flamenco/runtime/tests/run_ledger_backtest.sh b/src/flamenco/runtime/tests/run_ledger_backtest.sh index fec021eb93..7e677504bd 100755 --- a/src/flamenco/runtime/tests/run_ledger_backtest.sh +++ b/src/flamenco/runtime/tests/run_ledger_backtest.sh @@ -157,9 +157,6 @@ echo " archiver_path = \"$DUMP/$LEDGER/rocksdb\" [tiles.replay] snapshot = \"$SNAPSHOT\" - funk_sz_gb = $FUNK_PAGES - funk_txn_max = 1024 - funk_rec_max = $INDEX_MAX cluster_version = \"$CLUSTER_VERSION\" enable_features = [ \"$ONE_OFFS\" ] funk_file = \"$DUMP/$LEDGER/backtest.funk\" @@ -171,6 +168,10 @@ echo " txn_max = 1048576 alloc_max = 10737418240 file = \"$DUMP/$LEDGER/backtest.blockstore\" + [funk] + heap_size_gib = $FUNK_PAGES + max_account_records = $INDEX_MAX + max_database_transactions = 1024 [consensus] vote = false [development] diff --git a/src/flamenco/snapshot/fd_snapshot_http.c b/src/flamenco/snapshot/fd_snapshot_http.c index bf62bee2e6..4ddb9a6342 100644 --- a/src/flamenco/snapshot/fd_snapshot_http.c +++ b/src/flamenco/snapshot/fd_snapshot_http.c @@ -122,14 +122,18 @@ fd_snapshot_http_new( void * mem, return this; } -static void +void fd_snapshot_http_cleanup_fds( fd_snapshot_http_t * this ) { if( this->snapshot_fd!=-1 ) { - close( this->snapshot_fd ); + if( FD_UNLIKELY( close( this->snapshot_fd ) ) ) { + FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } this->snapshot_fd = -1; } if( this->socket_fd!=-1 ) { - close( this->socket_fd ); + if( FD_UNLIKELY( close( this->socket_fd ) ) ) { + FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } this->socket_fd = -1; } } @@ -192,6 +196,18 @@ fd_snapshot_http_init( fd_snapshot_http_t * this ) { return 0; } +/* for http tile use */ +void +fd_snapshot_http_privileged_init( fd_snapshot_http_t * this ) { + fd_snapshot_http_init( this ); + + /* open snapshot fd for writing to snapshot file */ + this->snapshot_fd = open( this->snapshot_path, O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR ); + if( this->snapshot_fd<0 ) { + FD_LOG_ERR(( "open(%s) failed (%d-%s)", this->snapshot_path, errno, fd_io_strerror( errno ) )); + } +} + /* fd_snapshot_http_req writes out the request. */ static int @@ -652,6 +668,8 @@ fd_io_istream_snapshot_http_read( void * _this, return fd_snapshot_http_dl( this, dst, dst_max, dst_sz ); case FD_SNAPSHOT_HTTP_STATE_READ: return fd_snapshot_http_read( this, dst, dst_max, dst_sz ); + case FD_SNAPSHOT_HTTP_STATE_DONE: + return 1; } /* Not yet ready to read at this point. */ diff --git a/src/flamenco/snapshot/fd_snapshot_http.h b/src/flamenco/snapshot/fd_snapshot_http.h index 20b36aeea4..481e27b80d 100644 --- a/src/flamenco/snapshot/fd_snapshot_http.h +++ b/src/flamenco/snapshot/fd_snapshot_http.h @@ -98,6 +98,19 @@ struct fd_snapshot_http { typedef struct fd_snapshot_http fd_snapshot_http_t; +FD_FN_PURE static inline ulong +fd_snapshot_http_align( void ) { + return fd_ulong_max( alignof(fd_snapshot_http_t), alignof(fd_snapshot_name_t) ); +} + +FD_FN_PURE static inline ulong +fd_snapshot_http_footprint( void ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_snapshot_http_t), sizeof(fd_snapshot_http_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_snapshot_name_t), sizeof(fd_snapshot_name_t) ); + return FD_LAYOUT_FINI( l, fd_snapshot_http_align() ); +} + fd_snapshot_http_t * fd_snapshot_http_new( void * mem, const char * dst_str, @@ -106,6 +119,9 @@ fd_snapshot_http_new( void * mem, const char * snapshot_dir, fd_snapshot_name_t * name_out ); +void +fd_snapshot_http_privileged_init( fd_snapshot_http_t * this ); + void * fd_snapshot_http_delete( fd_snapshot_http_t * this ); @@ -142,6 +158,9 @@ fd_io_istream_snapshot_http_virtual( fd_snapshot_http_t * this ) { }; } +void +fd_snapshot_http_cleanup_fds( fd_snapshot_http_t * this ); + FD_PROTOTYPES_END #endif /* HEADER_fd_src_flamenco_snapshot_fd_snapshot_http_h */ diff --git a/src/funk/fd_funk.c b/src/funk/fd_funk.c index 0ca50cee87..0700e7aae3 100644 --- a/src/funk/fd_funk.c +++ b/src/funk/fd_funk.c @@ -132,8 +132,8 @@ fd_funk_new( void * shmem, } fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ) { +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ) { if( FD_UNLIKELY( !shfunk ) ) { FD_LOG_WARNING(( "NULL shfunk" )); return NULL; diff --git a/src/funk/fd_funk.h b/src/funk/fd_funk.h index 3a13a0ade3..140003634f 100644 --- a/src/funk/fd_funk.h +++ b/src/funk/fd_funk.h @@ -327,8 +327,8 @@ fd_funk_new( void * shmem, (joins are local to a thread group). */ fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ); +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ); /* fd_funk_leave leaves a funk join. Returns the memory region used for join on success (caller has ownership on return and the caller is no diff --git a/src/funk/fd_funk_base.h b/src/funk/fd_funk_base.h index 3d3decc5ac..068616fc5b 100644 --- a/src/funk/fd_funk_base.h +++ b/src/funk/fd_funk_base.h @@ -162,6 +162,24 @@ fd_xxh3_mix16b( ulong i0, ulong i1, return fd_xxh3_mul128_fold64( i0 ^ (s0 + seed), i1 ^ (s1 - seed) ); } +FD_FN_PURE static inline ulong +fd_funk_rec_key_hash1( uchar const key[ 32 ], + ulong rec_type, + ulong seed ) { + seed ^= rec_type; + ulong k0 = FD_LOAD( ulong, key+ 0 ); + ulong k1 = FD_LOAD( ulong, key+ 8 ); + ulong k2 = FD_LOAD( ulong, key+16 ); + ulong k3 = FD_LOAD( ulong, key+24 ); + ulong acc = 32 * 0x9E3779B185EBCA87ULL; + acc += fd_xxh3_mix16b( k0, k1, 0xbe4ba423396cfeb8UL, 0x1cad21f72c81017cUL, seed ); + acc += fd_xxh3_mix16b( k2, k3, 0xdb979083e96dd4deUL, 0x1f67b3b7a4a44072UL, seed ); + acc = acc ^ (acc >> 37); + acc *= 0x165667919E3779F9ULL; + acc = acc ^ (acc >> 32); + return acc; +} + FD_FN_PURE static inline ulong fd_funk_rec_key_hash( fd_funk_rec_key_t const * k, ulong seed ) { @@ -182,13 +200,22 @@ fd_funk_rec_key_hash( fd_funk_rec_key_t const * k, FIXME This version is vulnerable to HashDoS */ +FD_FN_PURE static inline ulong +fd_funk_rec_key_hash1( uchar const key[ 32 ], + ulong rec_type, + ulong seed ) { + seed ^= rec_type; + /* tons of ILP */ + return (fd_ulong_hash( seed ^ (1UL<<0) ^ FD_LOAD( ulong, key+ 0 ) ) ^ + fd_ulong_hash( seed ^ (1UL<<1) ^ FD_LOAD( ulong, key+ 8 ) ) ) ^ + (fd_ulong_hash( seed ^ (1UL<<2) ^ FD_LOAD( ulong, key+16 ) ) ^ + fd_ulong_hash( seed ^ (1UL<<3) ^ FD_LOAD( ulong, key+24 ) ) ); +} + FD_FN_PURE static inline ulong fd_funk_rec_key_hash( fd_funk_rec_key_t const * k, ulong seed ) { - seed ^= k->ul[4]; - /* tons of ILP */ - return (fd_ulong_hash( seed ^ (1UL<<0) ^ k->ul[0] ) ^ fd_ulong_hash( seed ^ (1UL<<1) ^ k->ul[1] ) ) ^ - (fd_ulong_hash( seed ^ (1UL<<2) ^ k->ul[2] ) ^ fd_ulong_hash( seed ^ (1UL<<3) ^ k->ul[3] ) ); + return fd_funk_rec_key_hash1( k->uc, k->ul[4], seed ); } #endif /* FD_HAS_INT128 */ diff --git a/src/funk/fd_funk_filemap.c b/src/funk/fd_funk_filemap.c index 1ff9405e9e..61ffd5bda2 100644 --- a/src/funk/fd_funk_filemap.c +++ b/src/funk/fd_funk_filemap.c @@ -180,7 +180,7 @@ fd_funk_open_file( void * ljoin, ulong part_max = fd_wksp_part_max_est( total_sz, 1U<<18U ); if( FD_UNLIKELY( !part_max ) ) { - FD_LOG_WARNING(( "fd_wksp_part_max_est(%lu,64KiB) failed", total_sz )); + FD_LOG_WARNING(( "fd_wksp_part_max_est(%lu,256KiB) failed", total_sz )); munmap( shmem, total_sz ); close( fd ); return NULL;