From 44ee4e58b5bfd854781ff4683d0424a5a1a84d08 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Fri, 25 Apr 2025 19:59:44 +0000 Subject: [PATCH 01/34] restore 2.0: add FileRd tile --- src/app/firedancer-dev/main.c | 8 +- src/disco/stem/fd_stem.h | 2 +- src/disco/topo/fd_topo.h | 4 + src/disco/topo/fd_topob.c | 4 +- src/disco/topo/fd_topob.h | 2 +- src/discof/restore/Local.mk | 1 + src/discof/restore/README.md | 35 +++ src/discof/restore/fd_filerd_tile.c | 347 +++++++++++++++++++++++++++ src/discof/restore/fd_restore_base.h | 62 +++++ 9 files changed, 456 insertions(+), 9 deletions(-) create mode 100644 src/discof/restore/Local.mk create mode 100644 src/discof/restore/README.md create mode 100644 src/discof/restore/fd_filerd_tile.c create mode 100644 src/discof/restore/fd_restore_base.h diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index 5d73968776..c03e46344f 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -95,9 +95,7 @@ extern fd_topo_run_tile_t fd_tile_archiver_feeder; extern fd_topo_run_tile_t fd_tile_archiver_writer; extern fd_topo_run_tile_t fd_tile_archiver_playback; -extern fd_topo_run_tile_t fd_tile_bencho; -extern fd_topo_run_tile_t fd_tile_benchg; -extern fd_topo_run_tile_t fd_tile_benchs; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, @@ -131,9 +129,7 @@ fd_topo_run_tile_t * TILES[] = { &fd_tile_archiver_feeder, &fd_tile_archiver_writer, &fd_tile_archiver_playback, - &fd_tile_bencho, - &fd_tile_benchg, - &fd_tile_benchs, + &fd_tile_snapshot_restore_FileRd, NULL, }; diff --git a/src/disco/stem/fd_stem.h b/src/disco/stem/fd_stem.h index 5ed398bd2b..9fd290a4a6 100644 --- a/src/disco/stem/fd_stem.h +++ b/src/disco/stem/fd_stem.h @@ -8,7 +8,7 @@ struct fd_stem_context { fd_frag_meta_t ** mcaches; ulong * seqs; - ulong * depths; + ulong const * depths; ulong * cr_avail; ulong cr_decrement_amount; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 3c08d34dde..96831ca240 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -428,6 +428,10 @@ typedef struct { int archive_fd; } archiver; + struct { + char file_path[ PATH_MAX ]; + } filerd; + }; } fd_topo_tile_t; diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index ca2654d384..b5c889eff3 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -64,7 +64,7 @@ fd_topob_obj( fd_topo_t * topo, return obj; } -void +fd_topo_link_t * fd_topob_link( fd_topo_t * topo, char const * link_name, char const * wksp_name, @@ -100,6 +100,8 @@ fd_topob_link( fd_topo_t * topo, FD_TEST( fd_pod_insertf_ulong( topo->props, mtu, "obj.%lu.mtu", obj->id ) ); } topo->link_cnt++; + + return link; } void diff --git a/src/disco/topo/fd_topob.h b/src/disco/topo/fd_topob.h index cbb6d100b0..162237d18d 100644 --- a/src/disco/topo/fd_topob.h +++ b/src/disco/topo/fd_topob.h @@ -72,7 +72,7 @@ fd_topob_tile_uses( fd_topo_t * topo, can have no backing data buffer, a dcache, or a reassembly buffer behind it. */ -void +fd_topo_link_t * fd_topob_link( fd_topo_t * topo, char const * link_name, char const * wksp_name, diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk new file mode 100644 index 0000000000..da3369e6a5 --- /dev/null +++ b/src/discof/restore/Local.mk @@ -0,0 +1 @@ +$(call add-objs,fd_filerd_tile,fd_discof) diff --git a/src/discof/restore/README.md b/src/discof/restore/README.md new file mode 100644 index 0000000000..db0be4decc --- /dev/null +++ b/src/discof/restore/README.md @@ -0,0 +1,35 @@ +# Snapshot Restore + +## Stream link conventions + +Various snapshot components use byte streams, not packet streams. + +These require custom conventions. + +**Stream fragment descriptors** + +Byte streams use `fd_frag_stream_meta_t` (defined in `fd_restore_base.h`). + +These have the following changes: +- `chunk` is replaced by `goff` and `loff`, which are 64-bit offsets + describing the stream offset and dcache offset respectively +- `tsorig` / `tspub` are removed (latency is less relevant) +- `sig` is removed (cannot filter without looking at stream data) +- `sz` is widened to 32 bits. + +`**Dcache allocations** + +Payloads in stream dcaches are unaligned. Payloads are addressed with +uncompressed byte offsets relative to the workspace start. + +(Compare this to the usual compact packet dcaches, which use 64 byte +aligned chunks with compressed addressing.) + +**Stream backpressure** + +Byte streams naturally require a reliable transport. + +Consumers periodically publish their progress in `fseq`. +- `fseq[0]` is the lowest sequence number not yet consumed (standard) +- `fseq[1]` is the stream offset of the next byte not yet consumed +` \ No newline at end of file diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c new file mode 100644 index 0000000000..7c2574a193 --- /dev/null +++ b/src/discof/restore/fd_filerd_tile.c @@ -0,0 +1,347 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../disco/metrics/fd_metrics.h" +#include +#include +#include +#include + +#define NAME "FileRd" + +struct fd_filerd_tile { + int fd; + + uchar * buf; /* dcache */ + ulong buf_off; + ulong buf_sz; + ulong goff; +}; + +typedef struct fd_filerd_tile fd_filerd_tile_t; + +static ulong +scratch_align( void ) { + return alignof(fd_filerd_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + return sizeof(fd_filerd_tile_t); +} + +static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + fd_memset( ctx, 0, sizeof(fd_filerd_tile_t) ); + + if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + + ctx->fd = open( tile->filerd.file_path, O_RDONLY|O_CLOEXEC ); + if( FD_UNLIKELY( ctx->fd<0 ) ) FD_LOG_ERR(( "open() failed (%i-%s)", errno, fd_io_strerror( errno ) )); +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + + void * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ 0 ] ].dcache_obj_id ) ); + FD_TEST( out_dcache ); + + ctx->buf = out_dcache; + ctx->buf_off = 0UL; + ctx->buf_sz = fd_dcache_data_sz( out_dcache ); + ctx->goff = 0UL; +} + +static void +during_housekeeping( fd_filerd_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_filerd_tile_t * ctx ) { + (void)ctx; +} + +static void +close_file( fd_filerd_tile_t * ctx ) { + if( FD_UNLIKELY( ctx->fd<0 ) ) return; + if( FD_UNLIKELY( close( ctx->fd ) ) ) { + FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } + ctx->fd = -1; +} + +static void +after_credit( fd_filerd_tile_t * ctx, + fd_frag_stream_meta_t * out_mcache, + ulong const out_depth, + ulong * restrict out_seq, + ulong * restrict cr_frag_avail, + ulong * restrict cr_byte_avail, + int * restrict charge_busy_after ) { + /* Assumes *cr_frag_avail>=2 */ + + int fd = ctx->fd; + if( FD_UNLIKELY( fd<0 ) ) return; + + if( FD_UNLIKELY( ctx->buf_off >= ctx->buf_sz ) ) { + FD_LOG_CRIT(( "Buffer overflow (buf_off=%lu buf_sz=%lu)", ctx->buf_off, ctx->buf_sz )); + } + + ulong const iov0_sz = fd_ulong_min( *cr_byte_avail, ctx->buf_sz - ctx->buf_off ); + struct iovec iov[2]; + iov[ 0 ].iov_base = ctx->buf + ctx->buf_off; + iov[ 0 ].iov_len = iov0_sz; + iov[ 1 ].iov_base = ctx->buf; + iov[ 1 ].iov_len = fd_ulong_min( (ulong)fd_long_max( 0L, (long)*cr_byte_avail-(long)iov0_sz ), ctx->buf_off ); + + long res = readv( fd, iov, 2 ); + if( FD_UNLIKELY( res<=0L ) ) { + if( FD_UNLIKELY( res==0 ) ) { + FD_LOG_INFO(( "Reached end of file" )); + close_file( ctx ); + return; + } + if( FD_LIKELY( errno==EAGAIN ) ) return; + FD_LOG_ERR(( "readv() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + /* aborts app */ + } + + ulong sz = (ulong)res; + cr_byte_avail[0] -= sz; + *charge_busy_after = 1; + + ulong frag0_sz = fd_ulong_min( iov0_sz, sz ); + ulong frag1_sz = (ulong)res - frag0_sz; + + fd_mcache_publish_stream( out_mcache, out_depth, out_seq[0], ctx->goff, ctx->buf_off, frag0_sz ); + out_seq[0] = fd_seq_inc( out_seq[0], 1UL ); + cr_frag_avail[0]--; + ctx->goff += frag0_sz; + ctx->buf_off += frag0_sz; + if( ctx->buf_off >= ctx->buf_sz ) ctx->buf_off = 0UL; /* cmov */ + + if( FD_UNLIKELY( frag1_sz ) ) { + fd_mcache_publish_stream( out_mcache, out_depth, out_seq[0], ctx->goff, 0UL, frag1_sz ); + out_seq[0] = fd_seq_inc( out_seq[0], 1UL ); + cr_frag_avail[0]--; + ctx->goff += frag1_sz; + ctx->buf_off += frag1_sz; + } +} + +/* run/run1 are a custom run loop based on fd_stem.c. */ + +__attribute__((noinline)) static void +fd_filerd_run1( + fd_filerd_tile_t * ctx, + fd_frag_stream_meta_t * out_mcache, + void * out_dcache, + ulong cons_cnt, + ushort * restrict event_map, /* cnt=1+cons_cnt */ + ulong ** restrict cons_fseq, /* cnt= cons_cnt points to each consumer's fseq */ + ulong volatile ** restrict cons_slow, /* cnt= cons_cnt points to 'slow' metrics */ + ulong * restrict cons_seq, /* cnt=2*cons_cnt cache of recent fseq observations */ + long lazy, + fd_rng_t * rng +) { + + /* out flow control state */ + ulong cr_byte_avail; /* byte burst quota */ + ulong cr_frag_avail; /* frag burst quota */ + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; /* min number of ticks between a housekeeping event */ + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* out frag stream init */ + + cr_byte_avail = 0UL; + cr_frag_avail = 0UL; + + ulong const out_depth = fd_mcache_depth( out_mcache->f ); + ulong out_seq = 0UL; + + ulong const out_bufsz = fd_dcache_data_sz( out_dcache ); + + ulong const cr_byte_max = out_bufsz; + ulong const cr_frag_max = out_depth; + + ulong const burst_byte = 512UL; /* don't producing frags smaller than this */ + ulong const burst_frag = 2UL; + + for( ulong cons_idx=0UL; cons_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxgoff, cons_seq[ 2*cons_idx+1 ] ), 0L ), 0L ); + slowest_cons = fd_ulong_if( cons_cr_byte_avail=event_cnt ) ) { + event_seq = 0UL; + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( cr_byte_availlinks[ tile->out_link_id[ 0 ] ].mcache ); + FD_TEST( out_mcache ); + + ulong reliable_cons_cnt = 0UL; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[0] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cnttile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ 2*reliable_cons_cnt ]; + fd_filerd_run1( ctx, out_mcache, ctx->buf, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, 0L, rng ); +} + +fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd = { + .name = NAME, + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, + .run = fd_filerd_run, +}; + +#undef NAME diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h new file mode 100644 index 0000000000..de6b489842 --- /dev/null +++ b/src/discof/restore/fd_restore_base.h @@ -0,0 +1,62 @@ +#include "../../tango/mcache/fd_mcache.h" + +/* fd_frag_stream_meta_t is a variation of fd_frag_meta_t optimized for + stream I/O. */ + +union fd_frag_stream_meta { + + struct { + + ulong seq; /* frag sequence number */ + ulong goff; /* global offset */ + + uint sz; + uint unused; + ulong loff; /* dcache offset */ + + }; + + fd_frag_meta_t f[1]; + +}; + +typedef union fd_frag_stream_meta fd_frag_stream_meta_t; + +FD_PROTOTYPES_BEGIN + +#if FD_HAS_SSE + +FD_FN_CONST static inline __m128i +fd_frag_stream_meta_sse0( ulong seq, + ulong goff ) { + return _mm_set_epi64x( (long)goff, (long)seq ); +} + +FD_FN_CONST static inline __m128i +fd_frag_stream_meta_sse1( ulong sz, /* Assumed 32-bit */ + ulong loff ) { + return _mm_set_epi64x( (long)loff, (long)(sz) ); +} + +#endif /* FD_HAS_SSE */ + +static inline void +fd_mcache_publish_stream( fd_frag_stream_meta_t * mcache, + ulong depth, + ulong seq, + ulong goff, + ulong loff, + ulong sz ) { + fd_frag_stream_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + FD_COMPILER_MFENCE(); + meta->seq = fd_seq_dec( seq, 1UL ); + FD_COMPILER_MFENCE(); + meta->goff = goff; + meta->loff = loff; + meta->sz = (uint)sz; + FD_COMPILER_MFENCE(); + meta->seq = seq; + FD_COMPILER_MFENCE(); +} + +FD_PROTOTYPES_END From 283c736b2721686196c06dd339b5f66b4330be9a Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 5 May 2025 16:01:41 +0000 Subject: [PATCH 02/34] wip --- contrib/test/run_fd_shred_cap.sh | 11 +- contrib/test/test_firedancer_leader.sh | 11 +- snapload.toml | 7 + src/app/firedancer-dev/Local.mk | 1 + .../firedancer-dev/commands/snapshot_load.c | 174 +++ src/app/firedancer-dev/main.c | 6 + src/app/firedancer/commands/configure/funk.c | 59 + src/app/firedancer/config/default.toml | 90 +- src/app/firedancer/topology.c | 20 +- src/app/shared/boot/fd_boot.c | 1 + src/app/shared/fd_action.h | 10 +- src/app/shared/fd_config.h | 18 +- src/app/shared/fd_config_parse.c | 10 +- src/disco/bundle/fd_bundle_client.c | 277 ++++ src/disco/bundle/test_bundle_client.c | 76 ++ src/disco/stem/fd_stem.c | 4 +- src/disco/topo/fd_topo.h | 4 + src/disco/topo/fd_topob.c | 2 + src/discof/restart/test/restart_fd.sh | 11 +- src/discof/restore/Local.mk | 2 + src/discof/restore/README.md | 58 +- src/discof/restore/fd_actalc_tile.c | 472 +++++++ src/discof/restore/fd_actcpy_tile.c | 0 src/discof/restore/fd_filerd_tile.c | 81 +- src/discof/restore/fd_restore_base.h | 69 +- src/discof/restore/fd_restore_manifest.c | 261 ++++ src/discof/restore/fd_snapin_tile.c | 1188 +++++++++++++++++ src/discof/restore/fd_unzstd_tile.c | 0 src/discof/restore/test_snapin_tile.c | 20 + src/funk/fd_funk.c | 4 +- src/funk/fd_funk.h | 4 +- src/funk/fd_funk_filemap.h | 111 +- src/util/archive/fd_tar.h | 24 +- 33 files changed, 2900 insertions(+), 186 deletions(-) create mode 100644 snapload.toml create mode 100644 src/app/firedancer-dev/commands/snapshot_load.c create mode 100644 src/app/firedancer/commands/configure/funk.c create mode 100644 src/disco/bundle/fd_bundle_client.c create mode 100644 src/disco/bundle/test_bundle_client.c create mode 100644 src/discof/restore/fd_actalc_tile.c create mode 100644 src/discof/restore/fd_actcpy_tile.c create mode 100644 src/discof/restore/fd_restore_manifest.c create mode 100644 src/discof/restore/fd_snapin_tile.c create mode 100644 src/discof/restore/fd_unzstd_tile.c create mode 100644 src/discof/restore/test_snapin_tile.c diff --git a/contrib/test/run_fd_shred_cap.sh b/contrib/test/run_fd_shred_cap.sh index bdf888276b..4ae7856b17 100755 --- a/contrib/test/run_fd_shred_cap.sh +++ b/contrib/test/run_fd_shred_cap.sh @@ -90,16 +90,19 @@ echo " idx_max = 8192 alloc_max = 1073741824 file = \"$DATA_DIR/shredcap_testnet.blockstore\" +[funk] + max_account_records = 150000000 + heap_size_gb = 100 + max_database_transactions = 2000 + [funk.filemap] + enabled = true + path = \"$DATA_DIR/shredcap_testnet.funk\" [tiles] [tiles.shred] max_pending_shred_sets = 16384 [tiles.replay] snapshot = \"$SNAPSHOT\" incremental = \"$INCREMENTAL\" - funk_sz_gb = 100 - funk_rec_max = 150000000 - funk_txn_max = 2000 - funk_file = \"$DATA_DIR/shredcap_testnet.funk\" [tiles.store_int] shred_cap_replay = \"$SHREDCAP\" shred_cap_end_slot = 317018450 diff --git a/contrib/test/test_firedancer_leader.sh b/contrib/test/test_firedancer_leader.sh index 282f354bcb..141395cfc4 100755 --- a/contrib/test/test_firedancer_leader.sh +++ b/contrib/test/test_firedancer_leader.sh @@ -50,10 +50,6 @@ echo " [tiles.replay] capture = \"firedancer-dev.solcap\" snapshot = \"$FULL_SNAPSHOT\" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 - funk_file = \"/tmp/localnet.funk\" cluster_version = \"2.0.14\" [tiles.gui] enabled = false @@ -72,6 +68,13 @@ echo " txn_max = 1024 alloc_max = 10737418240 file = \"/tmp/localnet.blockstore\" +[funk] + max_account_records = 10000000 + heap_size_gb = 32 + max_database_transactions = 1024 + [funk.filemap] + enabled = true + path = \"/tmp/localnet.funk\" [log] path = \"firedancer-dev.log\" level_stderr = \"INFO\" diff --git a/snapload.toml b/snapload.toml new file mode 100644 index 0000000000..38945b3cef --- /dev/null +++ b/snapload.toml @@ -0,0 +1,7 @@ +[hugetlbfs] +max_page_size = "huge" + +[log] + level_stderr = "INFO" + level_logfile = "INFO" + path = "-" diff --git a/src/app/firedancer-dev/Local.mk b/src/app/firedancer-dev/Local.mk index 7cbdc4d078..ec14ed284e 100644 --- a/src/app/firedancer-dev/Local.mk +++ b/src/app/firedancer-dev/Local.mk @@ -12,6 +12,7 @@ $(call add-objs,commands/gossip,fd_firedancer_dev) $(call add-objs,commands/bench,fd_firedancer_dev) $(call add-objs,commands/dev,fd_firedancer_dev) $(call add-objs,commands/sim,fd_firedancer_dev) +$(call add-objs,commands/snapshot_load,fd_firedancer_dev) $(call make-bin,firedancer-dev,main,fd_firedancer_dev fd_firedancer fddev_shared fdctl_shared fd_discof fd_disco fd_choreo fd_flamenco fd_funk fd_quic fd_tls fd_reedsol fd_ballet fd_waltz fd_tango fd_util firedancer_version, $(SECP256K1_LIBS)) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c new file mode 100644 index 0000000000..2d0582e29b --- /dev/null +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -0,0 +1,174 @@ +#include "../../shared/fd_config.h" +#include "../../shared/commands/configure/configure.h" +#include "../../shared/commands/run/run.h" +#include "../../../disco/metrics/fd_metrics.h" +#include "../../../disco/topo/fd_topob.h" +#include "../../../disco/topo/fd_pod_format.h" +#include "../../../util/tile/fd_tile_private.h" +#include +#include +#include + +#define NAME "snapshot-load" + +extern fd_topo_obj_callbacks_t * CALLBACKS[]; + +fd_topo_run_tile_t +fdctl_tile_run( fd_topo_tile_t const * tile ); + +static void +snapshot_load_topo( config_t * config, + args_t const * args ) { + fd_topo_t * topo = &config->topo; + fd_topob_new( &config->topo, config->name ); + topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size ); + + static ushort tile_to_cpu[ FD_TILE_MAX ] = {0}; + if( args->tile_cpus[0] ) { + ulong cpu_cnt = fd_tile_private_cpus_parse( args->tile_cpus, tile_to_cpu ); + if( FD_UNLIKELY( cpu_cnt<4UL ) ) FD_LOG_ERR(( "--tile-cpus specifies %lu CPUs, but need at least 4", cpu_cnt )); + } + + fd_topob_wksp( topo, "metric_in" ); + fd_topob_wksp( topo, "metric" ); + fd_topo_tile_t * metric_tile = fd_topob_tile( topo, "metric", "metric", "metric_in", tile_to_cpu[0], 0, 0 ); + if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) + FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); + metric_tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + + fd_topob_wksp( topo, "FileRd" ); + fd_topo_tile_t * filerd_tile = fd_topob_tile( topo, "FileRd", "FileRd", "FileRd", tile_to_cpu[1], 0, 0 ); + fd_memcpy( filerd_tile->filerd.file_path, args->snapshot_load.snapshot_path, PATH_MAX ); + FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==sizeof(args->snapshot_load.snapshot_path), abi ); + FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==PATH_MAX, abi ); + + fd_topob_wksp( topo, "SnapIn" ); + fd_topo_tile_t * snapin_tile = fd_topob_tile( topo, "SnapIn", "SnapIn", "SnapIn", tile_to_cpu[2], 0, 0 ); + snapin_tile->snapin.scratch_sz = (3UL<<30); + + fd_topob_wksp( topo, "ActAlc" ); + fd_topo_tile_t * actalc_tile = fd_topob_tile( topo, "ActAlc", "ActAlc", "ActAlc", tile_to_cpu[3], 0, 0 ); + (void)actalc_tile; + + fd_topob_wksp( topo, "snap_stream" ); + fd_topo_link_t * snapin_link = fd_topob_link( topo, "snap_stream", "snap_stream", 512UL, 0UL, 0UL ); + fd_topo_obj_t * snapin_dcache = fd_topob_obj( topo, "dcache", "snap_stream" ); + snapin_link->dcache_obj_id = snapin_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", snapin_dcache->id ) ); + fd_topob_tile_out ( topo, "FileRd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_in ( topo, "SnapIn", 0UL, "metric_in", "snap_stream", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, filerd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, snapin_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, actalc_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + fd_topob_wksp( topo, "snap_frags" ); + fd_topob_link( topo, "snap_frags", "snap_frags", 65536UL, 0UL, 0UL ); + fd_topob_tile_out( topo, "SnapIn", 0UL, "snap_frags", 0UL ); + fd_topob_tile_in ( topo, "ActAlc", 0UL, "metric_in", "snap_frags", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + + fd_topob_wksp( topo, "snap_descs" ); + fd_topob_link( topo, "snap_descs", "snap_descs", 512UL, 0UL, 0UL )->permit_unused = 1; + fd_topob_tile_out( topo, "ActAlc", 0UL, "snap_descs", 0UL ); + + if( !args->tile_cpus[0] ) { + fd_topob_auto_layout( topo, 0 ); + } + fd_topob_finish( topo, CALLBACKS ); + fd_topo_print_log( /* stdout */ 1, topo ); +} + +static void +snapshot_load_cmd_args( int * pargc, + char *** pargv, + args_t * args ) { + char const * tile_cpus = fd_env_strip_cmdline_cstr( pargc, pargv, "--tile-cpus", "FD_TILE_CPUS", NULL ); + char const * snapshot_file = fd_env_strip_cmdline_cstr( pargc, pargv, "--snapshot", NULL, NULL ); + + if( tile_cpus ) { + ulong tile_cpus_strlen = strlen( tile_cpus ); + if( FD_UNLIKELY( tile_cpus_strlen>=sizeof(args->tile_cpus) ) ) FD_LOG_ERR(( "--tile-cpus: flag too long" )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->tile_cpus ), tile_cpus, tile_cpus_strlen ) ); + } + + if( FD_UNLIKELY( !snapshot_file ) ) FD_LOG_ERR(( "Missing --snapshot flag" )); + ulong snapshot_file_strlen = strlen( snapshot_file ); + if( FD_UNLIKELY( snapshot_file_strlen>=sizeof(args->snapshot_load.snapshot_path) ) ) FD_LOG_ERR(( "--snapshot: path too long" )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->snapshot_load.snapshot_path ), snapshot_file, snapshot_file_strlen ) ); +} + +static void +snapshot_load_cmd_perm( args_t * args, + fd_cap_chk_t * chk, + config_t const * config ) { + (void)args; + ulong mlock_limit = fd_topo_mlock_max_tile( &config->topo ); + fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_MEMLOCK, mlock_limit, "call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`" ); + fd_cap_chk_raise_rlimit( chk, NAME, RLIMIT_NICE, 40, "call `setpriority(2)` to increase thread priorities" ); +} + +static void +snapshot_load_cmd_fn( args_t * args, + config_t * config ) { + snapshot_load_topo( config, args ); + fd_topo_t * topo = &config->topo; + + configure_stage( &fd_cfg_stage_hugetlbfs, CONFIGURE_CMD_INIT, config ); + initialize_workspaces( config ); + initialize_stacks( config ); + fd_topo_join_workspaces( topo, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topo_fill( topo ); + double tick_per_ns = fd_tempo_tick_per_ns( NULL ); + double ns_per_tick = 1.0/tick_per_ns; + fd_topo_run_single_process( topo, 2, config->uid, config->gid, fdctl_tile_run, NULL ); + + fd_topo_tile_t * file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; + fd_topo_tile_t * snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; + + ulong * snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; + ulong * snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); + ulong volatile * file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); + ulong volatile * snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); + + ulong goff_old = 0UL; + ulong file_rd_backp_old = 0UL; + ulong snap_in_wait_old = 0UL; + ulong acc_cnt_old = 0UL; + ulong frag_cnt_old = 0UL; + for(;;) { + sleep( 1 ); + + ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); + ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); + if( FD_UNLIKELY( filerd_status==2UL || snapin_status==2UL ) ) { + FD_LOG_NOTICE(( "Done" )); + break; + } + + ulong goff = FD_VOLATILE_CONST( snap_in_fseq[ 1 ] ); + ulong file_rd_backp = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong snap_in_wait = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ); + ulong frag_cnt = FD_VOLATILE_CONST( snap_accs_sync[0] ); + ulong acc_cnt = FD_VOLATILE_CONST( snap_accs_sync[1] ); + FD_LOG_NOTICE(( "rate=%4.2g GB/s back=%3.0f%% parser_busy=%3.0f%% acc=%8.3g/s frag=%8.3g/s", + (double)( goff-goff_old )/1e9, + ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, + ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7, + (double)( acc_cnt -acc_cnt_old ), + (double)( frag_cnt-frag_cnt_old ) ) ); + goff_old = goff; + file_rd_backp_old = file_rd_backp; + snap_in_wait_old = snap_in_wait; + acc_cnt_old = acc_cnt; + frag_cnt_old = frag_cnt; + } + + FD_LOG_NOTICE(( "Loaded %g accounts", (double)FD_VOLATILE_CONST( snap_accs_sync[1] ) )); +} + +action_t fd_action_snapshot_load = { + .name = NAME, + .args = snapshot_load_cmd_args, + .perm = snapshot_load_cmd_perm, + .fn = snapshot_load_cmd_fn +}; diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index c03e46344f..866fc62a47 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -96,6 +96,8 @@ extern fd_topo_run_tile_t fd_tile_archiver_writer; extern fd_topo_run_tile_t fd_tile_archiver_playback; extern fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, @@ -130,6 +132,8 @@ fd_topo_run_tile_t * TILES[] = { &fd_tile_archiver_writer, &fd_tile_archiver_playback, &fd_tile_snapshot_restore_FileRd, + &fd_tile_snapshot_restore_SnapIn, + &fd_tile_snapshot_restore_ActAlc, NULL, }; @@ -151,6 +155,7 @@ extern action_t fd_action_help; extern action_t fd_action_load; extern action_t fd_action_pktgen; extern action_t fd_action_quic_trace; +extern action_t fd_action_snapshot_load; extern action_t fd_action_txn; extern action_t fd_action_wksp; extern action_t fd_action_gossip; @@ -174,6 +179,7 @@ action_t * ACTIONS[] = { &fd_action_flame, &fd_action_load, &fd_action_pktgen, + &fd_action_snapshot_load, &fd_action_quic_trace, &fd_action_txn, &fd_action_wksp, diff --git a/src/app/firedancer/commands/configure/funk.c b/src/app/firedancer/commands/configure/funk.c new file mode 100644 index 0000000000..ece80b79bb --- /dev/null +++ b/src/app/firedancer/commands/configure/funk.c @@ -0,0 +1,59 @@ +#include "../../../shared/commands/configure/configure.h" +#include "../../../../funk/fd_funk_filemap.h" + +#define NAME "funk" + +static int +enabled( config_t const * config ) { + (void)config; + return 1; +} + +static void +funk_init_file( config_t const * config ) { + +} + +static void +funk_init_mem( config_t const * config ) { + +} + +static void +init( config_t const * config ) { + if( config->firedancer.funk.filemap.enabled ) funk_init_file( config ); + else funk_init_mem ( config ); +} + +static void +fini( config_t const * config, + int pre_init ) { + (void)pre_init; +} + +static void +funk_check_file( config_t const * config ) { + fd_funk_open_file( funk, funk_path, 1UL, ) +} + +static void +funk_check_mem( config_t const * config ) { + +} + +static configure_result_t +check( config_t const * config ) { + if( config->firedancer.funk.filemap.enabled ) funk_check_file( config ); + else funk_check_mem ( config ); +} + +configure_stage_t fd_cfg_stage_funk = { + .name = NAME, + .always_recreate = 0, + .enabled = enabled, + .init = init, + .fini = fini, + .check = check, +}; + +#undef NAME diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index 97976e9df6..899ec2c946 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -11,9 +11,9 @@ name = "fd1" # or as root, so that it can configure kernel bypass networking. Once # this configuration has been performed, the process will enter a highly # restrictive sandbox, drop all privileges, and switch to the user given -# here. When running the configuration steps of `fdctl configure` data -# will be permissioned so that is it writable for this user and not the -# user which is performing the configuration. +# here. When running the configuration steps of `firedancer configure` +# data will be permissioned so that is it writable for this user and not +# the user which is performing the configuration. # # Firedancer requires nothing from this user, and it should be as # minimally permissioned as is possible. It is suggested to run @@ -266,6 +266,54 @@ user = "" # entrypoint validator is using. expected_shred_version = 0 +# This section configures the "funk" account database. Currently, funk +# stores all Solana accounts. In future versions of Firedancer, most +# accounts will be offloaded to the "groove" database. +[funk] + # The max amount of records that the funk instance can store. + # Each Solana account uses at least one record. Additional records + # are used for account changes that are not yet finalized by + # consensus (typically takes 13 seconds). + max_account_records = 10_000_000 + + # The size of the funk heap in gigabytes. This value must be large + # enough to store all Solana accounts uncompressed. + heap_size_gb = 32 + + # The max amount of concurrent database transactions. These are + # used to track conflicting versions of accounts until such + # conflicts are resolved by the consensus algorithm. (Not to be + # confused with Solana transactions). + # The validator uses one database transaction for each Solana block + # that is not yet finalized. It is not recommended to change this + # setting. + max_database_transactions = 1024 + + # If this option is set to 'false', `firedancer configure` may + # destroy a funk database and all its data (e.g. because config + # parameters changed, or `configure fini` requested). + # If set to 'true', running the `firedancer` command will never + # delete the funk database. + delete_protection = true + + # Optionally, a funk database can be memory-mapped from a regular + # file. This feature reducees performance significantly. It allows + # a validator to operate even when account set size exceeds the + # amount of available DRAM. This feature will be eventually + # replaced by the "groove" database, see below. + [funk.filemap] + # If set to "true", uses a regular file to store the funk + # database. Otherwise, uses shared memory via hugetlbfs/tmpfs. + enable = false + + # The absolute path to the funk file. Ignored if filemap is not + # enabled. + path = "/tmp/funk.bin" + +# This section configures the "groove" persistent account database. +# [groove] +# ... + # CPU cores in Firedancer are carefully managed. Where a typical # program lets the operating system scheduler determine which threads to # run on which cores and for how long, Firedancer overrides most of this @@ -616,7 +664,7 @@ user = "" # to use this mode in production. # # If the kernel/hardware does not support driver mode, then - # `fdctl run` will fail to start up with "operation not + # `firedancer run` will fail to start up with "operation not # supported". xdp_mode = "skb" @@ -628,7 +676,7 @@ user = "" # # Only works when XDP is provided by the network driver (see # the xdp_mode option above). If the kernel/hardware does not - # support zero copy, `fdctl run` will fail to start up with + # support zero copy, `firedancer run` will fail to start up with # "operation not supported". xdp_zero_copy = false @@ -987,9 +1035,6 @@ user = "" incremental = "" incremental_url = "" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 cluster_version = "1.18.0" # The metric tile receives metrics updates published from the rest @@ -1071,7 +1116,7 @@ user = "" # but might be too restrictive during development. # # In development, you can disable the sandbox for testing and - # debugging with the `--no-sandbox` argument to `fddev`. + # debugging with the `--no-sandbox` argument to `firedancer-dev`. sandbox = true # As part of the security sandboxing, Firedancer will run every tile @@ -1083,7 +1128,7 @@ user = "" # # This option cannot be enabled in production. In development, you # can also launch Firedancer as a single process for with the - # `--no-clone` argument to `fddev`. + # `--no-clone` argument to `firedancer-dev`. # # TODO: No clone should not be on by default, fix sandbox. no_clone = true @@ -1114,20 +1159,21 @@ user = "" # solution, they allow us to create a pair of virtual interfaces on # the machine which can route to each other. # - # If this configuration is enabled, `fdctl dev` will create two + # If this configuration is enabled, `firedancer-dev` will create two # network namespaces and a link between them to send packets back # and forth. When this option is enabled, the interface to bind to # in the net configuration must be one of the virtual interfaces. - # Firedancer will be launched by `fdctl` within that namespace. + # Firedancer will be launched by `firedancer` within that namespace. # # This is a development only configuration, network namespaces are # not suitable for production use due to performance overhead. In - # development when running with `fddev`, this can also be enabled - # with the `--netns` command line argument. + # development when running with `firedancer-dev`, this can also be + # enabled with the `--netns` command line argument. [development.netns] - # If enabled, `fdctl dev` will ensure the network namespaces are - # configured properly, can route to each other, and that running - # Firedancer will run it inside the namespace for interface0 + # If enabled, `firedancer-dev` will ensure the network + # namespaces are configured properly, can route to each other, + # and that running Firedancer will run it inside the namespace + # for interface0 enabled = false # Name of the first network namespace. @@ -1260,11 +1306,11 @@ user = "" # the rest of the network. larger_shred_limits_per_block = false - # The following options relate to the 'fddev pktgen' tool only. - # pktgen tests net tile transmit throughput by generating a flood of - # non-routable Ethernet frames. Only use 'fddev pktgen' if you know - # what you are doing! It can cause damage to your system and network - # infrastructure. + # The following options relate to the 'firedance-dev pktgen' tool + # only. pktgen tests net tile transmit throughput by generating a + # flood of non-routable Ethernet frames. Only use 'firedancer-dev + # pktgen' if you know what you are doing! It can cause damage to + # your system and network infrastructure. [development.pktgen] # Which cores to run the 'net' and 'pktgen' tiles on. # By default, two fixed cores will be used. To get reliable diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 7daed5e5a6..f0f8e6eb6d 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -838,10 +838,10 @@ fd_topo_initialize( config_t * config ) { tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; strncpy( tile->replay.capture, config->tiles.replay.capture, sizeof(tile->replay.capture) ); strncpy( tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(tile->replay.funk_checkpt) ); - tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + tile->replay.funk_rec_max = (uint)config->firedancer.funk.max_account_records; + tile->replay.funk_sz_gb = (uint)config->firedancer.funk.heap_size_gb; + tile->replay.funk_txn_max = (uint)config->firedancer.funk.max_database_transactions; + strncpy( tile->replay.funk_file, config->firedancer.funk.filemap.path, sizeof(tile->replay.funk_file) ); tile->replay.plugins_enabled = plugins_enabled; if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) @@ -895,11 +895,6 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "eqvoc" ) ) ) { strncpy( tile->eqvoc.identity_key_path, config->paths.identity_key, sizeof(tile->eqvoc.identity_key_path) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "rpcsrv" ) ) ) { - strncpy( tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(tile->replay.blockstore_file) ); - tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); tile->rpcserv.rpc_port = config->rpc.port; tile->rpcserv.tpu_port = config->tiles.quic.regular_transaction_listen_port; tile->rpcserv.tpu_ip_addr = config->net.ip_addr; @@ -908,7 +903,6 @@ fd_topo_initialize( config_t * config ) { tile->batch.full_interval = config->tiles.batch.full_interval; tile->batch.incremental_interval = config->tiles.batch.incremental_interval; strncpy( tile->batch.out_dir, config->tiles.batch.out_dir, sizeof(tile->batch.out_dir) ); - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "gui" ) ) ) { if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.gui.gui_listen_address, &tile->gui.listen_addr ) ) ) FD_LOG_ERR(( "failed to parse gui listen address `%s`", config->tiles.gui.gui_listen_address )); @@ -923,12 +917,12 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "plugin" ) ) ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "exec" ) ) ) { - strncpy( tile->exec.funk_file, config->tiles.replay.funk_file, sizeof(tile->exec.funk_file) ); + strncpy( tile->exec.funk_file, config->firedancer.funk.filemap.path, sizeof(tile->exec.funk_file) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "writer" ) ) ) { - strncpy( tile->writer.funk_file, config->tiles.replay.funk_file, sizeof(tile->writer.funk_file) ); + strncpy( tile->writer.funk_file, config->firedancer.funk.filemap.path, sizeof(tile->writer.funk_file) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "rstart" ) ) ) { tile->restart.in_wen_restart = config->tiles.restart.in_wen_restart; - strncpy( tile->restart.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + strncpy( tile->restart.funk_file, config->firedancer.funk.filemap.path, sizeof(tile->replay.funk_file) ); strncpy( tile->restart.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(tile->replay.tower_checkpt) ); strncpy( tile->restart.identity_key_path, config->paths.identity_key, sizeof(tile->restart.identity_key_path) ); fd_memcpy( tile->restart.genesis_hash, config->tiles.restart.genesis_hash, FD_BASE58_ENCODED_32_SZ ); diff --git a/src/app/shared/boot/fd_boot.c b/src/app/shared/boot/fd_boot.c index d0f60f4330..4fff2e3b17 100644 --- a/src/app/shared/boot/fd_boot.c +++ b/src/app/shared/boot/fd_boot.c @@ -145,6 +145,7 @@ fd_main_init( int * pargc, if( FD_LIKELY( !gid && setegid( config->gid ) ) ) FD_LOG_ERR(( "setegid() failed (%i-%s)", errno, fd_io_strerror( errno ) )); if( FD_LIKELY( !uid && seteuid( config->uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + if( 0==strcmp( config->log.path, "-" ) ) config->log.path[0] = '\0'; int boot_silent = config_fd>=0; fd_log_private_boot_custom( log_lock, 0UL, diff --git a/src/app/shared/fd_action.h b/src/app/shared/fd_action.h index d21f8b4461..cd30022936 100644 --- a/src/app/shared/fd_action.h +++ b/src/app/shared/fd_action.h @@ -3,7 +3,9 @@ #include "fd_cap_chk.h" -union fdctl_args { +struct fdctl_args { + char tile_cpus[ 256UL ]; + struct { char tile_name[ 7UL ]; ulong kind_id; @@ -87,9 +89,13 @@ union fdctl_args { int event; int dump; /* whether the user requested --dump */ } quic_trace; + + struct { + char snapshot_path[ PATH_MAX ]; + } snapshot_load; }; -typedef union fdctl_args args_t; +typedef struct fdctl_args args_t; struct fd_action { char const * name; diff --git a/src/app/shared/fd_config.h b/src/app/shared/fd_config.h index a3523e5d81..504651d7a3 100644 --- a/src/app/shared/fd_config.h +++ b/src/app/shared/fd_config.h @@ -106,6 +106,16 @@ struct fd_configf { char restore[PATH_MAX]; } blockstore; + struct { + ulong max_account_records; + ulong heap_size_gb; + ulong max_database_transactions; + struct { + int enabled; + char path[ PATH_MAX ]; + } filemap; + } funk; + struct { uint exec_tile_count; /* TODO: redundant ish with bank tile cnt */ uint writer_tile_count; @@ -274,6 +284,10 @@ struct fd_config { char affinity[ AFFINITY_SZ ]; char fake_dst_ip[ 16 ]; } pktgen; + + struct { + char affinity[ AFFINITY_SZ ]; + } snapshot_load; } development; struct { @@ -353,10 +367,6 @@ struct fd_config { struct { char capture[ PATH_MAX ]; char funk_checkpt[ PATH_MAX ]; - uint funk_rec_max; - ulong funk_sz_gb; - ulong funk_txn_max; - char funk_file[ PATH_MAX ]; char genesis[ PATH_MAX ]; char incremental[ PATH_MAX ]; char incremental_url[ PATH_MAX ]; diff --git a/src/app/shared/fd_config_parse.c b/src/app/shared/fd_config_parse.c index b683c7c6ea..3a77582c8d 100644 --- a/src/app/shared/fd_config_parse.c +++ b/src/app/shared/fd_config_parse.c @@ -311,6 +311,12 @@ fd_config_extract_podf( uchar * pod, CFG_POP ( bool, consensus.vote ); + CFG_POP ( ulong, funk.max_account_records ); + CFG_POP ( ulong, funk.heap_size_gb ); + CFG_POP ( ulong, funk.max_database_transactions ); + CFG_POP ( bool, funk.filemap.enabled ); + CFG_POP ( cstr, funk.filemap.path ); + return config; } @@ -422,10 +428,6 @@ fd_config_extract_pod( uchar * pod, CFG_POP ( cstr, tiles.replay.capture ); CFG_POP ( cstr, tiles.replay.funk_checkpt ); - CFG_POP ( uint, tiles.replay.funk_rec_max ); - CFG_POP ( ulong, tiles.replay.funk_sz_gb ); - CFG_POP ( ulong, tiles.replay.funk_txn_max ); - CFG_POP ( cstr, tiles.replay.funk_file ); CFG_POP ( cstr, tiles.replay.genesis ); CFG_POP ( cstr, tiles.replay.incremental ); CFG_POP ( cstr, tiles.replay.incremental_url ); diff --git a/src/disco/bundle/fd_bundle_client.c b/src/disco/bundle/fd_bundle_client.c new file mode 100644 index 0000000000..3fb16c6422 --- /dev/null +++ b/src/disco/bundle/fd_bundle_client.c @@ -0,0 +1,277 @@ +#include "fd_bundle_client_private.h" +#include "../../waltz/h2/fd_h2_rbuf_ossl.h" +#include "../../waltz/grpc/fd_grpc.h" +#include +#include +#include + +/* Forward declarations */ + +static fd_h2_callbacks_t const fd_bundle_h2_callbacks; + +ulong +fd_bundle_client_align( void ) { + return alignof(fd_bundle_client_t); +} + +ulong +fd_bundle_client_footprint( void ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_bundle_client_t), sizeof(fd_bundle_client_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_bundle_client_bufs_t), sizeof(fd_bundle_client_bufs_t) ); + l = FD_LAYOUT_APPEND( l, fd_bundle_h2_stream_pool_align(), fd_bundle_h2_stream_pool_footprint( FD_BUNDLE_CLIENT_MAX_STREAMS ) ); + return FD_LAYOUT_FINI( l, fd_bundle_client_align() ); +} + +fd_bundle_client_t * +fd_bundle_client_new( void * mem, + SSL * ssl, + fd_bundle_client_metrics_t * metrics ) { + FD_SCRATCH_ALLOC_INIT( l, mem ); + void * client_mem = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_bundle_client_t), sizeof(fd_bundle_client_t) ); + void * bufs_mem = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_bundle_client_bufs_t), sizeof(fd_bundle_client_bufs_t) ); + void * stream_pool_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_bundle_h2_stream_pool_align(), fd_bundle_h2_stream_pool_footprint( FD_BUNDLE_CLIENT_MAX_STREAMS ) ); + FD_SCRATCH_ALLOC_FINI( l, fd_bundle_client_align() ); + + fd_bundle_client_t * client = client_mem; + fd_bundle_client_bufs_t * bufs = bufs_mem; + + fd_bundle_h2_stream_t * stream_pool = + fd_bundle_h2_stream_pool_join( fd_bundle_h2_stream_pool_new( stream_pool_mem, FD_BUNDLE_CLIENT_MAX_STREAMS ) ); + if( FD_UNLIKELY( !stream_pool ) ) FD_LOG_CRIT(( "Failed to create stream pool" )); /* unreachable */ + + *client = (fd_bundle_client_t){ + .ssl = ssl, + .stream_pool = stream_pool, + .nanopb_rx = bufs->nanopb_rx, + .nanopb_tx = bufs->nanopb_tx, + .frame_scratch = bufs->frame_scratch, + .metrics = metrics + }; + fd_h2_rbuf_init( client->frame_rx, bufs->frame_rx_buf, sizeof(bufs->frame_rx_buf) ); + fd_h2_rbuf_init( client->frame_tx, bufs->frame_tx_buf, sizeof(bufs->frame_tx_buf) ); + + fd_h2_conn_init_client( client->conn ); + client->conn->ctx = client; + + /* Don't memset bufs for better performance */ + + return client; +} + +void * +fd_bundle_client_delete( fd_bundle_client_t * client ) { + return client; +} + +static int +fd_ossl_log_error( char const * str, + ulong len, + void * ctx ) { + (void)ctx; + FD_LOG_WARNING(( "%.*s", (int)len, str )); + return 0; +} + +void +fd_bundle_client_rxtx( fd_bundle_client_t * client ) { + SSL * ssl = client->ssl; + if( FD_UNLIKELY( !client->ssl_hs_done ) ) { + int res = SSL_do_handshake( ssl ); + if( res<=0 ) { + int error = SSL_get_error( ssl, res ); + if( FD_LIKELY( error==SSL_ERROR_WANT_READ || error==SSL_ERROR_WANT_WRITE ) ) return; + ERR_print_errors_cb( fd_ossl_log_error, NULL ); + client->failed = 1; + return; + } else { + client->ssl_hs_done = 1; + } + } + + fd_h2_conn_t * conn = client->conn; + fd_h2_rbuf_ssl_read( client->frame_rx, ssl ); + if( FD_UNLIKELY( conn->flags ) ) fd_h2_tx_control( conn, client->frame_tx ); + fd_h2_rx( conn, client->frame_rx, client->frame_tx, client->frame_scratch, FD_BUNDLE_CLIENT_BUFSZ, &fd_bundle_h2_callbacks ); + fd_h2_rbuf_ssl_write( client->frame_tx, ssl ); +} + +/* fd_bundle_client_request continue attempts to write a request data + frame. */ + +static int +fd_bundle_client_request_continue1( fd_bundle_client_t * client ) { + fd_h2_stream_t * stream = client->request_stream; + fd_h2_tx_op_copy( client->conn, stream, client->frame_tx, client->request_tx_op ); + if( FD_UNLIKELY( client->request_tx_op->chunk_sz ) ) return 0; + if( FD_UNLIKELY( stream->state != FD_H2_STREAM_STATE_CLOSING_TX ) ) return 0; + /* Request finished */ + client->request_stream = NULL; + return 1; +} + +static int +fd_bundle_client_request_continue( fd_bundle_client_t * client ) { + if( FD_UNLIKELY( client->conn->flags & FD_H2_CONN_FLAGS_DEAD ) ) return 0; + if( FD_UNLIKELY( !client->request_stream ) ) return 0; + if( FD_UNLIKELY( !client->request_tx_op->chunk_sz ) ) return 0; + return fd_bundle_client_request_continue1( client ); +} + +/* fd_bundle_client_stream_acquire grabs a new stream ID and a stream + object. */ + +static inline int +fd_bundle_client_stream_acquire_is_safe( fd_bundle_client_t * client ) { + /* Sufficient quota to start a stream? */ + if( FD_UNLIKELY( client->conn->stream_active_cnt[1]+1 <= client->conn->peer_settings.max_concurrent_streams ) ) return 0; + + /* Free stream object available? */ + if( FD_UNLIKELY( !fd_bundle_h2_stream_pool_free( client->stream_pool ) ) ) return 0; + if( FD_UNLIKELY( client->stream_cnt >= FD_BUNDLE_CLIENT_MAX_STREAMS ) ) return 0; + + return 1; +} + +static fd_h2_stream_t * +fd_bundle_client_stream_acquire( fd_bundle_client_t * client ) { + if( FD_UNLIKELY( client->stream_cnt >= FD_BUNDLE_CLIENT_MAX_STREAMS ) ) { + FD_LOG_CRIT(( "stream pool exhausted" )); + } + + fd_h2_conn_t * conn = client->conn; + uint const stream_id = client->conn->rx_stream_next; + conn->rx_stream_next += 2U; + + fd_bundle_h2_stream_t * stream_node = fd_bundle_h2_stream_pool_ele_acquire( client->stream_pool ); + + fd_h2_stream_t * stream = fd_h2_stream_open( fd_h2_stream_init( &stream_node->s ), conn, stream_id ); + client->request_stream = stream; + client->stream_ids[ stream_id ] = stream_id; + client->stream_cnt++; + return stream; +} + +static void +fd_bundle_client_stream_release( fd_bundle_client_t * client, + fd_h2_stream_t * stream ) { + if( FD_UNLIKELY( !client->stream_cnt ) ) FD_LOG_CRIT(( "stream map corrupt" )); /* unreachable */ + + /* Deallocate tx_op */ + if( FD_UNLIKELY( stream == client->request_stream ) ) { + client->request_stream = NULL; + *client->request_tx_op = (fd_h2_tx_op_t){0}; + } + + /* Remove stream from map */ + int map_idx = -1; + for( int i=0UL; istream_ids[ i ] == stream->stream_id ) { + map_idx = i; + } + } + if( FD_UNLIKELY( map_idx<0 ) ) FD_LOG_CRIT(( "stream map corrupt" )); /* unreachable */ + if( (ulong)map_idx+1 < client->stream_cnt ) { + client->stream_ids[ map_idx ] = client->stream_ids[ client->stream_cnt-1 ]; + client->streams [ map_idx ] = client->streams [ client->stream_cnt-1 ]; + client->stream_cnt--; + } + + fd_bundle_h2_stream_t * stream_node = (void *)( (ulong)stream - offsetof(fd_bundle_h2_stream_t, s) ); + fd_bundle_h2_stream_pool_ele_release( client->stream_pool, stream_node ); +} + +int +fd_bundle_client_request_start( + fd_bundle_client_t * client, + char const * path, + ulong path_len, + pb_msgdesc_t const * fields, + void const * message, + char const * auth_token, + ulong auth_token_sz +) { + /* Sanity check conn */ + if( FD_UNLIKELY( client->conn->flags & FD_H2_CONN_FLAGS_DEAD ) ) return 0; + if( FD_UNLIKELY( !fd_h2_rbuf_is_empty( client->frame_tx ) ) ) return 0; + if( FD_UNLIKELY( !fd_bundle_client_stream_acquire_is_safe( client ) ) ) return 0; + + /* Encode message */ + FD_STATIC_ASSERT( sizeof((fd_bundle_client_bufs_t *)0)->nanopb_rx == sizeof(fd_grpc_hdr_t)+FD_BUNDLE_CLIENT_MSG_SZ_MAX, sz ); + uchar * proto_buf = client->nanopb_rx + sizeof(fd_grpc_hdr_t); + pb_ostream_t ostream = pb_ostream_from_buffer( proto_buf, FD_BUNDLE_CLIENT_MSG_SZ_MAX ); + if( FD_UNLIKELY( !pb_encode( &ostream, fields, message ) ) ) { + FD_LOG_WARNING(( "Failed to encode Protobuf message (%.*s). This is a bug (insufficient buffer space?)", (int)path_len, path )); + return 0; + } + ulong const serialized_sz = ostream.bytes_written; + + /* Create gRPC length prefix */ + fd_grpc_hdr_t hdr = { .compressed=0, .msg_sz=(uint)serialized_sz }; + memcpy( client->nanopb_rx, &hdr, sizeof(fd_grpc_hdr_t) ); + ulong const payload_sz = serialized_sz + sizeof(fd_grpc_hdr_t); + + /* Allocate stream descriptor */ + fd_h2_stream_t * stream = fd_bundle_client_stream_acquire( client ); + uint const stream_id = stream->stream_id; + + /* Write HTTP/2 request headers */ + fd_h2_tx_prepare( client->conn, client->frame_tx, FD_H2_FRAME_TYPE_HEADERS, FD_H2_FLAG_END_HEADERS, stream_id ); + fd_grpc_req_hdrs_t req_meta = { + .path = path, + .path_len = path_len, + .https = 1, /* bundle_client assumes TLS encryption for now */ + + .bearer_auth = auth_token, + .bearer_auth_len = auth_token_sz + }; + if( FD_UNLIKELY( !fd_grpc_h2_gen_request_hdrs( &req_meta, client->frame_tx ) ) ) { + FD_LOG_WARNING(( "Failed to generate gRPC request headers (%.*s). This is a bug", (int)path_len, path )); + return 0; + } + fd_h2_tx_commit( client->conn, client->frame_tx ); + + /* Queue request payload for send + (Protobuf message might have to be fragmented into multiple HTTP/2 + DATA frames if the client gets blocked) */ + fd_h2_tx_op_init( client->request_tx_op, client->nanopb_rx, payload_sz, FD_H2_FLAG_END_STREAM ); + fd_bundle_client_request_continue1( client ); + client->metrics->requests_sent++; + + FD_LOG_DEBUG(( "gRPC request path=%.*s sz=%lu", (int)path_len, path, serialized_sz )); + + return 1; +} + +/* A HTTP/2 flow control change might unblock a queued request send op */ + +void +fd_bundle_h2_window_update( fd_h2_conn_t * conn, + uint increment ) { + (void)increment; + fd_bundle_client_request_continue( conn->ctx ); +} + +void +fd_bundle_h2_stream_window_update( fd_h2_conn_t * conn, + fd_h2_stream_t * stream, + uint increment ) { + (void)stream; (void)increment; + fd_bundle_client_request_continue( conn->ctx ); +} + +/* fd_bundle_h2_callbacks specifies h2->bundle_client callbacks. + Stored in .rodata for security. Must be kept in sync with fd_h2 to + avoid NULL pointers. */ + +static fd_h2_callbacks_t const fd_bundle_h2_callbacks = { + .stream_create = fd_h2_noop_stream_create, + .stream_query = fd_bundle_h2_stream_query, + .conn_established = fd_h2_noop_conn_established, + .conn_final = fd_h2_noop_conn_final, + .headers = fd_bundle_h2_cb_headers, + .data = fd_bundle_h2_cb_data, + .rst_stream = fd_bundle_h2_rst_stream, + .window_update = fd_bundle_h2_window_update, + .stream_window_update = fd_bundle_h2_stream_window_update, +}; diff --git a/src/disco/bundle/test_bundle_client.c b/src/disco/bundle/test_bundle_client.c new file mode 100644 index 0000000000..853892ee5f --- /dev/null +++ b/src/disco/bundle/test_bundle_client.c @@ -0,0 +1,76 @@ +/* test_bundle_client.c creates a gRPC connection and fetches auth + tokens. */ + +#include "fd_bundle_client.h" + +#include +#include +#include +#include +#include + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() ); + if( cpu_idx>=fd_shmem_cpu_cnt() ) cpu_idx = 0UL; + + char const * endpoint = fd_env_strip_cmdline_cstr ( &argc, &argv, "--endpoint", NULL, NULL ); + char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); + ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); + ulong numa_idx = fd_env_strip_cmdline_ulong( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx(cpu_idx) ); + + if( FD_UNLIKELY( !endpoint ) ) FD_LOG_ERR(( "Missing --endpoint" )); + ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz ); + if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "unsupported --page-sz" )); + + FD_LOG_NOTICE(( "Creating workspace with --page-cnt %lu --page-sz %s pages on --numa-idx %lu", page_cnt, _page_sz, numa_idx )); + fd_wksp_t * wksp = fd_wksp_new_anonymous( page_sz, page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL ); + FD_TEST( wksp ); + + SSL_library_init(); + SSL_load_error_strings(); + + SSL_CTX * ssl_ctx = SSL_CTX_new( TLS_client_method() ); + if( FD_UNLIKELY( !ssl_ctx ) ) { + FD_LOG_ERR(( "SSL_CTX_new failed" )); + } + + if( FD_UNLIKELY( !SSL_CTX_set_mode( ssl_ctx, SSL_MODE_ENABLE_PARTIAL_WRITE|SSL_MODE_AUTO_RETRY ) ) ) { + FD_LOG_ERR(( "SSL_CTX_set_mode failed" )); + } + + if( FD_UNLIKELY( !SSL_CTX_set_min_proto_version( ssl_ctx, TLS1_3_VERSION ) ) ) { + FD_LOG_ERR(( "SSL_CTX_set_min_proto_version(ssl_ctx,TLS1_3_VERSION) failed" )); + } + + BIO * bio = BIO_new_ssl_connect( ssl_ctx ); + if( FD_UNLIKELY( !bio ) ) FD_LOG_ERR(( "BIO_new_ssl_connect failed" )); + + BIO_set_conn_hostname( bio, endpoint ); + BIO_set_nbio( bio, 1 ); + + SSL * ssl = NULL; + BIO_get_ssl( bio, &ssl ); + if( FD_UNLIKELY( !ssl ) ) FD_LOG_ERR(( "BIO_get_ssl failed" )); + + void * client_mem = fd_wksp_alloc_laddr( wksp, fd_bundle_client_align(), fd_bundle_client_footprint(), 1UL ); + if( FD_UNLIKELY( !client_mem ) ) FD_LOG_ERR(( "Failed to alloc bundle client" )); + static fd_bundle_client_metrics_t metrics[1]; + fd_bundle_client_t * client = fd_bundle_client_new( client_mem, ssl, metrics ); + + for(;;) + fd_bundle_client_rxtx( client ); + + fd_wksp_free_laddr( fd_bundle_client_delete( client ) ); + + BIO_free_all( bio ); + SSL_CTX_free( ssl_ctx ); + + fd_wksp_delete_anonymous( wksp ); + + fd_halt(); + return 0; +} diff --git a/src/disco/stem/fd_stem.c b/src/disco/stem/fd_stem.c index 65368dac41..312df2ca99 100644 --- a/src/disco/stem/fd_stem.c +++ b/src/disco/stem/fd_stem.c @@ -260,8 +260,6 @@ STEM_(run1)( ulong in_cnt, FD_SCRATCH_ALLOC_INIT( l, scratch ); in = (fd_stem_tile_in_t *)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stem_tile_in_t), in_cnt*sizeof(fd_stem_tile_in_t) ); - ulong min_in_depth = (ulong)LONG_MAX; - if( FD_UNLIKELY( !!in_cnt && !in_mcache ) ) FD_LOG_ERR(( "NULL in_mcache" )); if( FD_UNLIKELY( !!in_cnt && !in_fseq ) ) FD_LOG_ERR(( "NULL in_fseq" )); if( FD_UNLIKELY( in_cnt > UINT_MAX ) ) FD_LOG_ERR(( "in_cnt too large" )); @@ -275,7 +273,7 @@ STEM_(run1)( ulong in_cnt, this_in->mcache = in_mcache[ in_idx ]; this_in->fseq = in_fseq [ in_idx ]; - ulong depth = fd_mcache_depth( this_in->mcache ); min_in_depth = fd_ulong_min( min_in_depth, depth ); + ulong depth = fd_mcache_depth( this_in->mcache ); if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", in_idx )); this_in->depth = (uint)depth; this_in->idx = (uint)in_idx; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 96831ca240..48508a7c23 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -432,6 +432,10 @@ typedef struct { char file_path[ PATH_MAX ]; } filerd; + struct { + ulong scratch_sz; + } snapin; + }; } fd_topo_tile_t; diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index b5c889eff3..f59888e954 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -376,6 +376,8 @@ fd_topob_auto_layout( fd_topo_t * topo, "rpcsrv", /* FIREDANCER only */ "batch", /* FIREDANCER only */ "pktgen", + "FileRd", + "SnapIn", }; char const * CRITICAL_TILES[] = { diff --git a/src/discof/restart/test/restart_fd.sh b/src/discof/restart/test/restart_fd.sh index 399c03c341..7ebf5b851c 100755 --- a/src/discof/restart/test/restart_fd.sh +++ b/src/discof/restart/test/restart_fd.sh @@ -47,12 +47,8 @@ echo " repair_serve_listen_port = 9056 [tiles.replay] snapshot = \"funk\" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 cluster_version = \"$CLUSTER_VERSION\" tower_checkpt = \"$TOWER_CHECKPT_FILE\" - funk_file = \"$FUNK_FILE\" [tiles.restart] in_wen_restart = true wen_restart_coordinator = \"$RESTART_COORDINATOR\" @@ -74,6 +70,13 @@ echo " idx_max = 512 alloc_max = 10737418240 file = \"$BLOCK_FILE\" +[funk] + max_account_records = 10000000 + heap_size_gb = 32 + max_database_transactions = 1024 + [funk.filemap] + enabled = true + path = \"$FUNK_FILE\" " > wen_restart.toml sudo gdb --args build/native/gcc/bin/firedancer-dev dev --config wen_restart.toml diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index da3369e6a5..ab4a7d08c5 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -1 +1,3 @@ $(call add-objs,fd_filerd_tile,fd_discof) +$(call add-objs,fd_snapin_tile,fd_discof) +$(call add-objs,fd_actalc_tile,fd_discof) diff --git a/src/discof/restore/README.md b/src/discof/restore/README.md index db0be4decc..4eb5d87352 100644 --- a/src/discof/restore/README.md +++ b/src/discof/restore/README.md @@ -1,5 +1,51 @@ # Snapshot Restore +## Philosophy + +Firedancer is optimized to restore snapshots as fast as possible, i.e. +at I/O and memory bandwidth limits. + +Fast snapshot restore time is not only helpful for operators, but +crucial for fast recovery from failures, which may be widespread in the +worst case. + +To meet these performance requirements, a multi-layer scaling approach +is used: + +- **SIMD:** Cryptographic computations (hashing) are accelerated via + AVX2 / AVX10 SIMD instructions +- **ILP:** Performance-critical logic is hand-optimized for good single-core + throughput on AMD Zen 2 (parallel random memory accesses via prefetching, + non-temporal memory copies, xxHash3 hashing) +- **Thread parallelism:** Certain algorithms redesigned as massively + parallel batch computations (e.g. parallel hashmap insert via sample sort) +- **Pipelining:** Snapshot loading step run concurrently / streaming if + possible. Each step is pinned to a core and independently scalable + for ideal throughput and efficient cache utilization. + +## Pipeline + +Phase 1: Ingest accounts into memory + +``` +FileRd -> UnZstd -> SnapIn -> FnkAlc -> FnkCpy +``` + +- FileRd: Reads a file +- UnZstd: Does Zstandard decompression +- SnapIn: Reads a snapshot +- FnkAlc: Allocates funk heap memory +- FnkCpy: Copies account data out to funk memory + +Phase 2: Index accounts + +``` +ActIdx -> ActDup +``` + +- ActIdx: Indexes accounts +- ActDup: Deletes duplicate accounts + ## Stream link conventions Various snapshot components use byte streams, not packet streams. @@ -8,7 +54,7 @@ These require custom conventions. **Stream fragment descriptors** -Byte streams use `fd_frag_stream_meta_t` (defined in `fd_restore_base.h`). +Byte streams use `fd_stream_frag_meta_t` (defined in `fd_restore_base.h`). These have the following changes: - `chunk` is replaced by `goff` and `loff`, which are 64-bit offsets @@ -17,7 +63,7 @@ These have the following changes: - `sig` is removed (cannot filter without looking at stream data) - `sz` is widened to 32 bits. -`**Dcache allocations** +**Dcache allocations** Payloads in stream dcaches are unaligned. Payloads are addressed with uncompressed byte offsets relative to the workspace start. @@ -32,4 +78,10 @@ Byte streams naturally require a reliable transport. Consumers periodically publish their progress in `fseq`. - `fseq[0]` is the lowest sequence number not yet consumed (standard) - `fseq[1]` is the stream offset of the next byte not yet consumed -` \ No newline at end of file + +**Frames in streams** + +Tiles can reference stream data zero-copy style. For example, the +`SnapIn` tile publishes fragments describing the accounts it parsed out +of a snapshot stream, where each fragment refers to a byte range in the +stream dcache. diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c new file mode 100644 index 0000000000..d6e8ee0cc7 --- /dev/null +++ b/src/discof/restore/fd_actalc_tile.c @@ -0,0 +1,472 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../disco/metrics/fd_metrics.h" +#include "../../flamenco/types/fd_types.h" + +#define LINK_IN_MAX 2UL +#define BURST 1UL + +struct fd_actalc_tile { + fd_solana_account_stored_meta_t acc_meta; + + /* Account output */ + + fd_stream_frag_meta_t * out_mcache; + + ulong out_seq_max; + ulong out_seq; + ulong out_cnt; + ulong out_depth; +}; + +typedef struct fd_actalc_tile fd_actalc_tile_t; + +struct fd_actalc_in { + fd_stream_frag_meta_t const * mcache; + uint depth; + uint idx; + ulong seq; + ulong goff; + fd_stream_frag_meta_t const * mline; + ulong volatile * restrict fseq; + uint accum[6]; +}; + +typedef struct fd_actalc_in fd_actalc_in_t; + +static ulong +scratch_align( void ) { + return alignof(fd_actalc_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + return sizeof(fd_actalc_tile_t); +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `ActAlc` tile" )); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu outs, expected 1", tile->out_cnt )); + /* FIXME check link names */ + + fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + + /* Join account output */ + + ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); + ctx->out_seq_max = 0UL; + ctx->out_seq = 0UL; + ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); +} + +static void +during_housekeeping( fd_actalc_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_actalc_tile_t * ctx ) { + (void)ctx; +} + +static int +on_stream_frag( fd_actalc_tile_t * ctx, + fd_actalc_in_t * in, + fd_stream_frag_meta_t const * frag, + ulong * read_sz ) { + (void)ctx; (void)in; (void)frag; (void)read_sz; + FD_LOG_NOTICE(( "frag" )); + return 1; +} + +/* fd_actalc_in_update gets called periodically synchronize flow control + credits back to the stream producer. Also updates link in metrics. */ + +static void +fd_actalc_in_update( fd_actalc_in_t * in ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->fseq[0] ) = in->seq; + FD_VOLATILE( in->fseq[1] ) = in->goff; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + + uint * accum = in->accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +__attribute__((noinline)) static void +fd_actalc_run1( + fd_actalc_tile_t * ctx, + ulong in_cnt, + fd_actalc_in_t * in, /* [in_cnt] */ + ulong out_cnt, + fd_frag_meta_t ** out_mcache, /* [out_cnt] */ + ulong * out_depth, /* [out_cnt] */ + ulong * out_seq, /* [out_cnt] */ + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong * cons_out, /* [cons_cnt] */ + ulong ** cons_fseq, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [cons_cnt] */ + long lazy, + fd_rng_t * rng +) { + /* in frag stream state */ + ulong in_seq; + + /* out flow control state */ + ulong cr_avail; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; /* First in to poll */ + + ulong min_in_depth = (ulong)LONG_MAX; + for( ulong in_idx=0UL; in_idxmcache->f ); + min_in_depth = fd_ulong_min( min_in_depth, depth ); + } + + /* out frag stream init */ + + cr_avail = 0UL; + + ulong const burst = BURST; + + ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX ); + + for( ulong out_idx=0UL; out_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ + + /* Send flow control credits and drain flow control diagnostics. */ + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_actalc_in_update( &in[ in_idx ] ); + + } else { /* event_idx==cons_cnt, housekeeping event */ + + /* Send synchronization info */ + FD_COMPILER_MFENCE(); + FD_VOLATILE( out_sync[0] ) = ctx->out_seq; + FD_VOLATILE( out_sync[1] ) = ctx->out_cnt; + FD_COMPILER_MFENCE(); + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + if( FD_LIKELY( cr_availout_seq_max = ctx->out_seq + cr_avail; + + if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*cons_slow[ slowest_cons ]) += metric_in_backp; + FD_COMPILER_MFENCE(); + } + } + + during_housekeeping( ctx ); + + } + + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + + event_seq++; + if( FD_UNLIKELY( event_seq>=event_cnt ) ) { + event_seq = 0UL; + + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + + if( FD_LIKELY( in_cnt>1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_actalc_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( cr_avail=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + ulong this_in_seq = this_in->seq; + fd_stream_frag_meta_t const * this_in_mline = this_in->mline; + + ulong seq_found = fd_frag_meta_seq_query( this_in_mline->f ); + + long diff = fd_seq_diff( this_in_seq, seq_found ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + this_in->seq = seq_found; + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + fd_stream_frag_meta_t meta = FD_VOLATILE_CONST( *this_in_mline ); + ulong sz = 0U; + int consumed_frag = on_stream_frag( ctx, this_in, &meta, &sz ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)sz; + + if( FD_LIKELY( consumed_frag ) ) { + + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu", in_seq )); + } + + /* Windup for the next in poll and accumulate diagnostics */ + + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + + } + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +static void +fd_actalc_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + fd_frag_meta_t * out_mcache[ tile->out_cnt ]; + ulong out_depth [ tile->out_cnt ]; + ulong out_seq [ tile->out_cnt ]; + for( ulong i=0UL; iout_cnt; i++ ) { + out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; + FD_TEST( out_mcache[ i ] ); + out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); + out_seq [ i ] = 0UL; + } + + ulong reliable_cons_cnt = 0UL; + ulong cons_out[ FD_TOPO_MAX_LINKS ]; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_out[ reliable_cons_cnt ] = k; + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cntmcache = in_mcache[ i ]; + this_in->fseq = in_fseq [ i ]; + + ulong depth = fd_mcache_depth( this_in->mcache->f ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", i )); + this_in->depth = (uint)depth; + this_in->idx = (uint)i; + this_in->seq = 0UL; + this_in->goff = 0UL; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + + this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; + this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; + } + + fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ reliable_cons_cnt ]; + fd_actalc_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, out_seq, reliable_cons_cnt, event_map, cons_out, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc = { + .name = "ActAlc", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_actalc_run, +}; +#endif diff --git a/src/discof/restore/fd_actcpy_tile.c b/src/discof/restore/fd_actcpy_tile.c new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c index 7c2574a193..8df32496bb 100644 --- a/src/discof/restore/fd_filerd_tile.c +++ b/src/discof/restore/fd_filerd_tile.c @@ -3,8 +3,8 @@ #include "../../disco/metrics/fd_metrics.h" #include #include +#include #include -#include #define NAME "FileRd" @@ -12,9 +12,13 @@ struct fd_filerd_tile { int fd; uchar * buf; /* dcache */ + ulong buf_base; ulong buf_off; ulong buf_sz; ulong goff; + ulong read_max; + + ulong * out_sync; /* mcache seq sync */ }; typedef struct fd_filerd_tile fd_filerd_tile_t; @@ -36,9 +40,7 @@ privileged_init( fd_topo_t * topo, fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); fd_memset( ctx, 0, sizeof(fd_filerd_tile_t) ); - if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); - if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); - + if( FD_UNLIKELY( !tile->filerd.file_path[0] ) ) FD_LOG_ERR(( "File path not set" )); ctx->fd = open( tile->filerd.file_path, O_RDONLY|O_CLOEXEC ); if( FD_UNLIKELY( ctx->fd<0 ) ) FD_LOG_ERR(( "open() failed (%i-%s)", errno, fd_io_strerror( errno ) )); } @@ -48,13 +50,19 @@ unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + void * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ 0 ] ].dcache_obj_id ) ); FD_TEST( out_dcache ); - ctx->buf = out_dcache; - ctx->buf_off = 0UL; - ctx->buf_sz = fd_dcache_data_sz( out_dcache ); - ctx->goff = 0UL; + ctx->buf = out_dcache; + ctx->buf_base = (ulong)out_dcache - (ulong)fd_wksp_containing( out_dcache ); + ctx->buf_off = 0UL; + ctx->buf_sz = fd_dcache_data_sz( out_dcache ); + ctx->goff = 0UL; + ctx->read_max = (8UL<<20); + ctx->out_sync = fd_mcache_seq_laddr( topo->links[ tile->out_link_id[ 0 ] ].mcache ); } static void @@ -67,18 +75,24 @@ metrics_write( fd_filerd_tile_t * ctx ) { (void)ctx; } -static void -close_file( fd_filerd_tile_t * ctx ) { - if( FD_UNLIKELY( ctx->fd<0 ) ) return; +__attribute__((noreturn)) FD_FN_UNUSED static void +fd_filerd_shutdown( fd_filerd_tile_t * ctx, + ulong seq_final ) { if( FD_UNLIKELY( close( ctx->fd ) ) ) { FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); } ctx->fd = -1; + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + FD_VOLATILE( ctx->out_sync[ 3 ] ) = seq_final; + FD_COMPILER_MFENCE(); + FD_LOG_INFO(( "Reached end of file" )); + + for(;;) pause(); } static void after_credit( fd_filerd_tile_t * ctx, - fd_frag_stream_meta_t * out_mcache, + fd_stream_frag_meta_t * out_mcache, ulong const out_depth, ulong * restrict out_seq, ulong * restrict cr_frag_avail, @@ -93,18 +107,13 @@ after_credit( fd_filerd_tile_t * ctx, FD_LOG_CRIT(( "Buffer overflow (buf_off=%lu buf_sz=%lu)", ctx->buf_off, ctx->buf_sz )); } - ulong const iov0_sz = fd_ulong_min( *cr_byte_avail, ctx->buf_sz - ctx->buf_off ); - struct iovec iov[2]; - iov[ 0 ].iov_base = ctx->buf + ctx->buf_off; - iov[ 0 ].iov_len = iov0_sz; - iov[ 1 ].iov_base = ctx->buf; - iov[ 1 ].iov_len = fd_ulong_min( (ulong)fd_long_max( 0L, (long)*cr_byte_avail-(long)iov0_sz ), ctx->buf_off ); + ulong const read_max = fd_ulong_min( *cr_byte_avail, ctx->read_max ); + ulong const read_sz = fd_ulong_min( read_max, ctx->buf_sz - ctx->buf_off ); - long res = readv( fd, iov, 2 ); + long res = read( fd, ctx->buf + ctx->buf_off, read_sz ); if( FD_UNLIKELY( res<=0L ) ) { if( FD_UNLIKELY( res==0 ) ) { - FD_LOG_INFO(( "Reached end of file" )); - close_file( ctx ); + fd_filerd_shutdown( ctx, out_seq[0] ); return; } if( FD_LIKELY( errno==EAGAIN ) ) return; @@ -116,23 +125,15 @@ after_credit( fd_filerd_tile_t * ctx, cr_byte_avail[0] -= sz; *charge_busy_after = 1; - ulong frag0_sz = fd_ulong_min( iov0_sz, sz ); - ulong frag1_sz = (ulong)res - frag0_sz; + ulong frag_sz = fd_ulong_min( read_sz, sz ); - fd_mcache_publish_stream( out_mcache, out_depth, out_seq[0], ctx->goff, ctx->buf_off, frag0_sz ); + ulong loff = ctx->buf_base + ctx->buf_off; + fd_mcache_publish_stream( out_mcache, out_depth, out_seq[0], ctx->goff, loff, frag_sz, 0 ); out_seq[0] = fd_seq_inc( out_seq[0], 1UL ); cr_frag_avail[0]--; - ctx->goff += frag0_sz; - ctx->buf_off += frag0_sz; + ctx->goff += frag_sz; + ctx->buf_off += frag_sz; if( ctx->buf_off >= ctx->buf_sz ) ctx->buf_off = 0UL; /* cmov */ - - if( FD_UNLIKELY( frag1_sz ) ) { - fd_mcache_publish_stream( out_mcache, out_depth, out_seq[0], ctx->goff, 0UL, frag1_sz ); - out_seq[0] = fd_seq_inc( out_seq[0], 1UL ); - cr_frag_avail[0]--; - ctx->goff += frag1_sz; - ctx->buf_off += frag1_sz; - } } /* run/run1 are a custom run loop based on fd_stem.c. */ @@ -140,7 +141,7 @@ after_credit( fd_filerd_tile_t * ctx, __attribute__((noinline)) static void fd_filerd_run1( fd_filerd_tile_t * ctx, - fd_frag_stream_meta_t * out_mcache, + fd_stream_frag_meta_t * out_mcache, void * out_dcache, ulong cons_cnt, ushort * restrict event_map, /* cnt=1+cons_cnt */ @@ -194,7 +195,8 @@ fd_filerd_run1( /* housekeeping init */ - if( lazy<=0L ) lazy = fd_tempo_lazy_default( out_depth ); + //if( lazy<=0L ) lazy = fd_tempo_lazy_default( out_depth ); + lazy = 1e3L; FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy )); /* Initial event sequence */ @@ -284,7 +286,7 @@ fd_filerd_run1( if( FD_UNLIKELY( cr_byte_availlinks[ tile->out_link_id[ 0 ] ].mcache ); + fd_stream_frag_meta_t * out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); FD_TEST( out_mcache ); ulong reliable_cons_cnt = 0UL; @@ -332,7 +335,7 @@ fd_filerd_run( fd_topo_t * topo, ushort event_map[ 1+reliable_cons_cnt ]; ulong volatile * cons_slow[ reliable_cons_cnt ]; ulong cons_seq [ 2*reliable_cons_cnt ]; - fd_filerd_run1( ctx, out_mcache, ctx->buf, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, 0L, rng ); + fd_filerd_run1( ctx, out_mcache, ctx->buf, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); } fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd = { diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index de6b489842..fa4f6c13a1 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -1,17 +1,21 @@ +#ifndef HEADER_fd_src_discof_restore_fd_restore_base_h +#define HEADER_fd_src_discof_restore_fd_restore_base_h + #include "../../tango/mcache/fd_mcache.h" -/* fd_frag_stream_meta_t is a variation of fd_frag_meta_t optimized for +/* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for stream I/O. */ -union fd_frag_stream_meta { +union fd_stream_frag_meta { struct { ulong seq; /* frag sequence number */ - ulong goff; /* global offset */ + ulong goff; /* stream offset */ uint sz; - uint unused; + ushort unused; + ushort ctl; ulong loff; /* dcache offset */ }; @@ -20,43 +24,58 @@ union fd_frag_stream_meta { }; -typedef union fd_frag_stream_meta fd_frag_stream_meta_t; - -FD_PROTOTYPES_BEGIN - -#if FD_HAS_SSE +typedef union fd_stream_frag_meta fd_stream_frag_meta_t; -FD_FN_CONST static inline __m128i -fd_frag_stream_meta_sse0( ulong seq, - ulong goff ) { - return _mm_set_epi64x( (long)goff, (long)seq ); -} - -FD_FN_CONST static inline __m128i -fd_frag_stream_meta_sse1( ulong sz, /* Assumed 32-bit */ - ulong loff ) { - return _mm_set_epi64x( (long)loff, (long)(sz) ); -} +FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); +FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); -#endif /* FD_HAS_SSE */ +FD_PROTOTYPES_BEGIN static inline void -fd_mcache_publish_stream( fd_frag_stream_meta_t * mcache, +fd_mcache_publish_stream( fd_stream_frag_meta_t * mcache, ulong depth, ulong seq, ulong goff, ulong loff, - ulong sz ) { - fd_frag_stream_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + ulong sz, + ulong ctl ) { + fd_stream_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); FD_COMPILER_MFENCE(); meta->seq = fd_seq_dec( seq, 1UL ); FD_COMPILER_MFENCE(); meta->goff = goff; - meta->loff = loff; meta->sz = (uint)sz; + meta->ctl = (ushort)ctl; + meta->loff = loff; FD_COMPILER_MFENCE(); meta->seq = seq; FD_COMPILER_MFENCE(); } FD_PROTOTYPES_END + +/* fd_account_frag_meta_t is a variation of fd_frag_meta_t optimized for + accounts. */ + +union fd_account_frag_meta { + + struct { + + ulong seq; + ulong rec_hash; + + ulong frag_seq; + ulong rec_goff; + + }; + + fd_frag_meta_t f[1]; + +}; + +typedef union fd_account_frag_meta fd_account_frag_meta_t; + +FD_STATIC_ASSERT( alignof(fd_account_frag_meta_t)==32, abi ); +FD_STATIC_ASSERT( sizeof (fd_account_frag_meta_t)==32, abi ); + +#endif /* HEADER_fd_src_discof_restore_fd_restore_base_h */ diff --git a/src/discof/restore/fd_restore_manifest.c b/src/discof/restore/fd_restore_manifest.c new file mode 100644 index 0000000000..28d916cd33 --- /dev/null +++ b/src/discof/restore/fd_restore_manifest.c @@ -0,0 +1,261 @@ +/* fd_restore_manifest.c implements streaming decode of a Solana + snapshot "manifest" file. The "manifest" is an abomination of + variable-length bincode structures. When deserializing everything + upfront, the scratch memory use is hard to control (potentially + unbounded). */ + +#include "../../flamenco/types/fd_types.h" + +/* Decode steps */ + +#define FD_MANIFEST_PT_1_1 0x01 /* bank / blockhash queue */ +#define FD_MANIFEST_PT_1_2 0x02 /* bank / blockhash queue / last hash */ +#define FD_MANIFEST_PT_1_3 0x03 /* bank / blockhash queue */ +#define FD_MANIFEST_PT_3 0x05 /* bank */ +#define FD_MANIFEST_PT_4 0x06 /* ancestors list */ +#define FD_MANIFEST_PT_5 0x07 /* bank */ +#define FD_MANIFEST_PT_6 0x08 /* hard forks list */ +#define FD_MANIFEST_PT_7 0x09 /* bank */ +#define FD_MANIFEST_PT_8 0x0a /* "hashes per tick" */ +#define FD_MANIFEST_PT_9 0x0b /* bank */ +#define FD_MANIFEST_PT_10_1 0x0c /* vote account header */ +#define FD_MANIFEST_PT_10_2 0x0d /* vote account data */ +#define FD_MANIFEST_PT_10_3 0x0e /* vote account trailer */ +#define FD_MANIFEST_PT_11 0x0f /* bank / stakes */ +#define FD_MANIFEST_PT_12 0x10 /* stake delegations */ +#define FD_MANIFEST_PT_13 0x11 /* bank / stakes */ +#define FD_MANIFEST_PT_14 0x12 /* stake history */ +#define FD_MANIFEST_PT_15 0x13 /* bank */ +#define FD_MANIFEST_PT_16 0x14 /* unused account */ +#define FD_MANIFEST_PT_17 0x15 /* bank */ +#define FD_MANIFEST_PT_18_1 0x16 /* epoch stakes */ +#define FD_MANIFEST_PT_18_1_1 0x17 /* vote account header */ +#define FD_MANIFEST_PT_18_1_2 0x18 /* vote account data */ +#define FD_MANIFEST_PT_18_1_3 0x19 /* vote account trailer */ +#define FD_MANIFEST_PT_18_2 0x1a /* epoch stakes / stakes */ +#define FD_MANIFEST_PT_18_3 0x1b /* epoch stakes / stake delegations */ +#define FD_MANIFEST_PT_18_4 0x1c /* epoch stakes / stakes */ +#define FD_MANIFEST_PT_18_5 0x1d /* epoch stakes / stake history */ +#define FD_MANIFEST_PT_18_6 0x1e /* epoch stakes */ +#define FD_MANIFEST_PT_18_7_1 0x1f /* epoch stakes / node id mapping */ +#define FD_MANIFEST_PT_18_7_2 0x20 /* epoch stakes / node id mapping */ +#define FD_MANIFEST_PT_18_7_3 0x21 /* epoch stakes / node id mapping */ +#define FD_MANIFEST_PT_18_8 0x22 /* epoch stakes */ +#define FD_MANIFEST_PT_18_9 0x23 /* epoch stakes */ +#define FD_MANIFEST_PT_19 0x24 /* bank, db */ +#define FD_MANIFEST_PT_20 0x25 /* db / storages */ + +/* Data structures */ + +struct __attribute__((packed)) fd_manifest_pt1_1 { + ulong bhq_last_hash_index; + uchar bhq_last_hash_present; +}; +typedef struct fd_manifest_pt1 fd_manifest_pt1_t; + +struct __attribute__((packed)) fd_manifest_pt1_3 { + ulong bhq_last_hash_index; + uchar bhq_last_hash_present; +}; +typedef struct fd_manifest_pt1 fd_manifest_pt1_t; + +struct __attribute__((packed)) fd_manifest_pt1 { + ulong bhq_last_hash_index; + uchar bhq_last_hash_present; +}; +typedef struct fd_manifest_pt1 fd_manifest_pt1_t; + +struct __attribute__((packed)) fd_manifest_pt3 { + ulong bhq_max_age; + ulong ancestors_len; +}; +typedef struct fd_manifest_pt3 fd_manifest_pt3_t; + +struct __attribute__((packed)) fd_manifest_pt5 { + fd_hash_t hash; + fd_hash_t parent_hash; + ulong parent_slot; + ulong hard_forks_len; +}; +typedef struct fd_manifest_pt5 fd_manifest_pt5_t; + +struct __attribute__((packed)) fd_manifest_pt7 { + ulong transaction_count; + ulong tick_height; + ulong signature_count; + ulong capitalization; + ulong max_tick_height; + uchar hashes_per_tick_present; +}; +typedef struct fd_manifest_pt7 fd_manifest_pt7_t; + +struct __attribute__((packed)) fd_manifest_pt9 { + ulong ticks_per_slot; + ulong ns_per_slot_lo; + ulong ns_per_slot_hi; + ulong genesis_creation_time; + double slots_per_year; + ulong accounts_data_len; + + ulong slot; + ulong epoch; + ulong block_height; + + fd_pubkey_t collector_id; + ulong collector_fees; + fd_fee_calculator_t fee_calculator; + fd_fee_rate_governor_t fee_rate_governor; + ulong collected_rent; + fd_rent_collector_t rent_collector; + fd_inflation_t inflation; + + ulong vote_accounts_len; +}; +typedef struct fd_manifest_pt9 fd_manifest_pt9_t; + +struct __attribute__((packed)) fd_manifest_pt10_1 { + fd_pubkey_t key; + ulong stake; + ulong lamports; + ulong data_len; +}; +typedef struct fd_manifest_pt10_1 fd_manifest_pt10_1_t; + +struct __attribute__((packed)) fd_manifest_pt10_3 { + uchar executable; + ulong rent_epoch; +}; +typedef struct fd_manifest_pt10_3 fd_manifest_pt10_3_t; + +struct __attribute__((packed)) fd_manifest_pt13 { + ulong unused; + ulong epoch; + ulong stake_history_len; +}; +typedef struct fd_manifest_pt13 fd_manifest_pt13_t; + +struct __attribute__((packed)) fd_manifest_pt15 { + fd_pubkey_t unused1; + fd_pubkey_t unused2; + ulong unused3_len; +}; +typedef struct fd_manifest_pt15 fd_manifest_pt15_t; + +struct __attribute__((packed)) fd_manifest_pt18_1 { + ulong key; + ulong vote_accounts_len; +}; +typedef struct fd_manifest_pt18_1 fd_manifest_pt18_1_t; + +struct __attribute__((packed)) fd_manifest_pt18_6 { + ulong total_stake; + ulong node_id_mapping_len; +}; +typedef struct fd_manifest_pt18_6 fd_manifest_pt18_6_t; + +struct __attribute__((packed)) fd_manifest_pt18_7_1 { + fd_pubkey_t pubkey; + ulong vote_accounts_len; +}; +typedef struct fd_manifest_pt18_7_1 fd_manifest_pt18_7_1_t; + +struct __attribute__((packed)) fd_manifest_pt19 { + uchar is_delta; + ulong storages_len; +}; +typedef struct fd_manifest_pt19 fd_manifest_pt19_t; + +struct __attribute__((packed)) fd_manifest_pt20 { + +}; +typedef struct fd_manifest_pt20 fd_manifest_pt20_t; + +struct fd_restore_manifest_ctx { + uint state; + uchar * buf; + ulong buf_sz; + ulong buf_max; + + ulong statev[3]; +}; + +typedef struct fd_restore_manifest_ctx fd_restore_manifest_ctx_t; + +static void const * +buf_frag( fd_restore_manifest_ctx_t * ctx, + void const * frag, + ulong * p_frag_sz, + ulong want_sz ) { + FD_TEST( want_sz<=ctx->buf_max ); + ulong frag_sz = *p_frag_sz; + if( FD_UNLIKELY( frag_szbuf_sz < want_sz ); + ulong rem_sz = want_sz - ctx->buf_sz; + frag_sz = fd_ulong_min( frag_sz, rem_sz ); + fd_memcpy( ctx->buf + ctx->buf_sz, frag, frag_sz ); + ctx->buf_sz += frag_sz; + (*p_frag_sz) -= frag_sz; + if( ctx->buf_sz == want_sz ) { + ctx->buf_sz = 0UL; + return ctx->buf; + } + return NULL; + } + (*p_frag_sz) -= want_sz; + return frag; +} + +static ushort const +manifest_node_len[] = { + [ FD_MANIFEST_PT_1_1 ] = sizeof(fd_manifest_pt1_t), + [ FD_MANIFEST_PT_1_2 ] = sizeof(fd_pubkey_t), + [ FD_MANIFEST_PT_1_3 ] = FD_HASH_HASH_AGE_PAIR_FOOTPRINT, + [ FD_MANIFEST_PT_3 ] = sizeof(fd_manifest_pt3_t), + [ FD_MANIFEST_PT_4 ] = FD_SLOT_PAIR_FOOTPRINT, + [ FD_MANIFEST_PT_5 ] = sizeof(fd_manifest_pt5_t), + [ FD_MANIFEST_PT_6 ] = FD_SLOT_PAIR_FOOTPRINT, + [ FD_MANIFEST_PT_7 ] = sizeof(fd_manifest_pt7_t), + [ FD_MANIFEST_PT_8 ] = sizeof(uchar), + [ FD_MANIFEST_PT_9 ] = sizeof(fd_manifest_pt9_t), + [ FD_MANIFEST_PT_10_1 ] = sizeof(fd_manifest_pt10_1_t), + [ FD_MANIFEST_PT_10_2 ] = 0, + [ FD_MANIFEST_PT_10_3 ] = sizeof(fd_manifest_pt10_3_t), + [ FD_MANIFEST_PT_11 ] = sizeof(ulong), + [ FD_MANIFEST_PT_12 ] = FD_DELEGATION_PAIR_FOOTPRINT, + [ FD_MANIFEST_PT_13 ] = sizeof(fd_manifest_pt13_t), + [ FD_MANIFEST_PT_14 ] = FD_STAKE_HISTORY_ENTRY_FOOTPRINT, + [ FD_MANIFEST_PT_15 ] = sizeof(fd_manifest_pt15_t), + [ FD_MANIFEST_PT_16 ] = FD_PUBKEY_U64_PAIR_FOOTPRINT, + [ FD_MANIFEST_PT_17 ] = sizeof(ulong), + [ FD_MANIFEST_PT_18_1 ] = sizeof(fd_manifest_pt18_1_t), + [ FD_MANIFEST_PT_18_1_1 ] = sizeof(fd_manifest_pt10_1_t), + [ FD_MANIFEST_PT_18_1_2 ] = 0, + [ FD_MANIFEST_PT_18_1_3 ] = sizeof(fd_manifest_pt10_3_t), + [ FD_MANIFEST_PT_18_2 ] = sizeof(ulong), + [ FD_MANIFEST_PT_18_3 ] = FD_DELEGATION_PAIR_FOOTPRINT, + [ FD_MANIFEST_PT_18_4 ] = sizeof(fd_manifest_pt13_t), + [ FD_MANIFEST_PT_18_5 ] = FD_STAKE_HISTORY_ENTRY_FOOTPRINT, + [ FD_MANIFEST_PT_18_6 ] = sizeof(fd_manifest_pt18_6_t), + [ FD_MANIFEST_PT_18_7_1 ] = sizeof(fd_manifest_pt18_7_1_t), + [ FD_MANIFEST_PT_18_7_2 ] = sizeof(fd_pubkey_t), + [ FD_MANIFEST_PT_18_7_3 ] = sizeof(ulong), + [ FD_MANIFEST_PT_18_8 ] = sizeof(ulong), + [ FD_MANIFEST_PT_18_9 ] = FD_PUBKEY_PUBKEY_PAIR_FOOTPRINT, +}; + +ulong +fd_restore_manifest_frag( fd_restore_manifest_ctx_t * ctx, + void const * restrict frag, + ulong frag_sz ) { + switch( ctx->state ) { + case FD_MANIFEST_PT_1: { + fd_manifest_pt1_t const * pt1 = buf_frag( ctx, frag, &frag_sz, sizeof(fd_manifest_pt1_t) ); + (void)pt1; + ctx->state = FD_MANIFEST_PT_2; + ctx->statev[0] = pt1->bhq_last_hash_present; + return frag_sz; + } + case FD_MANIFEST_PT_2: + break; + } +} diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c new file mode 100644 index 0000000000..e154756411 --- /dev/null +++ b/src/discof/restore/fd_snapin_tile.c @@ -0,0 +1,1188 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../disco/metrics/fd_metrics.h" +#include "../../util/archive/fd_tar.h" +#include "../../flamenco/types/fd_types.h" +#include +#include +#include +#include + +#define LINK_IN_MAX 1UL +#define BURST 16UL + +#define SNAP_STATE_IGNORE ((uchar)0) /* ignore file content */ +#define SNAP_STATE_TAR ((uchar)1) /* reading tar header (buffered) */ +#define SNAP_STATE_MANIFEST ((uchar)2) /* reading manifest (buffered) */ +#define SNAP_STATE_ACCOUNT_HDR ((uchar)3) /* reading account hdr (buffered) */ +#define SNAP_STATE_ACCOUNT_DATA ((uchar)4) /* reading account data (zero copy) */ +#define SNAP_STATE_DONE ((uchar)5) /* expect no more data */ + +struct fd_snapshot_accv_key { + ulong slot; + ulong id; +}; + +typedef struct fd_snapshot_accv_key fd_snapshot_accv_key_t; + +static const fd_snapshot_accv_key_t +fd_snapshot_accv_key_null = { 0UL, 0UL }; + +FD_FN_PURE static inline ulong +fd_snapshot_accv_key_hash( fd_snapshot_accv_key_t key ) { + return fd_hash( 0x39c49607bf16463aUL, &key, sizeof(fd_snapshot_accv_key_t) ); +} + +struct fd_snapshot_accv_map { + fd_snapshot_accv_key_t key; + ulong sz; + ulong hash; /* use uint or ulong hash? */ +}; + +typedef struct fd_snapshot_accv_map fd_snapshot_accv_map_t; + +#define MAP_NAME fd_snapshot_accv_map +#define MAP_T fd_snapshot_accv_map_t +#define MAP_LG_SLOT_CNT 23 /* 8.39 million */ +#define MAP_KEY_T fd_snapshot_accv_key_t +#define MAP_KEY_NULL fd_snapshot_accv_key_null +#define MAP_KEY_INVAL(k) ( ((k).slot==0UL) & ((k).id==0UL) ) +#define MAP_KEY_EQUAL(k0,k1) ( ((k0).slot==(k1).slot) & ((k0).id==(k1).id) ) +#define MAP_KEY_EQUAL_IS_SLOW 0 +#define MAP_HASH_T ulong +#define MAP_KEY_HASH(k0) fd_snapshot_accv_key_hash(k0) +#include "../../util/tmpl/fd_map.c" + +#define SNAP_FLAG_FAILED 1 +#define SNAP_FLAG_BLOCKED 2 + +struct fd_snapin_tile { + uchar state; + uchar flags; + uchar manifest_done; + + /* Stream input */ + + uchar const * in_base; + ulong goff_translate; + ulong loff_translate; + ulong in_skip; + + /* Frame buffer */ + + uchar * buf; + ulong buf_ctr; /* number of bytes allocated in buffer */ + ulong buf_sz; /* target buffer size (buf_ctrbuf_ctr = 0UL; + self->buf_sz = 0UL; +} + +static void * +fd_snapshot_restore_prepare_buf( fd_snapin_tile_t * self, + ulong sz ) { + self->buf_ctr = 0UL; + self->buf_sz = 0UL; + + fd_snapshot_restore_discard_buf( self ); + if( FD_UNLIKELY( sz > self->buf_max ) ) { + FD_LOG_WARNING(( "Alloc failed (need %lu bytes, have %lu)", sz, self->buf_max )); + self->state = SNAP_FLAG_FAILED; + return NULL; + } + + return self->buf; +} + +static int +fd_snapshot_expect_account_hdr( fd_snapin_tile_t * restore ) { + + ulong accv_sz = restore->accv_sz; + if( accv_sz < sizeof(fd_solana_account_hdr_t) ) { + if( FD_LIKELY( accv_sz==0UL ) ) { + restore->state = SNAP_STATE_ACCOUNT_HDR; + return 0; + } + FD_LOG_WARNING(( "encountered unexpected EOF while reading account header" )); + restore->flags |= SNAP_FLAG_FAILED; + return EINVAL; + } + + restore->state = SNAP_STATE_ACCOUNT_HDR; + restore->buf_ctr = 0UL; + restore->buf_sz = sizeof(fd_solana_account_hdr_t); + + return 0; +} + +static int +fd_snapshot_restore_accv_prepare( fd_snapin_tile_t * const restore, + fd_tar_meta_t const * const meta, + ulong const real_sz ) { + + if( FD_UNLIKELY( !fd_snapshot_restore_prepare_buf( restore, sizeof(fd_solana_account_hdr_t) ) ) ) { + FD_LOG_WARNING(( "Failed to allocate read buffer while restoring accounts from snapshot" )); + return ENOMEM; + } + + /* Parse file name */ + ulong id, slot; + if( FD_UNLIKELY( sscanf( meta->name, "accounts/%lu.%lu", &slot, &id )!=2 ) ) { + /* Ignore entire file if file name invalid */ + restore->state = SNAP_STATE_IGNORE; + return 0; + } + + /* Lookup account vec file size */ + fd_snapshot_accv_key_t key = { .slot = slot, .id = id }; + fd_snapshot_accv_map_t * rec = fd_snapshot_accv_map_query( restore->accv_map, key, NULL ); + if( FD_UNLIKELY( !rec ) ) { + /* Ignore account vec files that are not explicitly mentioned in the + manifest. */ + FD_LOG_DEBUG(( "Ignoring %s (sz %lu)", meta->name, real_sz )); + restore->state = SNAP_STATE_IGNORE; + return 0; + } + ulong sz = rec->sz; + + /* Validate the supposed file size against real size */ + if( FD_UNLIKELY( sz > real_sz ) ) { + FD_LOG_WARNING(( "AppendVec %lu.%lu is %lu bytes long according to manifest, but actually only %lu bytes", + slot, id, sz, real_sz )); + restore->flags |= SNAP_FLAG_FAILED; + return EINVAL; + } + restore->accv_sz = sz; + restore->accv_slot = slot; + restore->accv_id = id; + + /* Prepare read of account header */ + FD_LOG_DEBUG(( "Loading account vec %s", meta->name )); + return fd_snapshot_expect_account_hdr( restore ); +} + + +/* fd_snapshot_restore_manifest_prepare prepares for consumption of the + snapshot manifest. */ + +static int +fd_snapshot_restore_manifest_prepare( fd_snapin_tile_t * restore, + ulong sz ) { + /* Only read once */ + if( restore->manifest_done ) { + FD_LOG_WARNING(( "Snapshot file contains multiple manifests" )); + restore->state = SNAP_STATE_IGNORE; + return 0; + } + + /* We don't support streaming manifest deserialization yet. Thus, + buffer the whole manifest in one place. */ + if( FD_UNLIKELY( !fd_snapshot_restore_prepare_buf( restore, sz ) ) ) { + restore->flags |= SNAP_FLAG_FAILED; + return ENOMEM; + } + + restore->state = SNAP_STATE_MANIFEST; + restore->buf_sz = sz; + + return 0; +} + +static void +restore_file( void * restore_, + fd_tar_meta_t const * meta, + ulong sz ) { + fd_snapin_tile_t * restore = restore_; + + restore->buf_ctr = 0UL; /* reset buffer */ + restore->state = SNAP_STATE_IGNORE; + + if( (sz==0UL) | (!fd_tar_meta_is_reg( meta )) ) return; + + /* Detect account vec files. These are files that contain a vector + of accounts in Solana Labs "AppendVec" format. */ + assert( sizeof("accounts/")name, "accounts/", sizeof("accounts/")-1) ) { + if( FD_UNLIKELY( !restore->manifest_done ) ) { + FD_LOG_WARNING(( "Unsupported snapshot: encountered AppendVec before manifest" )); + restore->flags |= SNAP_FLAG_FAILED; + return; + } + fd_snapshot_restore_accv_prepare( restore, meta, sz ); + } else if( fd_memeq( meta->name, "snapshots/status_cache", sizeof("snapshots/status_cache") ) ) { + /* TODO */ + } else if(0==strncmp( meta->name, "snapshots/", sizeof("snapshots/")-1 ) ) { + fd_snapshot_restore_manifest_prepare( restore, sz ); + } + +} + +static uchar const * +snapshot_read_buffered( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + /* Should not be called if read is complete */ + FD_TEST( restore->buf_ctr < restore->buf_sz ); + + /* Determine number of bytes to buffer */ + ulong sz = restore->buf_sz - restore->buf_ctr; + if( sz>bufsz ) sz = bufsz; + + /* Append to buffer */ + fd_memcpy( restore->buf + restore->buf_ctr, buf, sz ); + restore->buf_ctr += sz; + + return buf+sz; +} + +FD_FN_PURE static inline int +snapshot_read_is_complete( fd_snapin_tile_t const * restore ) { + return restore->buf_ctr == restore->buf_sz; +} + +static int +snapshot_restore_account_hdr( fd_snapin_tile_t * restore ) { + fd_solana_account_hdr_t const * hdr = fd_type_pun_const( restore->buf ); + + ulong data_sz = hdr->meta.data_len; + restore->acc_sz = data_sz; + restore->acc_rem = data_sz; + restore->acc_pad = fd_ulong_align_up( data_sz, 8UL ) - data_sz; + + if( FD_UNLIKELY( data_sz>(10UL<<20) ) ) { + FD_LOG_ERR(( "Oversize account found (%lu bytes)", data_sz )); + } + + /* Next step */ + if( data_sz == 0UL ) { + return fd_snapshot_expect_account_hdr( restore ); + } + + restore->state = SNAP_STATE_ACCOUNT_DATA; + restore->buf_ctr = 0UL; + restore->buf_sz = 0UL; + return 0; +} + +static uchar const * +snapshot_read_account_hdr_chunk( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + if( !restore->accv_sz ) { + /* Reached end of AppendVec */ + restore->state = SNAP_STATE_IGNORE; + restore->buf_ctr = restore->buf_sz = 0UL; + return buf; + } + bufsz = fd_ulong_min( bufsz, restore->accv_sz ); + + int som = restore->buf_ctr == 0UL; + + ulong frag_goff = (ulong)buf - restore->goff_translate; + ulong frag_loff = (ulong)buf - restore->loff_translate; + + uchar const * buf_next = snapshot_read_buffered( restore, buf, bufsz ); + ulong hdr_read = (ulong)(buf_next-buf); + restore->accv_sz -= hdr_read; + bufsz -= hdr_read; + + ulong peek_sz = 0UL; + if( FD_LIKELY( snapshot_read_is_complete( restore ) ) ) { + if( FD_UNLIKELY( 0!=snapshot_restore_account_hdr( restore ) ) ) { + return buf; /* parse error */ + } + peek_sz = fd_ulong_min( restore->acc_rem, bufsz ); + } + + int eom = bufsz > restore->acc_rem; + + /* Publish header-only fragment or header+data fragment. + If data was included, skip ahead. (Combining header+data into the + same fragment reduces the amount of descriptor frags published.) */ + + fd_mcache_publish_stream( + restore->out_mcache, + restore->out_depth, + restore->out_seq, + frag_goff, + frag_loff, + hdr_read + peek_sz, + fd_frag_meta_ctl( 0UL, som, eom, 0 ) + ); + restore->out_seq = fd_seq_inc( restore->out_seq, 1UL ); + restore->out_cnt += !!som; + restore->acc_rem -= peek_sz; + restore->accv_sz -= peek_sz; + buf_next += peek_sz; + + return buf_next; +} + +static uchar const * +snapshot_read_account_chunk( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + + ulong chunk_sz = fd_ulong_min( restore->acc_rem, bufsz ); + if( FD_UNLIKELY( chunk_sz > restore->accv_sz ) ) + FD_LOG_CRIT(( "OOB account vec read: chunk_sz=%lu accv_sz=%lu", chunk_sz, restore->accv_sz )); + + if( FD_LIKELY( chunk_sz ) ) { + + int eom = restore->acc_rem == chunk_sz; + + fd_mcache_publish_stream( + restore->out_mcache, + restore->out_depth, + restore->out_seq, + (ulong)buf - restore->goff_translate, + (ulong)buf - restore->loff_translate, + chunk_sz, + fd_frag_meta_ctl( 0UL, 0, eom, 0 ) + ); + + restore->out_seq = fd_seq_inc( restore->out_seq, 1UL ); + restore->acc_rem -= chunk_sz; + restore->accv_sz -= chunk_sz; + buf += chunk_sz; + bufsz -= chunk_sz; + + } + + if( restore->acc_rem == 0UL ) { + ulong pad_sz = fd_ulong_min( fd_ulong_min( restore->acc_pad, bufsz ), restore->accv_sz ); + buf += pad_sz; + bufsz -= pad_sz; + restore->acc_pad -= pad_sz; + restore->accv_sz -= pad_sz; + + if( restore->accv_sz == 0UL ) { + restore->state = SNAP_STATE_IGNORE; + return buf; + } + if( restore->acc_pad == 0UL ) { + return (0==fd_snapshot_expect_account_hdr( restore )) ? buf : NULL; + } + } + + return buf; +} + + +/* fd_snapshot_accv_index populates the index of account vecs. This + index will be used when loading accounts. Returns errno-compatible + error code. */ + +static int +fd_snapshot_accv_index( fd_snapshot_accv_map_t * map, + fd_solana_accounts_db_fields_t const * fields ) { + + for( ulong i=0UL; i < fields->storages_len; i++ ) { + + fd_snapshot_slot_acc_vecs_t * slot = &fields->storages[ i ]; + + for( ulong j=0UL; j < slot->account_vecs_len; j++ ) { + fd_snapshot_acc_vec_t * accv = &slot->account_vecs[ j ]; + + /* Insert new AppendVec */ + fd_snapshot_accv_key_t key = { .slot = slot->slot, .id = accv->id }; + fd_snapshot_accv_map_t * rec = fd_snapshot_accv_map_insert( map, key ); + if( FD_UNLIKELY( !rec ) ) { + FD_LOG_WARNING(( "fd_snapshot_accv_map_insert failed" )); + return ENOMEM; + } + + /* Remember size */ + rec->sz = accv->file_sz; + } + + } + + return 0; +} + +/* snapshot_restore_manifest imports a snapshot manifest into the + given slot context. Also populates the accv index. Destroys the + existing bank structure. */ + +static void +snapshot_restore_manifest( fd_snapin_tile_t * restore ) { + + /* Decode manifest placing dynamic data structures onto slot context + heap. Once the epoch context heap is separated out, we need to + revisit this. + + This is horrible. Plenty of room for optimization, including: + - Streaming decoding + - Fixing the decoder (does 2 walks in decode_footprint, decode) + - Unpack directly into slot_ctx */ + + long dt = -fd_log_wallclock(); + + fd_bincode_decode_ctx_t decode = { + .data = restore->buf, + .dataend = restore->buf + restore->buf_sz + }; + + ulong total_sz = 0UL; + int err = fd_solana_manifest_decode_footprint( &decode, &total_sz ); + if( FD_UNLIKELY( err ) ) { + FD_LOG_ERR(( "fd_solana_manifest_decode_footprint failed (%d)", err )); + } + + uchar * scratch = (uchar *)fd_ulong_align_up( (ulong)decode.dataend, fd_solana_manifest_align() ); + ulong scratch_sz = (ulong)( restore->buf + restore->buf_max - scratch ); + if( FD_UNLIKELY( total_sz > scratch_sz ) ) { + FD_LOG_ERR(( "Cannot decode snapshot. Insufficient scratch buffer size (need %lu, have %lu bytes)", + (ulong)scratch + total_sz - (ulong)restore->buf, restore->buf_max )); + } + fd_solana_manifest_t * manifest = fd_solana_manifest_decode( scratch, &decode ); + + char acc_hash_cstr[ FD_BASE58_ENCODED_32_SZ ]; + fd_base58_encode_32( manifest->accounts_db.bank_hash_info.accounts_hash.uc, NULL, acc_hash_cstr ); + if( manifest->bank_incremental_snapshot_persistence ) { + FD_LOG_ERR(( "Incremental snapshots not yet supported TODO" )); + } else { + FD_LOG_NOTICE(( "Full snapshot acc_hash=%s", acc_hash_cstr )); + } + + dt += fd_log_wallclock(); + FD_LOG_NOTICE(( "Snapshot manifest decode took %.2g seconds", (double)dt/1e9 )); + + /* Move over accounts DB fields */ + + fd_solana_accounts_db_fields_t accounts_db = manifest->accounts_db; + fd_memset( &manifest->accounts_db, 0, sizeof(fd_solana_accounts_db_fields_t) ); + + /* Remember slot number */ + + //ulong slot = manifest->bank.slot; + + /* Copy objects into slot context */ + + //if( restore->cb_manifest ) { + // err = restore->cb_manifest( restore->cb_manifest_ctx, manifest, restore->spad ); + //} + + /* Read AccountVec map */ + + if( FD_LIKELY( !err ) ) { + err = fd_snapshot_accv_index( restore->accv_map, &accounts_db ); + } + + /* Discard buffer to reclaim heap space */ + + fd_snapshot_restore_discard_buf( restore ); + + restore->manifest_done = 1; +} + +/* snapshot_read_manifest_chunk reads partial manifest content. */ + +static uchar const * +snapshot_read_manifest_chunk( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + uchar const * end = snapshot_read_buffered( restore, buf, bufsz ); + if( snapshot_read_is_complete( restore ) ) { + snapshot_restore_manifest( restore ); + restore->state = SNAP_STATE_IGNORE; + } + return end; +} + +static ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_snapin_tile_t), fd_snapshot_accv_map_align() ); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_snapin_tile_t), sizeof(fd_snapin_tile_t) ); + l = FD_LAYOUT_APPEND( l, fd_snapshot_accv_map_align(), fd_snapshot_accv_map_footprint() ); + l = FD_LAYOUT_APPEND( l, 16UL, tile->snapin.scratch_sz ); + return l; +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `FileRd` tile" )); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu outs, expected 1", tile->out_cnt )); + /* FIXME check link names */ + + if( FD_UNLIKELY( !tile->snapin.scratch_sz ) ) FD_LOG_ERR(( "scratch_sz param not set" )); + + FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + fd_snapin_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_snapin_tile_t), sizeof(fd_snapin_tile_t) ); + void * accv_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_snapshot_accv_map_align(), fd_snapshot_accv_map_footprint() ); + void * scratch_mem = FD_SCRATCH_ALLOC_APPEND( l, 16UL, tile->snapin.scratch_sz ); + fd_memset( ctx, 0, sizeof(fd_snapin_tile_t) ); + + /* Init state */ + + ctx->state = SNAP_STATE_TAR; + ctx->flags = 0; + ctx->manifest_done = 0; + + /* Join stream input */ + + ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; + ctx->in_skip = 0UL; + + /* Join frame buffer */ + + ctx->buf = scratch_mem; + ctx->buf_sz = 0UL; + ctx->buf_ctr = 0UL; + ctx->buf_max = tile->snapin.scratch_sz; + + /* Join snapshot file parser */ + + ctx->accv_map = fd_snapshot_accv_map_join( fd_snapshot_accv_map_new( accv_map_mem ) ); + FD_TEST( ctx->accv_map ); + + /* Join account output */ + + ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); + ctx->out_seq_max = 0UL; + ctx->out_seq = 0UL; + ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); + +} + +static void +during_housekeeping( fd_snapin_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_snapin_tile_t * ctx ) { + (void)ctx; +} + +static void +tar_process_hdr( fd_snapin_tile_t * reader, + uchar const * cur ) { + + fd_tar_meta_t const * hdr = (fd_tar_meta_t const *)reader->buf; + + /* "ustar\x00" and "ustar \x00" (overlaps with version) are both + valid values for magic. These are POSIX ustar and OLDGNU versions + respectively. */ + if( FD_UNLIKELY( 0!=memcmp( hdr->magic, FD_TAR_MAGIC, 5UL ) ) ) { + + /* Detect EOF. A TAR EOF is marked by 1024 bytes of zeros. + We abort after 512 bytes. */ + int not_zero=0; + for( ulong i=0UL; ibuf[ i ]; + if( !not_zero ) return; + + /* Not an EOF, so must be a protocol error */ + ulong goff = (ulong)cur - reader->goff_translate - sizeof(fd_tar_meta_t); + FD_LOG_WARNING(( "Invalid tar header magic at goff=0x%lx", goff )); + FD_LOG_HEXDUMP_WARNING(( "Tar header", hdr, sizeof(fd_tar_meta_t) )); + reader->flags |= SNAP_FLAG_FAILED; + return; + } + + ulong file_sz = fd_tar_meta_get_size( hdr ); + if( FD_UNLIKELY( file_sz==ULONG_MAX ) ) { + FD_LOG_WARNING(( "Failed to parse file size in tar header" )); + reader->flags |= SNAP_FLAG_FAILED; + return; + } + reader->tar_file_rem = file_sz; + reader->buf_ctr = (ushort)0U; + + /* Call back to recipient */ + restore_file( reader, hdr, file_sz ); +} + +static uchar const * +tar_read_hdr( fd_snapin_tile_t * reader, + uchar const * cur, + ulong bufsz ) { + uchar const * end = cur+bufsz; + + /* Skip padding */ + if( reader->buf_ctr==0UL ) { + ulong goff = (ulong)cur - reader->goff_translate; + ulong pad_sz = fd_ulong_align_up( goff, 512UL ) - goff; + pad_sz = fd_ulong_min( pad_sz, (ulong)( end-cur ) ); + cur += pad_sz; + } + + /* Determine number of bytes to read */ + long chunk_sz = (long)sizeof(fd_tar_meta_t) - (long)reader->buf_ctr; + FD_TEST( chunk_sz>=0L ); + if( end-cur < chunk_sz ) chunk_sz = end-cur; + + /* Copy to header */ + fd_memcpy( reader->buf + reader->buf_ctr, cur, (ulong)chunk_sz ); + cur += chunk_sz; + reader->buf_ctr += (ulong)chunk_sz; + + /* Handle complete header */ + if( FD_LIKELY( reader->buf_ctr == sizeof(fd_tar_meta_t) ) ) { + tar_process_hdr( reader, cur ); + } + + return cur; +} + +static uchar const * +snapshot_read_discard( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + ulong avail = fd_ulong_min( bufsz, restore->tar_file_rem ); + return buf + avail; +} + +static uchar const * +restore_chunk1( fd_snapin_tile_t * restore, + uchar const * buf, + ulong bufsz ) { + if( FD_UNLIKELY( restore->state==SNAP_STATE_TAR ) ) { + return tar_read_hdr( restore, buf, bufsz ); + } + bufsz = fd_ulong_min( bufsz, restore->tar_file_rem ); + + uchar const * buf_next = NULL; + switch( restore->state ) { + case SNAP_STATE_IGNORE: + buf_next = snapshot_read_discard ( restore, buf, bufsz ); + break; + case SNAP_STATE_MANIFEST: + buf_next = snapshot_read_manifest_chunk ( restore, buf, bufsz ); + break; + case SNAP_STATE_ACCOUNT_HDR: + buf_next = snapshot_read_account_hdr_chunk( restore, buf, bufsz ); + break; + case SNAP_STATE_ACCOUNT_DATA: + buf_next = snapshot_read_account_chunk ( restore, buf, bufsz ); + break; + default: + FD_LOG_ERR(( "Invalid parser state %u (this is a bug)", restore->state )); + } + + ulong consumed = (ulong)buf_next - (ulong)buf; + if( FD_UNLIKELY( consumed>bufsz ) ) FD_LOG_CRIT(( "Buffer overflow (consumed=%lu bufsz=%lu)", consumed, bufsz )); + restore->tar_file_rem -= consumed; + if( restore->tar_file_rem==0UL ) { + restore->buf_ctr = 0UL; + restore->buf_sz = 0UL; + restore->state = SNAP_STATE_TAR; + } + return buf_next; +} + +/* on_stream_frag consumes an incoming stream data fragment. This frag + may be up to the dcache size (e.g. 8 MiB), therefore could contain + thousands of accounts. This function will publish a message for each + account to consumers. Slow consumers may cause backpressure and + force this function to exit early (before all accounts in this frag + were published). In that case, this function is called repeatedly + once the backpressure condition resolves (see in_skip). */ + +static int +on_stream_frag( fd_snapin_tile_t * ctx, + fd_snapin_in_t * in, + fd_stream_frag_meta_t const * frag, + ulong * read_sz ) { + if( FD_UNLIKELY( ctx->flags ) ) { + if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_FAILED ) ) FD_LOG_ERR(( "Failed to restore snapshot" )); + return 0; + } + + uchar const * const chunk0 = ctx->in_base + frag->loff; + uchar const * const chunk1 = chunk0 + frag->sz; + uchar const * const start = chunk0 + ctx->in_skip; + uchar const * cur = start; + + ctx->goff_translate = (ulong)chunk0 - frag->goff; + ctx->loff_translate = (ulong)chunk0 - frag->loff; + + int consume_frag = 1; + for(;;) { + if( FD_UNLIKELY( cur>=chunk1 ) ) { + ctx->in_skip = 0U; + break; + } + cur = restore_chunk1( ctx, cur, (ulong)( chunk1-cur ) ); + if( FD_UNLIKELY( ctx->flags ) ) { + if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_FAILED ) ) { + FD_LOG_ERR(( "Failed to restore snapshot" )); + } + FD_LOG_ERR(( "blocked" )); + consume_frag = 0; /* retry this frag */ + ulong consumed_sz = (uint)( cur-start ); + ctx->in_skip += consumed_sz; + break; + } + } + + ulong consumed_sz = (ulong)( cur-start ); + in->goff += consumed_sz; + *read_sz = consumed_sz; + return consume_frag; +} + +/* fd_snapin_in_update gets called periodically synchronize flow control + credits back to the stream producer. Also updates link in metrics. */ + +static void +fd_snapin_in_update( fd_snapin_in_t * in ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->fseq[0] ) = in->seq; + FD_VOLATILE( in->fseq[1] ) = in->goff; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + + uint * accum = in->accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +__attribute__((noreturn)) static void +fd_snapin_shutdown( void ) { + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + /* FIXME set final sequence number */ + FD_COMPILER_MFENCE(); + FD_LOG_INFO(( "Finished parsing snapshot" )); + + for(;;) pause(); +} + +__attribute__((noinline)) static void +fd_snapin_run1( + fd_snapin_tile_t * ctx, + ulong in_cnt, + fd_snapin_in_t * in, /* [in_cnt] */ + ulong out_cnt, + fd_frag_meta_t ** out_mcache, /* [out_cnt] */ + ulong * out_depth, /* [out_cnt] */ + ulong * out_seq, /* [out_cnt] */ + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong * cons_out, /* [cons_cnt] */ + ulong ** cons_fseq, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [cons_cnt] */ + long lazy, + fd_rng_t * rng +) { + /* in frag stream state */ + ulong in_seq; + + /* out flow control state */ + ulong cr_avail; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; /* First in to poll */ + + ulong min_in_depth = (ulong)LONG_MAX; + for( ulong in_idx=0UL; in_idxmcache->f ); + min_in_depth = fd_ulong_min( min_in_depth, depth ); + } + + FD_TEST( in_cnt==1 ); + ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( in[0].mcache->f ) + 3; + + /* out frag stream init */ + + cr_avail = 0UL; + + ulong const burst = BURST; + + ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX ); + + for( ulong out_idx=0UL; out_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ + + /* Send flow control credits and drain flow control diagnostics. */ + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_snapin_in_update( &in[ in_idx ] ); + + /* Input tile finished? */ + ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); + if( FD_UNLIKELY( in_seq_max == in[ 0 ].seq ) ) { + fd_snapin_shutdown(); + } + + } else { /* event_idx==cons_cnt, housekeeping event */ + + /* Send synchronization info */ + FD_COMPILER_MFENCE(); + FD_VOLATILE( out_sync[0] ) = ctx->out_seq; + FD_VOLATILE( out_sync[1] ) = ctx->out_cnt; + FD_COMPILER_MFENCE(); + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + if( FD_LIKELY( cr_availout_seq_max = ctx->out_seq + cr_avail; + + if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*cons_slow[ slowest_cons ]) += metric_in_backp; + FD_COMPILER_MFENCE(); + } + } + + during_housekeeping( ctx ); + + } + + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + + event_seq++; + if( FD_UNLIKELY( event_seq>=event_cnt ) ) { + event_seq = 0UL; + + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + + if( FD_LIKELY( in_cnt>1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_snapin_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( cr_avail=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + ulong this_in_seq = this_in->seq; + fd_stream_frag_meta_t const * this_in_mline = this_in->mline; + + ulong seq_found = fd_frag_meta_seq_query( this_in_mline->f ); + + long diff = fd_seq_diff( this_in_seq, seq_found ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + this_in->seq = seq_found; + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + fd_stream_frag_meta_t meta = FD_VOLATILE_CONST( *this_in_mline ); + ulong sz = 0U; + int consumed_frag = on_stream_frag( ctx, this_in, &meta, &sz ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)sz; + + if( FD_LIKELY( consumed_frag ) ) { + + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu", in_seq )); + } + + /* Windup for the next in poll and accumulate diagnostics */ + + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + + } + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +static void +fd_snapin_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + fd_frag_meta_t * out_mcache[ tile->out_cnt ]; + ulong out_depth [ tile->out_cnt ]; + ulong out_seq [ tile->out_cnt ]; + for( ulong i=0UL; iout_cnt; i++ ) { + out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; + FD_TEST( out_mcache[ i ] ); + out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); + out_seq [ i ] = 0UL; + } + + ulong reliable_cons_cnt = 0UL; + ulong cons_out[ FD_TOPO_MAX_LINKS ]; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_out[ reliable_cons_cnt ] = k; + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cntmcache = in_mcache[ i ]; + this_in->fseq = in_fseq [ i ]; + + ulong depth = fd_mcache_depth( this_in->mcache->f ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", i )); + this_in->depth = (uint)depth; + this_in->idx = (uint)i; + this_in->seq = 0UL; + this_in->goff = 0UL; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + + this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; + this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; + } + + fd_snapin_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ reliable_cons_cnt ]; + fd_snapin_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, out_seq, reliable_cons_cnt, event_map, cons_out, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn = { + .name = "SnapIn", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_snapin_run, +}; +#endif + +#undef LINK_IN_MAX +#undef BURST diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/discof/restore/test_snapin_tile.c b/src/discof/restore/test_snapin_tile.c new file mode 100644 index 0000000000..43ff1bb075 --- /dev/null +++ b/src/discof/restore/test_snapin_tile.c @@ -0,0 +1,20 @@ +#include "fd_snapin_tile.c" + +int +main( int argc, + char ** argv ) { + fd_boot( &argc, &argv ); + + char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); + ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); + ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() ); + + fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL ); + if( FD_UNLIKELY( !wksp ) ) FD_LOG_ERR(( "Unable to attach to wksp" )); + + fd_wksp_delete_anonymous( wksp ); + + FD_LOG_NOTICE(( "pass" )); + fd_halt(); + return 0; +} diff --git a/src/funk/fd_funk.c b/src/funk/fd_funk.c index 8966633899..9dd9ce51a3 100644 --- a/src/funk/fd_funk.c +++ b/src/funk/fd_funk.c @@ -132,8 +132,8 @@ fd_funk_new( void * shmem, } fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ) { +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ) { if( FD_UNLIKELY( !shfunk ) ) { FD_LOG_WARNING(( "NULL shfunk" )); return NULL; diff --git a/src/funk/fd_funk.h b/src/funk/fd_funk.h index 3a13a0ade3..140003634f 100644 --- a/src/funk/fd_funk.h +++ b/src/funk/fd_funk.h @@ -327,8 +327,8 @@ fd_funk_new( void * shmem, (joins are local to a thread group). */ fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ); +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ); /* fd_funk_leave leaves a funk join. Returns the memory region used for join on success (caller has ownership on return and the caller is no diff --git a/src/funk/fd_funk_filemap.h b/src/funk/fd_funk_filemap.h index 3b56d0656f..18ac700685 100644 --- a/src/funk/fd_funk_filemap.h +++ b/src/funk/fd_funk_filemap.h @@ -1,49 +1,74 @@ #ifndef HEADER_fd_src_funk_fd_funk_filemap_h #define HEADER_fd_src_funk_fd_funk_filemap_h +/* fd_funk_filemap.h provides an experimental API to access a funk DB + via mmap(2) of a regular file. Useful to test large funk DBs without + much memory, but not suitable for production use. */ + #include "fd_funk.h" -enum fd_funk_file_mode { - FD_FUNK_READONLY, /* Only open the file if it already exists, memory is marked readonly */ - FD_FUNK_READ_WRITE, /* Only open the file if it already exists, can be written to */ - FD_FUNK_CREATE, /* Use an existing file if available, otherwise create */ - FD_FUNK_OVERWRITE, /* Create new or overwrite existing with a fresh instance */ - FD_FUNK_CREATE_EXCL /* Fail if file exists, only create new */ +/* fd_funk_filemap_join_t describes a join to a file-backed funk + instance. */ + +struct fd_funk_filemap_join { + fd_funk_t funk[1]; + int funk_fd; + void * map_start; + ulong map_size; }; -typedef enum fd_funk_file_mode fd_funk_file_mode_t; -/* fd_funk_close_file_args_t contains the parameters needed by - * fd_funk_close_file. It is initialized in fd_funk_open_file. */ +typedef struct fd_funk_filemap_join fd_funk_filemap_join_t; + +/* Arguments for creating a funk filemap. */ -struct fd_funk_close_file_args { - void * shmem; - int fd; - ulong total_sz; +struct fd_funk_filemap_create_args { + ulong wksp_tag; /* partition tag for workspace allocations (arbitrary, usually just 1) */ + ulong seed; /* randomized funk hash seed */ + ulong txn_max; /* maximum number of funk transactions */ + ulong rec_max; /* maximum number of funk records */ + ulong total_sz; /* ignored if restoring from a checkpoint */ + int perm_bits; /* third argument to open(2) */ + + /* name of the temporary shm object to be registered via + fd_shmem_join_anonymous */ + char shmem_join_name[ FD_SHMEM_NAME_MAX ]; }; -typedef struct fd_funk_close_file_args fd_funk_close_file_args_t; - -/* Open or create a funk instance with an optional mmap backing file. - filename is the backing file, or NULL for a local/anonymous - instance. wksp_tag is the workspace partition tag for funk (usually - just 1). seed is the randomized hash seed. txn_max is the maximum - number of funk transactions. rec_max is the maximum number of funk - records. total_sz is the total size of the funk workspace. mode is - the file mode (see above). close_args_opt is an optional pointer to a - structure which is filled in. This is needed for fd_funk_close_file. - - Note that seed, txn_max, rec_max, and total_sz are ignored if - an existing file is opened without being overwritten. */ - -fd_funk_t * -fd_funk_open_file( void * ljoin, - const char * filename, - ulong wksp_tag, - ulong seed, - ulong txn_max, - ulong rec_max, - ulong total_sz, - fd_funk_file_mode_t mode, - fd_funk_close_file_args_t * close_args_out ); + +typedef struct fd_funk_filemap_create_args fd_funk_filemap_create_args_t; + +FD_PROTOTYPES_BEGIN + +/* fd_funk_filemap_create creates a funk instance backed by a file. + file_path is the path to the file to create. Returns 0 on success. + On failure, returns an errno-compatible code and logs a warning. + Reasons for failure include: file_path already exists, not enough + space on file system, permission error, invalid (wksp_tag, txn_max, + rec_max, total_sz). */ + +int +fd_funk_filemap_create( char const * file_path, + fd_funk_filemap_create_args_t const * args, + char const * shmem_join_name ); + +/* fd_funk_filemap_open joins a previously created file-backed funk + instance using mmap(2). file_path is the path of a funk workspace + file previously created with fd_funk_filemap_create. map_hint is + passed as the first argument of mmap(2). If read_write=={0,1}, the + file and mapping is opened as {read-only,read-write}. Returns ljoin + populated with join info on success. On failure, logs warning, + returns NULL and leaves ljoin in an undefined state. Reasons for + failure include: error opening file, error mapping memory, corrupt + workspace headers, or corrupt funk. + + Security: This API is not hardened against malicious funk instances. + Attempting to access a corrupt funk file can result in memory + corruption. */ + +fd_funk_filemap_join_t * +fd_funk_filemap_open( fd_funk_filemap_join_t * ljoin, + char const * file_path, + void * map_hint, + int read_write ); /* Load a workspace checkpoint containing a funk instance. funk_filename is the backing file, or NULL for a @@ -52,17 +77,17 @@ fd_funk_open_file( void * ljoin, file. close_args_opt is an optional pointer to a structure which is filled in. This is needed for fd_funk_close_file. */ -fd_funk_t * -fd_funk_recover_checkpoint( void * ljoin, - const char * funk_filename, +int +fd_funk_recover_checkpoint( char const * funk_filename, ulong wksp_tag, - const char * checkpt_filename, - fd_funk_close_file_args_t * close_args_out ); + char const * checkpt_filename ); /* Release the resources associated with a funk file map. The funk pointer is invalid after this is called. */ void -fd_funk_close_file( fd_funk_close_file_args_t * close_args ); +fd_funk_filemap_close( fd_funk_filemap_join_t * close_args ); + +FD_PROTOTYPES_END #endif /* HEADER_fd_src_funk_fd_funk_filemap_h */ diff --git a/src/util/archive/fd_tar.h b/src/util/archive/fd_tar.h index 09de84c442..5aa10aadf7 100644 --- a/src/util/archive/fd_tar.h +++ b/src/util/archive/fd_tar.h @@ -84,7 +84,7 @@ fd_tar_set_octal( char buf[ static 12 ], ulong val ); /* fd_tar_meta_set_size sets the size field. Returns 1 on success, 0 - if sz is too large to be represented in TAR header. Set size using the + if sz is too large to be represented in TAR header. Set size using the OLDGNU size extension to allow for unlimited file sizes. The first byte must be 0x80 followed by 0s and then the size in binary. */ @@ -240,30 +240,30 @@ fd_tar_read( void * reader, 2. Write out file data with fd_tar_writer_write_file_data( writer, data, data_sz ). This can be done as many times as you want. 3. Finish the current file with fd_tar_writer_fini_file( writer ). - - When you are done, call fd_tar_writer_delete( writer ) to write out the + + When you are done, call fd_tar_writer_delete( writer ) to write out the tar archive trailer and close otu the file descriptor. - If you want to reserve space for an existing file and write back to it + If you want to reserve space for an existing file and write back to it at some point in the future see the below comments for fd_tar_writer_{make,fill}_space(). - + */ struct fd_tar_writer { int fd; /* The file descriptor for the tar archive. */ ulong header_pos; /* The position in the file for the current files header. - If there is no current file that is being streamed out, + If there is no current file that is being streamed out, the header_pos will be equal to ULONG_MAX. */ ulong data_sz; /* The size of the current files data. If there is no current file that is being streamed out, the data_sz will be equal to ULONG_MAX. */ ulong wb_pos; /* If this value is not equal to ULONG_MAX that means that - this is the position at which to write back to with a + this is the position at which to write back to with a call to fd_tar_writer_fill_space. */ - /* TODO: Right now, the stream to the tar writer just uses fd_io_write. + /* TODO: Right now, the stream to the tar writer just uses fd_io_write. This can eventually be abstracted to use write callbacks that use - fd_io streaming under the hood. This adds some additional complexity + fd_io streaming under the hood. This adds some additional complexity that's related to writing back into the header: if the header is still in the ostream buf, modify the buffer. Otherwise, read the header directly from the file. */ @@ -333,15 +333,15 @@ fd_tar_writer_fini_file( fd_tar_writer_t * writer ); /* fd_tar_writer_make_space and fd_tar_writer_fill_space, allow for writing back to a specific place in the tar stream. This can be used by first making a call to fd_tar_write_new_file, fd_tar_writer_make_space, and - fd_tar_writer_fini_file. This will populate the header and write out + fd_tar_writer_fini_file. This will populate the header and write out random bytes. The start of this data file will be saved by the tar writer. - Up to n data files can be appended to the tar archive before a call to + Up to n data files can be appended to the tar archive before a call to fd_tar_writer_fill_space. fd_tar_writer_fill_space should only be called after an unpaired call to fd_tar_writer_make_space and it requires a valid fd_tar_writer_t handle. It allows the user to write back to the point at which they made space. _make_space and _fill_space should be paired together. There can only be one oustanding call to make_space at a time. - + TODO: This can be extended to support multiple write backs. */ int From d02fe78de073fce20975216623af05f5a0b0102b Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Fri, 9 May 2025 09:29:19 -0500 Subject: [PATCH 03/34] working unzstd tile but needs cleanup (#5061) --- .../firedancer-dev/commands/snapshot_load.c | 26 +- src/app/firedancer-dev/main.c | 2 + src/discof/restore/Local.mk | 2 + src/discof/restore/fd_actalc_tile.c | 2 +- src/discof/restore/fd_filerd_tile.c | 1 - src/discof/restore/fd_frag_reader.h | 127 +++++ src/discof/restore/fd_restore_base.h | 8 + src/discof/restore/fd_snapin_tile.c | 33 +- src/discof/restore/fd_stream_reader.h | 100 ++++ src/discof/restore/fd_stream_writer.c | 73 +++ src/discof/restore/fd_stream_writer.h | 150 ++++++ src/discof/restore/fd_unzstd_tile.c | 440 ++++++++++++++++++ 12 files changed, 942 insertions(+), 22 deletions(-) create mode 100644 src/discof/restore/fd_frag_reader.h create mode 100644 src/discof/restore/fd_stream_reader.h create mode 100644 src/discof/restore/fd_stream_writer.c create mode 100644 src/discof/restore/fd_stream_writer.h diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 2d0582e29b..3aca227766 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -42,22 +42,35 @@ snapshot_load_topo( config_t * config, FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==sizeof(args->snapshot_load.snapshot_path), abi ); FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==PATH_MAX, abi ); + fd_topob_wksp( topo, "Unzstd" ); + fd_topo_tile_t * unzstd_tile = fd_topob_tile( topo, "Unzstd", "Unzstd", "Unzstd", tile_to_cpu[2], 0, 0 ); + (void)unzstd_tile; + fd_topob_wksp( topo, "SnapIn" ); - fd_topo_tile_t * snapin_tile = fd_topob_tile( topo, "SnapIn", "SnapIn", "SnapIn", tile_to_cpu[2], 0, 0 ); + fd_topo_tile_t * snapin_tile = fd_topob_tile( topo, "SnapIn", "SnapIn", "SnapIn", tile_to_cpu[3], 0, 0 ); snapin_tile->snapin.scratch_sz = (3UL<<30); fd_topob_wksp( topo, "ActAlc" ); - fd_topo_tile_t * actalc_tile = fd_topob_tile( topo, "ActAlc", "ActAlc", "ActAlc", tile_to_cpu[3], 0, 0 ); + fd_topo_tile_t * actalc_tile = fd_topob_tile( topo, "ActAlc", "ActAlc", "ActAlc", tile_to_cpu[4], 0, 0 ); (void)actalc_tile; + fd_topob_wksp( topo, "snap_unzstd" ); fd_topob_wksp( topo, "snap_stream" ); + fd_topo_link_t * unzstd_link = fd_topob_link( topo, "snap_unzstd", "snap_unzstd", 512UL, 0UL, 0UL ); fd_topo_link_t * snapin_link = fd_topob_link( topo, "snap_stream", "snap_stream", 512UL, 0UL, 0UL ); fd_topo_obj_t * snapin_dcache = fd_topob_obj( topo, "dcache", "snap_stream" ); + fd_topo_obj_t * unzstd_dcache = fd_topob_obj( topo, "dcache", "snap_unzstd"); + unzstd_link->dcache_obj_id = unzstd_dcache->id; snapin_link->dcache_obj_id = snapin_dcache->id; FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", snapin_dcache->id ) ); - fd_topob_tile_out ( topo, "FileRd", 0UL, "snap_stream", 0UL ); + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", unzstd_dcache->id ) ); + fd_topob_tile_out ( topo, "FileRd", 0UL, "snap_unzstd", 0UL ); + fd_topob_tile_in (topo, "Unzstd", 0UL, "metric_in", "snap_unzstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED); + fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); fd_topob_tile_in ( topo, "SnapIn", 0UL, "metric_in", "snap_stream", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - fd_topob_tile_uses( topo, filerd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, filerd_tile, unzstd_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_uses( topo, unzstd_tile, unzstd_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, snapin_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); fd_topob_tile_uses( topo, actalc_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); @@ -123,11 +136,13 @@ snapshot_load_cmd_fn( args_t * args, fd_topo_tile_t * file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; fd_topo_tile_t * snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; + fd_topo_tile_t * unzstd_tile = &topo->tiles[ fd_topo_find_tile( topo, "Unzstd", 0UL ) ]; ulong * snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; ulong * snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); ulong volatile * file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); ulong volatile * snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); + ulong volatile * unzstd_in_metrics = fd_metrics_tile( unzstd_tile->metrics ); ulong goff_old = 0UL; ulong file_rd_backp_old = 0UL; @@ -139,7 +154,8 @@ snapshot_load_cmd_fn( args_t * args, ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); - if( FD_UNLIKELY( filerd_status==2UL || snapin_status==2UL ) ) { + ulong unzstd_status = FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); + if( FD_UNLIKELY( filerd_status==2UL || snapin_status==2UL || unzstd_status==2UL ) ) { FD_LOG_NOTICE(( "Done" )); break; } diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index 866fc62a47..3010cb9ea9 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -98,6 +98,7 @@ extern fd_topo_run_tile_t fd_tile_archiver_playback; extern fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd; extern fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn; extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, @@ -134,6 +135,7 @@ fd_topo_run_tile_t * TILES[] = { &fd_tile_snapshot_restore_FileRd, &fd_tile_snapshot_restore_SnapIn, &fd_tile_snapshot_restore_ActAlc, + &fd_tile_snapshot_restore_Unzstd, NULL, }; diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index ab4a7d08c5..7937fc2bb8 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -1,3 +1,5 @@ $(call add-objs,fd_filerd_tile,fd_discof) $(call add-objs,fd_snapin_tile,fd_discof) $(call add-objs,fd_actalc_tile,fd_discof) +$(call add-objs,fd_stream_writer,fd_discof) +$(call add-objs,fd_unzstd_tile,fd_discof) diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index d6e8ee0cc7..bc45df5101 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -80,7 +80,7 @@ on_stream_frag( fd_actalc_tile_t * ctx, fd_stream_frag_meta_t const * frag, ulong * read_sz ) { (void)ctx; (void)in; (void)frag; (void)read_sz; - FD_LOG_NOTICE(( "frag" )); + // FD_LOG_NOTICE(( "frag" )); return 1; } diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c index 8df32496bb..1ea69b9d21 100644 --- a/src/discof/restore/fd_filerd_tile.c +++ b/src/discof/restore/fd_filerd_tile.c @@ -99,7 +99,6 @@ after_credit( fd_filerd_tile_t * ctx, ulong * restrict cr_byte_avail, int * restrict charge_busy_after ) { /* Assumes *cr_frag_avail>=2 */ - int fd = ctx->fd; if( FD_UNLIKELY( fd<0 ) ) return; diff --git a/src/discof/restore/fd_frag_reader.h b/src/discof/restore/fd_frag_reader.h new file mode 100644 index 0000000000..9254fec21c --- /dev/null +++ b/src/discof/restore/fd_frag_reader.h @@ -0,0 +1,127 @@ +#ifndef HEADER_fd_src_discof_restore_fd_frag_reader_h +#define HEADER_fd_src_discof_restore_fd_frag_reader_h + +#include "../../disco/stem/fd_stem.h" +#include "../../disco/metrics/fd_metrics.h" + +struct __attribute__((aligned(64))) fd_frag_reader { + fd_frag_meta_t const * mcache; /* local join to this in's mcache */ + uint depth; /* == fd_mcache_depth( mcache ), depth of this in's cache (const) */ + uint idx; /* index of this in in the list of providers, [0, in_cnt) */ + ulong seq; /* sequence number of next frag expected from the upstream producer, + updated when frag from this in is published */ + fd_frag_meta_t const * mline; /* == mcache + fd_mcache_line_idx( seq, depth ), location to poll next */ + ulong * fseq; /* local join to the fseq used to return flow control credits to the in */ + uint accum[6]; /* local diagnostic accumulators. These are drained during in housekeeping. */ + /* Assumes FD_FSEQ_DIAG_{PUB_CNT,PUB_SZ,FILT_CNT,FILT_SZ,OVRNP_CNT,OVRNP_FRAG_CNT} are 0:5 */ +}; +typedef struct fd_frag_reader fd_frag_reader_t; + +struct fd_frag_reader_consume_ctx { + ulong seq_found; /* the seq num at the current mline */ + ulong seq_curr; /* the seq num in the stream reader */ + fd_frag_meta_t const * mline; /* current mline being consumed */ + ulong in_idx; /* link idx being polled */ +}; +typedef struct fd_frag_reader_consume_ctx fd_frag_reader_consume_ctx_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_frag_reader_align( void ) { + return alignof(fd_frag_reader_t); +} + +FD_FN_CONST static inline ulong +fd_frag_reader_footprint( void ) { + return sizeof(fd_frag_reader_t); +} + +static inline void +fd_frag_reader_init( fd_frag_reader_t * reader, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + reader->mcache = mcache; + reader->fseq = fseq; + ulong depth = fd_mcache_depth( reader->mcache ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", in_idx )); + reader->depth = (uint)depth; + reader->idx = (uint)in_idx; + reader->seq = 0UL; + reader->mline = reader->mcache + fd_mcache_line_idx( reader->seq, reader->depth ); + + reader->accum[0] = 0U; reader->accum[1] = 0U; reader->accum[2] = 0U; + reader->accum[3] = 0U; reader->accum[4] = 0U; reader->accum[5] = 0U; +} + +static inline fd_frag_reader_t * +fd_frag_reader_new( void * mem, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_frag_reader_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_frag_reader_t * self = (fd_frag_reader_t *)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_frag_reader_t), sizeof(fd_frag_reader_t) ); + + fd_frag_reader_init( self, mcache, fseq, in_idx ); + return self; +} + +static inline long +fd_frag_reader_poll_frag( fd_frag_reader_t * reader, + ulong in_idx, + fd_frag_reader_consume_ctx_t * ctx ) { + ctx->seq_curr = reader->seq; + ctx->mline = reader->mline; + ctx->in_idx = in_idx; + ctx->seq_found = fd_frag_meta_seq_query( ctx->mline ); + return fd_seq_diff( ctx->seq_curr, ctx->seq_found ); +} + +static inline void +fd_frag_reader_process_overrun( fd_frag_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx, + long seq_diff ) { + reader->seq = ctx->seq_curr; + reader->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + reader->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-seq_diff); +} + +static inline void +fd_frag_reader_consume_frag( fd_frag_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx, + ulong frag_sz ) { + reader->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)frag_sz; + + /* check for overrun: when sequence number has changed */ + ulong seq_test = fd_frag_meta_seq_query( ctx->mline ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, ctx->seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu", ctx->in_idx )); + } + + /* wind up for next in poll and accumulate diagnostics */ + ctx->seq_curr = fd_seq_inc( ctx->seq_curr, 1UL ); + reader->seq = ctx->seq_curr; + reader->mline = reader->mcache + fd_mcache_line_idx( ctx->seq_curr, reader->depth ); + reader->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; +} + +static inline void * +fd_frag_reader_destroy( fd_frag_reader_t * reader ) { + fd_memset( reader, 0, sizeof(fd_frag_reader_t) ); + return (void *)reader; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_fd_frag_reader_h */ \ No newline at end of file diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index fa4f6c13a1..dd26a4c277 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -26,6 +26,14 @@ union fd_stream_frag_meta { typedef union fd_stream_frag_meta fd_stream_frag_meta_t; +struct fd_stream_frag_meta_ctx { + uchar const * in_buf; + ulong goff_translate; + ulong loff_translate; + ulong in_skip; +}; +typedef struct fd_stream_frag_meta_ctx fd_stream_frag_meta_ctx_t; + FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index e154756411..a410f03284 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -576,8 +576,9 @@ unprivileged_init( fd_topo_t * topo, /* Join stream input */ - ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; - ctx->in_skip = 0UL; + uchar const * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ); + ctx->in_base = out_dcache; + ctx->in_skip = 0UL; /* Join frame buffer */ @@ -745,6 +746,7 @@ on_stream_frag( fd_snapin_tile_t * ctx, return 0; } + (void)in; uchar const * const chunk0 = ctx->in_base + frag->loff; uchar const * const chunk1 = chunk0 + frag->sz; uchar const * const start = chunk0 + ctx->in_skip; @@ -801,15 +803,15 @@ fd_snapin_in_update( fd_snapin_in_t * in ) { accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; } -__attribute__((noreturn)) static void -fd_snapin_shutdown( void ) { - FD_MGAUGE_SET( TILE, STATUS, 2UL ); - /* FIXME set final sequence number */ - FD_COMPILER_MFENCE(); - FD_LOG_INFO(( "Finished parsing snapshot" )); +// __attribute__((noreturn)) static void +// fd_snapin_shutdown( void ) { +// FD_MGAUGE_SET( TILE, STATUS, 2UL ); +// /* FIXME set final sequence number */ +// FD_COMPILER_MFENCE(); +// FD_LOG_INFO(( "Finished parsing snapshot" )); - for(;;) pause(); -} +// for(;;) pause(); +// } __attribute__((noinline)) static void fd_snapin_run1( @@ -861,7 +863,7 @@ fd_snapin_run1( } FD_TEST( in_cnt==1 ); - ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( in[0].mcache->f ) + 3; + // ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( in[0].mcache->f ) + 3; /* out frag stream init */ @@ -934,10 +936,11 @@ fd_snapin_run1( fd_snapin_in_update( &in[ in_idx ] ); /* Input tile finished? */ - ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); - if( FD_UNLIKELY( in_seq_max == in[ 0 ].seq ) ) { - fd_snapin_shutdown(); - } + // ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); + // FD_LOG_WARNING(("snapin: in_seq_max is %lu", in_seq_max)); + // if( FD_UNLIKELY( in_seq_max == in[ 0 ].seq ) ) { + // fd_snapin_shutdown(); + // } } else { /* event_idx==cons_cnt, housekeeping event */ diff --git a/src/discof/restore/fd_stream_reader.h b/src/discof/restore/fd_stream_reader.h new file mode 100644 index 0000000000..75ee8f09f8 --- /dev/null +++ b/src/discof/restore/fd_stream_reader.h @@ -0,0 +1,100 @@ +#ifndef HEADER_fd_src_discof_restore_fd_stream_reader_h +#define HEADER_fd_src_discof_restore_fd_stream_reader_h + +#include "fd_restore_base.h" +#include "fd_frag_reader.h" + +struct fd_stream_reader { + union { + struct { + fd_stream_frag_meta_t const * mcache; + uint depth; + uint idx; + ulong seq; + fd_stream_frag_meta_t const * mline; + ulong volatile * fseq; + uint accum[6]; + }; + + fd_frag_reader_t r[1]; + } base; + ulong goff; +}; +typedef struct fd_stream_reader fd_stream_reader_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_stream_reader_align( void ) { + return alignof(fd_stream_reader_t); +} + +FD_FN_CONST static inline ulong +fd_stream_reader_footprint( void ) { + return sizeof(fd_stream_reader_t); +} + +static inline void +fd_stream_reader_init( fd_stream_reader_t * reader, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + fd_frag_reader_init( reader->base.r, mcache, fseq, in_idx ); + reader->goff = 0UL; +} + +static inline fd_stream_reader_t * +fd_stream_reader_new( void * mem, + fd_frag_meta_t const * mcache, + ulong * fseq, + ulong in_idx ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_reader_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_stream_reader_t * self = (fd_stream_reader_t *)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t), sizeof(fd_stream_reader_t) ); + + fd_stream_reader_init( self, mcache, fseq, in_idx ); + + return self; +} + +static inline long +fd_stream_reader_poll_frag( fd_stream_reader_t * reader, + ulong in_idx, + fd_frag_reader_consume_ctx_t * ctx ) { + return fd_frag_reader_poll_frag( reader->base.r, in_idx, ctx ); +} + +static inline void +fd_stream_reader_process_overrun( fd_stream_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx, + long seq_diff ) { + fd_frag_reader_process_overrun( reader->base.r, ctx, seq_diff ); +} + +static inline void +fd_stream_reader_consume_frag( fd_stream_reader_t * reader, + fd_frag_reader_consume_ctx_t * ctx, + ulong frag_sz ) { + reader->goff += frag_sz; + fd_frag_reader_consume_frag( reader->base.r, ctx, frag_sz ); +} + +static inline void * +fd_stream_reader_destroy( fd_stream_reader_t * reader ) { + fd_frag_reader_destroy( reader->base.r ); + reader->goff = 0UL; + return (void *)reader; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_fd_stream_reader_h */ \ No newline at end of file diff --git a/src/discof/restore/fd_stream_writer.c b/src/discof/restore/fd_stream_writer.c new file mode 100644 index 0000000000..0dbfbf3d76 --- /dev/null +++ b/src/discof/restore/fd_stream_writer.c @@ -0,0 +1,73 @@ +#include "fd_stream_writer.h" +#include "../../util/log/fd_log.h" +#include "../../util/wksp/fd_wksp.h" +#include "../../tango/dcache/fd_dcache.h" + +fd_stream_writer_t * +fd_stream_writer_new( void * mem, + fd_topo_t * topo, + fd_topo_tile_t * tile, + ulong link_id, + ulong read_max, + ulong burst_byte, + ulong burst_frag ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_writer_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_stream_writer_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_writer_t), sizeof(fd_stream_writer_t) ); + + fd_topo_link_t const * link = &topo->links[ tile->out_link_id[ link_id ] ]; + void * dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ link_id ] ].dcache_obj_id ) ); + fd_stream_frag_meta_t * out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ link_id ] ].mcache ); + ulong cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, link ); + + self->out_mcache = out_mcache; + self->buf = dcache; + self->buf_off = 0UL; + self->buf_sz = fd_dcache_data_sz( dcache ); + self->goff = 0UL; + self->read_max = read_max; + self->stream_off = 0UL; + self->out_seq = 0UL; + + /* Set up flow control state */ + self->cr_byte_avail = 0UL; + self->cr_frag_avail = 0UL; + self->cr_byte_max = fd_dcache_data_sz( dcache ); + self->cr_frag_max = fd_mcache_depth( self->out_mcache->f ); + self->burst_byte = burst_byte; + self->burst_frag = burst_frag; + self->cons_cnt = cons_cnt; + self->cons_seq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), EXPECTED_FSEQ_CNT_PER_CONS*cons_cnt*sizeof(ulong) ); + self->cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); + + /* Set up consumer fseq pointer array. + We keep track of 2 fseqs per consumer to manage stream flow control. + The first fseq tracks the consumer's mcache sequence number. + The second fseq tracks the consumer's global read offset into stream. */ + ulong cons_idx = 0UL; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ link_id ] && consumer_tile->in_link_reliable[ j ] ) ) { + self->cons_fseq[ cons_idx ] = consumer_tile->in_link_fseq[ j ]; + if( FD_UNLIKELY( !self->cons_fseq[ cons_idx ] ) ) FD_LOG_ERR(( "NULL cons_fseq[%lu]", cons_idx )); + cons_idx++; + } + } + } + + fd_memset(self->cons_seq, 0, EXPECTED_FSEQ_CNT_PER_CONS*cons_cnt*sizeof(ulong) ); + /* make sure we're not tripping */ + FD_TEST( cons_idx==cons_cnt ); + + return self; +} diff --git a/src/discof/restore/fd_stream_writer.h b/src/discof/restore/fd_stream_writer.h new file mode 100644 index 0000000000..8fb0238760 --- /dev/null +++ b/src/discof/restore/fd_stream_writer.h @@ -0,0 +1,150 @@ +#ifndef HEADER_fd_src_discof_restore_fd_stream_writer_h +#define HEADER_fd_src_discof_restore_fd_stream_writer_h + +#include "../../util/fd_util_base.h" +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" + +/* A shared stream has a single producer and multiple consumers. + fd_stream_writer implements the producer APIs of the shared stream */ +struct fd_stream_writer { + fd_stream_frag_meta_t * out_mcache; /* frag producer mcache */ + + uchar * buf; /* laddr of shared dcache buffer */ + + /* dcache buffer state */ + ulong buf_off; /* local write offset into dcache buffer */ + ulong buf_sz; /* dcache buffer size */ + ulong goff; /* global offset into byte stream */ + ulong read_max; /* max chunk size */ + ulong stream_off; /* start of published stream */ + ulong out_seq; /* current sequence number */ + + /* flow control */ + ulong cr_byte_avail; /* bytes available in the slowest consumer */ + ulong cr_frag_avail; /* frags available in the slowest consumer */ + ulong cr_byte_max; /* max dcache buffer credits (size of dcache buffer)*/ + ulong cr_frag_max; /* max mcache frag credits */ + ulong burst_byte; + ulong burst_frag; + ulong cons_cnt; /* number of consumers */ + ulong * cons_seq; /* consumer fseq values */ + ulong ** cons_fseq; /* consumer fseq pointers */ +}; +typedef struct fd_stream_writer fd_stream_writer_t; + +#define EXPECTED_FSEQ_CNT_PER_CONS 2 + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_stream_writer_align( void ) { + return alignof(fd_stream_writer_t); +} + +FD_FN_CONST static inline ulong +fd_stream_writer_footprint( void ) { + return sizeof(fd_stream_writer_t); +} + +static inline uchar * +fd_stream_writer_get_write_ptr( fd_stream_writer_t * writer ) { + return writer->buf + writer->buf_off; +} + +fd_stream_writer_t * +fd_stream_writer_new( void * mem, + fd_topo_t * topo, + fd_topo_tile_t * tile, + ulong link_id, + ulong read_max, + ulong burst_byte, + ulong burst_frag ); + +static inline void +fd_stream_writer_init_flow_control_credits( fd_stream_writer_t * writer ) { + for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { + writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][0] ); + writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][1] ); + } +} + +static inline void +fd_stream_writer_receive_flow_control_credits( fd_stream_writer_t * writer, + ulong cons_idx) { + FD_COMPILER_MFENCE(); + writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][0] ); + writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][1] ); + FD_COMPILER_MFENCE(); +} + +static inline void +fd_stream_writer_update_flow_control_credits( fd_stream_writer_t * writer, + ulong * slowest_cons_out ) { + ulong slowest_cons = ULONG_MAX; + if( FD_LIKELY( writer->cr_byte_availcr_byte_max || writer->cr_frag_availcr_frag_max ) ) { + ulong cr_byte_avail = writer->cr_byte_max; + ulong cr_frag_avail = writer->cr_frag_max; + for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { + ulong cons_cr_byte_avail = (ulong)fd_long_max( (long)writer->cr_byte_max-fd_long_max( fd_seq_diff( writer->goff, writer->cons_seq[ 2*cons_idx+1 ] ), 0L ), 0L ); + ulong cons_cr_frag_avail = (ulong)fd_long_max( (long)writer->cr_frag_max-fd_long_max( fd_seq_diff( writer->out_seq, writer->cons_seq[ 2*cons_idx ] ), 0L ), 0L ); + slowest_cons = fd_ulong_if( cons_cr_byte_availcr_byte_avail = cr_byte_avail; + writer->cr_frag_avail = cr_frag_avail; + } + + if( slowest_cons_out ) { + *slowest_cons_out = slowest_cons; + } +} + +static inline ulong +fd_stream_writer_get_avail_bytes( fd_stream_writer_t * writer ) { + if( FD_UNLIKELY( writer->buf_off > writer->buf_sz ) ) { + FD_LOG_CRIT(( "Buffer overflow (buf_off=%lu buf_sz=%lu)", writer->buf_off, writer->buf_sz )); + return 0; + } + + ulong const read_max = fd_ulong_min( writer->cr_byte_avail, writer->read_max ); + return fd_ulong_min( read_max, writer->buf_sz - writer->buf_off ); +} + +static inline void +fd_stream_writer_publish( fd_stream_writer_t * writer, + ulong frag_sz ) { + ulong loff = writer->stream_off; + fd_mcache_publish_stream( writer->out_mcache, + fd_mcache_depth( writer->out_mcache->f ), + writer->out_seq, + writer->goff, + loff, + frag_sz, + 0 ); + writer->out_seq = fd_seq_inc( writer->out_seq, 1UL ); + writer->cr_frag_avail -= 1; + + /* rewind buf_off to start of buffer */ + if( writer->buf_off >= writer->buf_sz ) { + writer->buf_off = 0UL; + } + + writer->stream_off = writer->buf_off; +} + +static inline void +fd_stream_writer_advance( fd_stream_writer_t * writer, + ulong sz ) { + writer->goff += sz; + writer->buf_off += sz; + writer->cr_byte_avail -= sz; +} + +/* TODO: destroy / free */ + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_fd_stream_writer_h */ \ No newline at end of file diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index e69de29bb2..4a6693faec 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -0,0 +1,440 @@ +#include "../../disco/topo/fd_topo.h" +#include "../../ballet/zstd/fd_zstd.h" +#include "fd_stream_writer.h" +#include "fd_stream_reader.h" +#include +#include + +#define NAME "unzstd" +#define ZSTD_WINDOW_SZ (33554432UL) +#define ZSTD_FRAME_SZ 16384UL +#define LINK_IN_MAX 1 + +struct fd_unzstd_tile { + fd_stream_frag_meta_ctx_t in_state; /* input mcache context */ + fd_zstd_dstream_t * dstream; /* zstd decompress reader */ + fd_stream_writer_t * writer; /* stream writer object */ +}; + +typedef struct fd_unzstd_tile fd_unzstd_tile_t; + +FD_FN_PURE static ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_unzstd_tile_t), fd_zstd_dstream_align() ); +} + +FD_FN_PURE static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); + l = FD_LAYOUT_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); + l = FD_LAYOUT_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint() ); + return l; +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + fd_unzstd_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); + void * zstd_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); + void * writer_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint() ); + + void * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ 0 ] ].dcache_obj_id ) ); + FD_TEST( out_dcache ); + + fd_memset( ctx, 0, sizeof(fd_unzstd_tile_t) ); + + ctx->in_state.in_buf = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; + ctx->dstream = fd_zstd_dstream_new( zstd_mem, ZSTD_WINDOW_SZ ); + ctx->writer = fd_stream_writer_new( writer_mem, topo, tile, 0, ZSTD_WINDOW_SZ, 512UL, 2UL ); + + fd_zstd_dstream_reset( ctx->dstream ); +} + +static void +during_housekeeping( fd_unzstd_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_unzstd_tile_t * ctx ) { + (void)ctx; +} + +static int +on_stream_frag( fd_unzstd_tile_t * ctx, + fd_stream_reader_t * reader FD_PARAM_UNUSED, + fd_stream_frag_meta_t const * frag, + ulong * sz ) { + uchar const * chunk0 = ctx->in_state.in_buf + frag->loff; + uchar const * chunk_start = chunk0 + ctx->in_state.in_skip; + uchar const * chunk_end = chunk0 + frag->sz; + + ulong total_decompressed = 0UL; + uint dirty = 0; + int consume_frag = 1; + for(;;) { + uchar const * prev_chunk_start = chunk_start; + + if( !dirty && chunk_start==chunk_end ) { + fd_stream_writer_publish( ctx->writer, total_decompressed ); + ctx->in_state.in_skip = 0UL; + break; + } + + uchar * buf_write_start = fd_stream_writer_get_write_ptr( ctx->writer ); + uchar * out = buf_write_start; + ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); + uchar * out_end = buf_write_start + dst_max; + + if( dst_max==0 ) { + /* we are blocked by downstream */ + fd_stream_writer_publish( ctx->writer, total_decompressed ); + // FD_LOG_WARNING(("we are blocked by downstream! consumed %lu bytes frag size is %u", ctx->in_state.in_skip, frag->sz)); + consume_frag=0; + break; + } + + int zstd_err = fd_zstd_dstream_read( ctx->dstream, &chunk_start, chunk_end, &out, out_end, NULL ); + if( FD_UNLIKELY( zstd_err>0) ) { + FD_LOG_WARNING(( "fd_zstd_dstream_read failed" )); + consume_frag=0; + break; + } + + ulong decompress_sz = (ulong)out - (ulong)buf_write_start; + total_decompressed += decompress_sz; + ctx->in_state.in_skip += (ulong)chunk_start - (ulong)prev_chunk_start; + dirty = (out==out_end); + + fd_stream_writer_advance( ctx->writer, decompress_sz ); + } + + *sz = frag->sz; + return consume_frag; +} + +static void +fd_unzstd_in_update( fd_stream_reader_t * in ) { + // FD_LOG_WARNING(("unzstd: in fseq is %lu", (ulong)in->base.fseq)); + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->base.fseq[0] ) = in->base.seq; + FD_VOLATILE( in->base.fseq[1] ) = in->goff; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->base.idx ); + + uint * accum = in->base.accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +__attribute__((noreturn)) static void +fd_unzstd_shutdown( void ) { + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + /* FIXME set final sequence number */ + FD_COMPILER_MFENCE(); + FD_LOG_INFO(( "Finished parsing snapshot" )); + + for(;;) pause(); +} + +/* ?? */ +__attribute__((noinline)) static void +fd_unzstd_run1( + fd_unzstd_tile_t * ctx, + ulong in_cnt, + fd_stream_reader_t * in, /* [in_cnt] */ + ulong out_cnt FD_PARAM_UNUSED, + fd_stream_frag_meta_t ** out_mcache_arr FD_PARAM_UNUSED, + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong * cons_out FD_PARAM_UNUSED, /* [cons_cnt] */ + ulong ** cons_fseq FD_PARAM_UNUSED, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow FD_PARAM_UNUSED, /* [cons_cnt] */ + ulong * restrict cons_seq FD_PARAM_UNUSED, /* [cons_cnt] */ + long lazy, + fd_rng_t * rng ) { + + /* in frag stream state */ + ulong in_seq; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; /* min number of ticks between a housekeeping event */ + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; + + /* out frag stream init */ + + ulong const burst_byte = 512UL; /* don't producing frags smaller than this */ + ulong const burst_frag = 2UL; + + fd_stream_writer_init_flow_control_credits( ctx->writer ); + + /* housekeeping init */ + + //if( lazy<=0L ) lazy = fd_tempo_lazy_default( out_depth ); + lazy = 1e3L; + FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy )); + ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( in[0].base.mcache->f ) + 3; + + /* Initial event sequence */ + + event_cnt = in_cnt + 1UL + cons_cnt; + event_seq = 0UL; + event_map[ event_seq++ ] = (ushort)cons_cnt; + for( ulong in_idx=0UL; in_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxwriter, cons_idx ); + + ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); + if( FD_UNLIKELY( in_seq_max == in[ 0 ].base.seq && in_seq_max != 0) ) { + FD_LOG_WARNING(("zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", in_seq_max, in[0].base.seq)); + fd_unzstd_shutdown(); + } + + } else if( event_idx>cons_cnt) { + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_unzstd_in_update( &in[ in_idx ] ); + } + else { /* event_idx==cons_cnt, housekeeping event */ + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + ulong slowest_cons = ULONG_MAX; + fd_stream_writer_update_flow_control_credits( ctx->writer, &slowest_cons ); + + if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*cons_slow[ slowest_cons ]) += metric_in_backp; + FD_COMPILER_MFENCE(); + } + + during_housekeeping( ctx ); + } + + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + + event_seq++; + if( FD_UNLIKELY( event_seq>=event_cnt ) ) { + event_seq = 0UL; + // ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + // ushort map_tmp = event_map[ swap_idx ]; + // event_map[ swap_idx ] = event_map[ 0 ]; + // event_map[ 0 ] = map_tmp; + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( ctx->writer->cr_byte_availwriter->cr_frag_availwriter->cr_byte_avail=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + fd_frag_reader_consume_ctx_t consume_ctx; + long diff = fd_stream_reader_poll_frag( this_in, in_seq, &consume_ctx ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + + fd_stream_reader_process_overrun( this_in, &consume_ctx, diff ); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + ulong sz = 0U; + int consumed_frag = on_stream_frag( ctx, this_in, fd_type_pun_const( consume_ctx.mline ), &sz ); + + if( FD_LIKELY( consumed_frag ) ) { + // FD_LOG_WARNING(("consuming frag with sz: %lu", sz)); + fd_stream_reader_consume_frag( this_in, &consume_ctx, sz ); + } + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +static void +fd_unzstd_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + fd_memset(in_mcache, 0, sizeof(fd_stream_frag_meta_t *)*LINK_IN_MAX); + fd_memset(in_fseq, 0, sizeof(ulong *)*LINK_IN_MAX ); + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + fd_stream_frag_meta_t * out_mcache[ tile->out_cnt ]; + for( ulong i=0UL; iout_cnt; i++ ) { + out_mcache[ i ] = fd_type_pun( topo->links[ tile->out_link_id[ i ] ].mcache ); + FD_TEST( out_mcache[ i ] ); + } + + ulong reliable_cons_cnt = 0UL; + ulong cons_out[ FD_TOPO_MAX_LINKS ]; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_out[ reliable_cons_cnt ] = k; + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cnttile_obj_id ); + FD_LOG_WARNING(("reliable_cons_count is %lu", reliable_cons_cnt)); + ushort event_map[ 1+reliable_cons_cnt+polled_in_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ 2*reliable_cons_cnt+1 ]; + + FD_LOG_WARNING(("event map is located at %lx", (ulong)event_map)); + FD_LOG_WARNING(("cons fseq is located at %lx", (ulong)cons_fseq)); + FD_LOG_WARNING(("cons seq is located at %lx", (ulong)cons_seq)); + + fd_unzstd_run1( ctx, + polled_in_cnt, + polled_in, + reliable_cons_cnt, + out_mcache, + reliable_cons_cnt, + event_map, + cons_out, + cons_fseq, + cons_slow, + cons_seq, + (ulong)10e3, + rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd = { + .name = "Unzstd", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_unzstd_run, +}; +#endif \ No newline at end of file From e6a0803b3f65bfbbb4ba7048e96703d9a3c763c8 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 00:19:06 +0000 Subject: [PATCH 04/34] fdctl: dedup topology code --- src/app/firedancer-dev/commands/backtest.c | 209 ++++----------------- src/app/firedancer-dev/commands/sim.c | 164 +++------------- src/app/firedancer/topology.c | 127 ++++++++----- src/app/firedancer/topology.h | 39 +++- src/disco/topo/fd_topo.h | 10 +- src/disco/topo/fd_topob.c | 4 +- src/discof/repair/fd_repair_tile.c | 2 +- src/discof/replay/fd_replay_tile.c | 10 +- 8 files changed, 186 insertions(+), 379 deletions(-) diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c index 1feb407b35..09ed889173 100644 --- a/src/app/firedancer-dev/commands/backtest.c +++ b/src/app/firedancer-dev/commands/backtest.c @@ -15,7 +15,7 @@ */ -#include "../../shared/commands/configure/configure.h" +#include "../../firedancer/topology.h" #include "../../shared/commands/run/run.h" /* initialize_workspaces */ #include "../../shared/fd_config.h" /* config_t */ #include "../../../disco/tiles.h" @@ -23,124 +23,12 @@ #include "../../../disco/topo/fd_topob.h" #include "../../../disco/topo/fd_pod_format.h" #include "../../../discof/geyser/fd_replay_notif.h" -#include "../../../flamenco/runtime/fd_runtime.h" #include "../../../flamenco/runtime/fd_txncache.h" -#include "../../../flamenco/snapshot/fd_snapshot_base.h" #include /* pause */ extern fd_topo_obj_callbacks_t * CALLBACKS[]; fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); -static fd_topo_obj_t * -setup_topo_runtime_pub( fd_topo_t * topo, - char const * wksp_name, - ulong mem_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "runtime_pub", wksp_name ); - FD_TEST( fd_pod_insertf_ulong( topo->props, mem_max, "obj.%lu.mem_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, 12UL, "obj.%lu.wksp_tag", obj->id ) ); - return obj; -} - -static fd_topo_obj_t * -setup_topo_txncache( fd_topo_t * topo, - char const * wksp_name, - ulong max_rooted_slots, - ulong max_live_slots, - ulong max_txn_per_slot, - ulong max_constipated_slots ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "txncache", wksp_name ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, max_rooted_slots, "obj.%lu.max_rooted_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_live_slots, "obj.%lu.max_live_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_txn_per_slot, "obj.%lu.max_txn_per_slot", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_constipated_slots, "obj.%lu.max_constipated_slots", obj->id ) ); - - return obj; -} - -#include -#include "../../../flamenco/runtime/fd_blockstore.h" -static fd_topo_obj_t * -setup_topo_blockstore( fd_topo_t * topo, - char const * wksp_name, - ulong shred_max, - ulong block_max, - ulong idx_max, - ulong txn_max, - ulong alloc_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "blockstore", wksp_name ); - - ulong seed; - FD_TEST( sizeof(ulong) == getrandom( &seed, sizeof(ulong), 0 ) ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, 1UL, "obj.%lu.wksp_tag", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, seed, "obj.%lu.seed", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, shred_max, "obj.%lu.shred_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, block_max, "obj.%lu.block_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, idx_max, "obj.%lu.idx_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, txn_max, "obj.%lu.txn_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, alloc_max, "obj.%lu.alloc_max", obj->id ) ); - - /* DO NOT MODIFY LOOSE WITHOUT CHANGING HOW BLOCKSTORE ALLOCATES INTERNAL STRUCTURES */ - - ulong blockstore_footprint = fd_blockstore_footprint( shred_max, block_max, idx_max, txn_max ) + alloc_max; - FD_TEST( fd_pod_insertf_ulong( topo->props, blockstore_footprint, "obj.%lu.loose", obj->id ) ); - - return obj; -} - -static void -setup_snapshots( config_t * config, - fd_topo_tile_t * tile ) { - uchar incremental_is_file, incremental_is_url; - if( strnlen( config->tiles.replay.incremental, PATH_MAX )>0UL ) { - incremental_is_file = 1U; - } else { - incremental_is_file = 0U; - } - if( strnlen( config->tiles.replay.incremental_url, PATH_MAX )>0UL ) { - incremental_is_url = 1U; - } else { - incremental_is_url = 0U; - } - if( FD_UNLIKELY( incremental_is_file && incremental_is_url ) ) { - FD_LOG_ERR(( "At most one of the incremental snapshot source strings in the configuration file under [tiles.replay.incremental] and [tiles.replay.incremental_url] may be set." )); - } - tile->replay.incremental_src_type = INT_MAX; - if( FD_LIKELY( incremental_is_url ) ) { - strncpy( tile->replay.incremental, config->tiles.replay.incremental_url, sizeof(tile->replay.incremental) ); - tile->replay.incremental_src_type = FD_SNAPSHOT_SRC_HTTP; - } - if( FD_UNLIKELY( incremental_is_file ) ) { - strncpy( tile->replay.incremental, config->tiles.replay.incremental, sizeof(tile->replay.incremental) ); - tile->replay.incremental_src_type = FD_SNAPSHOT_SRC_FILE; - } - - uchar snapshot_is_file, snapshot_is_url; - if( strnlen( config->tiles.replay.snapshot, PATH_MAX )>0UL ) { - snapshot_is_file = 1U; - } else { - snapshot_is_file = 0U; - } - if( strnlen( config->tiles.replay.snapshot_url, PATH_MAX )>0UL ) { - snapshot_is_url = 1U; - } else { - snapshot_is_url = 0U; - } - if( FD_UNLIKELY( snapshot_is_file && snapshot_is_url ) ) { - FD_LOG_ERR(( "At most one of the full snapshot source strings in the configuration file under [tiles.replay.snapshot] and [tiles.replay.snapshot_url] may be set." )); - } - tile->replay.snapshot_src_type = INT_MAX; - if( FD_LIKELY( snapshot_is_url ) ) { - strncpy( tile->replay.snapshot, config->tiles.replay.snapshot_url, sizeof(tile->replay.snapshot) ); - tile->replay.snapshot_src_type = FD_SNAPSHOT_SRC_HTTP; - } - if( FD_UNLIKELY( snapshot_is_file ) ) { - strncpy( tile->replay.snapshot, config->tiles.replay.snapshot, sizeof(tile->replay.snapshot) ); - tile->replay.snapshot_src_type = FD_SNAPSHOT_SRC_FILE; - } -} - static void backtest_topo( config_t * config ) { fd_topo_cpus_t cpus[1]; @@ -164,75 +52,19 @@ backtest_topo( config_t * config ) { /**********************************************************************/ fd_topob_wksp( topo, "metric" ); fd_topob_wksp( topo, "metric_in" ); - fd_topo_tile_t * metric_tile = fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); - if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) - FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); - metric_tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); /**********************************************************************/ /* Add the rocksdb tile to topo */ /**********************************************************************/ fd_topob_wksp( topo, "rocksdb" ); - fd_topo_tile_t * rocksdb_tile = fd_topob_tile( topo, "arch_b", "rocksdb", "metric_in", rocksdb_cpu_idx, 0, 0 ); - rocksdb_tile->archiver.end_slot = config->tiles.archiver.end_slot; - strncpy( rocksdb_tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); - if( FD_UNLIKELY( 0==strlen( rocksdb_tile->archiver.archiver_path ) ) ) { - FD_LOG_ERR(( "Rocksdb not found, check `archiver.archiver_path` in toml" )); - } else { - FD_LOG_NOTICE(( "Found rocksdb path from config: %s", rocksdb_tile->archiver.archiver_path )); - } + fd_topo_tile_t * rocksdb_tile = fd_topob_tile( topo, "arch_b", "rocksdb", "metric_in", rocksdb_cpu_idx, 0, 0 ); /**********************************************************************/ /* Add the replay tile to topo */ /**********************************************************************/ fd_topob_wksp( topo, "replay" ); fd_topo_tile_t * replay_tile = fd_topob_tile( topo, "replay", "replay", "metric_in", replay_cpu_idx, 0, 0 ); - replay_tile->replay.fec_max = config->tiles.shred.max_pending_shred_sets; - replay_tile->replay.max_vote_accounts = config->firedancer.runtime.limits.max_vote_accounts; - - /* specified by [tiles.replay] */ - - strncpy( replay_tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(replay_tile->replay.blockstore_file) ); - strncpy( replay_tile->replay.blockstore_checkpt, config->firedancer.blockstore.checkpt, sizeof(replay_tile->replay.blockstore_checkpt) ); - - replay_tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; - strncpy( replay_tile->replay.capture, config->tiles.replay.capture, sizeof(replay_tile->replay.capture) ); - strncpy( replay_tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(replay_tile->replay.funk_checkpt) ); - replay_tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - replay_tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - replay_tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( replay_tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(replay_tile->replay.funk_file) ); - replay_tile->replay.plugins_enabled = config->tiles.gui.enabled; - - if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) - && !strncmp( config->tiles.replay.snapshot, "", 1 ) ) ) { - fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); - } - strncpy( replay_tile->replay.genesis, config->tiles.replay.genesis, sizeof(replay_tile->replay.genesis) ); - - setup_snapshots( config, replay_tile ); - - strncpy( replay_tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(replay_tile->replay.slots_replayed) ); - strncpy( replay_tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(replay_tile->replay.status_cache) ); - strncpy( replay_tile->replay.cluster_version, config->tiles.replay.cluster_version, sizeof(replay_tile->replay.cluster_version) ); - replay_tile->replay.bank_tile_count = config->layout.bank_tile_count; - replay_tile->replay.exec_tile_count = config->firedancer.layout.exec_tile_count; - replay_tile->replay.writer_tile_cuont = config->firedancer.layout.writer_tile_count; - strncpy( replay_tile->replay.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(replay_tile->replay.tower_checkpt) ); - - replay_tile->replay.enable_features_cnt = config->tiles.replay.enable_features_cnt; - for( ulong i = 0; i < replay_tile->replay.enable_features_cnt; i++ ) { - strncpy( replay_tile->replay.enable_features[i], config->tiles.replay.enable_features[i], sizeof(replay_tile->replay.enable_features[i]) ); - } - - /* not specified by [tiles.replay] */ - - strncpy( replay_tile->replay.identity_key_path, config->paths.identity_key, sizeof(replay_tile->replay.identity_key_path) ); - replay_tile->replay.ip_addr = config->net.ip_addr; - replay_tile->replay.vote = config->firedancer.consensus.vote; - strncpy( replay_tile->replay.vote_account_path, config->paths.vote_account, sizeof(replay_tile->replay.vote_account_path) ); - replay_tile->replay.full_interval = config->tiles.batch.full_interval; - replay_tile->replay.incremental_interval = config->tiles.batch.incremental_interval; /**********************************************************************/ /* Add the executor tiles to topo */ @@ -397,7 +229,7 @@ backtest_topo( config_t * config ) { } /* root_slot_obj shared by replay and rocksdb tiles */ - fd_topob_wksp( topo, "root_slot" ); + fd_topob_wksp( topo, "root_slot" ); fd_topo_obj_t * root_slot_obj = fd_topob_obj( topo, "fseq", "root_slot" ); fd_topob_tile_uses( topo, replay_tile, root_slot_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, rocksdb_tile, root_slot_obj, FD_SHMEM_JOIN_MODE_READ_ONLY ); @@ -427,6 +259,29 @@ backtest_topo( config_t * config ) { fd_topob_tile_uses( topo, replay_tile, constipated_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FD_TEST( fd_pod_insertf_ulong( topo->props, constipated_obj->id, "constipate" ) ); + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + if( !strcmp( tile->name, "rocksdb" ) ) { + tile->archiver.end_slot = config->tiles.archiver.end_slot; + strncpy( tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); + if( FD_UNLIKELY( 0==strlen( tile->archiver.archiver_path ) ) ) { + FD_LOG_ERR(( "Rocksdb not found, check `archiver.archiver_path` in toml" )); + } else { + FD_LOG_NOTICE(( "Found rocksdb path from config: %s", tile->archiver.archiver_path )); + } + } else if( !fd_topo_configure_tile( tile, config ) ) { + FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + } + + /* Override */ + if( !strcmp( tile->name, "replay" ) ) { + tile->replay.enable_features_cnt = config->tiles.replay.enable_features_cnt; + for( ulong i = 0; i < tile->replay.enable_features_cnt; i++ ) { + strncpy( tile->replay.enable_features[i], config->tiles.replay.enable_features[i], sizeof(tile->replay.enable_features[i]) ); + } + } + } + /**********************************************************************/ /* Finish and print out the topo information */ /**********************************************************************/ @@ -436,7 +291,7 @@ backtest_topo( config_t * config ) { static void backtest_cmd_fn( args_t * args FD_PARAM_UNUSED, - config_t * config ) { + config_t * config ) { FD_LOG_NOTICE(( "Start to run the backtest cmd" )); backtest_topo( config ); @@ -459,13 +314,13 @@ backtest_cmd_fn( args_t * args FD_PARAM_UNUSED, static void backtest_cmd_perm( args_t * args FD_PARAM_UNUSED, - fd_cap_chk_t * chk FD_PARAM_UNUSED, - config_t const * config FD_PARAM_UNUSED ) {} + fd_cap_chk_t * chk FD_PARAM_UNUSED, + config_t const * config FD_PARAM_UNUSED ) {} static void backtest_cmd_args( int * pargc FD_PARAM_UNUSED, - char *** pargv FD_PARAM_UNUSED, - args_t * args FD_PARAM_UNUSED ) {} + char *** pargv FD_PARAM_UNUSED, + args_t * args FD_PARAM_UNUSED ) {} action_t fd_action_backtest = { .name = "backtest", diff --git a/src/app/firedancer-dev/commands/sim.c b/src/app/firedancer-dev/commands/sim.c index 0417dfc3fe..1b084e3968 100644 --- a/src/app/firedancer-dev/commands/sim.c +++ b/src/app/firedancer-dev/commands/sim.c @@ -17,78 +17,17 @@ a notification for the previous frag from storei_notif. */ +#include "../../firedancer/topology.h" #include "../../shared/commands/run/run.h" /* initialize_workspaces */ -#include "../../shared/fd_config.h" /* config_t */ #include "../../../disco/topo/fd_cpu_topo.h" /* fd_topo_cpus */ #include "../../../disco/topo/fd_topob.h" #include "../../../disco/topo/fd_pod_format.h" -#include "../../../flamenco/runtime/fd_runtime.h" #include "../../../flamenco/runtime/fd_txncache.h" #include /* pause */ extern fd_topo_obj_callbacks_t * CALLBACKS[]; fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); -/* setup_topo_txncache, setup_topo_runtime_pub and setup_topo_blockstore - are simply copied from fd_firedancer.c */ -static fd_topo_obj_t * -setup_topo_txncache( fd_topo_t * topo, - char const * wksp_name, - ulong max_rooted_slots, - ulong max_live_slots, - ulong max_txn_per_slot, - ulong max_constipated_slots ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "txncache", wksp_name ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, max_rooted_slots, "obj.%lu.max_rooted_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_live_slots, "obj.%lu.max_live_slots", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_txn_per_slot, "obj.%lu.max_txn_per_slot", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_constipated_slots, "obj.%lu.max_constipated_slots", obj->id ) ); - - return obj; -} - -static fd_topo_obj_t * -setup_topo_runtime_pub( fd_topo_t * topo, - char const * wksp_name, - ulong mem_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "runtime_pub", wksp_name ); - FD_TEST( fd_pod_insertf_ulong( topo->props, mem_max, "obj.%lu.mem_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, 12UL, "obj.%lu.wksp_tag", obj->id ) ); - return obj; -} - -#include -#include "../../../flamenco/runtime/fd_blockstore.h" -static fd_topo_obj_t * -setup_topo_blockstore( fd_topo_t * topo, - char const * wksp_name, - ulong shred_max, - ulong block_max, - ulong idx_max, - ulong txn_max, - ulong alloc_max ) { - fd_topo_obj_t * obj = fd_topob_obj( topo, "blockstore", wksp_name ); - - ulong seed; - FD_TEST( sizeof(ulong) == getrandom( &seed, sizeof(ulong), 0 ) ); - - FD_TEST( fd_pod_insertf_ulong( topo->props, 1UL, "obj.%lu.wksp_tag", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, seed, "obj.%lu.seed", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, shred_max, "obj.%lu.shred_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, block_max, "obj.%lu.block_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, idx_max, "obj.%lu.idx_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, txn_max, "obj.%lu.txn_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, alloc_max, "obj.%lu.alloc_max", obj->id ) ); - - /* DO NOT MODIFY LOOSE WITHOUT CHANGING HOW BLOCKSTORE ALLOCATES INTERNAL STRUCTURES */ - - ulong blockstore_footprint = fd_blockstore_footprint( shred_max, block_max, idx_max, txn_max ) + alloc_max; - FD_TEST( fd_pod_insertf_ulong( topo->props, blockstore_footprint, "obj.%lu.loose", obj->id ) ); - - return obj; -} - static void sim_topo( config_t * config ) { fd_topo_cpus_t cpus[1]; @@ -106,95 +45,24 @@ sim_topo( config_t * config ) { static_end_idx, }; - /**********************************************************************/ - /* Add the metric tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "metric" ); fd_topob_wksp( topo, "metric_in" ); - fd_topo_tile_t * metric_tile = fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); - if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) - FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); - metric_tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + fd_topob_tile( topo, "metric", "metric", "metric_in", metric_cpu_idx, 0, 0 ); - /**********************************************************************/ - /* Add the playback tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "playback" ); - fd_topo_tile_t * playback_tile = fd_topob_tile( topo, "arch_p", "playback", "metric_in", playback_cpu_idx, 0, 0 ); - strncpy( playback_tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); - if( FD_UNLIKELY( 0==strlen( playback_tile->archiver.archiver_path ) ) ) { - FD_LOG_ERR(( "Archive file not found for playback" )); - } else { - FD_LOG_NOTICE(( "Found archive file from config: %s", playback_tile->archiver.archiver_path )); - } + fd_topob_tile( topo, "arch_p", "playback", "metric_in", playback_cpu_idx, 0, 0 ); - /**********************************************************************/ - /* Add the storei tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "storei" ); fd_topo_tile_t * storei_tile = fd_topob_tile( topo, "storei", "storei", "metric_in", storei_cpu_idx, 0, 0 ); - strncpy( storei_tile->store_int.blockstore_file, config->firedancer.blockstore.file, sizeof(storei_tile->store_int.blockstore_file) ); - strncpy( storei_tile->store_int.blockstore_restore, config->firedancer.blockstore.restore, sizeof(storei_tile->store_int.blockstore_restore) ); - strncpy( storei_tile->store_int.identity_key_path, config->paths.identity_key, sizeof(storei_tile->store_int.identity_key_path) ); - strncpy( storei_tile->store_int.slots_pending, config->tiles.store_int.slots_pending, sizeof( storei_tile->store_int.slots_pending ) ); - strncpy( storei_tile->store_int.shred_cap_archive, config->tiles.store_int.shred_cap_archive, sizeof(storei_tile->store_int.shred_cap_archive) ); - strncpy( storei_tile->store_int.shred_cap_replay, config->tiles.store_int.shred_cap_replay, sizeof(storei_tile->store_int.shred_cap_replay) ); - storei_tile->store_int.shred_cap_end_slot = config->tiles.store_int.shred_cap_end_slot; - storei_tile->store_int.expected_shred_version = config->consensus.expected_shred_version; - /**********************************************************************/ - /* Add the replay tile to topo */ - /**********************************************************************/ fd_topob_wksp( topo, "replay" ); fd_topo_tile_t * replay_tile = fd_topob_tile( topo, "replay", "replay", "metric_in", replay_cpu_idx, 0, 0 ); - replay_tile->replay.fec_max = config->tiles.shred.max_pending_shred_sets; - replay_tile->replay.max_vote_accounts = config->firedancer.runtime.limits.max_vote_accounts; - - /* specified by [tiles.replay] */ - strncpy( replay_tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(replay_tile->replay.blockstore_file) ); - strncpy( replay_tile->replay.blockstore_checkpt, config->firedancer.blockstore.checkpt, sizeof(replay_tile->replay.blockstore_checkpt) ); - - replay_tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; - strncpy( replay_tile->replay.capture, config->tiles.replay.capture, sizeof(replay_tile->replay.capture) ); - strncpy( replay_tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(replay_tile->replay.funk_checkpt) ); - replay_tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - replay_tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - replay_tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( replay_tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(replay_tile->replay.funk_file) ); - replay_tile->replay.plugins_enabled = 0; - - if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) - && !strncmp( config->tiles.replay.snapshot, "", 1 ) ) ) { - fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); - } - strncpy( replay_tile->replay.genesis, config->tiles.replay.genesis, sizeof(replay_tile->replay.genesis) ); - - strncpy( replay_tile->replay.incremental, config->tiles.replay.incremental, sizeof(replay_tile->replay.incremental) ); - strncpy( replay_tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(replay_tile->replay.slots_replayed) ); - strncpy( replay_tile->replay.snapshot, config->tiles.replay.snapshot, sizeof(replay_tile->replay.snapshot) ); - strncpy( replay_tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(replay_tile->replay.status_cache) ); - - strncpy( replay_tile->replay.cluster_version, config->tiles.replay.cluster_version, sizeof(replay_tile->replay.cluster_version) ); - replay_tile->replay.bank_tile_count = config->layout.bank_tile_count; - replay_tile->replay.exec_tile_count = config->firedancer.layout.exec_tile_count; - strncpy( replay_tile->replay.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(replay_tile->replay.tower_checkpt) ); - - /* not specified by [tiles.replay] */ - strncpy( replay_tile->replay.identity_key_path, config->paths.identity_key, sizeof(replay_tile->replay.identity_key_path) ); - replay_tile->replay.ip_addr = config->net.ip_addr; - replay_tile->replay.vote = config->firedancer.consensus.vote; - strncpy( replay_tile->replay.vote_account_path, config->paths.vote_account, sizeof(replay_tile->replay.vote_account_path) ); - replay_tile->replay.full_interval = config->tiles.batch.full_interval; - replay_tile->replay.incremental_interval = config->tiles.batch.incremental_interval; #define FOR(cnt) for( ulong i=0UL; ifiredancer.layout.exec_tile_count; - FOR(exec_tile_cnt) fd_topob_tile( topo, "exec", "exec", "metric_in", static_end_idx+i, 0, 0 ); + ulong exec_tile_cnt = config->firedancer.layout.exec_tile_count; + FOR(exec_tile_cnt) fd_topob_tile( topo, "exec", "exec", "metric_in", static_end_idx+i, 0, 0 ); /**********************************************************************/ /* Setup playback<->storei and storei<->replay links in topo */ @@ -307,6 +175,28 @@ sim_topo( config_t * config ) { FD_TEST( fd_pod_insertf_ulong( topo->props, exec_fseq_obj->id, "exec_fseq.%lu", i ) ); } + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + if( !strcmp( tile->name, "arch_p" ) ) { + strncpy( tile->archiver.archiver_path, config->tiles.archiver.archiver_path, PATH_MAX ); + if( FD_UNLIKELY( 0==strlen( tile->archiver.archiver_path ) ) ) { + FD_LOG_ERR(( "Archive file not found for playback" )); + } else { + FD_LOG_NOTICE(( "Found archive file from config: %s", tile->archiver.archiver_path )); + } + } else if( !fd_topo_configure_tile( tile, config ) ) { + FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + } + + /* Override */ + if( !strcmp( tile->name, "replay" ) ) { + strncpy( tile->replay.incremental, config->tiles.replay.incremental, sizeof(tile->replay.incremental) ); + strncpy( tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(tile->replay.slots_replayed) ); + strncpy( tile->replay.snapshot, config->tiles.replay.snapshot, sizeof(tile->replay.snapshot) ); + strncpy( tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(tile->replay.status_cache) ); + } + } + /**********************************************************************/ /* Finish and print out the topo information */ /**********************************************************************/ diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 9054fc233c..31d79530b8 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -1,4 +1,4 @@ -#include "../shared/fd_config.h" +#include "topology.h" #include "../../discof/geyser/fd_replay_notif.h" #include "../../disco/net/fd_net_tile.h" @@ -8,8 +8,6 @@ #include "../../disco/topo/fd_cpu_topo.h" #include "../../disco/topo/fd_pod_format.h" #include "../../flamenco/runtime/fd_blockstore.h" -#include "../../flamenco/runtime/fd_runtime.h" -#include "../../flamenco/runtime/fd_runtime_public.h" #include "../../flamenco/runtime/fd_txncache.h" #include "../../flamenco/snapshot/fd_snapshot_base.h" #include "../../util/tile/fd_tile_private.h" @@ -21,14 +19,14 @@ extern fd_topo_obj_callbacks_t * CALLBACKS[]; -static fd_topo_obj_t * +fd_topo_obj_t * setup_topo_blockstore( fd_topo_t * topo, - char const * wksp_name, - ulong shred_max, - ulong block_max, - ulong idx_max, - ulong txn_max, - ulong alloc_max ) { + char const * wksp_name, + ulong shred_max, + ulong block_max, + ulong idx_max, + ulong txn_max, + ulong alloc_max ) { fd_topo_obj_t * obj = fd_topob_obj( topo, "blockstore", wksp_name ); ulong seed; @@ -50,7 +48,7 @@ setup_topo_blockstore( fd_topo_t * topo, return obj; } -static fd_topo_obj_t * +fd_topo_obj_t * setup_topo_runtime_pub( fd_topo_t * topo, char const * wksp_name, ulong mem_max ) { @@ -60,13 +58,13 @@ setup_topo_runtime_pub( fd_topo_t * topo, return obj; } -static fd_topo_obj_t * +fd_topo_obj_t * setup_topo_txncache( fd_topo_t * topo, - char const * wksp_name, - ulong max_rooted_slots, - ulong max_live_slots, - ulong max_txn_per_slot, - ulong max_constipated_slots ) { + char const * wksp_name, + ulong max_rooted_slots, + ulong max_live_slots, + ulong max_txn_per_slot, + ulong max_constipated_slots ) { fd_topo_obj_t * obj = fd_topob_obj( topo, "txncache", wksp_name ); FD_TEST( fd_pod_insertf_ulong( topo->props, max_rooted_slots, "obj.%lu.max_rooted_slots", obj->id ) ); @@ -77,6 +75,29 @@ setup_topo_txncache( fd_topo_t * topo, return obj; } +fd_topo_obj_t * +setup_topo_funk( fd_topo_t * topo, + char const * wksp_name, + ulong max_account_records, + ulong max_database_transactions ) { + fd_topo_obj_t * obj = fd_topob_obj( topo, "funk", wksp_name ); + FD_TEST( fd_pod_insert_ulong( topo->props, "funk", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, max_account_records, "obj.%lu.rec_max", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, max_database_transactions, "obj.%lu.txn_max", obj->id ) ); + ulong funk_footprint = fd_funk_footprint( max_database_transactions, max_account_records ); + if( FD_UNLIKELY( !funk_footprint ) ) FD_LOG_ERR(( "Invalid [funk] parameters" )); + + /* Increase workspace partition count */ + ulong wksp_idx = fd_topo_find_wksp( topo, wksp_name ); + FD_TEST( wksp_idx!=ULONG_MAX ); + fd_topo_wksp_t * wksp = &topo->workspaces[ wksp_idx ]; + ulong part_max = fd_wksp_part_max_est( funk_footprint, 1U<<18U ); + if( FD_UNLIKELY( !part_max ) ) FD_LOG_ERR(( "fd_wksp_part_max_est(%lu,256KiB) failed", funk_footprint )); + wksp->part_max += part_max; + + return obj; +} + static int resolve_gossip_entrypoint( char const * host_port, fd_ip4_port_t * ip4_port ) { @@ -763,7 +784,40 @@ fd_topo_initialize( config_t * config ) { for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; + if( !fd_topo_configure_tile( tile, config ) ) { + FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + } + } + + if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 ); + + fd_topob_finish( topo, CALLBACKS ); + FD_TEST( blockstore_obj->id ); + + const char * status_cache = config->tiles.replay.status_cache; + if ( strlen( status_cache ) > 0 ) { + /* Make the status cache workspace match the parameters used to create the + checkpoint. This is a bit nonintuitive because of the way + fd_topo_create_workspace works. */ + fd_wksp_preview_t preview[1]; + int err = fd_wksp_preview( status_cache, preview ); + if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "unable to preview %s: error %d", status_cache, err )); + fd_topo_wksp_t * wksp = &topo->workspaces[ topo->objs[ txncache_obj->id ].wksp_id ]; + wksp->part_max = preview->part_max; + wksp->known_footprint = 0; + wksp->total_footprint = preview->data_max; + ulong page_sz = FD_SHMEM_GIGANTIC_PAGE_SZ; + wksp->page_sz = page_sz; + ulong footprint = fd_wksp_footprint( preview->part_max, preview->data_max ); + wksp->page_cnt = footprint / page_sz; + } + config->topo = *topo; +} + +int +fd_topo_configure_tile( fd_topo_tile_t * tile, + fd_config_t * config ) { if( FD_UNLIKELY( !strcmp( tile->name, "net" ) || !strcmp( tile->name, "sock" ) ) ) { tile->net.shred_listen_port = config->tiles.shred.shred_listen_port; @@ -797,7 +851,7 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "shred" ) ) ) { strncpy( tile->shred.identity_key_path, config->paths.identity_key, sizeof(tile->shred.identity_key_path) ); - tile->shred.depth = topo->links[ tile->out_link_id[ 0 ] ].depth; + tile->shred.depth = config->topo.links[ tile->out_link_id[ 0 ] ].depth; tile->shred.fec_resolver_depth = config->tiles.shred.max_pending_shred_sets; tile->shred.expected_shred_version = config->consensus.expected_shred_version; tile->shred.shred_listen_port = config->tiles.shred.shred_listen_port; @@ -830,7 +884,6 @@ fd_topo_initialize( config_t * config ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "repair" ) ) ) { tile->repair.max_pending_shred_sets = config->tiles.shred.max_pending_shred_sets; - tile->repair.shred_tile_cnt = config->layout.shred_tile_count; tile->repair.repair_intake_listen_port = config->tiles.repair.repair_intake_listen_port; tile->repair.repair_serve_listen_port = config->tiles.repair.repair_serve_listen_port; strncpy( tile->repair.good_peer_cache_file, config->tiles.repair.good_peer_cache_file, sizeof(tile->repair.good_peer_cache_file) ); @@ -854,11 +907,11 @@ fd_topo_initialize( config_t * config ) { tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); - tile->replay.plugins_enabled = plugins_enabled; + tile->replay.plugins_enabled = fd_topo_find_tile( &config->topo, "plugin", 0UL ) != ULONG_MAX; if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) && !strncmp( config->tiles.replay.snapshot, "", 1 ) ) ) { - fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); + fd_cstr_printf_check( config->tiles.replay.genesis, PATH_MAX, NULL, "%s/genesis.bin", config->paths.ledger ); } strncpy( tile->replay.genesis, config->tiles.replay.genesis, sizeof(tile->replay.genesis) ); @@ -867,9 +920,6 @@ fd_topo_initialize( config_t * config ) { strncpy( tile->replay.slots_replayed, config->tiles.replay.slots_replayed, sizeof(tile->replay.slots_replayed) ); strncpy( tile->replay.status_cache, config->tiles.replay.status_cache, sizeof(tile->replay.status_cache) ); strncpy( tile->replay.cluster_version, config->tiles.replay.cluster_version, sizeof(tile->replay.cluster_version) ); - tile->replay.bank_tile_count = config->layout.bank_tile_count; - tile->replay.exec_tile_count = config->firedancer.layout.exec_tile_count; - tile->replay.writer_tile_cuont = config->firedancer.layout.writer_tile_count; strncpy( tile->replay.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(tile->replay.tower_checkpt) ); /* not specified by [tiles.replay] */ @@ -947,34 +997,9 @@ fd_topo_initialize( config_t * config ) { tile->restart.heap_mem_max = config->firedancer.runtime.heap_size_gib<<30; } else if( FD_UNLIKELY( !strcmp( tile->name, "arch_f" ) || !strcmp( tile->name, "arch_w" ) ) ) { - tile->archiver.enabled = config->tiles.archiver.enabled; strncpy( tile->archiver.archiver_path, config->tiles.archiver.archiver_path, sizeof(tile->archiver.archiver_path) ); } else { - FD_LOG_ERR(( "unknown tile name %lu `%s`", i, tile->name )); + return 0; } - } - - if( FD_UNLIKELY( is_auto_affinity ) ) fd_topob_auto_layout( topo, 0 ); - - fd_topob_finish( topo, CALLBACKS ); - - const char * status_cache = config->tiles.replay.status_cache; - if ( strlen( status_cache ) > 0 ) { - /* Make the status cache workspace match the parameters used to create the - checkpoint. This is a bit nonintuitive because of the way - fd_topo_create_workspace works. */ - fd_wksp_preview_t preview[1]; - int err = fd_wksp_preview( status_cache, preview ); - if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "unable to preview %s: error %d", status_cache, err )); - fd_topo_wksp_t * wksp = &topo->workspaces[ topo->objs[ txncache_obj->id ].wksp_id ]; - wksp->part_max = preview->part_max; - wksp->known_footprint = 0; - wksp->total_footprint = preview->data_max; - ulong page_sz = FD_SHMEM_GIGANTIC_PAGE_SZ; - wksp->page_sz = page_sz; - ulong footprint = fd_wksp_footprint( preview->part_max, preview->data_max ); - wksp->page_cnt = footprint / page_sz; - } - - config->topo = *topo; + return 1; } diff --git a/src/app/firedancer/topology.h b/src/app/firedancer/topology.h index f998c9b847..3f213b1eba 100644 --- a/src/app/firedancer/topology.h +++ b/src/app/firedancer/topology.h @@ -1,12 +1,49 @@ #ifndef HEADER_fd_src_app_firedancer_topology_h #define HEADER_fd_src_app_firedancer_topology_h +/* topology.h contains APIs for constructing a Firedancer topology. */ + #include "../shared/fd_config.h" FD_PROTOTYPES_BEGIN +/* fd_topo_initialize constructs a full validator config according to + the given topology. Populates config->topo. */ + void -fd_topo_initialize( config_t * config ); +fd_topo_initialize( fd_config_t * config ); + +fd_topo_obj_t * +setup_topo_blockstore( fd_topo_t * topo, + char const * wksp_name, + ulong shred_max, + ulong block_max, + ulong idx_max, + ulong txn_max, + ulong alloc_max ); + +fd_topo_obj_t * +setup_topo_runtime_pub( fd_topo_t * topo, + char const * wksp_name, + ulong mem_max ); + +fd_topo_obj_t * +setup_topo_txncache( fd_topo_t * topo, + char const * wksp_name, + ulong max_rooted_slots, + ulong max_live_slots, + ulong max_txn_per_slot, + ulong max_constipated_slots ); + +fd_topo_obj_t * +setup_topo_funk( fd_topo_t * topo, + char const * wksp_name, + ulong max_account_records, + ulong max_database_transactions ); + +int +fd_topo_configure_tile( fd_topo_tile_t * tile, + fd_config_t * config ); FD_PROTOTYPES_END diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 6c91f3dbb0..2428a87b19 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -292,9 +292,6 @@ typedef struct { uint ip_addr; int vote; char vote_account_path[ PATH_MAX ]; - ulong bank_tile_count; - ulong exec_tile_count; - ulong writer_tile_cuont; ulong full_interval; ulong incremental_interval; @@ -373,7 +370,6 @@ typedef struct { int good_peer_cache_file_fd; char identity_key_path[ PATH_MAX ]; ulong max_pending_shred_sets; - uint shred_tile_cnt; } repair; struct { @@ -427,7 +423,6 @@ typedef struct { } pktgen; struct { - int enabled; ulong end_slot; char archiver_path[ PATH_MAX ]; @@ -515,10 +510,13 @@ fd_topo_workspace_align( void ) { return 4096UL; } -FD_FN_PURE static inline void * +static inline void * fd_topo_obj_laddr( fd_topo_t const * topo, ulong obj_id ) { fd_topo_obj_t const * obj = &topo->objs[ obj_id ]; + FD_TEST( obj_idid == obj_id ); + FD_TEST( obj->offset ); return (void *)((ulong)topo->workspaces[ obj->wksp_id ].wksp + obj->offset); } diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index bf994e7bf1..7d3849d7ad 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -602,7 +602,9 @@ fd_topob_finish( fd_topo_t * topo, if( FD_UNLIKELY( cb->loose ) ) loose_sz += cb->loose( topo, obj ); } - ulong part_max = 3UL + (loose_sz / (64UL << 10)); /* 3 for initial alignment + actual alloc + residual padding */ + ulong part_max = wksp->part_max; + if( !part_max ) part_max = (loose_sz / (64UL << 10)); /* alloc + residual padding */ + part_max += 3; /* for initial alignment */ ulong offset = fd_ulong_align_up( fd_wksp_private_data_off( part_max ), fd_topo_workspace_align() ); for( ulong j=0UL; jobj_cnt; j++ ) { diff --git a/src/discof/repair/fd_repair_tile.c b/src/discof/repair/fd_repair_tile.c index a9f1afb53c..418efdabc1 100644 --- a/src/discof/repair/fd_repair_tile.c +++ b/src/discof/repair/fd_repair_tile.c @@ -1339,7 +1339,7 @@ unprivileged_init( fd_topo_t * topo, } if( FD_UNLIKELY( sign_link_out_idx==UINT_MAX ) ) FD_LOG_ERR(( "Missing gossip_sign link" )); ctx->shred_tile_cnt = shred_tile_idx; - FD_TEST( ctx->shred_tile_cnt == tile->repair.shred_tile_cnt ); + FD_TEST( ctx->shred_tile_cnt == fd_topo_tile_name_cnt( topo, "shred" ) ); /* Scratch mem setup */ diff --git a/src/discof/replay/fd_replay_tile.c b/src/discof/replay/fd_replay_tile.c index 239d24df58..60fa7cf2a1 100644 --- a/src/discof/replay/fd_replay_tile.c +++ b/src/discof/replay/fd_replay_tile.c @@ -2986,7 +2986,7 @@ unprivileged_init( fd_topo_t * topo, /**********************************************************************/ /* Join each of the exec spads. */ - ctx->exec_cnt = tile->replay.exec_tile_count; + ctx->exec_cnt = fd_topo_tile_name_cnt( topo, "exec" ); for( ulong i=0UL; iexec_cnt; i++ ) { ulong exec_spad_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "exec_spad.%lu", i ); fd_spad_t * spad = fd_spad_join( fd_topo_obj_laddr( topo, exec_spad_id ) ); @@ -3083,8 +3083,8 @@ unprivileged_init( fd_topo_t * topo, /* bank */ /**********************************************************************/ - ctx->bank_cnt = tile->replay.bank_tile_count; - for( ulong i=0UL; ireplay.bank_tile_count; i++ ) { + ctx->bank_cnt = fd_topo_tile_name_cnt( topo, "bank" ); + for( ulong i=0UL; i<(ctx->bank_cnt); i++ ) { ulong busy_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "bank_busy.%lu", i ); FD_TEST( busy_obj_id!=ULONG_MAX ); ctx->bank_busy[ i ] = fd_fseq_join( fd_topo_obj_laddr( topo, busy_obj_id ) ); @@ -3109,7 +3109,7 @@ unprivileged_init( fd_topo_t * topo, /**********************************************************************/ /* exec */ /**********************************************************************/ - ctx->exec_cnt = tile->replay.exec_tile_count; + ctx->exec_cnt = fd_topo_tile_name_cnt( topo, "exec" ); if( FD_UNLIKELY( ctx->exec_cnt>FD_PACK_MAX_BANK_TILES ) ) { FD_LOG_ERR(( "replay tile has too many exec tiles %lu", ctx->exec_cnt )); } @@ -3153,7 +3153,7 @@ unprivileged_init( fd_topo_t * topo, /**********************************************************************/ /* writer */ /**********************************************************************/ - ctx->writer_cnt = tile->replay.writer_tile_cuont; + ctx->writer_cnt = fd_topo_tile_name_cnt( topo, "writer" ); if( FD_UNLIKELY( ctx->writer_cnt>FD_PACK_MAX_BANK_TILES ) ) { FD_LOG_CRIT(( "replay tile has too many writer tiles %lu", ctx->writer_cnt )); } From 1ebc94da2bbc8344a21316835d854e4015a81dc5 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 10 May 2025 00:42:20 +0000 Subject: [PATCH 05/34] fdctl: switch tiles to topo-provided funk --- contrib/test/run_fd_shred_cap.sh | 8 ++-- contrib/test/test_firedancer_leader.sh | 8 ++-- src/app/firedancer-dev/commands/backtest.c | 9 ++++ src/app/firedancer-dev/config/default.toml | 7 +-- src/app/firedancer-dev/config/private.toml | 7 +-- src/app/firedancer-dev/config/tiny.toml | 10 ++--- src/app/firedancer-dev/main.c | 2 + src/app/firedancer/config/default.toml | 30 +++++++++++-- src/app/firedancer/main.c | 2 + src/app/firedancer/topology.c | 37 ++++++++++------ src/app/shared/fd_config.h | 10 +++-- src/app/shared/fd_config_parse.c | 8 ++-- src/app/shared/fd_obj_callbacks.c | 31 +++++++++++++ src/disco/topo/fd_topo.h | 13 +++--- src/discof/batch/fd_batch_tile.c | 27 ++---------- src/discof/exec/fd_exec_tile.c | 19 +------- src/discof/replay/fd_replay_tile.c | 43 ++----------------- src/discof/restart/fd_restart_tile.c | 18 ++------ src/discof/restart/test/restart_fd.sh | 8 ++-- src/discof/rpc/fd_rpcserv_tile.c | 33 +++----------- src/discof/writer/fd_writer_tile.c | 20 +-------- .../runtime/tests/run_ledger_backtest.sh | 7 +-- src/funk/fd_funk.c | 4 +- src/funk/fd_funk.h | 4 +- 24 files changed, 165 insertions(+), 200 deletions(-) diff --git a/contrib/test/run_fd_shred_cap.sh b/contrib/test/run_fd_shred_cap.sh index bdf888276b..82e9b0af72 100755 --- a/contrib/test/run_fd_shred_cap.sh +++ b/contrib/test/run_fd_shred_cap.sh @@ -90,16 +90,16 @@ echo " idx_max = 8192 alloc_max = 1073741824 file = \"$DATA_DIR/shredcap_testnet.blockstore\" +[funk] + max_account_records = 150000000 + heap_size_gib = 100 + max_database_transactions = 2000 [tiles] [tiles.shred] max_pending_shred_sets = 16384 [tiles.replay] snapshot = \"$SNAPSHOT\" incremental = \"$INCREMENTAL\" - funk_sz_gb = 100 - funk_rec_max = 150000000 - funk_txn_max = 2000 - funk_file = \"$DATA_DIR/shredcap_testnet.funk\" [tiles.store_int] shred_cap_replay = \"$SHREDCAP\" shred_cap_end_slot = 317018450 diff --git a/contrib/test/test_firedancer_leader.sh b/contrib/test/test_firedancer_leader.sh index 282f354bcb..3e9771ac58 100755 --- a/contrib/test/test_firedancer_leader.sh +++ b/contrib/test/test_firedancer_leader.sh @@ -50,10 +50,6 @@ echo " [tiles.replay] capture = \"firedancer-dev.solcap\" snapshot = \"$FULL_SNAPSHOT\" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 - funk_file = \"/tmp/localnet.funk\" cluster_version = \"2.0.14\" [tiles.gui] enabled = false @@ -72,6 +68,10 @@ echo " txn_max = 1024 alloc_max = 10737418240 file = \"/tmp/localnet.blockstore\" +[funk] + max_account_records = 10000000 + heap_size_gib = 32 + max_database_transactions = 1024 [log] path = \"firedancer-dev.log\" level_stderr = \"INFO\" diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c index 09ed889173..3f2cdf3739 100644 --- a/src/app/firedancer-dev/commands/backtest.c +++ b/src/app/firedancer-dev/commands/backtest.c @@ -66,6 +66,15 @@ backtest_topo( config_t * config ) { fd_topob_wksp( topo, "replay" ); fd_topo_tile_t * replay_tile = fd_topob_tile( topo, "replay", "replay", "metric_in", replay_cpu_idx, 0, 0 ); + /* specified by [tiles.replay] */ + + fd_topob_wksp( topo, "funk" ); + fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", + config->firedancer.funk.max_account_records, + config->firedancer.funk.max_database_transactions ); + + fd_topob_tile_uses( topo, replay_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + /**********************************************************************/ /* Add the executor tiles to topo */ /**********************************************************************/ diff --git a/src/app/firedancer-dev/config/default.toml b/src/app/firedancer-dev/config/default.toml index 0b2f0608f8..dcf912e570 100644 --- a/src/app/firedancer-dev/config/default.toml +++ b/src/app/firedancer-dev/config/default.toml @@ -13,6 +13,10 @@ txn_max = 1048576 idx_max = 8192 alloc_max = 10737418240 +[funk] + heap_size_gib = 140 + max_account_records = 150000000 + max_database_transactions = 2000 [tiles] [tiles.shred] max_pending_shred_sets = 16384 @@ -22,9 +26,6 @@ [tiles.replay] snapshot_url = "http://${VALIDATOR_IP}:8899/snapshot.tar.bz2" incremental_url = "http://${VALIDATOR_IP}:8899/incremental-snapshot.tar.bz2" - funk_sz_gb = 140 - funk_rec_max = 150000000 - funk_txn_max = 2000 [tiles.metric] prometheus_listen_address = "0.0.0.0" prometheus_listen_port = 7999 diff --git a/src/app/firedancer-dev/config/private.toml b/src/app/firedancer-dev/config/private.toml index f161a842e1..de95d3a82d 100644 --- a/src/app/firedancer-dev/config/private.toml +++ b/src/app/firedancer-dev/config/private.toml @@ -9,6 +9,10 @@ txn_max = 1024 idx_max = 8192 alloc_max = 10737418240 +[funk] + heap_size_gib = 20 + max_account_records = 1048576 + max_database_transactions = 4096 [tiles] [tiles.shred] max_pending_shred_sets = 16384 @@ -17,9 +21,6 @@ repair_serve_listen_port = 8034 [tiles.replay] snapshot_url = "http://${VALIDATOR_IP}:8899/snapshot.tar.bz2" - funk_sz_gb = 20 - funk_txn_max = 4096 - funk_rec_max = 1048576 [tiles.metric] prometheus_listen_address = "0.0.0.0" prometheus_listen_port = 7999 diff --git a/src/app/firedancer-dev/config/tiny.toml b/src/app/firedancer-dev/config/tiny.toml index 6e086482a2..55e0ce3a91 100644 --- a/src/app/firedancer-dev/config/tiny.toml +++ b/src/app/firedancer-dev/config/tiny.toml @@ -1,6 +1,11 @@ [hugetlbfs] max_page_size = "huge" +[funk] +max_account_records = 1048576 +heap_size_gib = 20 +max_database_transactions = 1024 + [runtime] heap_size_gib = 4 @@ -17,11 +22,6 @@ writer_tile_count = 1 [tiles.restart] enabled = false -[tiles.replay] -funk_sz_gb = 20 -funk_txn_max = 1024 -funk_rec_max = 1048576 - [tiles.shred] max_pending_shred_sets = 512 diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index 0ad69c3005..dbe0fd5753 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -22,6 +22,7 @@ extern fd_topo_obj_callbacks_t fd_obj_cb_runtime_pub; extern fd_topo_obj_callbacks_t fd_obj_cb_blockstore; extern fd_topo_obj_callbacks_t fd_obj_cb_txncache; extern fd_topo_obj_callbacks_t fd_obj_cb_exec_spad; +extern fd_topo_obj_callbacks_t fd_obj_cb_funk; fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_mcache, @@ -39,6 +40,7 @@ fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_blockstore, &fd_obj_cb_txncache, &fd_obj_cb_exec_spad, + &fd_obj_cb_funk, NULL, }; diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index 5380b39db2..1c9bd20b93 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -293,6 +293,33 @@ user = "" snapshot_grace_period_seconds = 409 max_vote_accounts = 2000000 +# This section configures the "funk" account database. Currently, funk +# stores all Solana accounts. In future versions of Firedancer, most +# accounts will be offloaded to the "groove" database. +[funk] + # The max amount of records that the funk instance can store. + # Each Solana account uses at least one record. Additional records + # are used for account changes that are not yet finalized by + # consensus (typically takes 13 seconds). + max_account_records = 10_000_000 + + # The size of the funk heap in gibibytes. This value must be large + # enough to store all Solana accounts uncompressed. + heap_size_gib = 32 + + # The max amount of concurrent database transactions. These are + # used to track conflicting versions of accounts until such + # conflicts are resolved by the consensus algorithm. (Not to be + # confused with Solana transactions). + # The validator uses one database transaction for each Solana block + # that is not yet finalized. It is not recommended to change this + # setting. + max_database_transactions = 1024 + +# This section configures the "groove" persistent account database. +# [groove] +# ... + # CPU cores in Firedancer are carefully managed. Where a typical # program lets the operating system scheduler determine which threads to # run on which cores and for how long, Firedancer overrides most of this @@ -1030,9 +1057,6 @@ user = "" # snapshots and frequent validator restarts are expected. snapshot_dir = "" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 cluster_version = "1.18.0" # The metric tile receives metrics updates published from the rest diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index cf05828d3f..85850e335a 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -21,6 +21,7 @@ extern fd_topo_obj_callbacks_t fd_obj_cb_runtime_pub; extern fd_topo_obj_callbacks_t fd_obj_cb_blockstore; extern fd_topo_obj_callbacks_t fd_obj_cb_txncache; extern fd_topo_obj_callbacks_t fd_obj_cb_exec_spad; +extern fd_topo_obj_callbacks_t fd_obj_cb_funk; fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_mcache, @@ -38,6 +39,7 @@ fd_topo_obj_callbacks_t * CALLBACKS[] = { &fd_obj_cb_blockstore, &fd_obj_cb_txncache, &fd_obj_cb_exec_spad, + &fd_obj_cb_funk, NULL, }; diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 31d79530b8..6454147492 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -326,6 +326,7 @@ fd_topo_initialize( config_t * config ) { fd_topob_wksp( topo, "exec_spad" ); fd_topob_wksp( topo, "exec_fseq" ); fd_topob_wksp( topo, "writer_fseq" ); + fd_topob_wksp( topo, "funk" ); if( enable_rpc ) fd_topob_wksp( topo, "rpcsrv" ); @@ -460,11 +461,25 @@ fd_topo_initialize( config_t * config ) { FOR(writer_tile_cnt) fd_topob_tile( topo, "writer", "writer", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); fd_topo_tile_t * batch_tile = fd_topob_tile( topo, "batch", "batch", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); - if( enable_rstart ) /* */ fd_topob_tile( topo, "rstart", "restart", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); + fd_topo_tile_t * rstart_tile = NULL; + if( enable_rstart ) rstart_tile =fd_topob_tile( topo, "rstart", "restart", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); fd_topo_tile_t * rpcserv_tile = NULL; if( enable_rpc ) rpcserv_tile = fd_topob_tile( topo, "rpcsrv", "rpcsrv", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 0 ); + /* Database cache */ + + fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", + config->firedancer.funk.max_account_records, + config->firedancer.funk.max_database_transactions ); + + /* */ fd_topob_tile_uses( topo, batch_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + FOR(exec_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "exec", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + /* */ fd_topob_tile_uses( topo, replay_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + if(rstart_tile) fd_topob_tile_uses( topo, rstart_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + if(rpcserv_tile) fd_topob_tile_uses( topo, rpcserv_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + FOR(writer_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "writer", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + /* Create a shared blockstore to be used by store and replay. */ fd_topo_obj_t * blockstore_obj = setup_topo_blockstore( topo, "bstore", @@ -903,10 +918,7 @@ fd_topo_configure_tile( fd_topo_tile_t * tile, tile->replay.tx_metadata_storage = config->rpc.extended_tx_metadata_storage; strncpy( tile->replay.capture, config->tiles.replay.capture, sizeof(tile->replay.capture) ); strncpy( tile->replay.funk_checkpt, config->tiles.replay.funk_checkpt, sizeof(tile->replay.funk_checkpt) ); - tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + tile->replay.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); tile->replay.plugins_enabled = fd_topo_find_tile( &config->topo, "plugin", 0UL ) != ULONG_MAX; if( FD_UNLIKELY( !strncmp( config->tiles.replay.genesis, "", 1 ) @@ -931,6 +943,8 @@ fd_topo_configure_tile( fd_topo_tile_t * tile, tile->replay.full_interval = config->tiles.batch.full_interval; tile->replay.incremental_interval = config->tiles.batch.incremental_interval; + FD_TEST( tile->replay.funk_obj_id == fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ) ); + } else if( FD_UNLIKELY( !strcmp( tile->name, "sign" ) ) ) { strncpy( tile->sign.identity_key_path, config->paths.identity_key, sizeof(tile->sign.identity_key_path) ); @@ -958,19 +972,16 @@ fd_topo_configure_tile( fd_topo_tile_t * tile, strncpy( tile->eqvoc.identity_key_path, config->paths.identity_key, sizeof(tile->eqvoc.identity_key_path) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "rpcsrv" ) ) ) { strncpy( tile->replay.blockstore_file, config->firedancer.blockstore.file, sizeof(tile->replay.blockstore_file) ); - tile->replay.funk_rec_max = config->tiles.replay.funk_rec_max; - tile->replay.funk_sz_gb = config->tiles.replay.funk_sz_gb; - tile->replay.funk_txn_max = config->tiles.replay.funk_txn_max; - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + tile->rpcserv.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); tile->rpcserv.rpc_port = config->rpc.port; tile->rpcserv.tpu_port = config->tiles.quic.regular_transaction_listen_port; tile->rpcserv.tpu_ip_addr = config->net.ip_addr; strncpy( tile->rpcserv.identity_key_path, config->paths.identity_key, sizeof(tile->rpcserv.identity_key_path) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "batch" ) ) ) { + tile->batch.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); tile->batch.full_interval = config->tiles.batch.full_interval; tile->batch.incremental_interval = config->tiles.batch.incremental_interval; strncpy( tile->batch.out_dir, config->tiles.batch.out_dir, sizeof(tile->batch.out_dir) ); - strncpy( tile->replay.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); } else if( FD_UNLIKELY( !strcmp( tile->name, "gui" ) ) ) { if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.gui.gui_listen_address, &tile->gui.listen_addr ) ) ) FD_LOG_ERR(( "failed to parse gui listen address `%s`", config->tiles.gui.gui_listen_address )); @@ -985,11 +996,11 @@ fd_topo_configure_tile( fd_topo_tile_t * tile, } else if( FD_UNLIKELY( !strcmp( tile->name, "plugin" ) ) ) { } else if( FD_UNLIKELY( !strcmp( tile->name, "exec" ) ) ) { - strncpy( tile->exec.funk_file, config->tiles.replay.funk_file, sizeof(tile->exec.funk_file) ); + tile->exec.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); } else if( FD_UNLIKELY( !strcmp( tile->name, "writer" ) ) ) { - strncpy( tile->writer.funk_file, config->tiles.replay.funk_file, sizeof(tile->writer.funk_file) ); + tile->writer.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); } else if( FD_UNLIKELY( !strcmp( tile->name, "rstart" ) ) ) { - strncpy( tile->restart.funk_file, config->tiles.replay.funk_file, sizeof(tile->replay.funk_file) ); + tile->restart.funk_obj_id = fd_pod_query_ulong( config->topo.props, "funk", ULONG_MAX ); strncpy( tile->restart.tower_checkpt, config->tiles.replay.tower_checkpt, sizeof(tile->replay.tower_checkpt) ); strncpy( tile->restart.identity_key_path, config->paths.identity_key, sizeof(tile->restart.identity_key_path) ); fd_memcpy( tile->restart.genesis_hash, config->tiles.restart.genesis_hash, FD_BASE58_ENCODED_32_SZ ); diff --git a/src/app/shared/fd_config.h b/src/app/shared/fd_config.h index 575d5693a9..4003d63d3c 100644 --- a/src/app/shared/fd_config.h +++ b/src/app/shared/fd_config.h @@ -118,6 +118,12 @@ struct fd_configf { } limits; } runtime; + struct { + ulong max_account_records; + ulong heap_size_gib; + ulong max_database_transactions; + } funk; + struct { uint exec_tile_count; /* TODO: redundant ish with bank tile cnt */ uint writer_tile_count; @@ -365,10 +371,6 @@ struct fd_config { struct { char capture[ PATH_MAX ]; char funk_checkpt[ PATH_MAX ]; - uint funk_rec_max; - ulong funk_sz_gb; - ulong funk_txn_max; - char funk_file[ PATH_MAX ]; char genesis[ PATH_MAX ]; char incremental[ PATH_MAX ]; char incremental_url[ PATH_MAX ]; diff --git a/src/app/shared/fd_config_parse.c b/src/app/shared/fd_config_parse.c index 5ae00b297a..86d2e55f53 100644 --- a/src/app/shared/fd_config_parse.c +++ b/src/app/shared/fd_config_parse.c @@ -331,6 +331,10 @@ fd_config_extract_podf( uchar * pod, CFG_POP ( ulong, runtime.limits.snapshot_grace_period_seconds ); CFG_POP ( ulong, runtime.limits.max_vote_accounts ); + CFG_POP ( ulong, funk.max_account_records ); + CFG_POP ( ulong, funk.heap_size_gib ); + CFG_POP ( ulong, funk.max_database_transactions ); + return config; } @@ -442,10 +446,6 @@ fd_config_extract_pod( uchar * pod, CFG_POP ( cstr, tiles.replay.capture ); CFG_POP ( cstr, tiles.replay.funk_checkpt ); - CFG_POP ( uint, tiles.replay.funk_rec_max ); - CFG_POP ( ulong, tiles.replay.funk_sz_gb ); - CFG_POP ( ulong, tiles.replay.funk_txn_max ); - CFG_POP ( cstr, tiles.replay.funk_file ); CFG_POP ( cstr, tiles.replay.genesis ); CFG_POP ( cstr, tiles.replay.incremental ); CFG_POP ( cstr, tiles.replay.incremental_url ); diff --git a/src/app/shared/fd_obj_callbacks.c b/src/app/shared/fd_obj_callbacks.c index 1e3aae7e2a..009392f6e8 100644 --- a/src/app/shared/fd_obj_callbacks.c +++ b/src/app/shared/fd_obj_callbacks.c @@ -10,6 +10,7 @@ #include "../../waltz/neigh/fd_neigh4_map.h" #include "../../waltz/ip/fd_fib4.h" #include "../../disco/keyguard/fd_keyswitch.h" +#include "../../funk/fd_funk.h" #define VAL(name) (__extension__({ \ ulong __x = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.%s", obj->id, name ); \ @@ -276,6 +277,36 @@ fd_topo_obj_callbacks_t fd_obj_cb_keyswitch = { .new = keyswitch_new, }; +static ulong +funk_footprint( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; + return fd_funk_footprint( VAL("txn_max"), VAL("rec_max") ); +} + +static ulong +funk_align( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; (void)obj; + return fd_funk_align(); +} + +static void +funk_new( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; + ulong funk_seed = fd_pod_queryf_ulong( topo->props, 0UL, "obj.%lu.seed", obj->id ); + if( !funk_seed ) FD_TEST( fd_rng_secure( &funk_seed, sizeof(ulong) ) ); + FD_TEST( fd_funk_new( fd_topo_obj_laddr( topo, obj->id ), 2UL, funk_seed, VAL("txn_max"), VAL("rec_max") ) ); +} + +fd_topo_obj_callbacks_t fd_obj_cb_funk = { + .name = "funk", + .footprint = funk_footprint, + .align = funk_align, + .new = funk_new, +}; + fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 2428a87b19..bea0c3442e 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -272,12 +272,9 @@ typedef struct { ulong max_vote_accounts; int tx_metadata_storage; + ulong funk_obj_id; char capture[ PATH_MAX ]; char funk_checkpt[ PATH_MAX ]; - uint funk_rec_max; - ulong funk_sz_gb; - ulong funk_txn_max; - char funk_file[ PATH_MAX ]; char genesis[ PATH_MAX ]; char incremental[ PATH_MAX ]; char slots_replayed[ PATH_MAX ]; @@ -310,7 +307,7 @@ typedef struct { struct { int in_wen_restart; int tower_checkpt_fileno; - char funk_file[ PATH_MAX ]; + ulong funk_obj_id; char tower_checkpt[ PATH_MAX ]; char identity_key_path[ PATH_MAX ]; char genesis_hash[ FD_BASE58_ENCODED_32_SZ ]; @@ -319,11 +316,11 @@ typedef struct { } restart; struct { - char funk_file[ PATH_MAX ]; + ulong funk_obj_id; } exec; struct { - char funk_file[ PATH_MAX ]; + ulong funk_obj_id; } writer; struct { @@ -402,6 +399,7 @@ typedef struct { } eqvoc; struct { + ulong funk_obj_id; ushort rpc_port; ushort tpu_port; uint tpu_ip_addr; @@ -409,6 +407,7 @@ typedef struct { } rpcserv; struct { + ulong funk_obj_id; ulong full_interval; ulong incremental_interval; char out_dir[ PATH_MAX ]; diff --git a/src/discof/batch/fd_batch_tile.c b/src/discof/batch/fd_batch_tile.c index b43e553ab1..2bfd22a131 100644 --- a/src/discof/batch/fd_batch_tile.c +++ b/src/discof/batch/fd_batch_tile.c @@ -1,7 +1,6 @@ #include "../../disco/topo/fd_topo.h" #include "../../disco/topo/fd_pod_format.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" #include "../../flamenco/runtime/fd_hashes.h" #include "../../flamenco/runtime/fd_txncache.h" #include "../../flamenco/snapshot/fd_snapshot_create.h" @@ -23,7 +22,6 @@ struct fd_snapshot_tile_ctx { ulong full_interval; ulong incremental_interval; char const * out_dir; - char funk_file[ PATH_MAX ]; /* Shared data structures. */ fd_txncache_t * status_cache; @@ -36,9 +34,6 @@ struct fd_snapshot_tile_ctx { int full_snapshot_fd; int incremental_snapshot_fd; - /* Only join funk after tiles start spinning. */ - int is_funk_active; - /* Metadata from the full snapshot used for incremental snapshots. */ ulong last_full_snap_slot; fd_hash_t last_hash; @@ -171,11 +166,9 @@ unprivileged_init( fd_topo_t * topo, /* funk */ /**********************************************************************/ - /* We only want to join funk after it has been setup and joined in the - replay tile. - TODO: Eventually funk will be joined via a shared topology object. */ - ctx->is_funk_active = 0; - memcpy( ctx->funk_file, tile->replay.funk_file, sizeof(tile->replay.funk_file) ); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->batch.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } /**********************************************************************/ /* status cache */ @@ -428,20 +421,6 @@ after_credit( fd_snapshot_tile_ctx_t * ctx, return; } - if( FD_UNLIKELY( !ctx->is_funk_active ) ) { - /* Setting these parameters are not required because we are joining the - funk that was setup in the replay tile. */ - fd_funk_t * funk = fd_funk_open_file( - ctx->funk, ctx->funk_file, - 1UL, 0UL, 0UL, 0UL, 0UL, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "Failed to join a funk database" )); - } - ctx->is_funk_active = 1; - - FD_LOG_WARNING(( "Joined funk database at file=%s", ctx->funk_file )); - } - if( fd_batch_fseq_is_snapshot( batch_fseq ) ) { produce_snapshot( ctx, batch_fseq ); } else { diff --git a/src/discof/exec/fd_exec_tile.c b/src/discof/exec/fd_exec_tile.c index 3de851a001..342d156430 100644 --- a/src/discof/exec/fd_exec_tile.c +++ b/src/discof/exec/fd_exec_tile.c @@ -1,5 +1,3 @@ -#include -#define _GNU_SOURCE #include "../../disco/tiles.h" #include "generated/fd_exec_tile_seccomp.h" @@ -12,7 +10,6 @@ #include "../../flamenco/runtime/program/fd_bpf_program_util.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" struct fd_exec_tile_out_ctx { ulong idx; @@ -118,9 +115,7 @@ struct fd_exec_tile_ctx { int pending_slot_pop; int pending_epoch_pop; - /* Funk-specific setup. */ fd_funk_t funk[1]; - fd_wksp_t * funk_wksp; /* Data structures related to managing and executing the transaction. The fd_txn_p_t is refreshed with every transaction and is sent @@ -642,19 +637,9 @@ unprivileged_init( fd_topo_t * topo, /* funk-specific setup */ /********************************************************************/ - /* Setting these parameters are not required because we are joining - the funk that was setup in the replay tile. */ - FD_LOG_NOTICE(( "Trying to join funk at file=%s", tile->exec.funk_file )); - fd_funk_txn_start_write( NULL ); - if( FD_UNLIKELY( !fd_funk_open_file( - ctx->funk, tile->exec.funk_file, - 1UL, 0UL, 0UL, 0UL, 0UL, FD_FUNK_READONLY, NULL ) ) ) { - FD_LOG_ERR(( "fd_funk_open_file(%s) failed", tile->exec.funk_file )); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->exec.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); } - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( ctx->funk ); - - FD_LOG_NOTICE(( "Just joined funk at file=%s", tile->exec.funk_file )); //FIXME /********************************************************************/ diff --git a/src/discof/replay/fd_replay_tile.c b/src/discof/replay/fd_replay_tile.c index 60fa7cf2a1..e3d835c16a 100644 --- a/src/discof/replay/fd_replay_tile.c +++ b/src/discof/replay/fd_replay_tile.c @@ -24,7 +24,6 @@ #include "../../flamenco/rewards/fd_rewards.h" #include "../../disco/metrics/fd_metrics.h" #include "../../choreo/fd_choreo.h" -#include "../../funk/fd_funk_filemap.h" #include "../../flamenco/snapshot/fd_snapshot_create.h" #include "../../disco/plugin/fd_plugin.h" //#include "fd_replay.h" @@ -109,7 +108,6 @@ typedef struct fd_slice_exec_ctx fd_slice_exec_ctx_t; struct fd_replay_tile_ctx { fd_wksp_t * wksp; fd_wksp_t * blockstore_wksp; - fd_wksp_t * funk_wksp; fd_wksp_t * status_cache_wksp; fd_wksp_t * runtime_public_wksp; @@ -745,7 +743,7 @@ checkpt( fd_replay_tile_ctx_t * ctx ) { FD_LOG_ERR( ( "blockstore checkpt failed: error %d", rc ) ); } } - int rc = fd_wksp_checkpt( ctx->funk_wksp, ctx->funk_checkpt, 0666, 0, NULL ); + int rc = fd_wksp_checkpt( ctx->funk->wksp, ctx->funk_checkpt, 0666, 0, NULL ); if( rc ) { FD_LOG_ERR( ( "funk checkpt failed: error %d", rc ) ); } @@ -2757,40 +2755,6 @@ privileged_init( fd_topo_t * topo, if( FD_UNLIKELY( !ctx->runtime_public ) ) { FD_LOG_ERR(( "no runtime_public" )); } - - - /* Open Funk */ - fd_funk_txn_start_write( NULL ); - fd_funk_t * funk; - const char * snapshot = tile->replay.snapshot; - if( strcmp( snapshot, "funk" ) == 0 ) { - /* Funk database already exists. The parameters are actually mostly ignored. */ - funk = fd_funk_open_file( - ctx->funk, - tile->replay.funk_file, 1, ctx->funk_seed, tile->replay.funk_txn_max, - tile->replay.funk_rec_max, tile->replay.funk_sz_gb * (1UL<<30), - FD_FUNK_READ_WRITE, NULL ); - } else if( strncmp( snapshot, "wksp:", 5 ) == 0) { - /* Recover funk database from a checkpoint. */ - funk = fd_funk_recover_checkpoint( ctx->funk, tile->replay.funk_file, 1, snapshot+5, NULL ); - } else { - FD_LOG_NOTICE(( "Trying to create new funk at file=%s", tile->replay.funk_file )); - /* Create new funk database */ - funk = fd_funk_open_file( - ctx->funk, - tile->replay.funk_file, 1, ctx->funk_seed, tile->replay.funk_txn_max, - tile->replay.funk_rec_max, tile->replay.funk_sz_gb * (1UL<<30), - FD_FUNK_OVERWRITE, NULL ); - FD_LOG_NOTICE(( "Opened funk file at %s", tile->replay.funk_file )); - } - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "Failed to join funk database" )); - } - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( funk ); - if( FD_UNLIKELY( ctx->funk_wksp == NULL ) ) { - FD_LOG_ERR(( "no funk wksp" )); - } } static void @@ -2874,8 +2838,9 @@ unprivileged_init( fd_topo_t * topo, /* funk */ /**********************************************************************/ - /* TODO: This below code needs to be shared as a topology object. This - will involve adding support to create a funk-based file here. */ + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->replay.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } ctx->is_caught_up = 0; diff --git a/src/discof/restart/fd_restart_tile.c b/src/discof/restart/fd_restart_tile.c index 5a2f00b2ab..f618180104 100644 --- a/src/discof/restart/fd_restart_tile.c +++ b/src/discof/restart/fd_restart_tile.c @@ -4,7 +4,6 @@ #include "../../disco/topo/fd_topo.h" #include "../../disco/topo/fd_pod_format.h" #include "../../disco/keyguard/fd_keyload.h" -#include "../../funk/fd_funk_filemap.h" #include "../../flamenco/runtime/fd_runtime.h" #define GOSSIP_IN_IDX (0UL) @@ -18,7 +17,6 @@ struct fd_restart_tile_ctx { fd_funk_t funk[1]; fd_epoch_bank_t epoch_bank; int is_funk_active; - char funk_file[ PATH_MAX ]; fd_spad_t * runtime_spad; int tower_checkpt_fileno; fd_pubkey_t identity, coordinator, genesis_hash; @@ -114,10 +112,10 @@ unprivileged_init( fd_topo_t * topo, /* funk */ /**********************************************************************/ - /* TODO: Same as what happens in the batch tile, eventually, funk should - be joined via a shared topology object. */ + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->restart.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } ctx->is_funk_active = 0; - memcpy( ctx->funk_file, tile->restart.funk_file, sizeof(tile->restart.funk_file) ); /**********************************************************************/ /* spad */ @@ -340,16 +338,6 @@ after_credit( fd_restart_tile_ctx_t * ctx, int * opt_poll_in FD_PARAM_UNUSED, int * charge_busy FD_PARAM_UNUSED ) { if( FD_UNLIKELY( !ctx->is_funk_active ) ) { - /* Setting these parameters are not required because we are joining the - funk that was setup in the replay tile. */ - fd_funk_t * funk = fd_funk_open_file( - ctx->funk, ctx->funk_file, - 1UL, 0UL, 0UL, 0UL, 0UL, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "fd_funk_open_file failed" )); - } else { - FD_LOG_NOTICE(("Restart tile joins funk successfully")); - } ctx->is_funk_active = 1; /* Decode the slot bank from funk, referencing fd_runtime_recover_banks() in fd_runtime_init.c */ diff --git a/src/discof/restart/test/restart_fd.sh b/src/discof/restart/test/restart_fd.sh index 399c03c341..fe248b9ffb 100755 --- a/src/discof/restart/test/restart_fd.sh +++ b/src/discof/restart/test/restart_fd.sh @@ -47,12 +47,8 @@ echo " repair_serve_listen_port = 9056 [tiles.replay] snapshot = \"funk\" - funk_sz_gb = 32 - funk_rec_max = 10000000 - funk_txn_max = 1024 cluster_version = \"$CLUSTER_VERSION\" tower_checkpt = \"$TOWER_CHECKPT_FILE\" - funk_file = \"$FUNK_FILE\" [tiles.restart] in_wen_restart = true wen_restart_coordinator = \"$RESTART_COORDINATOR\" @@ -74,6 +70,10 @@ echo " idx_max = 512 alloc_max = 10737418240 file = \"$BLOCK_FILE\" +[funk] + max_account_records = 10000000 + heap_size_gib = 32 + max_database_transactions = 1024 " > wen_restart.toml sudo gdb --args build/native/gcc/bin/firedancer-dev dev --config wen_restart.toml diff --git a/src/discof/rpc/fd_rpcserv_tile.c b/src/discof/rpc/fd_rpcserv_tile.c index ea73ce0e6f..cdec7fdc99 100644 --- a/src/discof/rpc/fd_rpcserv_tile.c +++ b/src/discof/rpc/fd_rpcserv_tile.c @@ -14,7 +14,6 @@ #include "../../disco/fd_disco.h" #include "../../disco/shred/fd_stake_ci.h" #include "../../disco/topo/fd_pod_format.h" -#include "../../funk/fd_funk_filemap.h" #include "../../disco/keyguard/fd_keyload.h" #include @@ -26,9 +25,6 @@ struct fd_rpcserv_tile_ctx { fd_rpcserver_args_t args; - char funk_file[ PATH_MAX ]; - - int activated; fd_rpc_ctx_t * ctx; @@ -85,12 +81,7 @@ before_credit( fd_rpcserv_tile_ctx_t * ctx, fd_stem_context_t * stem, int * charge_busy ) { (void)stem; - - if( FD_UNLIKELY( !ctx->activated ) ) { - *charge_busy = 0; - } else { - *charge_busy = fd_rpc_ws_poll( ctx->ctx ); - } + *charge_busy = fd_rpc_ws_poll( ctx->ctx ); } static void @@ -138,20 +129,7 @@ after_frag( fd_rpcserv_tile_ctx_t * ctx, (void)stem; if( FD_LIKELY( in_idx==REPLAY_NOTIF_IDX ) ) { - if( FD_UNLIKELY( !ctx->activated ) ) { - fd_rpcserver_args_t * args = &ctx->args; - fd_funk_t * funk = fd_funk_open_file( - args->funk, ctx->funk_file, 1, 0, 0, 0, 0, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "failed to join a funky" )); - } - - ctx->activated = 1; - fd_rpc_start_service( args, ctx->ctx ); - } - fd_rpc_replay_after_frag( ctx->ctx, &ctx->replay_notif_in_state ); - } else if( FD_UNLIKELY( in_idx==STAKE_CI_IN_IDX ) ) { fd_rpc_stake_after_frag( ctx->ctx, ctx->args.stake_ci ); @@ -189,7 +167,6 @@ privileged_init( fd_topo_t * topo, args->stake_ci = fd_stake_ci_join( fd_stake_ci_new( stake_ci_mem, ctx->identity_key ) ); - strncpy( ctx->funk_file, tile->replay.funk_file, sizeof(ctx->funk_file) ); /* Open funk after replay tile is booted */ /* Blockstore setup */ @@ -251,8 +228,6 @@ unprivileged_init( fd_topo_t * topo, FD_TEST( ( !!smem ) & ( !!fmem ) ); fd_scratch_attach( smem, fmem, FD_RPC_SCRATCH_MAX, FD_RPC_SCRATCH_DEPTH ); - ctx->activated = 0; - fd_topo_link_t * replay_notif_in_link = &topo->links[ tile->in_link_id[ REPLAY_NOTIF_IDX ] ]; ctx->replay_notif_in_mem = topo->workspaces[ topo->objs[ replay_notif_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->replay_notif_in_chunk0 = fd_dcache_compact_chunk0( ctx->replay_notif_in_mem, replay_notif_in_link->dcache ); @@ -262,6 +237,12 @@ unprivileged_init( fd_topo_t * topo, ctx->stake_ci_in_mem = topo->workspaces[ topo->objs[ stake_ci_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->stake_ci_in_chunk0 = fd_dcache_compact_chunk0( ctx->stake_ci_in_mem, stake_ci_in_link->dcache ); ctx->stake_ci_in_wmark = fd_dcache_compact_wmark ( ctx->stake_ci_in_mem, stake_ci_in_link->dcache, stake_ci_in_link->mtu ); + + fd_rpcserver_args_t * args = &ctx->args; + if( FD_UNLIKELY( !fd_funk_join( args->funk, fd_topo_obj_laddr( topo, tile->rpcserv.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + fd_rpc_start_service( args, ctx->ctx ); } static ulong diff --git a/src/discof/writer/fd_writer_tile.c b/src/discof/writer/fd_writer_tile.c index f2eeca4895..7e01e438be 100644 --- a/src/discof/writer/fd_writer_tile.c +++ b/src/discof/writer/fd_writer_tile.c @@ -9,7 +9,6 @@ #include "../../flamenco/runtime/fd_executor.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" struct fd_writer_tile_in_ctx { fd_wksp_t * mem; @@ -31,7 +30,6 @@ struct fd_writer_tile_ctx { /* Local join of Funk. R/W. */ fd_funk_t funk[1]; - fd_wksp_t * funk_wksp; /* Link management. */ fd_writer_tile_in_ctx_t exec_writer_in[ FD_PACK_MAX_BANK_TILES ]; @@ -339,23 +337,9 @@ unprivileged_init( fd_topo_t * topo, /* Funk */ /********************************************************************/ - FD_LOG_DEBUG(( "Trying to join funk at file=%s", tile->writer.funk_file )); - fd_funk_txn_start_write( NULL ); - int funk_join_ok = !!fd_funk_open_file( ctx->funk, - tile->writer.funk_file, - 1UL, - 0UL, - 0UL, - 0UL, - 0UL, - FD_FUNK_READ_WRITE, - NULL ); - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( ctx->funk ); - if( FD_UNLIKELY( !funk_join_ok ) ) { - FD_LOG_CRIT(( "Failed to join funk" )); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->writer.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); } - FD_LOG_DEBUG(( "Just joined funk at file=%s", tile->writer.funk_file )); /********************************************************************/ /* Setup fseq */ diff --git a/src/flamenco/runtime/tests/run_ledger_backtest.sh b/src/flamenco/runtime/tests/run_ledger_backtest.sh index 009e2011f8..732172b9c4 100755 --- a/src/flamenco/runtime/tests/run_ledger_backtest.sh +++ b/src/flamenco/runtime/tests/run_ledger_backtest.sh @@ -157,9 +157,6 @@ echo " archiver_path = \"$DUMP/$LEDGER/rocksdb\" [tiles.replay] snapshot = \"$SNAPSHOT\" - funk_sz_gb = $FUNK_PAGES - funk_txn_max = 1024 - funk_rec_max = $INDEX_MAX cluster_version = \"$CLUSTER_VERSION\" enable_features = [ \"$ONE_OFFS\" ] [tiles.gui] @@ -169,6 +166,10 @@ echo " block_max = 8192 txn_max = 1048576 alloc_max = 10737418240 + [funk] + heap_size_gib = $FUNK_PAGES + max_account_records = $INDEX_MAX + max_database_transactions = 1024 [consensus] vote = false [development] diff --git a/src/funk/fd_funk.c b/src/funk/fd_funk.c index 8966633899..9dd9ce51a3 100644 --- a/src/funk/fd_funk.c +++ b/src/funk/fd_funk.c @@ -132,8 +132,8 @@ fd_funk_new( void * shmem, } fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ) { +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ) { if( FD_UNLIKELY( !shfunk ) ) { FD_LOG_WARNING(( "NULL shfunk" )); return NULL; diff --git a/src/funk/fd_funk.h b/src/funk/fd_funk.h index 3a13a0ade3..140003634f 100644 --- a/src/funk/fd_funk.h +++ b/src/funk/fd_funk.h @@ -327,8 +327,8 @@ fd_funk_new( void * shmem, (joins are local to a thread group). */ fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ); +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ); /* fd_funk_leave leaves a funk join. Returns the memory region used for join on success (caller has ownership on return and the caller is no From 2135ce8244c56d57b6619159b12624c613b42aec Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Mon, 12 May 2025 17:48:21 +0000 Subject: [PATCH 06/34] cleaned up unzstd tile --- src/disco/topo/fd_topo.h | 36 ++ src/discof/restore/Local.mk | 4 +- src/discof/restore/fd_restore_base.h | 53 +-- src/discof/restore/fd_snapin_tile.c | 10 +- src/discof/restore/fd_unzstd_tile.c | 346 ++++-------------- src/discof/restore/stream/fd_event_map.c | 29 ++ src/discof/restore/stream/fd_event_map.h | 99 +++++ .../restore/{ => stream}/fd_frag_reader.h | 17 +- src/discof/restore/stream/fd_stream_ctx.c | 75 ++++ src/discof/restore/stream/fd_stream_ctx.h | 173 +++++++++ src/discof/restore/stream/fd_stream_metrics.h | 79 ++++ .../restore/{ => stream}/fd_stream_reader.h | 76 +++- src/discof/restore/stream/fd_stream_ticks.h | 55 +++ .../restore/{ => stream}/fd_stream_writer.c | 8 +- .../restore/{ => stream}/fd_stream_writer.h | 29 +- 15 files changed, 724 insertions(+), 365 deletions(-) create mode 100644 src/discof/restore/stream/fd_event_map.c create mode 100644 src/discof/restore/stream/fd_event_map.h rename src/discof/restore/{ => stream}/fd_frag_reader.h (90%) create mode 100644 src/discof/restore/stream/fd_stream_ctx.c create mode 100644 src/discof/restore/stream/fd_stream_ctx.h create mode 100644 src/discof/restore/stream/fd_stream_metrics.h rename src/discof/restore/{ => stream}/fd_stream_reader.h (56%) create mode 100644 src/discof/restore/stream/fd_stream_ticks.h rename src/discof/restore/{ => stream}/fd_stream_writer.c (95%) rename src/discof/restore/{ => stream}/fd_stream_writer.h (86%) diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 48508a7c23..f4b7794025 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -646,6 +646,42 @@ fd_topo_link_reliable_consumer_cnt( fd_topo_t const * topo, return cnt; } +FD_FN_PURE static inline ulong +fd_topo_tile_consumer_cnt( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + (void)topo; + return tile->out_cnt; +} + +FD_FN_PURE static inline ulong +fd_topo_tile_reliable_consumer_cnt( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + ulong reliable_cons_cnt = 0UL; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + reliable_cons_cnt++; + } + } + } + } + return reliable_cons_cnt; +} + +FD_FN_PURE static inline ulong +fd_topo_tile_producer_cnt( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + (void)topo; + ulong in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + in_cnt++; + } + return in_cnt; +} + /* Join (map into the process) all shared memory (huge/gigantic pages) needed by the tile, in the given topology. All memory associated with the tile (aka. used by links that the tile either produces to or diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index 7937fc2bb8..ea659cac49 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -1,5 +1,7 @@ $(call add-objs,fd_filerd_tile,fd_discof) $(call add-objs,fd_snapin_tile,fd_discof) $(call add-objs,fd_actalc_tile,fd_discof) -$(call add-objs,fd_stream_writer,fd_discof) $(call add-objs,fd_unzstd_tile,fd_discof) +$(call add-objs,stream/fd_stream_writer,fd_discof) +$(call add-objs,stream/fd_event_map,fd_discof) +$(call add-objs,stream/fd_stream_ctx,fd_discof) diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index dd26a4c277..6533d98175 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -2,29 +2,8 @@ #define HEADER_fd_src_discof_restore_fd_restore_base_h #include "../../tango/mcache/fd_mcache.h" - -/* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for - stream I/O. */ - -union fd_stream_frag_meta { - - struct { - - ulong seq; /* frag sequence number */ - ulong goff; /* stream offset */ - - uint sz; - ushort unused; - ushort ctl; - ulong loff; /* dcache offset */ - - }; - - fd_frag_meta_t f[1]; - -}; - -typedef union fd_stream_frag_meta fd_stream_frag_meta_t; +#include "../../disco/topo/fd_topo.h" +#include "stream/fd_stream_reader.h" struct fd_stream_frag_meta_ctx { uchar const * in_buf; @@ -34,34 +13,6 @@ struct fd_stream_frag_meta_ctx { }; typedef struct fd_stream_frag_meta_ctx fd_stream_frag_meta_ctx_t; -FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); -FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); - -FD_PROTOTYPES_BEGIN - -static inline void -fd_mcache_publish_stream( fd_stream_frag_meta_t * mcache, - ulong depth, - ulong seq, - ulong goff, - ulong loff, - ulong sz, - ulong ctl ) { - fd_stream_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); - FD_COMPILER_MFENCE(); - meta->seq = fd_seq_dec( seq, 1UL ); - FD_COMPILER_MFENCE(); - meta->goff = goff; - meta->sz = (uint)sz; - meta->ctl = (ushort)ctl; - meta->loff = loff; - FD_COMPILER_MFENCE(); - meta->seq = seq; - FD_COMPILER_MFENCE(); -} - -FD_PROTOTYPES_END - /* fd_account_frag_meta_t is a variation of fd_frag_meta_t optimized for accounts. */ diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index a410f03284..cda85f662e 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -74,6 +74,7 @@ struct fd_snapin_tile { ulong buf_ctr; /* number of bytes allocated in buffer */ ulong buf_sz; /* target buffer size (buf_ctrbuf_ctr==0UL ) { + if( reader->pad_sz==0UL ) { ulong goff = (ulong)cur - reader->goff_translate; - ulong pad_sz = fd_ulong_align_up( goff, 512UL ) - goff; - pad_sz = fd_ulong_min( pad_sz, (ulong)( end-cur ) ); - cur += pad_sz; + reader->pad_sz = fd_ulong_align_up( goff, 512UL ) - goff; + ulong pad_sz_cur = fd_ulong_min( reader->pad_sz, (ulong)( end-cur ) ); + reader->pad_sz -= pad_sz_cur; + cur += pad_sz_cur; } /* Determine number of bytes to read */ diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 4a6693faec..71dbc6609b 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -1,7 +1,8 @@ #include "../../disco/topo/fd_topo.h" #include "../../ballet/zstd/fd_zstd.h" -#include "fd_stream_writer.h" -#include "fd_stream_reader.h" +#include "fd_restore_base.h" +#include "stream/fd_stream_ctx.h" +#include "stream/fd_stream_writer.h" #include #include @@ -58,32 +59,31 @@ during_housekeeping( fd_unzstd_tile_t * ctx ) { (void)ctx; } -static void -metrics_write( fd_unzstd_tile_t * ctx ) { - (void)ctx; -} - static int -on_stream_frag( fd_unzstd_tile_t * ctx, +on_stream_frag( void * _ctx, fd_stream_reader_t * reader FD_PARAM_UNUSED, fd_stream_frag_meta_t const * frag, ulong * sz ) { - uchar const * chunk0 = ctx->in_state.in_buf + frag->loff; - uchar const * chunk_start = chunk0 + ctx->in_state.in_skip; - uchar const * chunk_end = chunk0 + frag->sz; + fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + uchar const * chunk0 = ctx->in_state.in_buf + frag->loff; + uchar const * chunk_start = chunk0 + ctx->in_state.in_skip; + uchar const * chunk_end = chunk0 + frag->sz; + uchar const * cur = chunk_start; + ulong total_decompressed = 0UL; + int consume_frag = 0; - ulong total_decompressed = 0UL; - uint dirty = 0; - int consume_frag = 1; for(;;) { - uchar const * prev_chunk_start = chunk_start; + uchar const * prev = cur; - if( !dirty && chunk_start==chunk_end ) { + if( cur==chunk_end ) { + /* Done with frag */ fd_stream_writer_publish( ctx->writer, total_decompressed ); ctx->in_state.in_skip = 0UL; + consume_frag = 1; break; } + /* get write pointers into dcache buffer */ uchar * buf_write_start = fd_stream_writer_get_write_ptr( ctx->writer ); uchar * out = buf_write_start; ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); @@ -92,33 +92,34 @@ on_stream_frag( fd_unzstd_tile_t * ctx, if( dst_max==0 ) { /* we are blocked by downstream */ fd_stream_writer_publish( ctx->writer, total_decompressed ); - // FD_LOG_WARNING(("we are blocked by downstream! consumed %lu bytes frag size is %u", ctx->in_state.in_skip, frag->sz)); - consume_frag=0; break; } - int zstd_err = fd_zstd_dstream_read( ctx->dstream, &chunk_start, chunk_end, &out, out_end, NULL ); + + /* fd_zstd_dstream_read updates chunk_start and out */ + int zstd_err = fd_zstd_dstream_read( ctx->dstream, &cur, chunk_end, &out, out_end, NULL ); if( FD_UNLIKELY( zstd_err>0) ) { - FD_LOG_WARNING(( "fd_zstd_dstream_read failed" )); - consume_frag=0; + FD_LOG_ERR(( "fd_zstd_dstream_read failed" )); break; } - ulong decompress_sz = (ulong)out - (ulong)buf_write_start; - total_decompressed += decompress_sz; - ctx->in_state.in_skip += (ulong)chunk_start - (ulong)prev_chunk_start; - dirty = (out==out_end); + /* accumulate decompressed bytes */ + ulong decompress_sz = (ulong)out - (ulong)buf_write_start; + total_decompressed += decompress_sz; + + /* accumulate consumed bytes */ + ulong consumed_sz = (ulong)cur - (ulong)prev; + ctx->in_state.in_skip += consumed_sz; fd_stream_writer_advance( ctx->writer, decompress_sz ); } - *sz = frag->sz; + *sz = (ulong)cur - (ulong)chunk_start; return consume_frag; } static void fd_unzstd_in_update( fd_stream_reader_t * in ) { - // FD_LOG_WARNING(("unzstd: in fseq is %lu", (ulong)in->base.fseq)); FD_COMPILER_MFENCE(); FD_VOLATILE( in->base.fseq[0] ) = in->base.seq; FD_VOLATILE( in->base.fseq[1] ) = in->goff; @@ -147,286 +148,83 @@ fd_unzstd_shutdown( void ) { for(;;) pause(); } -/* ?? */ +static void +fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, + ulong const volatile * shutdown_signal ) { + ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); + if( FD_UNLIKELY( in_seq_max == stream_ctx->in[ 0 ].base.seq && in_seq_max != 0) ) { + FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", + in_seq_max, stream_ctx->in[0].base.seq)); + fd_unzstd_shutdown(); + } +} + __attribute__((noinline)) static void fd_unzstd_run1( fd_unzstd_tile_t * ctx, - ulong in_cnt, - fd_stream_reader_t * in, /* [in_cnt] */ - ulong out_cnt FD_PARAM_UNUSED, - fd_stream_frag_meta_t ** out_mcache_arr FD_PARAM_UNUSED, - ulong cons_cnt, - ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ - ulong * cons_out FD_PARAM_UNUSED, /* [cons_cnt] */ - ulong ** cons_fseq FD_PARAM_UNUSED, /* [cons_cnt] */ - ulong volatile ** restrict cons_slow FD_PARAM_UNUSED, /* [cons_cnt] */ - ulong * restrict cons_seq FD_PARAM_UNUSED, /* [cons_cnt] */ - long lazy, - fd_rng_t * rng ) { - - /* in frag stream state */ - ulong in_seq; - - /* housekeeping state */ - ulong event_cnt; - ulong event_seq; - ulong async_min; /* min number of ticks between a housekeeping event */ - - /* performance metrics */ - ulong metric_in_backp; - ulong metric_backp_cnt; - ulong metric_regime_ticks[9]; - - metric_in_backp = 1UL; - metric_backp_cnt = 0UL; - memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + fd_stream_ctx_t * stream_ctx ) { - /* in frag stream init */ - - in_seq = 0UL; - - /* out frag stream init */ - - ulong const burst_byte = 512UL; /* don't producing frags smaller than this */ - ulong const burst_frag = 2UL; + FD_LOG_INFO(( "Running unzstd tile" )); + /* run loop init */ + ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 3; fd_stream_writer_init_flow_control_credits( ctx->writer ); + fd_stream_ctx_init_run_loop( stream_ctx ); - /* housekeeping init */ - - //if( lazy<=0L ) lazy = fd_tempo_lazy_default( out_depth ); - lazy = 1e3L; - FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy )); - ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( in[0].base.mcache->f ) + 3; - - /* Initial event sequence */ - - event_cnt = in_cnt + 1UL + cons_cnt; - event_seq = 0UL; - event_map[ event_seq++ ] = (ushort)cons_cnt; - for( ulong in_idx=0UL; in_idxticks ) ) ) { + ulong event_idx = fd_event_map_get_event( stream_ctx->event_map ); - /* Do housekeeping at a low rate in the background */ - ulong housekeeping_ticks = 0UL; - if( FD_UNLIKELY( (now-then)>=0L ) ) { - ulong event_idx = (ulong)event_map[ event_seq ]; - - if( FD_LIKELY( event_idxcons_cnt ) ) { /* receive credits */ ulong cons_idx = event_idx; - + /* Receive flow control credits from this out. */ fd_stream_writer_receive_flow_control_credits( ctx->writer, cons_idx ); - ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); - if( FD_UNLIKELY( in_seq_max == in[ 0 ].base.seq && in_seq_max != 0) ) { - FD_LOG_WARNING(("zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", in_seq_max, in[0].base.seq)); - fd_unzstd_shutdown(); - } + fd_unzstd_poll_shutdown( stream_ctx, shutdown_signal ); - } else if( event_idx>cons_cnt) { - ulong in_idx = event_idx - cons_cnt - 1UL; - fd_unzstd_in_update( &in[ in_idx ] ); + } else if( event_idx>stream_ctx->cons_cnt) { /* send credits */ + ulong in_idx = event_idx - stream_ctx->cons_cnt - 1UL; + fd_unzstd_in_update( &stream_ctx->in[ in_idx ] ); } else { /* event_idx==cons_cnt, housekeeping event */ /* Update metrics counters to external viewers */ - FD_COMPILER_MFENCE(); - FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); - FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); - FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); - FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); - metrics_write( ctx ); - FD_COMPILER_MFENCE(); - metric_backp_cnt = 0UL; - - /* Receive flow control credits */ + fd_stream_metrics_update_external( stream_ctx->metrics, + stream_ctx->ticks->now, + NULL, + ctx ); + /* Recalculate flow control credits */ ulong slowest_cons = ULONG_MAX; - fd_stream_writer_update_flow_control_credits( ctx->writer, &slowest_cons ); - - if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { - FD_COMPILER_MFENCE(); - (*cons_slow[ slowest_cons ]) += metric_in_backp; - FD_COMPILER_MFENCE(); - } - + fd_stream_writer_update_flow_control_credits( ctx->writer, + &slowest_cons ); + fd_stream_ctx_update_cons_slow( stream_ctx, + slowest_cons ); during_housekeeping( ctx ); } - - /* Select which event to do next (randomized round robin) and - reload the housekeeping timer. */ - - event_seq++; - if( FD_UNLIKELY( event_seq>=event_cnt ) ) { - event_seq = 0UL; - // ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); - // ushort map_tmp = event_map[ swap_idx ]; - // event_map[ swap_idx ] = event_map[ 0 ]; - // event_map[ 0 ] = map_tmp; - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - long next = fd_tickcount(); - housekeeping_ticks = (ulong)(next - now); - now = next; - } - - /* Check if we are backpressured. */ - - if( FD_UNLIKELY( ctx->writer->cr_byte_availwriter->cr_frag_availwriter->cr_byte_avail=in_cnt ) in_seq = 0UL; /* cmov */ - - /* Check if this in has any new fragments to mux */ - - fd_frag_reader_consume_ctx_t consume_ctx; - long diff = fd_stream_reader_poll_frag( this_in, in_seq, &consume_ctx ); - if( FD_UNLIKELY( diff ) ) { - ulong * housekeeping_regime = &metric_regime_ticks[0]; - ulong * prefrag_regime = &metric_regime_ticks[3]; - ulong * finish_regime = &metric_regime_ticks[6]; - if( FD_UNLIKELY( diff<0L ) ) { - housekeeping_regime = &metric_regime_ticks[1]; - prefrag_regime = &metric_regime_ticks[4]; - finish_regime = &metric_regime_ticks[7]; - - fd_stream_reader_process_overrun( this_in, &consume_ctx, diff ); - } - - /* Don't bother with spin as polling multiple locations */ - *housekeeping_regime += housekeeping_ticks; - *prefrag_regime += prefrag_ticks; - long next = fd_tickcount(); - *finish_regime += (ulong)(next - now); - now = next; - continue; + fd_stream_ctx_housekeeping_advance( stream_ctx ); } - FD_COMPILER_MFENCE(); - ulong sz = 0U; - int consumed_frag = on_stream_frag( ctx, this_in, fd_type_pun_const( consume_ctx.mline ), &sz ); - - if( FD_LIKELY( consumed_frag ) ) { - // FD_LOG_WARNING(("consuming frag with sz: %lu", sz)); - fd_stream_reader_consume_frag( this_in, &consume_ctx, sz ); + /* Check if we are backpressured, otherwise poll */ + if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { + fd_stream_ctx_process_backpressure( stream_ctx ); + } else { + fd_stream_ctx_poll( stream_ctx, ctx, on_stream_frag ); } - - metric_regime_ticks[1] += housekeeping_ticks; - metric_regime_ticks[4] += prefrag_ticks; - long next = fd_tickcount(); - metric_regime_ticks[7] += (ulong)(next - now); - now = next; } } static void fd_unzstd_run( fd_topo_t * topo, fd_topo_tile_t * tile ) { - fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; - ulong * in_fseq [ LINK_IN_MAX ]; - fd_memset(in_mcache, 0, sizeof(fd_stream_frag_meta_t *)*LINK_IN_MAX); - fd_memset(in_fseq, 0, sizeof(ulong *)*LINK_IN_MAX ); - - ulong polled_in_cnt = 0UL; - for( ulong i=0UL; iin_cnt; i++ ) { - if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; - - in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); - FD_TEST( in_mcache[ polled_in_cnt ] ); - in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; - FD_TEST( in_fseq[ polled_in_cnt ] ); - polled_in_cnt += 1; - } - FD_TEST( polled_in_cnt<=LINK_IN_MAX ); - - fd_stream_frag_meta_t * out_mcache[ tile->out_cnt ]; - for( ulong i=0UL; iout_cnt; i++ ) { - out_mcache[ i ] = fd_type_pun( topo->links[ tile->out_link_id[ i ] ].mcache ); - FD_TEST( out_mcache[ i ] ); - } - - ulong reliable_cons_cnt = 0UL; - ulong cons_out[ FD_TOPO_MAX_LINKS ]; - ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; - for( ulong i=0UL; itile_cnt; i++ ) { - fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; - for( ulong j=0UL; jin_cnt; j++ ) { - for( ulong k=0UL; kout_cnt; k++ ) { - if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { - cons_out[ reliable_cons_cnt ] = k; - cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; - FD_TEST( cons_fseq[ reliable_cons_cnt ] ); - reliable_cons_cnt++; - FD_TEST( reliable_cons_cnttile_obj_id ); - FD_LOG_WARNING(("reliable_cons_count is %lu", reliable_cons_cnt)); - ushort event_map[ 1+reliable_cons_cnt+polled_in_cnt ]; - ulong volatile * cons_slow[ reliable_cons_cnt ]; - ulong cons_seq [ 2*reliable_cons_cnt+1 ]; - - FD_LOG_WARNING(("event map is located at %lx", (ulong)event_map)); - FD_LOG_WARNING(("cons fseq is located at %lx", (ulong)cons_fseq)); - FD_LOG_WARNING(("cons seq is located at %lx", (ulong)cons_seq)); + ulong in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong cons_cnt = fd_topo_tile_reliable_consumer_cnt( topo, tile ); + void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, cons_cnt ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, cons_cnt ); fd_unzstd_run1( ctx, - polled_in_cnt, - polled_in, - reliable_cons_cnt, - out_mcache, - reliable_cons_cnt, - event_map, - cons_out, - cons_fseq, - cons_slow, - cons_seq, - (ulong)10e3, - rng ); + stream_ctx ); } #ifndef FD_TILE_TEST diff --git a/src/discof/restore/stream/fd_event_map.c b/src/discof/restore/stream/fd_event_map.c new file mode 100644 index 0000000000..8d02c60df5 --- /dev/null +++ b/src/discof/restore/stream/fd_event_map.c @@ -0,0 +1,29 @@ +#include "fd_event_map.h" + +fd_event_map_t * +fd_event_map_new( void * mem, + ulong in_cnt, + ulong cons_cnt ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_event_map_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_event_map_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_event_map_t), sizeof(fd_event_map_t) ); + + ulong event_cnt = 1UL + in_cnt + cons_cnt; + self->event_map = FD_SCRATCH_ALLOC_APPEND( l, alignof(ushort), sizeof(ushort)*event_cnt ); + self->event_cnt = event_cnt; + self->event_seq = 0UL; + + /* init event map */ + fd_event_map_init(self, in_cnt, cons_cnt ); + + return self; +} diff --git a/src/discof/restore/stream/fd_event_map.h b/src/discof/restore/stream/fd_event_map.h new file mode 100644 index 0000000000..eaa89b9739 --- /dev/null +++ b/src/discof/restore/stream/fd_event_map.h @@ -0,0 +1,99 @@ +#ifndef HEADER_fd_src_discof_restore_fd_event_map_h +#define HEADER_fd_src_discof_restore_fd_event_map_h + +#include "../../../util/fd_util_base.h" +#include "../../../util/bits/fd_bits.h" +#include "../../../util/rng/fd_rng.h" +#include "fd_stream_reader.h" + +struct fd_event_map { + ulong event_cnt; + ulong event_seq; + ushort * event_map; +}; +typedef struct fd_event_map fd_event_map_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_CONST static inline ulong +fd_event_map_align( void ) { + return alignof(fd_event_map_t); +} + +FD_FN_CONST static inline ulong +fd_event_map_footprint( ulong in_cnt, + ulong cons_cnt ) { + ulong event_cnt = 1UL + in_cnt + cons_cnt; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND(l, alignof(fd_event_map_t), sizeof(fd_event_map_t) ); + l = FD_LAYOUT_APPEND(l, alignof(ushort), sizeof(ushort)*event_cnt ); + return FD_LAYOUT_FINI( l, fd_event_map_align() ); +} + +fd_event_map_t * +fd_event_map_new( void * mem, + ulong in_cnt, + ulong cons_cnt ); + +static inline void +fd_event_map_init( fd_event_map_t * map, + ulong in_cnt, + ulong cons_cnt ) { + ulong idx = 0UL; + map->event_map[ idx++ ] = (ushort)cons_cnt; + for( ulong in_idx=0UL; in_idxevent_map[ idx++ ] = (ushort)(in_idx+cons_cnt+1UL); + for( ulong cons_idx=0UL; cons_idxevent_map[ idx++ ] = (ushort)cons_idx; +} + +static inline ushort +fd_event_map_get_event( fd_event_map_t * map ) { + return map->event_map[ map->event_seq ]; +} + +static inline void +fd_event_map_randomize( fd_event_map_t * map, + fd_rng_t * rng ) { + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)map->event_cnt ); + ushort map_tmp = map->event_map[ swap_idx ]; + map->event_map[ swap_idx ] = map->event_map[ 0 ]; + map->event_map[ 0 ] = map_tmp; +} + +static inline void +fd_event_map_randomize_inputs( void ** in, + ulong in_cnt, + fd_rng_t * rng ) { + if( FD_LIKELY( in_cnt>1UL ) ) { + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + void * in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } +} + +static inline void +fd_event_map_advance( fd_event_map_t * map, + fd_rng_t * rng, + void ** in, + ulong in_cnt ) { + map->event_seq++; + if( FD_UNLIKELY( map->event_seq>=map->event_cnt) ) { + map->event_seq = 0UL; + + fd_event_map_randomize( map, rng ); + + fd_event_map_randomize_inputs( in, in_cnt, rng ); + } +} + +static inline void * +fd_event_map_delete( fd_event_map_t * map ) { + fd_memset(map, 0, sizeof(fd_event_map_t) ); + return (void *)map; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_fd_event_map_h */ diff --git a/src/discof/restore/fd_frag_reader.h b/src/discof/restore/stream/fd_frag_reader.h similarity index 90% rename from src/discof/restore/fd_frag_reader.h rename to src/discof/restore/stream/fd_frag_reader.h index 9254fec21c..22b89f5640 100644 --- a/src/discof/restore/fd_frag_reader.h +++ b/src/discof/restore/stream/fd_frag_reader.h @@ -1,8 +1,8 @@ -#ifndef HEADER_fd_src_discof_restore_fd_frag_reader_h -#define HEADER_fd_src_discof_restore_fd_frag_reader_h +#ifndef HEADER_fd_src_discof_restore_stream_fd_frag_reader_h +#define HEADER_fd_src_discof_restore_stream_fd_frag_reader_h -#include "../../disco/stem/fd_stem.h" -#include "../../disco/metrics/fd_metrics.h" +#include "../../../disco/stem/fd_stem.h" +#include "../../../disco/metrics/fd_metrics.h" struct __attribute__((aligned(64))) fd_frag_reader { fd_frag_meta_t const * mcache; /* local join to this in's mcache */ @@ -99,10 +99,7 @@ fd_frag_reader_process_overrun( fd_frag_reader_t * reader, static inline void fd_frag_reader_consume_frag( fd_frag_reader_t * reader, - fd_frag_reader_consume_ctx_t * ctx, - ulong frag_sz ) { - reader->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)frag_sz; - + fd_frag_reader_consume_ctx_t * ctx ) { /* check for overrun: when sequence number has changed */ ulong seq_test = fd_frag_meta_seq_query( ctx->mline ); if( FD_UNLIKELY( fd_seq_ne( seq_test, ctx->seq_found ) ) ) { @@ -117,11 +114,11 @@ fd_frag_reader_consume_frag( fd_frag_reader_t * reader, } static inline void * -fd_frag_reader_destroy( fd_frag_reader_t * reader ) { +fd_frag_reader_delete( fd_frag_reader_t * reader ) { fd_memset( reader, 0, sizeof(fd_frag_reader_t) ); return (void *)reader; } FD_PROTOTYPES_END -#endif /* HEADER_fd_src_discof_restore_fd_frag_reader_h */ \ No newline at end of file +#endif /* HEADER_fd_src_discof_restore_stream_fd_frag_reader_h */ diff --git a/src/discof/restore/stream/fd_stream_ctx.c b/src/discof/restore/stream/fd_stream_ctx.c new file mode 100644 index 0000000000..82586929f5 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_ctx.c @@ -0,0 +1,75 @@ +#include "fd_stream_ctx.h" + +void +fd_stream_ctx_init( fd_stream_ctx_t * ctx, + fd_topo_t * topo, + fd_topo_tile_t * tile ) { + /* init in */ + ulong in_idx = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + fd_stream_reader_init( &ctx->in[ in_idx ], + fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ), + tile->in_link_fseq[ i ], + in_idx ); + in_idx++; + } + + /* init in_ptrs */ + for( ulong i=0UL; iin_cnt; i++ ) { + ctx->in_ptrs[ i ] = &ctx->in[ i ]; + } + + /* init cons_fseq */ + ulong cons_idx = 0UL; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + ctx->cons_fseq[ cons_idx ] = consumer_tile->in_link_fseq[ j ]; + } + } + } + } + + fd_stream_ticks_init( ctx->ticks, ctx->event_map->event_cnt, 1e3L ); + fd_stream_metrics_init( ctx->metrics ); + FD_TEST( fd_rng_join( fd_rng_new( ctx->rng, 0, 0UL ) ) ); +} + +fd_stream_ctx_t * +fd_stream_ctx_new( void * mem, + fd_topo_t * topo, + fd_topo_tile_t * tile, + ulong in_cnt, + ulong cons_cnt ) { + if( FD_UNLIKELY( !mem ) ) { + FD_LOG_WARNING(( "NULL mem" )); + return NULL; + } + + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_ctx_scratch_align() ) ) ) { + FD_LOG_WARNING(( "unaligned mem" )); + return NULL; + } + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_stream_ctx_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); + + self->in = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); + self->in_ptrs = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); + self->cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong const *), cons_cnt*sizeof(ulong const *) ); + self->cons_slow = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); + void * event_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, cons_cnt ) ); + + self->in_cnt = in_cnt; + self->cons_cnt = cons_cnt; + + self->event_map = fd_event_map_new( event_map_mem, in_cnt, cons_cnt ); + fd_stream_ctx_init( self, topo, tile ); + self->in_seq = 0UL; + + return self; +} diff --git a/src/discof/restore/stream/fd_stream_ctx.h b/src/discof/restore/stream/fd_stream_ctx.h new file mode 100644 index 0000000000..a8bf3522b6 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_ctx.h @@ -0,0 +1,173 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_ctx_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_ctx_h + +#include "../../../disco/topo/fd_topo.h" +#include "fd_stream_reader.h" +#include "fd_event_map.h" +#include "fd_stream_ticks.h" +#include "fd_stream_metrics.h" + +struct fd_stream_ctx { + fd_stream_reader_t * in; + fd_stream_reader_t ** in_ptrs; + ulong ** cons_fseq; + ulong ** cons_slow; + fd_event_map_t * event_map; + ulong in_cnt; + ulong cons_cnt; + ulong in_seq; + fd_rng_t rng[1]; + fd_stream_ticks_t ticks[1]; + fd_stream_metrics_t metrics[1]; +}; +typedef struct fd_stream_ctx fd_stream_ctx_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_PURE static inline ulong +fd_stream_ctx_scratch_align( void ) { + return FD_STEM_SCRATCH_ALIGN; +} + +FD_FN_PURE static inline ulong +fd_stream_ctx_scratch_footprint( ulong in_cnt, + ulong cons_cnt ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); /* in */ + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); /* in_ptrs */ + l = FD_LAYOUT_APPEND( l, alignof(ulong const *), cons_cnt*sizeof(ulong const *) ); /* cons_fseq */ + l = FD_LAYOUT_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); /* cons_slow */ + l = FD_LAYOUT_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, cons_cnt ) ); /* event_map */ + return FD_LAYOUT_FINI( l, fd_stream_ctx_scratch_align() ); +} + +fd_stream_ctx_t * +fd_stream_ctx_new( void * mem, + fd_topo_t * topo, + fd_topo_tile_t * tile, + ulong in_cnt, + ulong cons_cnt ); + +void +fd_stream_ctx_init( fd_stream_ctx_t * ctx, + fd_topo_t * topo, + fd_topo_tile_t * tile ); + +static inline void +fd_stream_ctx_update_cons_slow( fd_stream_ctx_t * ctx, + ulong slowest_cons ) { +if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*ctx->cons_slow[ slowest_cons ]) += ctx->metrics->in_backp; + FD_COMPILER_MFENCE(); + } +} + +static inline void +fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx ) { + FD_MGAUGE_SET( TILE, STATUS, 1UL ); + fd_stream_ticks_init_timer( ctx->ticks ); +} + +static inline void +fd_stream_ctx_housekeeping_advance( fd_stream_ctx_t * ctx ) { + /* Select which event to do next (randomized round robin) and + reload the housekeeping timer. */ + fd_event_map_advance( ctx->event_map, + ctx->rng, + (void **)ctx->in_ptrs, + ctx->in_cnt ); + + /* Reload housekeeping timer */ + fd_stream_ticks_reload_housekeeping( ctx->ticks, + ctx->rng); +} + +static inline void +fd_stream_ctx_process_backpressure( fd_stream_ctx_t * ctx ) { + fd_stream_metrics_update_backpressure( ctx->metrics, + ctx->ticks->housekeeping_ticks ); + fd_stream_ticks_reload_backpressure( ctx->ticks ); +} + +typedef int fd_on_stream_frag_fn_t( void * ctx, + fd_stream_reader_t * reader, + fd_stream_frag_meta_t const * frag, + ulong * sz ); + +static inline void +fd_stream_ctx_poll( fd_stream_ctx_t * stream_ctx, + void * ctx, + fd_on_stream_frag_fn_t * on_stream_frag ) { + stream_ctx->metrics->in_backp = 0UL; + stream_ctx->ticks->prefrag_ticks = 0UL; + + /* select input to poll */ + fd_stream_reader_t * this_in = &stream_ctx->in[ stream_ctx->in_seq ]; + stream_ctx->in_seq++; + if( stream_ctx->in_seq>=stream_ctx->in_cnt ) { + stream_ctx->in_seq = 0UL; /* cmov */ + } + + fd_frag_reader_consume_ctx_t consume_ctx; + long diff = fd_stream_reader_poll_frag( this_in, + stream_ctx->in_seq, + &consume_ctx ); + + if( FD_UNLIKELY( diff<0L ) ) { + fd_stream_metrics_update_poll( stream_ctx->metrics, + stream_ctx->ticks->housekeeping_ticks, + stream_ctx->ticks->prefrag_ticks, + &stream_ctx->ticks->now); + + fd_stream_reader_process_overrun( this_in, + &consume_ctx, + diff ); + } + else if ( FD_UNLIKELY( diff ) ) { + fd_stream_metrics_update_poll_idle( stream_ctx->metrics, + stream_ctx->ticks->housekeeping_ticks, + stream_ctx->ticks->prefrag_ticks, + &stream_ctx->ticks->now ); + } + else { + FD_COMPILER_MFENCE(); + ulong sz = 0U; + fd_stream_frag_meta_t const * frag = fd_type_pun_const( consume_ctx.mline ); + int consumed_frag = on_stream_frag( ctx, this_in, frag, &sz ); + + fd_stream_reader_consume_bytes( this_in, sz ); + + if( FD_LIKELY( consumed_frag ) ) { + fd_stream_reader_consume_frag( this_in, + &consume_ctx ); + } + + fd_stream_metrics_update_poll( stream_ctx->metrics, + stream_ctx->ticks->housekeeping_ticks, + stream_ctx->ticks->prefrag_ticks, + &stream_ctx->ticks->now ); + } +} + +static inline void * +fd_stream_ctx_delete( fd_stream_ctx_t * ctx ) { + for( ulong i=0UL; iin_cnt; i++ ) { + fd_stream_reader_delete( &ctx->in[ i ] ); + ctx->in_ptrs[ i ] = NULL; + } + + for( ulong i=0UL; icons_cnt; i++ ) { + ctx->cons_fseq[ i ] = NULL; + ctx->cons_slow[ i ] = NULL; + } + + fd_event_map_delete( ctx->event_map ); + fd_memset(ctx, 0, sizeof(fd_stream_ctx_t) ); + return (void *)ctx; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_ctx_h */ diff --git a/src/discof/restore/stream/fd_stream_metrics.h b/src/discof/restore/stream/fd_stream_metrics.h new file mode 100644 index 0000000000..031e5500e2 --- /dev/null +++ b/src/discof/restore/stream/fd_stream_metrics.h @@ -0,0 +1,79 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h + +#include "../../../util/fd_util_base.h" +#include "../../../disco/metrics/fd_metrics.h" + +struct fd_stream_metrics { + ulong in_backp; + ulong backp_cnt; + ulong regime_ticks[9]; +}; +typedef struct fd_stream_metrics fd_stream_metrics_t; + +typedef void fd_metrics_write_fn_t( void * ctx ); + +FD_PROTOTYPES_BEGIN + +static inline void +fd_stream_metrics_init( fd_stream_metrics_t * metrics ) { + metrics->in_backp = 1UL; + metrics->backp_cnt = 0UL; + fd_memset( metrics->regime_ticks, 0, sizeof(metrics->regime_ticks) ); +} + +static inline void +fd_stream_metrics_update_external( fd_stream_metrics_t * metrics, + long now, + fd_metrics_write_fn_t * metrics_write, + void * ctx ) { + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metrics->in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metrics->backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metrics->regime_ticks ); + + if( metrics_write ) { + metrics_write( ctx ); + } + + FD_COMPILER_MFENCE(); + metrics->backp_cnt = 0UL; +} + +static inline void +fd_stream_metrics_update_backpressure( fd_stream_metrics_t * metrics, + ulong housekeeping_ticks ) { + metrics->backp_cnt += (ulong)!metrics->in_backp; + metrics->in_backp = 1UL; + FD_SPIN_PAUSE(); + metrics->regime_ticks[2] += housekeeping_ticks; +} + +static inline void +fd_stream_metrics_update_poll( fd_stream_metrics_t * metrics, + ulong housekeeping_ticks, + ulong prefrag_ticks, + long * now) { + metrics->regime_ticks[1] += housekeeping_ticks; + metrics->regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metrics->regime_ticks[7] += (ulong)(next - *now); + *now = next; +} + +static inline void +fd_stream_metrics_update_poll_idle( fd_stream_metrics_t * metrics, + ulong housekeeping_ticks, + ulong prefrag_ticks, + long * now) { + metrics->regime_ticks[0] += housekeeping_ticks; + metrics->regime_ticks[3] += prefrag_ticks; + long next = fd_tickcount(); + metrics->regime_ticks[6] += (ulong)(next - *now); + *now = next; +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h */ diff --git a/src/discof/restore/fd_stream_reader.h b/src/discof/restore/stream/fd_stream_reader.h similarity index 56% rename from src/discof/restore/fd_stream_reader.h rename to src/discof/restore/stream/fd_stream_reader.h index 75ee8f09f8..e562f1f6a3 100644 --- a/src/discof/restore/fd_stream_reader.h +++ b/src/discof/restore/stream/fd_stream_reader.h @@ -1,9 +1,34 @@ -#ifndef HEADER_fd_src_discof_restore_fd_stream_reader_h -#define HEADER_fd_src_discof_restore_fd_stream_reader_h +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_reader_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_reader_h -#include "fd_restore_base.h" #include "fd_frag_reader.h" +/* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for + stream I/O. */ + +union fd_stream_frag_meta { + +struct { + + ulong seq; /* frag sequence number */ + ulong goff; /* stream offset */ + + uint sz; + ushort unused; + ushort ctl; + ulong loff; /* dcache offset */ + +}; + +fd_frag_meta_t f[1]; + +}; + +typedef union fd_stream_frag_meta fd_stream_frag_meta_t; + +FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); +FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); + struct fd_stream_reader { union { struct { @@ -24,6 +49,27 @@ typedef struct fd_stream_reader fd_stream_reader_t; FD_PROTOTYPES_BEGIN +static inline void +fd_mcache_publish_stream( fd_stream_frag_meta_t * mcache, + ulong depth, + ulong seq, + ulong goff, + ulong loff, + ulong sz, + ulong ctl ) { + fd_stream_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + FD_COMPILER_MFENCE(); + meta->seq = fd_seq_dec( seq, 1UL ); + FD_COMPILER_MFENCE(); + meta->goff = goff; + meta->sz = (uint)sz; + meta->ctl = (ushort)ctl; + meta->loff = loff; + FD_COMPILER_MFENCE(); + meta->seq = seq; + FD_COMPILER_MFENCE(); +} + FD_FN_CONST static inline ulong fd_stream_reader_align( void ) { return alignof(fd_stream_reader_t); @@ -75,26 +121,30 @@ fd_stream_reader_poll_frag( fd_stream_reader_t * reader, static inline void fd_stream_reader_process_overrun( fd_stream_reader_t * reader, - fd_frag_reader_consume_ctx_t * ctx, - long seq_diff ) { + fd_frag_reader_consume_ctx_t * ctx, + long seq_diff ) { fd_frag_reader_process_overrun( reader->base.r, ctx, seq_diff ); } +static inline void +fd_stream_reader_consume_bytes( fd_stream_reader_t * reader, + ulong bytes ) { + reader->goff += bytes; + reader->base.accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)bytes; +} + static inline void fd_stream_reader_consume_frag( fd_stream_reader_t * reader, - fd_frag_reader_consume_ctx_t * ctx, - ulong frag_sz ) { - reader->goff += frag_sz; - fd_frag_reader_consume_frag( reader->base.r, ctx, frag_sz ); + fd_frag_reader_consume_ctx_t * ctx ) { + fd_frag_reader_consume_frag( reader->base.r, ctx ); } static inline void * -fd_stream_reader_destroy( fd_stream_reader_t * reader ) { - fd_frag_reader_destroy( reader->base.r ); - reader->goff = 0UL; +fd_stream_reader_delete( fd_stream_reader_t * reader ) { + fd_frag_reader_delete( reader->base.r ); return (void *)reader; } FD_PROTOTYPES_END -#endif /* HEADER_fd_src_discof_restore_fd_stream_reader_h */ \ No newline at end of file +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_reader_h */ diff --git a/src/discof/restore/stream/fd_stream_ticks.h b/src/discof/restore/stream/fd_stream_ticks.h new file mode 100644 index 0000000000..9db346cfac --- /dev/null +++ b/src/discof/restore/stream/fd_stream_ticks.h @@ -0,0 +1,55 @@ +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h + +#include "../../../util/fd_util_base.h" +#include "../../../tango/tempo/fd_tempo.h" + +struct fd_stream_ticks { + ulong housekeeping_ticks; + ulong prefrag_ticks; + ulong async_min; + long lazy; + long now; + long then; +}; +typedef struct fd_stream_ticks fd_stream_ticks_t; + +static inline void +fd_stream_ticks_init( fd_stream_ticks_t * ticks, + ulong event_cnt, + long lazy ) { + fd_memset( ticks, 0, sizeof(fd_stream_ticks_t) ); + ticks->lazy = lazy; + ticks->async_min = fd_tempo_async_min( ticks->lazy, + event_cnt, + (float)fd_tempo_tick_per_ns( NULL ) ); + if( FD_UNLIKELY( !ticks->async_min ) ) FD_LOG_ERR(( "bad lazy %lu %lu", (ulong)ticks->lazy, event_cnt )); +} + +static inline void +fd_stream_ticks_init_timer( fd_stream_ticks_t * ticks ) { + ticks->then = fd_tickcount(); + ticks->now = ticks->then; +} + +static inline int +fd_stream_ticks_is_housekeeping_time( fd_stream_ticks_t * ticks ) { + ticks->housekeeping_ticks = 0UL; + return (ticks->now - ticks->then) >= 0L; +} + +static inline void +fd_stream_ticks_reload_housekeeping( fd_stream_ticks_t * ticks, fd_rng_t * rng ) { + ticks->then = ticks->now + (long)fd_tempo_async_reload( rng, ticks->async_min ); + long next = fd_tickcount(); + ticks->housekeeping_ticks = (ulong)(next - ticks->now); + ticks->now = next; +} + +static inline void +fd_stream_ticks_reload_backpressure( fd_stream_ticks_t * ticks ) { + long next = fd_tickcount(); + ticks->now = next; +} + +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h */ diff --git a/src/discof/restore/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c similarity index 95% rename from src/discof/restore/fd_stream_writer.c rename to src/discof/restore/stream/fd_stream_writer.c index 0dbfbf3d76..bd7076ac92 100644 --- a/src/discof/restore/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -1,7 +1,6 @@ #include "fd_stream_writer.h" -#include "../../util/log/fd_log.h" -#include "../../util/wksp/fd_wksp.h" -#include "../../tango/dcache/fd_dcache.h" +#include "../../../util/log/fd_log.h" +#include "../../../tango/dcache/fd_dcache.h" fd_stream_writer_t * fd_stream_writer_new( void * mem, @@ -36,7 +35,8 @@ fd_stream_writer_new( void * mem, self->goff = 0UL; self->read_max = read_max; self->stream_off = 0UL; - self->out_seq = 0UL; + self->goff_start = 0UL; + self->out_seq = 0UL; /* Set up flow control state */ self->cr_byte_avail = 0UL; diff --git a/src/discof/restore/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h similarity index 86% rename from src/discof/restore/fd_stream_writer.h rename to src/discof/restore/stream/fd_stream_writer.h index 8fb0238760..679317b41f 100644 --- a/src/discof/restore/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -1,9 +1,9 @@ -#ifndef HEADER_fd_src_discof_restore_fd_stream_writer_h -#define HEADER_fd_src_discof_restore_fd_stream_writer_h +#ifndef HEADER_fd_src_discof_restore_stream_fd_stream_writer_h +#define HEADER_fd_src_discof_restore_stream_fd_stream_writer_h -#include "../../util/fd_util_base.h" -#include "fd_restore_base.h" -#include "../../disco/topo/fd_topo.h" +#include "../../../util/fd_util_base.h" +#include "../../../disco/topo/fd_topo.h" +#include "fd_stream_reader.h" /* A shared stream has a single producer and multiple consumers. fd_stream_writer implements the producer APIs of the shared stream */ @@ -18,6 +18,7 @@ struct fd_stream_writer { ulong goff; /* global offset into byte stream */ ulong read_max; /* max chunk size */ ulong stream_off; /* start of published stream */ + ulong goff_start; /* start of goff in stream */ ulong out_seq; /* current sequence number */ /* flow control */ @@ -120,7 +121,7 @@ fd_stream_writer_publish( fd_stream_writer_t * writer, fd_mcache_publish_stream( writer->out_mcache, fd_mcache_depth( writer->out_mcache->f ), writer->out_seq, - writer->goff, + writer->goff_start, loff, frag_sz, 0 ); @@ -132,7 +133,10 @@ fd_stream_writer_publish( fd_stream_writer_t * writer, writer->buf_off = 0UL; } + /* update stream_off and goff_start to current values + of buf_off and goff */ writer->stream_off = writer->buf_off; + writer->goff_start = writer->goff; } static inline void @@ -143,8 +147,17 @@ fd_stream_writer_advance( fd_stream_writer_t * writer, writer->cr_byte_avail -= sz; } -/* TODO: destroy / free */ +static inline int +fd_stream_writer_is_backpressured( fd_stream_writer_t * writer ) { + return writer->cr_byte_availburst_byte || writer->cr_frag_availburst_frag; +} + +static inline void * +fd_stream_writer_delete( fd_stream_writer_t * writer ) { + fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); + return (void *)writer; +} FD_PROTOTYPES_END -#endif /* HEADER_fd_src_discof_restore_fd_stream_writer_h */ \ No newline at end of file +#endif /* HEADER_fd_src_discof_restore_stream_fd_stream_writer_h */ From e3a11b4b65e2ffa2f3990bd9d03d8c9654a747c3 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 19:27:19 +0000 Subject: [PATCH 07/34] revert unrelated changes --- src/discof/rpc/fd_rpcserv_tile.c | 33 ++- src/discof/writer/fd_writer_tile.c | 20 +- src/funk/Local.mk | 4 +- src/funk/fd_funk.c | 4 +- src/funk/fd_funk.h | 4 +- src/funk/fd_funk_filemap.c | 449 +++++++++++++++++++++++++++++ src/funk/fd_funk_filemap.h | 68 +++++ src/util/archive/fd_tar.h | 24 +- 8 files changed, 579 insertions(+), 27 deletions(-) create mode 100644 src/funk/fd_funk_filemap.c create mode 100644 src/funk/fd_funk_filemap.h diff --git a/src/discof/rpc/fd_rpcserv_tile.c b/src/discof/rpc/fd_rpcserv_tile.c index cdec7fdc99..ea73ce0e6f 100644 --- a/src/discof/rpc/fd_rpcserv_tile.c +++ b/src/discof/rpc/fd_rpcserv_tile.c @@ -14,6 +14,7 @@ #include "../../disco/fd_disco.h" #include "../../disco/shred/fd_stake_ci.h" #include "../../disco/topo/fd_pod_format.h" +#include "../../funk/fd_funk_filemap.h" #include "../../disco/keyguard/fd_keyload.h" #include @@ -25,6 +26,9 @@ struct fd_rpcserv_tile_ctx { fd_rpcserver_args_t args; + char funk_file[ PATH_MAX ]; + + int activated; fd_rpc_ctx_t * ctx; @@ -81,7 +85,12 @@ before_credit( fd_rpcserv_tile_ctx_t * ctx, fd_stem_context_t * stem, int * charge_busy ) { (void)stem; - *charge_busy = fd_rpc_ws_poll( ctx->ctx ); + + if( FD_UNLIKELY( !ctx->activated ) ) { + *charge_busy = 0; + } else { + *charge_busy = fd_rpc_ws_poll( ctx->ctx ); + } } static void @@ -129,7 +138,20 @@ after_frag( fd_rpcserv_tile_ctx_t * ctx, (void)stem; if( FD_LIKELY( in_idx==REPLAY_NOTIF_IDX ) ) { + if( FD_UNLIKELY( !ctx->activated ) ) { + fd_rpcserver_args_t * args = &ctx->args; + fd_funk_t * funk = fd_funk_open_file( + args->funk, ctx->funk_file, 1, 0, 0, 0, 0, FD_FUNK_READ_WRITE, NULL ); + if( FD_UNLIKELY( !funk ) ) { + FD_LOG_ERR(( "failed to join a funky" )); + } + + ctx->activated = 1; + fd_rpc_start_service( args, ctx->ctx ); + } + fd_rpc_replay_after_frag( ctx->ctx, &ctx->replay_notif_in_state ); + } else if( FD_UNLIKELY( in_idx==STAKE_CI_IN_IDX ) ) { fd_rpc_stake_after_frag( ctx->ctx, ctx->args.stake_ci ); @@ -167,6 +189,7 @@ privileged_init( fd_topo_t * topo, args->stake_ci = fd_stake_ci_join( fd_stake_ci_new( stake_ci_mem, ctx->identity_key ) ); + strncpy( ctx->funk_file, tile->replay.funk_file, sizeof(ctx->funk_file) ); /* Open funk after replay tile is booted */ /* Blockstore setup */ @@ -228,6 +251,8 @@ unprivileged_init( fd_topo_t * topo, FD_TEST( ( !!smem ) & ( !!fmem ) ); fd_scratch_attach( smem, fmem, FD_RPC_SCRATCH_MAX, FD_RPC_SCRATCH_DEPTH ); + ctx->activated = 0; + fd_topo_link_t * replay_notif_in_link = &topo->links[ tile->in_link_id[ REPLAY_NOTIF_IDX ] ]; ctx->replay_notif_in_mem = topo->workspaces[ topo->objs[ replay_notif_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->replay_notif_in_chunk0 = fd_dcache_compact_chunk0( ctx->replay_notif_in_mem, replay_notif_in_link->dcache ); @@ -237,12 +262,6 @@ unprivileged_init( fd_topo_t * topo, ctx->stake_ci_in_mem = topo->workspaces[ topo->objs[ stake_ci_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->stake_ci_in_chunk0 = fd_dcache_compact_chunk0( ctx->stake_ci_in_mem, stake_ci_in_link->dcache ); ctx->stake_ci_in_wmark = fd_dcache_compact_wmark ( ctx->stake_ci_in_mem, stake_ci_in_link->dcache, stake_ci_in_link->mtu ); - - fd_rpcserver_args_t * args = &ctx->args; - if( FD_UNLIKELY( !fd_funk_join( args->funk, fd_topo_obj_laddr( topo, tile->rpcserv.funk_obj_id ) ) ) ) { - FD_LOG_ERR(( "Failed to join database cache" )); - } - fd_rpc_start_service( args, ctx->ctx ); } static ulong diff --git a/src/discof/writer/fd_writer_tile.c b/src/discof/writer/fd_writer_tile.c index 7e01e438be..f2eeca4895 100644 --- a/src/discof/writer/fd_writer_tile.c +++ b/src/discof/writer/fd_writer_tile.c @@ -9,6 +9,7 @@ #include "../../flamenco/runtime/fd_executor.h" #include "../../funk/fd_funk.h" +#include "../../funk/fd_funk_filemap.h" struct fd_writer_tile_in_ctx { fd_wksp_t * mem; @@ -30,6 +31,7 @@ struct fd_writer_tile_ctx { /* Local join of Funk. R/W. */ fd_funk_t funk[1]; + fd_wksp_t * funk_wksp; /* Link management. */ fd_writer_tile_in_ctx_t exec_writer_in[ FD_PACK_MAX_BANK_TILES ]; @@ -337,9 +339,23 @@ unprivileged_init( fd_topo_t * topo, /* Funk */ /********************************************************************/ - if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->writer.funk_obj_id ) ) ) ) { - FD_LOG_ERR(( "Failed to join database cache" )); + FD_LOG_DEBUG(( "Trying to join funk at file=%s", tile->writer.funk_file )); + fd_funk_txn_start_write( NULL ); + int funk_join_ok = !!fd_funk_open_file( ctx->funk, + tile->writer.funk_file, + 1UL, + 0UL, + 0UL, + 0UL, + 0UL, + FD_FUNK_READ_WRITE, + NULL ); + fd_funk_txn_end_write( NULL ); + ctx->funk_wksp = fd_funk_wksp( ctx->funk ); + if( FD_UNLIKELY( !funk_join_ok ) ) { + FD_LOG_CRIT(( "Failed to join funk" )); } + FD_LOG_DEBUG(( "Just joined funk at file=%s", tile->writer.funk_file )); /********************************************************************/ /* Setup fseq */ diff --git a/src/funk/Local.mk b/src/funk/Local.mk index 747184304c..d92ae23425 100644 --- a/src/funk/Local.mk +++ b/src/funk/Local.mk @@ -1,6 +1,6 @@ ifdef FD_HAS_ATOMIC -$(call add-hdrs,fd_funk_base.h fd_funk_txn.h fd_funk_rec.h fd_funk_val.h fd_funk.h) -$(call add-objs,fd_funk_base fd_funk_txn fd_funk_rec fd_funk_val fd_funk,fd_funk) +$(call add-hdrs,fd_funk_base.h fd_funk_txn.h fd_funk_rec.h fd_funk_val.h fd_funk_filemap.h fd_funk.h) +$(call add-objs,fd_funk_base fd_funk_txn fd_funk_rec fd_funk_val fd_funk_filemap fd_funk,fd_funk) $(call make-unit-test,test_funk_base,test_funk_base,fd_funk fd_util) $(call run-unit-test,test_funk_base,) $(call make-unit-test,test_funk,test_funk,fd_funk fd_util) diff --git a/src/funk/fd_funk.c b/src/funk/fd_funk.c index 9dd9ce51a3..8966633899 100644 --- a/src/funk/fd_funk.c +++ b/src/funk/fd_funk.c @@ -132,8 +132,8 @@ fd_funk_new( void * shmem, } fd_funk_t * -fd_funk_join( fd_funk_t * ljoin, - void * shfunk ) { +fd_funk_join( void * ljoin, + void * shfunk ) { if( FD_UNLIKELY( !shfunk ) ) { FD_LOG_WARNING(( "NULL shfunk" )); return NULL; diff --git a/src/funk/fd_funk.h b/src/funk/fd_funk.h index 140003634f..3a13a0ade3 100644 --- a/src/funk/fd_funk.h +++ b/src/funk/fd_funk.h @@ -327,8 +327,8 @@ fd_funk_new( void * shmem, (joins are local to a thread group). */ fd_funk_t * -fd_funk_join( fd_funk_t * ljoin, - void * shfunk ); +fd_funk_join( void * ljoin, + void * shfunk ); /* fd_funk_leave leaves a funk join. Returns the memory region used for join on success (caller has ownership on return and the caller is no diff --git a/src/funk/fd_funk_filemap.c b/src/funk/fd_funk_filemap.c new file mode 100644 index 0000000000..1ff9405e9e --- /dev/null +++ b/src/funk/fd_funk_filemap.c @@ -0,0 +1,449 @@ +#define _GNU_SOURCE +#define _FILE_OFFSET_BITS 64 +#include "fd_funk_filemap.h" +#include +#include +#include +#include +#include +#include + +#define PAGESIZE (1UL<<12) /* 4 KiB */ + +fd_funk_t * +fd_funk_open_file( void * ljoin, + const char * filename, + ulong wksp_tag, + ulong seed, + ulong txn_max, + ulong rec_max, + ulong total_sz, + fd_funk_file_mode_t mode, + fd_funk_close_file_args_t * close_args_out ) { + + /* See if we already have the file open */ + + if( mode == FD_FUNK_READONLY || mode == FD_FUNK_READ_WRITE ) { + fd_shmem_join_info_t info; + if( !fd_shmem_join_query_by_name("funk", &info) ) { + void * shmem = info.join; + fd_wksp_t * wksp = fd_wksp_join( shmem ); + if( FD_UNLIKELY( !wksp ) ) { + FD_LOG_WARNING(( "fd_wksp_join(%p) failed", shmem )); + return NULL; + } + + fd_wksp_tag_query_info_t info2; + if( FD_UNLIKELY( !fd_wksp_tag_query( wksp, &wksp_tag, 1, &info2, 1 ) ) ) { + FD_LOG_WARNING(( "%s does not contain a funk database", filename )); + return NULL; + } + + void * funk_shmem = fd_wksp_laddr_fast( wksp, info2.gaddr_lo ); + fd_funk_t * funk = fd_funk_join( ljoin, funk_shmem ); + if( FD_UNLIKELY( funk == NULL ) ) { + FD_LOG_WARNING(( "Failed to join funk database at %s:0x%lx", fd_wksp_name( wksp ), info2.gaddr_lo )); + return NULL; + } + + if( FD_UNLIKELY( close_args_out != NULL ) ) { + close_args_out->shmem = shmem; + close_args_out->fd = -1; + close_args_out->total_sz = 0; + } + return funk; + } + } + + /* Open the file */ + + int open_flags, can_resize, can_create, do_new; + switch( mode ) { + case FD_FUNK_READONLY: + if( filename == NULL || filename[0] == '\0' ) { + FD_LOG_WARNING(( "mode FD_FUNK_READONLY can not be used with an anonymous workspace, funk file required" )); + return NULL; + } + open_flags = O_RDWR; /* We mark the memory as read-only after we are done setting up */ + can_create = 0; + can_resize = 0; + do_new = 0; + break; + case FD_FUNK_READ_WRITE: + if( filename == NULL || filename[0] == '\0' ) { + FD_LOG_WARNING(( "mode FD_FUNK_READ_WRITE can not be used with an anonymous workspace, funk file required" )); + return NULL; + } + open_flags = O_RDWR; + can_create = 0; + can_resize = 0; + do_new = 0; + break; + case FD_FUNK_CREATE: + open_flags = O_CREAT|O_RDWR; + can_create = 1; + can_resize = 0; + do_new = 0; + break; + case FD_FUNK_OVERWRITE: + open_flags = O_CREAT|O_RDWR; + can_create = 1; + can_resize = 1; + do_new = 1; + break; + case FD_FUNK_CREATE_EXCL: + open_flags = O_CREAT|O_EXCL|O_RDWR; + can_create = 1; + can_resize = 1; + do_new = 1; + break; + default: + FD_LOG_WARNING(( "invalid mode when opening %s", filename )); + return NULL; + } + + int fd; + if( FD_UNLIKELY( filename == NULL || filename[0] == '\0' ) ) { + fd = -1; /* Anonymous */ + do_new = 1; + } else { + + /* Open the file */ + FD_LOG_DEBUG(( "opening %s", filename )); + fd = open( filename, open_flags, S_IRUSR|S_IWUSR ); + if( FD_UNLIKELY( fd < 0 ) ) { + FD_LOG_WARNING(( "error opening %s: %s", filename, strerror(errno) )); + return NULL; + } + + /* Resize the file */ + + struct stat statbuf; + int r = fstat( fd, &statbuf ); + if( FD_UNLIKELY( r < 0 ) ) { + FD_LOG_WARNING(( "error opening %s: %s", filename, strerror(errno) )); + close( fd ); + return NULL; + } + if( (can_create && statbuf.st_size == 0) || + (can_resize && statbuf.st_size != (off_t)total_sz) ) { + FD_LOG_DEBUG(( "resizing %s to %lu", filename, total_sz )); + if( FD_UNLIKELY( ftruncate( fd, (off_t)total_sz ) < 0 ) ) { + FD_LOG_WARNING(( "error resizing %s: %s", filename, strerror(errno) )); + close( fd ); + return NULL; + } + do_new = 1; + } else { + total_sz = (ulong)statbuf.st_size; + } + } + + if( FD_UNLIKELY( total_sz & (PAGESIZE-1) ) ) { + FD_LOG_WARNING(( "file size must be a multiple of a %lu", PAGESIZE )); + close( fd ); + return NULL; + } + + /* Force all the disk blocks to be physically allocated to avoid major faults in the future */ + + if( do_new & (fd != -1) ) { + FD_LOG_DEBUG(( "zeroing %s", (filename ? filename : "(NULL)") )); + uchar zeros[4<<20]; + memset( zeros, 0, sizeof(zeros) ); + for( ulong i = 0; i < total_sz; ) { + ulong sz = fd_ulong_min( sizeof(zeros), total_sz - i ); + if( FD_UNLIKELY( pwrite( fd, zeros, sz, (__off_t)i ) < (ssize_t)sz ) ) { + FD_LOG_WARNING(( "error zeroing %s: %s", (filename ? filename : "(NULL)"), strerror(errno) )); + close( fd ); + return NULL; + } + sync_file_range( fd, (__off64_t)i, (__off64_t)sz, SYNC_FILE_RANGE_WRITE ); + i += sz; + } + } + + /* Create the memory map */ + + FD_LOG_DEBUG(( "mapping %s", (filename ? filename : "(NULL)") )); + void * shmem = mmap( NULL, total_sz, (PROT_READ|PROT_WRITE), + (fd == -1 ? (MAP_ANONYMOUS|MAP_PRIVATE) : MAP_SHARED), fd, 0 ); + if( FD_UNLIKELY ( shmem == MAP_FAILED ) ) { + FD_LOG_WARNING(( "error mapping %s: %s", (filename ? filename : "(NULL)"), strerror(errno) )); + close( fd ); + return NULL; + } + + if( do_new ) { + + /* Create the data structures */ + + ulong part_max = fd_wksp_part_max_est( total_sz, 1U<<18U ); + if( FD_UNLIKELY( !part_max ) ) { + FD_LOG_WARNING(( "fd_wksp_part_max_est(%lu,64KiB) failed", total_sz )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + ulong data_max = fd_wksp_data_max_est( total_sz, part_max ); + if( FD_UNLIKELY( !data_max ) ) { + FD_LOG_WARNING(( "part_max (%lu) too large for footprint %lu", part_max, total_sz )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + FD_LOG_DEBUG(( "creating workspace in %s", (filename ? filename : "(NULL)") )); + void * shwksp = fd_wksp_new( shmem, "funk", (uint)seed, part_max, data_max ); + if( FD_UNLIKELY( !shwksp ) ) { + FD_LOG_WARNING(( "fd_wksp_new(%p,\"%s\",%lu,%lu,%lu) failed", shmem, "funk", seed, part_max, data_max )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + fd_wksp_t * wksp = fd_wksp_join( shwksp ); + if( FD_UNLIKELY( !wksp ) ) { + FD_LOG_WARNING(( "fd_wksp_join(%p) failed", shwksp )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + ulong page_sz = PAGESIZE; + ulong page_cnt = total_sz/PAGESIZE; + int join_err = fd_shmem_join_anonymous( "funk", FD_SHMEM_JOIN_MODE_READ_WRITE, wksp, shmem, page_sz, page_cnt ); + if( join_err ) { + FD_LOG_WARNING(( "fd_shmem_join_anonymous failed" )); + } + + FD_LOG_DEBUG(( "creating funk in %s", (filename ? filename : "(NULL)") )); + void * funk_shmem = fd_wksp_alloc_laddr( wksp, fd_funk_align(), fd_funk_footprint( txn_max, rec_max ), wksp_tag ); + if( FD_UNLIKELY(funk_shmem == NULL ) ) { + FD_LOG_WARNING(( "failed to allocate a funky" )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + fd_funk_t * funk = fd_funk_join( ljoin, fd_funk_new( funk_shmem, wksp_tag, seed, txn_max, rec_max ) ); + if( FD_UNLIKELY( funk == NULL ) ) { + FD_LOG_WARNING(( "failed to allocate a funky" )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + FD_LOG_NOTICE(( "opened funk size %f GB, backing file %s", ((double)total_sz)/((double)(1LU<<30)), (filename ? filename : "(NULL)") )); + + if( FD_UNLIKELY( close_args_out != NULL ) ) { + close_args_out->shmem = shmem; + close_args_out->fd = fd; + close_args_out->total_sz = total_sz; + } + return funk; + + } else { + + /* Join the data existing structures */ + + fd_wksp_t * wksp = fd_wksp_join( shmem ); + if( FD_UNLIKELY( !wksp ) ) { + FD_LOG_WARNING(( "fd_wksp_join(%p) failed", shmem )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + ulong page_sz = PAGESIZE; + ulong page_cnt = total_sz/PAGESIZE; + int join_err = fd_shmem_join_anonymous( "funk", FD_SHMEM_JOIN_MODE_READ_WRITE, wksp, shmem, page_sz, page_cnt ); + if( FD_UNLIKELY( join_err ) ) { + FD_LOG_WARNING(( "fd_shmem_join_anonymous failed" )); + } + + fd_wksp_tag_query_info_t info; + if( FD_UNLIKELY( !fd_wksp_tag_query( wksp, &wksp_tag, 1, &info, 1 ) ) ) { + FD_LOG_WARNING(( "%s does not contain a funky", filename )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + void * funk_shmem = fd_wksp_laddr_fast( wksp, info.gaddr_lo ); + fd_funk_t * funk = fd_funk_join( ljoin, funk_shmem ); + if( FD_UNLIKELY( funk == NULL ) ) { + FD_LOG_WARNING(( "failed to join a funky" )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + if( mode == FD_FUNK_READONLY ) { + if( FD_UNLIKELY( mprotect( shmem, total_sz, PROT_READ ) ) ) { + FD_LOG_WARNING(( "mprotect failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } + } + + FD_LOG_NOTICE(( "opened funk size %f GB, backing file %s", ((double)total_sz)/((double)(1LU<<30)), (filename ? filename : "(NULL)") )); + + if( FD_UNLIKELY( close_args_out != NULL ) ) { + close_args_out->shmem = shmem; + close_args_out->fd = fd; + close_args_out->total_sz = total_sz; + } + return funk; + } +} + +fd_funk_t * +fd_funk_recover_checkpoint( void * ljoin, + const char * funk_filename, + ulong wksp_tag, + const char * checkpt_filename, + fd_funk_close_file_args_t * close_args_out ) { + /* Make the funk workspace match the parameters used to create the + checkpoint. */ + + fd_wksp_preview_t preview[1]; + int err = fd_wksp_preview( checkpt_filename, preview ); + if( FD_UNLIKELY( err ) ) { + FD_LOG_WARNING(( "unable to preview %s (%i-%s)", checkpt_filename, err, fd_wksp_strerror( err ) )); + return NULL; + } + uint seed = preview->seed; + ulong part_max = preview->part_max; + ulong data_max = preview->data_max; + + ulong total_sz = fd_wksp_footprint( part_max, data_max ); + + int fd; + if( funk_filename == NULL || funk_filename[0] == '\0' ) { + fd = -1; /* Anonymous */ + + } else { + + /* Open the file */ + fd = open( funk_filename, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR ); + if( FD_UNLIKELY( fd < 0 ) ) { + FD_LOG_WARNING(( "error opening %s: %s", funk_filename, strerror(errno) )); + return NULL; + } + + /* Resize the file */ + + struct stat statbuf; + int r = fstat( fd, &statbuf ); + if( FD_UNLIKELY( r < 0 ) ) { + FD_LOG_WARNING(( "error opening %s: %s", funk_filename, strerror(errno) )); + close( fd ); + return NULL; + } + if( statbuf.st_size != (off_t)total_sz ) { + if( FD_UNLIKELY( ftruncate( fd, (off_t)total_sz ) < 0 ) ) { + FD_LOG_WARNING(( "error resizing %s: %s", funk_filename, strerror(errno) )); + close( fd ); + return NULL; + } + } + + /* Force all the disk blocks to be physically allocated to avoid major faults in the future */ + + uchar zeros[4<<20]; + memset( zeros, 0, sizeof(zeros) ); + for( ulong i = 0; i < total_sz; ) { + ulong sz = fd_ulong_min( sizeof(zeros), total_sz - i ); + if( FD_UNLIKELY ( pwrite( fd, zeros, sz, (__off_t)i ) < (ssize_t)sz ) ) { + FD_LOG_WARNING(( "error zeroing %s: %s", (funk_filename ? funk_filename : "(NULL)"), strerror(errno) )); + close( fd ); + return NULL; + } + sync_file_range( fd, (__off64_t)i, (__off64_t)sz, SYNC_FILE_RANGE_WRITE ); + i += sz; + } + } + + /* Create the memory map */ + + void * shmem = mmap( NULL, total_sz, PROT_READ|PROT_WRITE, + (fd == -1 ? (MAP_ANONYMOUS|MAP_PRIVATE) : MAP_SHARED), fd, 0 ); + + if( FD_UNLIKELY( shmem == MAP_FAILED ) ) { + FD_LOG_WARNING(( "error mapping %s: %s", (funk_filename ? funk_filename : "(NULL)"), strerror(errno) )); + close( fd ); + return NULL; + } + + /* Create the workspace */ + + void * shwksp = fd_wksp_new( shmem, "funk", seed, part_max, data_max ); + if( FD_UNLIKELY( !shwksp ) ) { + FD_LOG_WARNING(( "fd_wksp_new(%p,\"%s\",%u,%lu,%lu) failed", shmem, "funk", seed, part_max, data_max )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + fd_wksp_t * wksp = fd_wksp_join( shwksp ); + if( FD_UNLIKELY( !wksp ) ) { + FD_LOG_WARNING(( "fd_wksp_join(%p) failed", shwksp )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + ulong page_sz = PAGESIZE; + ulong page_cnt = total_sz/PAGESIZE; + int join_err = fd_shmem_join_anonymous( "funk", FD_SHMEM_JOIN_MODE_READ_WRITE, wksp, shmem, page_sz, page_cnt ); + if( FD_UNLIKELY( join_err ) ) { + FD_LOG_WARNING(( "fd_shmem_join_anonymous failed" )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + /* Restore the checkpoint */ + + if( fd_wksp_restore( wksp, checkpt_filename, seed ) ) { + FD_LOG_WARNING(( "restoring %s failed", checkpt_filename )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + /* Let's play find the funk */ + + fd_wksp_tag_query_info_t info; + if( FD_UNLIKELY( !fd_wksp_tag_query( wksp, &wksp_tag, 1, &info, 1 ) ) ) { + FD_LOG_WARNING(( "%s does not contain a funky", checkpt_filename )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + void * funk_shmem = fd_wksp_laddr_fast( wksp, info.gaddr_lo ); + fd_funk_t * funk = fd_funk_join( ljoin, funk_shmem ); + if( FD_UNLIKELY( funk == NULL ) ) { + FD_LOG_WARNING(( "failed to join a funky" )); + munmap( shmem, total_sz ); + close( fd ); + return NULL; + } + + FD_LOG_NOTICE(( "opened funk size %f GB, backing file %s", ((double)total_sz)/((double)(1LU<<30)), (funk_filename ? funk_filename : "(NULL)") )); + + if( FD_UNLIKELY( close_args_out != NULL ) ) { + close_args_out->shmem = shmem; + close_args_out->fd = fd; + close_args_out->total_sz = total_sz; + } + return funk; +} + +void +fd_funk_close_file( fd_funk_close_file_args_t * close_args ) { + fd_shmem_leave_anonymous( close_args->shmem, NULL ); + munmap( close_args->shmem, close_args->total_sz ); + close( close_args->fd ); +} diff --git a/src/funk/fd_funk_filemap.h b/src/funk/fd_funk_filemap.h new file mode 100644 index 0000000000..3b56d0656f --- /dev/null +++ b/src/funk/fd_funk_filemap.h @@ -0,0 +1,68 @@ +#ifndef HEADER_fd_src_funk_fd_funk_filemap_h +#define HEADER_fd_src_funk_fd_funk_filemap_h + +#include "fd_funk.h" + +enum fd_funk_file_mode { + FD_FUNK_READONLY, /* Only open the file if it already exists, memory is marked readonly */ + FD_FUNK_READ_WRITE, /* Only open the file if it already exists, can be written to */ + FD_FUNK_CREATE, /* Use an existing file if available, otherwise create */ + FD_FUNK_OVERWRITE, /* Create new or overwrite existing with a fresh instance */ + FD_FUNK_CREATE_EXCL /* Fail if file exists, only create new */ +}; +typedef enum fd_funk_file_mode fd_funk_file_mode_t; + +/* fd_funk_close_file_args_t contains the parameters needed by + * fd_funk_close_file. It is initialized in fd_funk_open_file. */ + +struct fd_funk_close_file_args { + void * shmem; + int fd; + ulong total_sz; +}; +typedef struct fd_funk_close_file_args fd_funk_close_file_args_t; + +/* Open or create a funk instance with an optional mmap backing file. + filename is the backing file, or NULL for a local/anonymous + instance. wksp_tag is the workspace partition tag for funk (usually + just 1). seed is the randomized hash seed. txn_max is the maximum + number of funk transactions. rec_max is the maximum number of funk + records. total_sz is the total size of the funk workspace. mode is + the file mode (see above). close_args_opt is an optional pointer to a + structure which is filled in. This is needed for fd_funk_close_file. + + Note that seed, txn_max, rec_max, and total_sz are ignored if + an existing file is opened without being overwritten. */ + +fd_funk_t * +fd_funk_open_file( void * ljoin, + const char * filename, + ulong wksp_tag, + ulong seed, + ulong txn_max, + ulong rec_max, + ulong total_sz, + fd_funk_file_mode_t mode, + fd_funk_close_file_args_t * close_args_out ); + +/* Load a workspace checkpoint containing a funk + instance. funk_filename is the backing file, or NULL for a + local/anonymous instance. wksp_tag is the workspace partition tag + for funk (usually just 1). checkpt_filename is the checkpoint + file. close_args_opt is an optional pointer to a structure which is + filled in. This is needed for fd_funk_close_file. */ + +fd_funk_t * +fd_funk_recover_checkpoint( void * ljoin, + const char * funk_filename, + ulong wksp_tag, + const char * checkpt_filename, + fd_funk_close_file_args_t * close_args_out ); + +/* Release the resources associated with a funk file map. The funk + pointer is invalid after this is called. */ + +void +fd_funk_close_file( fd_funk_close_file_args_t * close_args ); + +#endif /* HEADER_fd_src_funk_fd_funk_filemap_h */ diff --git a/src/util/archive/fd_tar.h b/src/util/archive/fd_tar.h index 5aa10aadf7..09de84c442 100644 --- a/src/util/archive/fd_tar.h +++ b/src/util/archive/fd_tar.h @@ -84,7 +84,7 @@ fd_tar_set_octal( char buf[ static 12 ], ulong val ); /* fd_tar_meta_set_size sets the size field. Returns 1 on success, 0 - if sz is too large to be represented in TAR header. Set size using the + if sz is too large to be represented in TAR header. Set size using the OLDGNU size extension to allow for unlimited file sizes. The first byte must be 0x80 followed by 0s and then the size in binary. */ @@ -240,30 +240,30 @@ fd_tar_read( void * reader, 2. Write out file data with fd_tar_writer_write_file_data( writer, data, data_sz ). This can be done as many times as you want. 3. Finish the current file with fd_tar_writer_fini_file( writer ). - - When you are done, call fd_tar_writer_delete( writer ) to write out the + + When you are done, call fd_tar_writer_delete( writer ) to write out the tar archive trailer and close otu the file descriptor. - If you want to reserve space for an existing file and write back to it + If you want to reserve space for an existing file and write back to it at some point in the future see the below comments for fd_tar_writer_{make,fill}_space(). - + */ struct fd_tar_writer { int fd; /* The file descriptor for the tar archive. */ ulong header_pos; /* The position in the file for the current files header. - If there is no current file that is being streamed out, + If there is no current file that is being streamed out, the header_pos will be equal to ULONG_MAX. */ ulong data_sz; /* The size of the current files data. If there is no current file that is being streamed out, the data_sz will be equal to ULONG_MAX. */ ulong wb_pos; /* If this value is not equal to ULONG_MAX that means that - this is the position at which to write back to with a + this is the position at which to write back to with a call to fd_tar_writer_fill_space. */ - /* TODO: Right now, the stream to the tar writer just uses fd_io_write. + /* TODO: Right now, the stream to the tar writer just uses fd_io_write. This can eventually be abstracted to use write callbacks that use - fd_io streaming under the hood. This adds some additional complexity + fd_io streaming under the hood. This adds some additional complexity that's related to writing back into the header: if the header is still in the ostream buf, modify the buffer. Otherwise, read the header directly from the file. */ @@ -333,15 +333,15 @@ fd_tar_writer_fini_file( fd_tar_writer_t * writer ); /* fd_tar_writer_make_space and fd_tar_writer_fill_space, allow for writing back to a specific place in the tar stream. This can be used by first making a call to fd_tar_write_new_file, fd_tar_writer_make_space, and - fd_tar_writer_fini_file. This will populate the header and write out + fd_tar_writer_fini_file. This will populate the header and write out random bytes. The start of this data file will be saved by the tar writer. - Up to n data files can be appended to the tar archive before a call to + Up to n data files can be appended to the tar archive before a call to fd_tar_writer_fill_space. fd_tar_writer_fill_space should only be called after an unpaired call to fd_tar_writer_make_space and it requires a valid fd_tar_writer_t handle. It allows the user to write back to the point at which they made space. _make_space and _fill_space should be paired together. There can only be one oustanding call to make_space at a time. - + TODO: This can be extended to support multiple write backs. */ int From 787bd815f86e5031d32feef97607849859b5b674 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Sat, 10 May 2025 00:42:20 +0000 Subject: [PATCH 08/34] fdctl: switch tiles to topo-provided funk --- src/discof/rpc/fd_rpcserv_tile.c | 33 +++++++----------------------- src/discof/writer/fd_writer_tile.c | 20 ++---------------- src/funk/fd_funk.c | 4 ++-- src/funk/fd_funk.h | 4 ++-- 4 files changed, 13 insertions(+), 48 deletions(-) diff --git a/src/discof/rpc/fd_rpcserv_tile.c b/src/discof/rpc/fd_rpcserv_tile.c index ea73ce0e6f..cdec7fdc99 100644 --- a/src/discof/rpc/fd_rpcserv_tile.c +++ b/src/discof/rpc/fd_rpcserv_tile.c @@ -14,7 +14,6 @@ #include "../../disco/fd_disco.h" #include "../../disco/shred/fd_stake_ci.h" #include "../../disco/topo/fd_pod_format.h" -#include "../../funk/fd_funk_filemap.h" #include "../../disco/keyguard/fd_keyload.h" #include @@ -26,9 +25,6 @@ struct fd_rpcserv_tile_ctx { fd_rpcserver_args_t args; - char funk_file[ PATH_MAX ]; - - int activated; fd_rpc_ctx_t * ctx; @@ -85,12 +81,7 @@ before_credit( fd_rpcserv_tile_ctx_t * ctx, fd_stem_context_t * stem, int * charge_busy ) { (void)stem; - - if( FD_UNLIKELY( !ctx->activated ) ) { - *charge_busy = 0; - } else { - *charge_busy = fd_rpc_ws_poll( ctx->ctx ); - } + *charge_busy = fd_rpc_ws_poll( ctx->ctx ); } static void @@ -138,20 +129,7 @@ after_frag( fd_rpcserv_tile_ctx_t * ctx, (void)stem; if( FD_LIKELY( in_idx==REPLAY_NOTIF_IDX ) ) { - if( FD_UNLIKELY( !ctx->activated ) ) { - fd_rpcserver_args_t * args = &ctx->args; - fd_funk_t * funk = fd_funk_open_file( - args->funk, ctx->funk_file, 1, 0, 0, 0, 0, FD_FUNK_READ_WRITE, NULL ); - if( FD_UNLIKELY( !funk ) ) { - FD_LOG_ERR(( "failed to join a funky" )); - } - - ctx->activated = 1; - fd_rpc_start_service( args, ctx->ctx ); - } - fd_rpc_replay_after_frag( ctx->ctx, &ctx->replay_notif_in_state ); - } else if( FD_UNLIKELY( in_idx==STAKE_CI_IN_IDX ) ) { fd_rpc_stake_after_frag( ctx->ctx, ctx->args.stake_ci ); @@ -189,7 +167,6 @@ privileged_init( fd_topo_t * topo, args->stake_ci = fd_stake_ci_join( fd_stake_ci_new( stake_ci_mem, ctx->identity_key ) ); - strncpy( ctx->funk_file, tile->replay.funk_file, sizeof(ctx->funk_file) ); /* Open funk after replay tile is booted */ /* Blockstore setup */ @@ -251,8 +228,6 @@ unprivileged_init( fd_topo_t * topo, FD_TEST( ( !!smem ) & ( !!fmem ) ); fd_scratch_attach( smem, fmem, FD_RPC_SCRATCH_MAX, FD_RPC_SCRATCH_DEPTH ); - ctx->activated = 0; - fd_topo_link_t * replay_notif_in_link = &topo->links[ tile->in_link_id[ REPLAY_NOTIF_IDX ] ]; ctx->replay_notif_in_mem = topo->workspaces[ topo->objs[ replay_notif_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->replay_notif_in_chunk0 = fd_dcache_compact_chunk0( ctx->replay_notif_in_mem, replay_notif_in_link->dcache ); @@ -262,6 +237,12 @@ unprivileged_init( fd_topo_t * topo, ctx->stake_ci_in_mem = topo->workspaces[ topo->objs[ stake_ci_in_link->dcache_obj_id ].wksp_id ].wksp; ctx->stake_ci_in_chunk0 = fd_dcache_compact_chunk0( ctx->stake_ci_in_mem, stake_ci_in_link->dcache ); ctx->stake_ci_in_wmark = fd_dcache_compact_wmark ( ctx->stake_ci_in_mem, stake_ci_in_link->dcache, stake_ci_in_link->mtu ); + + fd_rpcserver_args_t * args = &ctx->args; + if( FD_UNLIKELY( !fd_funk_join( args->funk, fd_topo_obj_laddr( topo, tile->rpcserv.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + fd_rpc_start_service( args, ctx->ctx ); } static ulong diff --git a/src/discof/writer/fd_writer_tile.c b/src/discof/writer/fd_writer_tile.c index f2eeca4895..7e01e438be 100644 --- a/src/discof/writer/fd_writer_tile.c +++ b/src/discof/writer/fd_writer_tile.c @@ -9,7 +9,6 @@ #include "../../flamenco/runtime/fd_executor.h" #include "../../funk/fd_funk.h" -#include "../../funk/fd_funk_filemap.h" struct fd_writer_tile_in_ctx { fd_wksp_t * mem; @@ -31,7 +30,6 @@ struct fd_writer_tile_ctx { /* Local join of Funk. R/W. */ fd_funk_t funk[1]; - fd_wksp_t * funk_wksp; /* Link management. */ fd_writer_tile_in_ctx_t exec_writer_in[ FD_PACK_MAX_BANK_TILES ]; @@ -339,23 +337,9 @@ unprivileged_init( fd_topo_t * topo, /* Funk */ /********************************************************************/ - FD_LOG_DEBUG(( "Trying to join funk at file=%s", tile->writer.funk_file )); - fd_funk_txn_start_write( NULL ); - int funk_join_ok = !!fd_funk_open_file( ctx->funk, - tile->writer.funk_file, - 1UL, - 0UL, - 0UL, - 0UL, - 0UL, - FD_FUNK_READ_WRITE, - NULL ); - fd_funk_txn_end_write( NULL ); - ctx->funk_wksp = fd_funk_wksp( ctx->funk ); - if( FD_UNLIKELY( !funk_join_ok ) ) { - FD_LOG_CRIT(( "Failed to join funk" )); + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->writer.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); } - FD_LOG_DEBUG(( "Just joined funk at file=%s", tile->writer.funk_file )); /********************************************************************/ /* Setup fseq */ diff --git a/src/funk/fd_funk.c b/src/funk/fd_funk.c index 8966633899..9dd9ce51a3 100644 --- a/src/funk/fd_funk.c +++ b/src/funk/fd_funk.c @@ -132,8 +132,8 @@ fd_funk_new( void * shmem, } fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ) { +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ) { if( FD_UNLIKELY( !shfunk ) ) { FD_LOG_WARNING(( "NULL shfunk" )); return NULL; diff --git a/src/funk/fd_funk.h b/src/funk/fd_funk.h index 3a13a0ade3..140003634f 100644 --- a/src/funk/fd_funk.h +++ b/src/funk/fd_funk.h @@ -327,8 +327,8 @@ fd_funk_new( void * shmem, (joins are local to a thread group). */ fd_funk_t * -fd_funk_join( void * ljoin, - void * shfunk ); +fd_funk_join( fd_funk_t * ljoin, + void * shfunk ); /* fd_funk_leave leaves a funk join. Returns the memory region used for join on success (caller has ownership on return and the caller is no From e28e99c713a75c25457077d05064d0d5395243ba Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 19:33:05 +0000 Subject: [PATCH 09/34] remove unrelated bundle files --- src/disco/bundle/fd_bundle_client.c | 277 -------------------------- src/disco/bundle/test_bundle_client.c | 76 ------- 2 files changed, 353 deletions(-) delete mode 100644 src/disco/bundle/fd_bundle_client.c delete mode 100644 src/disco/bundle/test_bundle_client.c diff --git a/src/disco/bundle/fd_bundle_client.c b/src/disco/bundle/fd_bundle_client.c deleted file mode 100644 index 3fb16c6422..0000000000 --- a/src/disco/bundle/fd_bundle_client.c +++ /dev/null @@ -1,277 +0,0 @@ -#include "fd_bundle_client_private.h" -#include "../../waltz/h2/fd_h2_rbuf_ossl.h" -#include "../../waltz/grpc/fd_grpc.h" -#include -#include -#include - -/* Forward declarations */ - -static fd_h2_callbacks_t const fd_bundle_h2_callbacks; - -ulong -fd_bundle_client_align( void ) { - return alignof(fd_bundle_client_t); -} - -ulong -fd_bundle_client_footprint( void ) { - ulong l = FD_LAYOUT_INIT; - l = FD_LAYOUT_APPEND( l, alignof(fd_bundle_client_t), sizeof(fd_bundle_client_t) ); - l = FD_LAYOUT_APPEND( l, alignof(fd_bundle_client_bufs_t), sizeof(fd_bundle_client_bufs_t) ); - l = FD_LAYOUT_APPEND( l, fd_bundle_h2_stream_pool_align(), fd_bundle_h2_stream_pool_footprint( FD_BUNDLE_CLIENT_MAX_STREAMS ) ); - return FD_LAYOUT_FINI( l, fd_bundle_client_align() ); -} - -fd_bundle_client_t * -fd_bundle_client_new( void * mem, - SSL * ssl, - fd_bundle_client_metrics_t * metrics ) { - FD_SCRATCH_ALLOC_INIT( l, mem ); - void * client_mem = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_bundle_client_t), sizeof(fd_bundle_client_t) ); - void * bufs_mem = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_bundle_client_bufs_t), sizeof(fd_bundle_client_bufs_t) ); - void * stream_pool_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_bundle_h2_stream_pool_align(), fd_bundle_h2_stream_pool_footprint( FD_BUNDLE_CLIENT_MAX_STREAMS ) ); - FD_SCRATCH_ALLOC_FINI( l, fd_bundle_client_align() ); - - fd_bundle_client_t * client = client_mem; - fd_bundle_client_bufs_t * bufs = bufs_mem; - - fd_bundle_h2_stream_t * stream_pool = - fd_bundle_h2_stream_pool_join( fd_bundle_h2_stream_pool_new( stream_pool_mem, FD_BUNDLE_CLIENT_MAX_STREAMS ) ); - if( FD_UNLIKELY( !stream_pool ) ) FD_LOG_CRIT(( "Failed to create stream pool" )); /* unreachable */ - - *client = (fd_bundle_client_t){ - .ssl = ssl, - .stream_pool = stream_pool, - .nanopb_rx = bufs->nanopb_rx, - .nanopb_tx = bufs->nanopb_tx, - .frame_scratch = bufs->frame_scratch, - .metrics = metrics - }; - fd_h2_rbuf_init( client->frame_rx, bufs->frame_rx_buf, sizeof(bufs->frame_rx_buf) ); - fd_h2_rbuf_init( client->frame_tx, bufs->frame_tx_buf, sizeof(bufs->frame_tx_buf) ); - - fd_h2_conn_init_client( client->conn ); - client->conn->ctx = client; - - /* Don't memset bufs for better performance */ - - return client; -} - -void * -fd_bundle_client_delete( fd_bundle_client_t * client ) { - return client; -} - -static int -fd_ossl_log_error( char const * str, - ulong len, - void * ctx ) { - (void)ctx; - FD_LOG_WARNING(( "%.*s", (int)len, str )); - return 0; -} - -void -fd_bundle_client_rxtx( fd_bundle_client_t * client ) { - SSL * ssl = client->ssl; - if( FD_UNLIKELY( !client->ssl_hs_done ) ) { - int res = SSL_do_handshake( ssl ); - if( res<=0 ) { - int error = SSL_get_error( ssl, res ); - if( FD_LIKELY( error==SSL_ERROR_WANT_READ || error==SSL_ERROR_WANT_WRITE ) ) return; - ERR_print_errors_cb( fd_ossl_log_error, NULL ); - client->failed = 1; - return; - } else { - client->ssl_hs_done = 1; - } - } - - fd_h2_conn_t * conn = client->conn; - fd_h2_rbuf_ssl_read( client->frame_rx, ssl ); - if( FD_UNLIKELY( conn->flags ) ) fd_h2_tx_control( conn, client->frame_tx ); - fd_h2_rx( conn, client->frame_rx, client->frame_tx, client->frame_scratch, FD_BUNDLE_CLIENT_BUFSZ, &fd_bundle_h2_callbacks ); - fd_h2_rbuf_ssl_write( client->frame_tx, ssl ); -} - -/* fd_bundle_client_request continue attempts to write a request data - frame. */ - -static int -fd_bundle_client_request_continue1( fd_bundle_client_t * client ) { - fd_h2_stream_t * stream = client->request_stream; - fd_h2_tx_op_copy( client->conn, stream, client->frame_tx, client->request_tx_op ); - if( FD_UNLIKELY( client->request_tx_op->chunk_sz ) ) return 0; - if( FD_UNLIKELY( stream->state != FD_H2_STREAM_STATE_CLOSING_TX ) ) return 0; - /* Request finished */ - client->request_stream = NULL; - return 1; -} - -static int -fd_bundle_client_request_continue( fd_bundle_client_t * client ) { - if( FD_UNLIKELY( client->conn->flags & FD_H2_CONN_FLAGS_DEAD ) ) return 0; - if( FD_UNLIKELY( !client->request_stream ) ) return 0; - if( FD_UNLIKELY( !client->request_tx_op->chunk_sz ) ) return 0; - return fd_bundle_client_request_continue1( client ); -} - -/* fd_bundle_client_stream_acquire grabs a new stream ID and a stream - object. */ - -static inline int -fd_bundle_client_stream_acquire_is_safe( fd_bundle_client_t * client ) { - /* Sufficient quota to start a stream? */ - if( FD_UNLIKELY( client->conn->stream_active_cnt[1]+1 <= client->conn->peer_settings.max_concurrent_streams ) ) return 0; - - /* Free stream object available? */ - if( FD_UNLIKELY( !fd_bundle_h2_stream_pool_free( client->stream_pool ) ) ) return 0; - if( FD_UNLIKELY( client->stream_cnt >= FD_BUNDLE_CLIENT_MAX_STREAMS ) ) return 0; - - return 1; -} - -static fd_h2_stream_t * -fd_bundle_client_stream_acquire( fd_bundle_client_t * client ) { - if( FD_UNLIKELY( client->stream_cnt >= FD_BUNDLE_CLIENT_MAX_STREAMS ) ) { - FD_LOG_CRIT(( "stream pool exhausted" )); - } - - fd_h2_conn_t * conn = client->conn; - uint const stream_id = client->conn->rx_stream_next; - conn->rx_stream_next += 2U; - - fd_bundle_h2_stream_t * stream_node = fd_bundle_h2_stream_pool_ele_acquire( client->stream_pool ); - - fd_h2_stream_t * stream = fd_h2_stream_open( fd_h2_stream_init( &stream_node->s ), conn, stream_id ); - client->request_stream = stream; - client->stream_ids[ stream_id ] = stream_id; - client->stream_cnt++; - return stream; -} - -static void -fd_bundle_client_stream_release( fd_bundle_client_t * client, - fd_h2_stream_t * stream ) { - if( FD_UNLIKELY( !client->stream_cnt ) ) FD_LOG_CRIT(( "stream map corrupt" )); /* unreachable */ - - /* Deallocate tx_op */ - if( FD_UNLIKELY( stream == client->request_stream ) ) { - client->request_stream = NULL; - *client->request_tx_op = (fd_h2_tx_op_t){0}; - } - - /* Remove stream from map */ - int map_idx = -1; - for( int i=0UL; istream_ids[ i ] == stream->stream_id ) { - map_idx = i; - } - } - if( FD_UNLIKELY( map_idx<0 ) ) FD_LOG_CRIT(( "stream map corrupt" )); /* unreachable */ - if( (ulong)map_idx+1 < client->stream_cnt ) { - client->stream_ids[ map_idx ] = client->stream_ids[ client->stream_cnt-1 ]; - client->streams [ map_idx ] = client->streams [ client->stream_cnt-1 ]; - client->stream_cnt--; - } - - fd_bundle_h2_stream_t * stream_node = (void *)( (ulong)stream - offsetof(fd_bundle_h2_stream_t, s) ); - fd_bundle_h2_stream_pool_ele_release( client->stream_pool, stream_node ); -} - -int -fd_bundle_client_request_start( - fd_bundle_client_t * client, - char const * path, - ulong path_len, - pb_msgdesc_t const * fields, - void const * message, - char const * auth_token, - ulong auth_token_sz -) { - /* Sanity check conn */ - if( FD_UNLIKELY( client->conn->flags & FD_H2_CONN_FLAGS_DEAD ) ) return 0; - if( FD_UNLIKELY( !fd_h2_rbuf_is_empty( client->frame_tx ) ) ) return 0; - if( FD_UNLIKELY( !fd_bundle_client_stream_acquire_is_safe( client ) ) ) return 0; - - /* Encode message */ - FD_STATIC_ASSERT( sizeof((fd_bundle_client_bufs_t *)0)->nanopb_rx == sizeof(fd_grpc_hdr_t)+FD_BUNDLE_CLIENT_MSG_SZ_MAX, sz ); - uchar * proto_buf = client->nanopb_rx + sizeof(fd_grpc_hdr_t); - pb_ostream_t ostream = pb_ostream_from_buffer( proto_buf, FD_BUNDLE_CLIENT_MSG_SZ_MAX ); - if( FD_UNLIKELY( !pb_encode( &ostream, fields, message ) ) ) { - FD_LOG_WARNING(( "Failed to encode Protobuf message (%.*s). This is a bug (insufficient buffer space?)", (int)path_len, path )); - return 0; - } - ulong const serialized_sz = ostream.bytes_written; - - /* Create gRPC length prefix */ - fd_grpc_hdr_t hdr = { .compressed=0, .msg_sz=(uint)serialized_sz }; - memcpy( client->nanopb_rx, &hdr, sizeof(fd_grpc_hdr_t) ); - ulong const payload_sz = serialized_sz + sizeof(fd_grpc_hdr_t); - - /* Allocate stream descriptor */ - fd_h2_stream_t * stream = fd_bundle_client_stream_acquire( client ); - uint const stream_id = stream->stream_id; - - /* Write HTTP/2 request headers */ - fd_h2_tx_prepare( client->conn, client->frame_tx, FD_H2_FRAME_TYPE_HEADERS, FD_H2_FLAG_END_HEADERS, stream_id ); - fd_grpc_req_hdrs_t req_meta = { - .path = path, - .path_len = path_len, - .https = 1, /* bundle_client assumes TLS encryption for now */ - - .bearer_auth = auth_token, - .bearer_auth_len = auth_token_sz - }; - if( FD_UNLIKELY( !fd_grpc_h2_gen_request_hdrs( &req_meta, client->frame_tx ) ) ) { - FD_LOG_WARNING(( "Failed to generate gRPC request headers (%.*s). This is a bug", (int)path_len, path )); - return 0; - } - fd_h2_tx_commit( client->conn, client->frame_tx ); - - /* Queue request payload for send - (Protobuf message might have to be fragmented into multiple HTTP/2 - DATA frames if the client gets blocked) */ - fd_h2_tx_op_init( client->request_tx_op, client->nanopb_rx, payload_sz, FD_H2_FLAG_END_STREAM ); - fd_bundle_client_request_continue1( client ); - client->metrics->requests_sent++; - - FD_LOG_DEBUG(( "gRPC request path=%.*s sz=%lu", (int)path_len, path, serialized_sz )); - - return 1; -} - -/* A HTTP/2 flow control change might unblock a queued request send op */ - -void -fd_bundle_h2_window_update( fd_h2_conn_t * conn, - uint increment ) { - (void)increment; - fd_bundle_client_request_continue( conn->ctx ); -} - -void -fd_bundle_h2_stream_window_update( fd_h2_conn_t * conn, - fd_h2_stream_t * stream, - uint increment ) { - (void)stream; (void)increment; - fd_bundle_client_request_continue( conn->ctx ); -} - -/* fd_bundle_h2_callbacks specifies h2->bundle_client callbacks. - Stored in .rodata for security. Must be kept in sync with fd_h2 to - avoid NULL pointers. */ - -static fd_h2_callbacks_t const fd_bundle_h2_callbacks = { - .stream_create = fd_h2_noop_stream_create, - .stream_query = fd_bundle_h2_stream_query, - .conn_established = fd_h2_noop_conn_established, - .conn_final = fd_h2_noop_conn_final, - .headers = fd_bundle_h2_cb_headers, - .data = fd_bundle_h2_cb_data, - .rst_stream = fd_bundle_h2_rst_stream, - .window_update = fd_bundle_h2_window_update, - .stream_window_update = fd_bundle_h2_stream_window_update, -}; diff --git a/src/disco/bundle/test_bundle_client.c b/src/disco/bundle/test_bundle_client.c deleted file mode 100644 index 853892ee5f..0000000000 --- a/src/disco/bundle/test_bundle_client.c +++ /dev/null @@ -1,76 +0,0 @@ -/* test_bundle_client.c creates a gRPC connection and fetches auth - tokens. */ - -#include "fd_bundle_client.h" - -#include -#include -#include -#include -#include - -int -main( int argc, - char ** argv ) { - fd_boot( &argc, &argv ); - - ulong cpu_idx = fd_tile_cpu_id( fd_tile_idx() ); - if( cpu_idx>=fd_shmem_cpu_cnt() ) cpu_idx = 0UL; - - char const * endpoint = fd_env_strip_cmdline_cstr ( &argc, &argv, "--endpoint", NULL, NULL ); - char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); - ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); - ulong numa_idx = fd_env_strip_cmdline_ulong( &argc, &argv, "--numa-idx", NULL, fd_shmem_numa_idx(cpu_idx) ); - - if( FD_UNLIKELY( !endpoint ) ) FD_LOG_ERR(( "Missing --endpoint" )); - ulong page_sz = fd_cstr_to_shmem_page_sz( _page_sz ); - if( FD_UNLIKELY( !page_sz ) ) FD_LOG_ERR(( "unsupported --page-sz" )); - - FD_LOG_NOTICE(( "Creating workspace with --page-cnt %lu --page-sz %s pages on --numa-idx %lu", page_cnt, _page_sz, numa_idx )); - fd_wksp_t * wksp = fd_wksp_new_anonymous( page_sz, page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL ); - FD_TEST( wksp ); - - SSL_library_init(); - SSL_load_error_strings(); - - SSL_CTX * ssl_ctx = SSL_CTX_new( TLS_client_method() ); - if( FD_UNLIKELY( !ssl_ctx ) ) { - FD_LOG_ERR(( "SSL_CTX_new failed" )); - } - - if( FD_UNLIKELY( !SSL_CTX_set_mode( ssl_ctx, SSL_MODE_ENABLE_PARTIAL_WRITE|SSL_MODE_AUTO_RETRY ) ) ) { - FD_LOG_ERR(( "SSL_CTX_set_mode failed" )); - } - - if( FD_UNLIKELY( !SSL_CTX_set_min_proto_version( ssl_ctx, TLS1_3_VERSION ) ) ) { - FD_LOG_ERR(( "SSL_CTX_set_min_proto_version(ssl_ctx,TLS1_3_VERSION) failed" )); - } - - BIO * bio = BIO_new_ssl_connect( ssl_ctx ); - if( FD_UNLIKELY( !bio ) ) FD_LOG_ERR(( "BIO_new_ssl_connect failed" )); - - BIO_set_conn_hostname( bio, endpoint ); - BIO_set_nbio( bio, 1 ); - - SSL * ssl = NULL; - BIO_get_ssl( bio, &ssl ); - if( FD_UNLIKELY( !ssl ) ) FD_LOG_ERR(( "BIO_get_ssl failed" )); - - void * client_mem = fd_wksp_alloc_laddr( wksp, fd_bundle_client_align(), fd_bundle_client_footprint(), 1UL ); - if( FD_UNLIKELY( !client_mem ) ) FD_LOG_ERR(( "Failed to alloc bundle client" )); - static fd_bundle_client_metrics_t metrics[1]; - fd_bundle_client_t * client = fd_bundle_client_new( client_mem, ssl, metrics ); - - for(;;) - fd_bundle_client_rxtx( client ); - - fd_wksp_free_laddr( fd_bundle_client_delete( client ) ); - - BIO_free_all( bio ); - SSL_CTX_free( ssl_ctx ); - - fd_wksp_delete_anonymous( wksp ); - - fd_halt(); - return 0; -} From 961aad26a9bd9d514d6f74db99485499810f8b95 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 19:34:52 +0000 Subject: [PATCH 10/34] revert unrelated changes --- src/app/firedancer/commands/configure/funk.c | 59 -------------------- src/app/rpcserver/Local.mk | 2 +- src/discof/geyser/Local.mk | 6 +- src/discof/geyser/fd_geyser.c | 4 ++ 4 files changed, 8 insertions(+), 63 deletions(-) delete mode 100644 src/app/firedancer/commands/configure/funk.c diff --git a/src/app/firedancer/commands/configure/funk.c b/src/app/firedancer/commands/configure/funk.c deleted file mode 100644 index ece80b79bb..0000000000 --- a/src/app/firedancer/commands/configure/funk.c +++ /dev/null @@ -1,59 +0,0 @@ -#include "../../../shared/commands/configure/configure.h" -#include "../../../../funk/fd_funk_filemap.h" - -#define NAME "funk" - -static int -enabled( config_t const * config ) { - (void)config; - return 1; -} - -static void -funk_init_file( config_t const * config ) { - -} - -static void -funk_init_mem( config_t const * config ) { - -} - -static void -init( config_t const * config ) { - if( config->firedancer.funk.filemap.enabled ) funk_init_file( config ); - else funk_init_mem ( config ); -} - -static void -fini( config_t const * config, - int pre_init ) { - (void)pre_init; -} - -static void -funk_check_file( config_t const * config ) { - fd_funk_open_file( funk, funk_path, 1UL, ) -} - -static void -funk_check_mem( config_t const * config ) { - -} - -static configure_result_t -check( config_t const * config ) { - if( config->firedancer.funk.filemap.enabled ) funk_check_file( config ); - else funk_check_mem ( config ); -} - -configure_stage_t fd_cfg_stage_funk = { - .name = NAME, - .always_recreate = 0, - .enabled = enabled, - .init = init, - .fini = fini, - .check = check, -}; - -#undef NAME diff --git a/src/app/rpcserver/Local.mk b/src/app/rpcserver/Local.mk index cad6b8e792..11457119ca 100644 --- a/src/app/rpcserver/Local.mk +++ b/src/app/rpcserver/Local.mk @@ -1,7 +1,7 @@ ifdef FD_HAS_HOSTED ifdef FD_HAS_INT128 ifdef FD_HAS_SSE -#$(call make-bin,fd_rpcserver,main,fd_discof fd_disco fd_flamenco fd_reedsol fd_ballet fd_funk fd_tango fd_choreo fd_waltz fd_util, $(SECP256K1_LIBS)) +$(call make-bin,fd_rpcserver,main,fd_discof fd_disco fd_flamenco fd_reedsol fd_ballet fd_funk fd_tango fd_choreo fd_waltz fd_util, $(SECP256K1_LIBS)) endif endif endif diff --git a/src/discof/geyser/Local.mk b/src/discof/geyser/Local.mk index e51cceb687..9da0729a19 100644 --- a/src/discof/geyser/Local.mk +++ b/src/discof/geyser/Local.mk @@ -1,7 +1,7 @@ ifdef FD_HAS_INT128 ifdef FD_HAS_SSE -#$(call add-hdrs,fd_geyser.h) -#$(call add-objs,fd_geyser,fd_discof) -#$(call make-unit-test,test_geyser,test_geyser,fd_reedsol fd_discof fd_disco fd_flamenco fd_ballet fd_funk fd_tango fd_choreo fd_waltz fd_util) +$(call add-hdrs,fd_geyser.h) +$(call add-objs,fd_geyser,fd_discof) +$(call make-unit-test,test_geyser,test_geyser,fd_reedsol fd_discof fd_disco fd_flamenco fd_ballet fd_funk fd_tango fd_choreo fd_waltz fd_util) endif endif diff --git a/src/discof/geyser/fd_geyser.c b/src/discof/geyser/fd_geyser.c index f38c4bacbb..b9ffefab2a 100644 --- a/src/discof/geyser/fd_geyser.c +++ b/src/discof/geyser/fd_geyser.c @@ -1,7 +1,11 @@ #include "fd_geyser.h" +#include "../../funk/fd_funk_filemap.h" +#include "../../tango/mcache/fd_mcache.h" #include "../../flamenco/runtime/fd_acc_mgr.h" +#include "../../util/wksp/fd_wksp_private.h" +#include "../../disco/topo/fd_topo.h" #include #include From 752cd2c999b804df00480a7e08947459ecaee1c2 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 19:40:15 +0000 Subject: [PATCH 11/34] remove unused file --- src/discof/restore/fd_restore_manifest.c | 261 ----------------------- 1 file changed, 261 deletions(-) delete mode 100644 src/discof/restore/fd_restore_manifest.c diff --git a/src/discof/restore/fd_restore_manifest.c b/src/discof/restore/fd_restore_manifest.c deleted file mode 100644 index 28d916cd33..0000000000 --- a/src/discof/restore/fd_restore_manifest.c +++ /dev/null @@ -1,261 +0,0 @@ -/* fd_restore_manifest.c implements streaming decode of a Solana - snapshot "manifest" file. The "manifest" is an abomination of - variable-length bincode structures. When deserializing everything - upfront, the scratch memory use is hard to control (potentially - unbounded). */ - -#include "../../flamenco/types/fd_types.h" - -/* Decode steps */ - -#define FD_MANIFEST_PT_1_1 0x01 /* bank / blockhash queue */ -#define FD_MANIFEST_PT_1_2 0x02 /* bank / blockhash queue / last hash */ -#define FD_MANIFEST_PT_1_3 0x03 /* bank / blockhash queue */ -#define FD_MANIFEST_PT_3 0x05 /* bank */ -#define FD_MANIFEST_PT_4 0x06 /* ancestors list */ -#define FD_MANIFEST_PT_5 0x07 /* bank */ -#define FD_MANIFEST_PT_6 0x08 /* hard forks list */ -#define FD_MANIFEST_PT_7 0x09 /* bank */ -#define FD_MANIFEST_PT_8 0x0a /* "hashes per tick" */ -#define FD_MANIFEST_PT_9 0x0b /* bank */ -#define FD_MANIFEST_PT_10_1 0x0c /* vote account header */ -#define FD_MANIFEST_PT_10_2 0x0d /* vote account data */ -#define FD_MANIFEST_PT_10_3 0x0e /* vote account trailer */ -#define FD_MANIFEST_PT_11 0x0f /* bank / stakes */ -#define FD_MANIFEST_PT_12 0x10 /* stake delegations */ -#define FD_MANIFEST_PT_13 0x11 /* bank / stakes */ -#define FD_MANIFEST_PT_14 0x12 /* stake history */ -#define FD_MANIFEST_PT_15 0x13 /* bank */ -#define FD_MANIFEST_PT_16 0x14 /* unused account */ -#define FD_MANIFEST_PT_17 0x15 /* bank */ -#define FD_MANIFEST_PT_18_1 0x16 /* epoch stakes */ -#define FD_MANIFEST_PT_18_1_1 0x17 /* vote account header */ -#define FD_MANIFEST_PT_18_1_2 0x18 /* vote account data */ -#define FD_MANIFEST_PT_18_1_3 0x19 /* vote account trailer */ -#define FD_MANIFEST_PT_18_2 0x1a /* epoch stakes / stakes */ -#define FD_MANIFEST_PT_18_3 0x1b /* epoch stakes / stake delegations */ -#define FD_MANIFEST_PT_18_4 0x1c /* epoch stakes / stakes */ -#define FD_MANIFEST_PT_18_5 0x1d /* epoch stakes / stake history */ -#define FD_MANIFEST_PT_18_6 0x1e /* epoch stakes */ -#define FD_MANIFEST_PT_18_7_1 0x1f /* epoch stakes / node id mapping */ -#define FD_MANIFEST_PT_18_7_2 0x20 /* epoch stakes / node id mapping */ -#define FD_MANIFEST_PT_18_7_3 0x21 /* epoch stakes / node id mapping */ -#define FD_MANIFEST_PT_18_8 0x22 /* epoch stakes */ -#define FD_MANIFEST_PT_18_9 0x23 /* epoch stakes */ -#define FD_MANIFEST_PT_19 0x24 /* bank, db */ -#define FD_MANIFEST_PT_20 0x25 /* db / storages */ - -/* Data structures */ - -struct __attribute__((packed)) fd_manifest_pt1_1 { - ulong bhq_last_hash_index; - uchar bhq_last_hash_present; -}; -typedef struct fd_manifest_pt1 fd_manifest_pt1_t; - -struct __attribute__((packed)) fd_manifest_pt1_3 { - ulong bhq_last_hash_index; - uchar bhq_last_hash_present; -}; -typedef struct fd_manifest_pt1 fd_manifest_pt1_t; - -struct __attribute__((packed)) fd_manifest_pt1 { - ulong bhq_last_hash_index; - uchar bhq_last_hash_present; -}; -typedef struct fd_manifest_pt1 fd_manifest_pt1_t; - -struct __attribute__((packed)) fd_manifest_pt3 { - ulong bhq_max_age; - ulong ancestors_len; -}; -typedef struct fd_manifest_pt3 fd_manifest_pt3_t; - -struct __attribute__((packed)) fd_manifest_pt5 { - fd_hash_t hash; - fd_hash_t parent_hash; - ulong parent_slot; - ulong hard_forks_len; -}; -typedef struct fd_manifest_pt5 fd_manifest_pt5_t; - -struct __attribute__((packed)) fd_manifest_pt7 { - ulong transaction_count; - ulong tick_height; - ulong signature_count; - ulong capitalization; - ulong max_tick_height; - uchar hashes_per_tick_present; -}; -typedef struct fd_manifest_pt7 fd_manifest_pt7_t; - -struct __attribute__((packed)) fd_manifest_pt9 { - ulong ticks_per_slot; - ulong ns_per_slot_lo; - ulong ns_per_slot_hi; - ulong genesis_creation_time; - double slots_per_year; - ulong accounts_data_len; - - ulong slot; - ulong epoch; - ulong block_height; - - fd_pubkey_t collector_id; - ulong collector_fees; - fd_fee_calculator_t fee_calculator; - fd_fee_rate_governor_t fee_rate_governor; - ulong collected_rent; - fd_rent_collector_t rent_collector; - fd_inflation_t inflation; - - ulong vote_accounts_len; -}; -typedef struct fd_manifest_pt9 fd_manifest_pt9_t; - -struct __attribute__((packed)) fd_manifest_pt10_1 { - fd_pubkey_t key; - ulong stake; - ulong lamports; - ulong data_len; -}; -typedef struct fd_manifest_pt10_1 fd_manifest_pt10_1_t; - -struct __attribute__((packed)) fd_manifest_pt10_3 { - uchar executable; - ulong rent_epoch; -}; -typedef struct fd_manifest_pt10_3 fd_manifest_pt10_3_t; - -struct __attribute__((packed)) fd_manifest_pt13 { - ulong unused; - ulong epoch; - ulong stake_history_len; -}; -typedef struct fd_manifest_pt13 fd_manifest_pt13_t; - -struct __attribute__((packed)) fd_manifest_pt15 { - fd_pubkey_t unused1; - fd_pubkey_t unused2; - ulong unused3_len; -}; -typedef struct fd_manifest_pt15 fd_manifest_pt15_t; - -struct __attribute__((packed)) fd_manifest_pt18_1 { - ulong key; - ulong vote_accounts_len; -}; -typedef struct fd_manifest_pt18_1 fd_manifest_pt18_1_t; - -struct __attribute__((packed)) fd_manifest_pt18_6 { - ulong total_stake; - ulong node_id_mapping_len; -}; -typedef struct fd_manifest_pt18_6 fd_manifest_pt18_6_t; - -struct __attribute__((packed)) fd_manifest_pt18_7_1 { - fd_pubkey_t pubkey; - ulong vote_accounts_len; -}; -typedef struct fd_manifest_pt18_7_1 fd_manifest_pt18_7_1_t; - -struct __attribute__((packed)) fd_manifest_pt19 { - uchar is_delta; - ulong storages_len; -}; -typedef struct fd_manifest_pt19 fd_manifest_pt19_t; - -struct __attribute__((packed)) fd_manifest_pt20 { - -}; -typedef struct fd_manifest_pt20 fd_manifest_pt20_t; - -struct fd_restore_manifest_ctx { - uint state; - uchar * buf; - ulong buf_sz; - ulong buf_max; - - ulong statev[3]; -}; - -typedef struct fd_restore_manifest_ctx fd_restore_manifest_ctx_t; - -static void const * -buf_frag( fd_restore_manifest_ctx_t * ctx, - void const * frag, - ulong * p_frag_sz, - ulong want_sz ) { - FD_TEST( want_sz<=ctx->buf_max ); - ulong frag_sz = *p_frag_sz; - if( FD_UNLIKELY( frag_szbuf_sz < want_sz ); - ulong rem_sz = want_sz - ctx->buf_sz; - frag_sz = fd_ulong_min( frag_sz, rem_sz ); - fd_memcpy( ctx->buf + ctx->buf_sz, frag, frag_sz ); - ctx->buf_sz += frag_sz; - (*p_frag_sz) -= frag_sz; - if( ctx->buf_sz == want_sz ) { - ctx->buf_sz = 0UL; - return ctx->buf; - } - return NULL; - } - (*p_frag_sz) -= want_sz; - return frag; -} - -static ushort const -manifest_node_len[] = { - [ FD_MANIFEST_PT_1_1 ] = sizeof(fd_manifest_pt1_t), - [ FD_MANIFEST_PT_1_2 ] = sizeof(fd_pubkey_t), - [ FD_MANIFEST_PT_1_3 ] = FD_HASH_HASH_AGE_PAIR_FOOTPRINT, - [ FD_MANIFEST_PT_3 ] = sizeof(fd_manifest_pt3_t), - [ FD_MANIFEST_PT_4 ] = FD_SLOT_PAIR_FOOTPRINT, - [ FD_MANIFEST_PT_5 ] = sizeof(fd_manifest_pt5_t), - [ FD_MANIFEST_PT_6 ] = FD_SLOT_PAIR_FOOTPRINT, - [ FD_MANIFEST_PT_7 ] = sizeof(fd_manifest_pt7_t), - [ FD_MANIFEST_PT_8 ] = sizeof(uchar), - [ FD_MANIFEST_PT_9 ] = sizeof(fd_manifest_pt9_t), - [ FD_MANIFEST_PT_10_1 ] = sizeof(fd_manifest_pt10_1_t), - [ FD_MANIFEST_PT_10_2 ] = 0, - [ FD_MANIFEST_PT_10_3 ] = sizeof(fd_manifest_pt10_3_t), - [ FD_MANIFEST_PT_11 ] = sizeof(ulong), - [ FD_MANIFEST_PT_12 ] = FD_DELEGATION_PAIR_FOOTPRINT, - [ FD_MANIFEST_PT_13 ] = sizeof(fd_manifest_pt13_t), - [ FD_MANIFEST_PT_14 ] = FD_STAKE_HISTORY_ENTRY_FOOTPRINT, - [ FD_MANIFEST_PT_15 ] = sizeof(fd_manifest_pt15_t), - [ FD_MANIFEST_PT_16 ] = FD_PUBKEY_U64_PAIR_FOOTPRINT, - [ FD_MANIFEST_PT_17 ] = sizeof(ulong), - [ FD_MANIFEST_PT_18_1 ] = sizeof(fd_manifest_pt18_1_t), - [ FD_MANIFEST_PT_18_1_1 ] = sizeof(fd_manifest_pt10_1_t), - [ FD_MANIFEST_PT_18_1_2 ] = 0, - [ FD_MANIFEST_PT_18_1_3 ] = sizeof(fd_manifest_pt10_3_t), - [ FD_MANIFEST_PT_18_2 ] = sizeof(ulong), - [ FD_MANIFEST_PT_18_3 ] = FD_DELEGATION_PAIR_FOOTPRINT, - [ FD_MANIFEST_PT_18_4 ] = sizeof(fd_manifest_pt13_t), - [ FD_MANIFEST_PT_18_5 ] = FD_STAKE_HISTORY_ENTRY_FOOTPRINT, - [ FD_MANIFEST_PT_18_6 ] = sizeof(fd_manifest_pt18_6_t), - [ FD_MANIFEST_PT_18_7_1 ] = sizeof(fd_manifest_pt18_7_1_t), - [ FD_MANIFEST_PT_18_7_2 ] = sizeof(fd_pubkey_t), - [ FD_MANIFEST_PT_18_7_3 ] = sizeof(ulong), - [ FD_MANIFEST_PT_18_8 ] = sizeof(ulong), - [ FD_MANIFEST_PT_18_9 ] = FD_PUBKEY_PUBKEY_PAIR_FOOTPRINT, -}; - -ulong -fd_restore_manifest_frag( fd_restore_manifest_ctx_t * ctx, - void const * restrict frag, - ulong frag_sz ) { - switch( ctx->state ) { - case FD_MANIFEST_PT_1: { - fd_manifest_pt1_t const * pt1 = buf_frag( ctx, frag, &frag_sz, sizeof(fd_manifest_pt1_t) ); - (void)pt1; - ctx->state = FD_MANIFEST_PT_2; - ctx->statev[0] = pt1->bhq_last_hash_present; - return frag_sz; - } - case FD_MANIFEST_PT_2: - break; - } -} From e61027c0ae97cc5af07192047e46170bc2f0d23e Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Mon, 12 May 2025 19:57:35 +0000 Subject: [PATCH 12/34] fix initialization of cons_slow in stream_ctx --- src/discof/restore/stream/fd_stream_ctx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/discof/restore/stream/fd_stream_ctx.c b/src/discof/restore/stream/fd_stream_ctx.c index 82586929f5..65c54dceb7 100644 --- a/src/discof/restore/stream/fd_stream_ctx.c +++ b/src/discof/restore/stream/fd_stream_ctx.c @@ -37,6 +37,12 @@ fd_stream_ctx_init( fd_stream_ctx_t * ctx, fd_stream_ticks_init( ctx->ticks, ctx->event_map->event_cnt, 1e3L ); fd_stream_metrics_init( ctx->metrics ); FD_TEST( fd_rng_join( fd_rng_new( ctx->rng, 0, 0UL ) ) ); + + /* init metrics link for cons_slow */ + cons_idx = 0UL; + for( ; cons_idxcons_cnt; cons_idx++ ) { + ctx->cons_slow[ cons_idx ] = (ulong *)(fd_metrics_link_out( fd_metrics_base_tl, cons_idx ) + FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF); + } } fd_stream_ctx_t * From 23a905023c281af2e679171b10f8a007f5733080 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 20:12:20 +0000 Subject: [PATCH 13/34] new topo system --- .../firedancer-dev/commands/snapshot_load.c | 20 +++++++++++++------ src/disco/topo/fd_topo.h | 4 ++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index e31e04784b..c244f98f62 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -1,4 +1,4 @@ -#include "../../shared/fd_config.h" +#include "../../firedancer/topology.h" #include "../../shared/commands/configure/configure.h" #include "../../shared/commands/run/run.h" #include "../../../disco/metrics/fd_metrics.h" @@ -23,6 +23,11 @@ snapshot_load_topo( config_t * config, fd_topob_new( &config->topo, config->name ); topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size ); + fd_topob_wksp( topo, "funk" ); + fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", + config->firedancer.funk.max_account_records, + config->firedancer.funk.max_database_transactions ); + static ushort tile_to_cpu[ FD_TILE_MAX ] = {0}; if( args->tile_cpus[0] ) { ulong cpu_cnt = fd_tile_private_cpus_parse( args->tile_cpus, tile_to_cpu ); @@ -31,10 +36,7 @@ snapshot_load_topo( config_t * config, fd_topob_wksp( topo, "metric_in" ); fd_topob_wksp( topo, "metric" ); - fd_topo_tile_t * metric_tile = fd_topob_tile( topo, "metric", "metric", "metric_in", tile_to_cpu[0], 0, 0 ); - if( FD_UNLIKELY( !fd_cstr_to_ip4_addr( config->tiles.metric.prometheus_listen_address, &metric_tile->metric.prometheus_listen_addr ) ) ) - FD_LOG_ERR(( "failed to parse prometheus listen address `%s`", config->tiles.metric.prometheus_listen_address )); - metric_tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + fd_topob_tile( topo, "metric", "metric", "metric_in", tile_to_cpu[0], 0, 0 ); fd_topob_wksp( topo, "FileRd" ); fd_topo_tile_t * filerd_tile = fd_topob_tile( topo, "FileRd", "FileRd", "FileRd", tile_to_cpu[1], 0, 0 ); @@ -52,7 +54,8 @@ snapshot_load_topo( config_t * config, fd_topob_wksp( topo, "ActAlc" ); fd_topo_tile_t * actalc_tile = fd_topob_tile( topo, "ActAlc", "ActAlc", "ActAlc", tile_to_cpu[4], 0, 0 ); - (void)actalc_tile; + fd_topob_tile_uses( topo, actalc_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); + actalc_tile->actalc.funk_obj_id = funk_obj->id; fd_topob_wksp( topo, "snap_unzstd" ); fd_topob_wksp( topo, "snap_stream" ); @@ -83,6 +86,11 @@ snapshot_load_topo( config_t * config, fd_topob_link( topo, "snap_descs", "snap_descs", 512UL, 0UL, 0UL )->permit_no_consumers = 1; fd_topob_tile_out( topo, "ActAlc", 0UL, "snap_descs", 0UL ); + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + fd_topo_configure_tile( tile, config ); + } + if( !args->tile_cpus[0] ) { fd_topob_auto_layout( topo, 0 ); } diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index ef7250b157..595474abba 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -437,6 +437,10 @@ typedef struct { ulong scratch_sz; } snapin; + struct { + ulong funk_obj_id; + } actalc; + }; } fd_topo_tile_t; From 5a9ec6400edba4dc39e58e326e177ff34c33f840 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 20:12:56 +0000 Subject: [PATCH 14/34] uncompressed tar support --- .../firedancer-dev/commands/snapshot_load.c | 85 ++++++++++++++----- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index c244f98f62..bd9562d773 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -8,6 +8,7 @@ #include #include #include +#include #define NAME "snapshot-load" @@ -16,9 +17,33 @@ extern fd_topo_obj_callbacks_t * CALLBACKS[]; fd_topo_run_tile_t fdctl_tile_run( fd_topo_tile_t const * tile ); +/* _is_zstd returns 1 if given file handle points to the beginning of a + zstd stream, otherwise zero. */ + +static int +_is_zstd( char const * path ) { + FILE * file = fopen( path, "r" ); + FD_TEST( file ); + uint magic; + ulong n = fread( &magic, 1UL, 4UL, file ); + if( FD_UNLIKELY( feof( file ) ) ) { + clearerr( file ); + fseek( file, -(long)n, SEEK_CUR ); + fclose( file ); + return 0; + } + int err = ferror( file ); + if( FD_UNLIKELY( err ) ) + FD_LOG_ERR(( "fread() failed (%d-%s)", err, strerror( err ) )); + fseek( file, -4L, SEEK_CUR ); + fclose( file ); + return ( magic==0xFD2FB528UL ); +} static void snapshot_load_topo( config_t * config, args_t const * args ) { + int is_zstd = _is_zstd( args->snapshot_load.snapshot_path ); + fd_topo_t * topo = &config->topo; fd_topob_new( &config->topo, config->name ); topo->max_page_size = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size ); @@ -38,15 +63,53 @@ snapshot_load_topo( config_t * config, fd_topob_wksp( topo, "metric" ); fd_topob_tile( topo, "metric", "metric", "metric_in", tile_to_cpu[0], 0, 0 ); + /* read() tile */ fd_topob_wksp( topo, "FileRd" ); fd_topo_tile_t * filerd_tile = fd_topob_tile( topo, "FileRd", "FileRd", "FileRd", tile_to_cpu[1], 0, 0 ); fd_memcpy( filerd_tile->filerd.file_path, args->snapshot_load.snapshot_path, PATH_MAX ); FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==sizeof(args->snapshot_load.snapshot_path), abi ); FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==PATH_MAX, abi ); - fd_topob_wksp( topo, "Unzstd" ); - fd_topo_tile_t * unzstd_tile = fd_topob_tile( topo, "Unzstd", "Unzstd", "Unzstd", tile_to_cpu[2], 0, 0 ); - (void)unzstd_tile; + /* Uncompressed data stream */ + fd_topob_wksp( topo, "snap_stream" ); + fd_topo_link_t * snapin_link = fd_topob_link( topo, "snap_stream", "snap_stream", 512UL, 0UL, 0UL ); + fd_topo_obj_t * snapin_dcache = fd_topob_obj( topo, "dcache", "snap_stream" ); + snapin_link->dcache_obj_id = snapin_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", snapin_dcache->id ) ); + + if( is_zstd ) { /* .tar.zst file */ + + /* "unzstd": Zstandard decompress tile */ + fd_topob_wksp( topo, "Unzstd" ); + fd_topo_tile_t * unzstd_tile = fd_topob_tile( topo, "Unzstd", "Unzstd", "Unzstd", tile_to_cpu[2], 0, 0 ); + (void)unzstd_tile; + + /* Compressed data stream */ + fd_topob_wksp( topo, "snap_zstd" ); + fd_topo_link_t * zstd_link = fd_topob_link( topo, "snap_zstd", "snap_zstd", 512UL, 0UL, 0UL ); + fd_topo_obj_t * zstd_dcache = fd_topob_obj( topo, "dcache", "snap_zstd"); + zstd_link->dcache_obj_id = zstd_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", zstd_dcache->id ) ); + + /* filerd tile -> compressed stream */ + fd_topob_tile_out( topo, "FileRd", 0UL, "snap_zstd", 0UL ); + fd_topob_tile_uses( topo, filerd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + /* compressed stream -> unzstd tile */ + fd_topob_tile_in( topo, "Unzstd", 0UL, "metric_in", "snap_zstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, unzstd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* unzstd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + } else { /* .tar file */ + + /* filerd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "FileRd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, filerd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + } fd_topob_wksp( topo, "SnapIn" ); fd_topo_tile_t * snapin_tile = fd_topob_tile( topo, "SnapIn", "SnapIn", "SnapIn", tile_to_cpu[3], 0, 0 ); @@ -57,23 +120,7 @@ snapshot_load_topo( config_t * config, fd_topob_tile_uses( topo, actalc_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); actalc_tile->actalc.funk_obj_id = funk_obj->id; - fd_topob_wksp( topo, "snap_unzstd" ); - fd_topob_wksp( topo, "snap_stream" ); - fd_topo_link_t * unzstd_link = fd_topob_link( topo, "snap_unzstd", "snap_unzstd", 512UL, 0UL, 0UL ); - fd_topo_link_t * snapin_link = fd_topob_link( topo, "snap_stream", "snap_stream", 512UL, 0UL, 0UL ); - fd_topo_obj_t * snapin_dcache = fd_topob_obj( topo, "dcache", "snap_stream" ); - fd_topo_obj_t * unzstd_dcache = fd_topob_obj( topo, "dcache", "snap_unzstd"); - unzstd_link->dcache_obj_id = unzstd_dcache->id; - snapin_link->dcache_obj_id = snapin_dcache->id; - FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", snapin_dcache->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", unzstd_dcache->id ) ); - fd_topob_tile_out ( topo, "FileRd", 0UL, "snap_unzstd", 0UL ); - fd_topob_tile_in (topo, "Unzstd", 0UL, "metric_in", "snap_unzstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED); - fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); fd_topob_tile_in ( topo, "SnapIn", 0UL, "metric_in", "snap_stream", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - fd_topob_tile_uses( topo, filerd_tile, unzstd_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); - fd_topob_tile_uses( topo, unzstd_tile, unzstd_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); - fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); fd_topob_tile_uses( topo, snapin_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); fd_topob_tile_uses( topo, actalc_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); From 1b16a7f0c98cd0946a5343458b61a9841a960b60 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 12 May 2025 20:29:11 +0000 Subject: [PATCH 15/34] fix monitor segfault --- src/app/firedancer-dev/commands/snapshot_load.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index bd9562d773..23921161e0 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -191,13 +191,14 @@ snapshot_load_cmd_fn( args_t * args, fd_topo_tile_t * file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; fd_topo_tile_t * snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; - fd_topo_tile_t * unzstd_tile = &topo->tiles[ fd_topo_find_tile( topo, "Unzstd", 0UL ) ]; + ulong zstd_tile_idx = fd_topo_find_tile( topo, "Unzstd", 0UL ); + fd_topo_tile_t * unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; ulong * snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; ulong * snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); ulong volatile * file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); ulong volatile * snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); - ulong volatile * unzstd_in_metrics = fd_metrics_tile( unzstd_tile->metrics ); + ulong volatile * unzstd_in_metrics = unzstd_tile ? fd_metrics_tile( unzstd_tile->metrics ) : NULL; ulong goff_old = 0UL; ulong file_rd_backp_old = 0UL; @@ -209,8 +210,8 @@ snapshot_load_cmd_fn( args_t * args, ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); - ulong unzstd_status = FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); - if( FD_UNLIKELY( filerd_status==2UL || snapin_status==2UL || unzstd_status==2UL ) ) { + ulong unzstd_status = unzstd_in_metrics ? FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; + if( FD_UNLIKELY( filerd_status==2UL && snapin_status==2UL && unzstd_status==2UL ) ) { FD_LOG_NOTICE(( "Done" )); break; } From 15ea41dc0a9bc4e17fc2f15b15e44507ae0b9994 Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Mon, 12 May 2025 20:50:20 +0000 Subject: [PATCH 16/34] use wksp addr for shared dcache --- src/discof/restore/fd_snapin_tile.c | 13 ++++++------- src/discof/restore/stream/fd_stream_writer.c | 1 + src/discof/restore/stream/fd_stream_writer.h | 3 ++- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index cda85f662e..901e5b30b2 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -577,16 +577,15 @@ unprivileged_init( fd_topo_t * topo, /* Join stream input */ - uchar const * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ); - ctx->in_base = out_dcache; - ctx->in_skip = 0UL; + ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp;; + ctx->in_skip = 0UL; /* Join frame buffer */ - ctx->buf = scratch_mem; - ctx->buf_sz = 0UL; - ctx->buf_ctr = 0UL; - ctx->buf_max = tile->snapin.scratch_sz; + ctx->buf = scratch_mem; + ctx->buf_sz = 0UL; + ctx->buf_ctr = 0UL; + ctx->buf_max = tile->snapin.scratch_sz; /* Join snapshot file parser */ diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c index bd7076ac92..200468075a 100644 --- a/src/discof/restore/stream/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -30,6 +30,7 @@ fd_stream_writer_new( void * mem, self->out_mcache = out_mcache; self->buf = dcache; + self->buf_base = (ulong)dcache - (ulong)fd_wksp_containing( dcache ); self->buf_off = 0UL; self->buf_sz = fd_dcache_data_sz( dcache ); self->goff = 0UL; diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h index 679317b41f..32ce466e22 100644 --- a/src/discof/restore/stream/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -11,6 +11,7 @@ struct fd_stream_writer { fd_stream_frag_meta_t * out_mcache; /* frag producer mcache */ uchar * buf; /* laddr of shared dcache buffer */ + ulong buf_base; /* offset to the dcache buffer from wksp */ /* dcache buffer state */ ulong buf_off; /* local write offset into dcache buffer */ @@ -117,7 +118,7 @@ fd_stream_writer_get_avail_bytes( fd_stream_writer_t * writer ) { static inline void fd_stream_writer_publish( fd_stream_writer_t * writer, ulong frag_sz ) { - ulong loff = writer->stream_off; + ulong loff = writer->buf_base + writer->stream_off; fd_mcache_publish_stream( writer->out_mcache, fd_mcache_depth( writer->out_mcache->f ), writer->out_seq, From 412472b1da5c4cce9bd6b21a05ba42f3278ba840 Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Tue, 13 May 2025 00:22:22 +0000 Subject: [PATCH 17/34] fix snapin shutdown --- .../firedancer-dev/commands/snapshot_load.c | 2 +- src/discof/restore/fd_filerd_tile.c | 2 +- src/discof/restore/fd_snapin_tile.c | 59 ++++++++++--------- src/discof/restore/fd_unzstd_tile.c | 14 ++--- src/discof/restore/stream/fd_stream_writer.c | 1 + src/discof/restore/stream/fd_stream_writer.h | 10 +++- 6 files changed, 49 insertions(+), 39 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 23921161e0..69153f0671 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -211,7 +211,7 @@ snapshot_load_cmd_fn( args_t * args, ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong unzstd_status = unzstd_in_metrics ? FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; - if( FD_UNLIKELY( filerd_status==2UL && snapin_status==2UL && unzstd_status==2UL ) ) { + if( FD_UNLIKELY( filerd_status==2UL && unzstd_status==2UL && snapin_status == 2UL ) ) { FD_LOG_NOTICE(( "Done" )); break; } diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c index 1ea69b9d21..b3b63b5414 100644 --- a/src/discof/restore/fd_filerd_tile.c +++ b/src/discof/restore/fd_filerd_tile.c @@ -83,7 +83,7 @@ fd_filerd_shutdown( fd_filerd_tile_t * ctx, } ctx->fd = -1; FD_MGAUGE_SET( TILE, STATUS, 2UL ); - FD_VOLATILE( ctx->out_sync[ 3 ] ) = seq_final; + FD_VOLATILE( ctx->out_sync[ 2 ] ) = seq_final; FD_COMPILER_MFENCE(); FD_LOG_INFO(( "Reached end of file" )); diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index 901e5b30b2..9c33116304 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -74,7 +74,6 @@ struct fd_snapin_tile { ulong buf_ctr; /* number of bytes allocated in buffer */ ulong buf_sz; /* target buffer size (buf_ctrshutdown_signal ); + /* wait for zstd tile to set shutdown sequence number */ + while ( in_seq_max == 0 ) { + in_seq_max = FD_VOLATILE_CONST( *ctx->shutdown_signal ); + FD_SPIN_PAUSE(); + } + + /* FIXME set final sequence number */ + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + FD_TEST( in_seq_max == ctx->seq+1 && in_seq_max != 0 ); + FD_COMPILER_MFENCE(); + FD_LOG_WARNING(( "Finished parsing snapshot" )); + + for(;;) pause(); +} + static void fd_snapshot_restore_discard_buf( fd_snapin_tile_t * self ) { self->buf_ctr = 0UL; @@ -598,6 +617,7 @@ unprivileged_init( fd_topo_t * topo, ctx->out_seq_max = 0UL; ctx->out_seq = 0UL; ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); + ctx->shutdown_signal = fd_mcache_seq_laddr_const( topo->links[ tile->in_link_id[ 0 ] ].mcache ) + 2; } @@ -627,8 +647,11 @@ tar_process_hdr( fd_snapin_tile_t * reader, int not_zero=0; for( ulong i=0UL; ibuf[ i ]; - if( !not_zero ) return; - + if( !not_zero ) { + cur += sizeof(fd_tar_meta_t); + fd_snapin_shutdown( reader ); + return; + } /* Not an EOF, so must be a protocol error */ ulong goff = (ulong)cur - reader->goff_translate - sizeof(fd_tar_meta_t); FD_LOG_WARNING(( "Invalid tar header magic at goff=0x%lx", goff )); @@ -657,12 +680,11 @@ tar_read_hdr( fd_snapin_tile_t * reader, uchar const * end = cur+bufsz; /* Skip padding */ - if( reader->pad_sz==0UL ) { + if( reader->buf_ctr==0UL ) { ulong goff = (ulong)cur - reader->goff_translate; - reader->pad_sz = fd_ulong_align_up( goff, 512UL ) - goff; - ulong pad_sz_cur = fd_ulong_min( reader->pad_sz, (ulong)( end-cur ) ); - reader->pad_sz -= pad_sz_cur; - cur += pad_sz_cur; + ulong pad_sz = fd_ulong_align_up( goff, 512UL ) - goff; + pad_sz = fd_ulong_min( pad_sz, (ulong)( end-cur ) ); + cur += pad_sz; } /* Determine number of bytes to read */ @@ -804,16 +826,6 @@ fd_snapin_in_update( fd_snapin_in_t * in ) { accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; } -// __attribute__((noreturn)) static void -// fd_snapin_shutdown( void ) { -// FD_MGAUGE_SET( TILE, STATUS, 2UL ); -// /* FIXME set final sequence number */ -// FD_COMPILER_MFENCE(); -// FD_LOG_INFO(( "Finished parsing snapshot" )); - -// for(;;) pause(); -// } - __attribute__((noinline)) static void fd_snapin_run1( fd_snapin_tile_t * ctx, @@ -864,7 +876,6 @@ fd_snapin_run1( } FD_TEST( in_cnt==1 ); - // ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( in[0].mcache->f ) + 3; /* out frag stream init */ @@ -923,7 +934,6 @@ fd_snapin_run1( ulong housekeeping_ticks = 0UL; if( FD_UNLIKELY( (now-then)>=0L ) ) { ulong event_idx = (ulong)event_map[ event_seq ]; - if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ - /* Send flow control credits and drain flow control diagnostics. */ ulong in_idx = event_idx - cons_cnt - 1UL; fd_snapin_in_update( &in[ in_idx ] ); - /* Input tile finished? */ - // ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); - // FD_LOG_WARNING(("snapin: in_seq_max is %lu", in_seq_max)); - // if( FD_UNLIKELY( in_seq_max == in[ 0 ].seq ) ) { - // fd_snapin_shutdown(); - // } - } else { /* event_idx==cons_cnt, housekeeping event */ /* Send synchronization info */ @@ -1092,6 +1094,7 @@ fd_snapin_run1( this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + ctx->seq = this_in->seq; } diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 34ca93fd9d..4bd9b352ba 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -139,23 +139,23 @@ fd_unzstd_in_update( fd_stream_reader_t * in ) { } __attribute__((noreturn)) static void -fd_unzstd_shutdown( void ) { +fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { FD_MGAUGE_SET( TILE, STATUS, 2UL ); - /* FIXME set final sequence number */ + fd_stream_writer_notify_shutdown( ctx->writer ); FD_COMPILER_MFENCE(); - FD_LOG_INFO(( "Finished parsing snapshot" )); for(;;) pause(); } static void fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, - ulong const volatile * shutdown_signal ) { + fd_unzstd_tile_t * ctx, + ulong const volatile * shutdown_signal ) { ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); if( FD_UNLIKELY( in_seq_max == stream_ctx->in[ 0 ].base.seq && in_seq_max != 0) ) { FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", in_seq_max, stream_ctx->in[0].base.seq)); - fd_unzstd_shutdown(); + fd_unzstd_shutdown( ctx ); } } @@ -167,7 +167,7 @@ fd_unzstd_run1( FD_LOG_INFO(( "Running unzstd tile" )); /* run loop init */ - ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 3; + ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 2; fd_stream_writer_init_flow_control_credits( ctx->writer ); fd_stream_ctx_init_run_loop( stream_ctx ); @@ -181,7 +181,7 @@ fd_unzstd_run1( /* Receive flow control credits from this out. */ fd_stream_writer_receive_flow_control_credits( ctx->writer, cons_idx ); - fd_unzstd_poll_shutdown( stream_ctx, shutdown_signal ); + fd_unzstd_poll_shutdown( stream_ctx, ctx, shutdown_signal ); } else if( event_idx>stream_ctx->cons_cnt) { /* send credits */ ulong in_idx = event_idx - stream_ctx->cons_cnt - 1UL; diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c index 200468075a..dea9aeb027 100644 --- a/src/discof/restore/stream/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -49,6 +49,7 @@ fd_stream_writer_new( void * mem, self->cons_cnt = cons_cnt; self->cons_seq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), EXPECTED_FSEQ_CNT_PER_CONS*cons_cnt*sizeof(ulong) ); self->cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); + self->out_sync = fd_mcache_seq_laddr( topo->links[ tile->out_link_id[ link_id ] ].mcache ); /* Set up consumer fseq pointer array. We keep track of 2 fseqs per consumer to manage stream flow control. diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h index 32ce466e22..a08bd2a0af 100644 --- a/src/discof/restore/stream/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -32,6 +32,7 @@ struct fd_stream_writer { ulong cons_cnt; /* number of consumers */ ulong * cons_seq; /* consumer fseq values */ ulong ** cons_fseq; /* consumer fseq pointers */ + ulong * out_sync; /* out fseq */ }; typedef struct fd_stream_writer fd_stream_writer_t; @@ -88,8 +89,8 @@ fd_stream_writer_update_flow_control_credits( fd_stream_writer_t * writer, ulong cr_byte_avail = writer->cr_byte_max; ulong cr_frag_avail = writer->cr_frag_max; for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { - ulong cons_cr_byte_avail = (ulong)fd_long_max( (long)writer->cr_byte_max-fd_long_max( fd_seq_diff( writer->goff, writer->cons_seq[ 2*cons_idx+1 ] ), 0L ), 0L ); - ulong cons_cr_frag_avail = (ulong)fd_long_max( (long)writer->cr_frag_max-fd_long_max( fd_seq_diff( writer->out_seq, writer->cons_seq[ 2*cons_idx ] ), 0L ), 0L ); + ulong cons_cr_byte_avail = (ulong)fd_long_max( (long)writer->cr_byte_max-fd_long_max( fd_seq_diff( writer->goff, writer->cons_seq[ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx+1 ] ), 0L ), 0L ); + ulong cons_cr_frag_avail = (ulong)fd_long_max( (long)writer->cr_frag_max-fd_long_max( fd_seq_diff( writer->out_seq, writer->cons_seq[ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx ] ), 0L ), 0L ); slowest_cons = fd_ulong_if( cons_cr_byte_availcr_byte_availburst_byte || writer->cr_frag_availburst_frag; } +static inline void +fd_stream_writer_notify_shutdown( fd_stream_writer_t * writer ) { + FD_VOLATILE( writer->out_sync[ EXPECTED_FSEQ_CNT_PER_CONS * writer->cons_cnt ] ) = writer->out_seq; +} + static inline void * fd_stream_writer_delete( fd_stream_writer_t * writer ) { fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); From ec0dec69d368d23a3e1425f8f6200d9e2c5f7beb Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 06:06:29 +0000 Subject: [PATCH 18/34] snapshot load improve output --- .../firedancer-dev/commands/snapshot_load.c | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 69153f0671..66ae1cc635 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -189,25 +189,29 @@ snapshot_load_cmd_fn( args_t * args, double ns_per_tick = 1.0/tick_per_ns; fd_topo_run_single_process( topo, 2, config->uid, config->gid, fdctl_tile_run, NULL ); - fd_topo_tile_t * file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; - fd_topo_tile_t * snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; + fd_topo_tile_t * file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; + fd_topo_tile_t * snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; ulong zstd_tile_idx = fd_topo_find_tile( topo, "Unzstd", 0UL ); - fd_topo_tile_t * unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; + fd_topo_tile_t * unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; + fd_topo_tile_t * actalc_tile = &topo->tiles[ fd_topo_find_tile( topo, "ActAlc", 0UL ) ]; - ulong * snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; - ulong * snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); - ulong volatile * file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); - ulong volatile * snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); + ulong * snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; + ulong * snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); + ulong volatile * file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); + ulong volatile * snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); ulong volatile * unzstd_in_metrics = unzstd_tile ? fd_metrics_tile( unzstd_tile->metrics ) : NULL; + ulong volatile * actalc_metrics = fd_metrics_tile( actalc_tile->metrics ); ulong goff_old = 0UL; ulong file_rd_backp_old = 0UL; + ulong snap_in_backp_old = 0UL; ulong snap_in_wait_old = 0UL; + ulong actalc_wait_old = 0UL; ulong acc_cnt_old = 0UL; ulong frag_cnt_old = 0UL; - for(;;) { sleep( 1 ); - + FD_LOG_NOTICE(( "---------------backp=(file,snap) busy=(snap,alc )-------------------------------" )); + for(;;) { ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong unzstd_status = unzstd_in_metrics ? FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; @@ -218,21 +222,29 @@ snapshot_load_cmd_fn( args_t * args, ulong goff = FD_VOLATILE_CONST( snap_in_fseq[ 1 ] ); ulong file_rd_backp = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong snap_in_backp = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); ulong snap_in_wait = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ); + ulong actalc_wait = FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ); ulong frag_cnt = FD_VOLATILE_CONST( snap_accs_sync[0] ); ulong acc_cnt = FD_VOLATILE_CONST( snap_accs_sync[1] ); - FD_LOG_NOTICE(( "rate=%4.2g GB/s back=%3.0f%% parser_busy=%3.0f%% acc=%8.3g/s frag=%8.3g/s", + FD_LOG_NOTICE(( "rate=%4.2g GB/s backp=(%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%) acc=%8.3g/s frag=%8.3g/s", (double)( goff-goff_old )/1e9, ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, + ( (double)( snap_in_backp-snap_in_backp_old )*ns_per_tick )/1e7, ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7, + ( (double)( actalc_wait -actalc_wait_old )*ns_per_tick )/1e7, (double)( acc_cnt -acc_cnt_old ), (double)( frag_cnt-frag_cnt_old ) ) ); goff_old = goff; file_rd_backp_old = file_rd_backp; + snap_in_backp_old = snap_in_backp; snap_in_wait_old = snap_in_wait; + actalc_wait_old = actalc_wait; acc_cnt_old = acc_cnt; frag_cnt_old = frag_cnt; + sleep( 1 ); } FD_LOG_NOTICE(( "Loaded %g accounts", (double)FD_VOLATILE_CONST( snap_accs_sync[1] ) )); From 26df3d20660a103ad496a7c7202e0a765d1833dd Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 09:44:01 +0000 Subject: [PATCH 19/34] Fix funk wksp sizing --- src/app/firedancer-dev/commands/backtest.c | 3 ++- src/app/firedancer-dev/commands/snapshot_load.c | 3 ++- src/app/firedancer/topology.c | 15 +++++++++------ src/app/firedancer/topology.h | 3 ++- src/app/shared/fd_obj_callbacks.c | 14 +++++++++++--- 5 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/app/firedancer-dev/commands/backtest.c b/src/app/firedancer-dev/commands/backtest.c index 3f2cdf3739..b5684b656d 100644 --- a/src/app/firedancer-dev/commands/backtest.c +++ b/src/app/firedancer-dev/commands/backtest.c @@ -71,7 +71,8 @@ backtest_topo( config_t * config ) { fd_topob_wksp( topo, "funk" ); fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", config->firedancer.funk.max_account_records, - config->firedancer.funk.max_database_transactions ); + config->firedancer.funk.max_database_transactions, + config->firedancer.funk.heap_size_gib ); fd_topob_tile_uses( topo, replay_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 66ae1cc635..98b828b5f2 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -51,7 +51,8 @@ snapshot_load_topo( config_t * config, fd_topob_wksp( topo, "funk" ); fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", config->firedancer.funk.max_account_records, - config->firedancer.funk.max_database_transactions ); + config->firedancer.funk.max_database_transactions, + config->firedancer.funk.heap_size_gib ); static ushort tile_to_cpu[ FD_TILE_MAX ] = {0}; if( args->tile_cpus[0] ) { diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 6454147492..c7c14c78c4 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -79,11 +79,13 @@ fd_topo_obj_t * setup_topo_funk( fd_topo_t * topo, char const * wksp_name, ulong max_account_records, - ulong max_database_transactions ) { + ulong max_database_transactions, + ulong heap_size_gib ) { fd_topo_obj_t * obj = fd_topob_obj( topo, "funk", wksp_name ); FD_TEST( fd_pod_insert_ulong( topo->props, "funk", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_account_records, "obj.%lu.rec_max", obj->id ) ); - FD_TEST( fd_pod_insertf_ulong( topo->props, max_database_transactions, "obj.%lu.txn_max", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, max_account_records, "obj.%lu.rec_max", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, max_database_transactions, "obj.%lu.txn_max", obj->id ) ); + FD_TEST( fd_pod_insertf_ulong( topo->props, heap_size_gib<<30, "obj.%lu.heap_max", obj->id ) ); ulong funk_footprint = fd_funk_footprint( max_database_transactions, max_account_records ); if( FD_UNLIKELY( !funk_footprint ) ) FD_LOG_ERR(( "Invalid [funk] parameters" )); @@ -91,8 +93,8 @@ setup_topo_funk( fd_topo_t * topo, ulong wksp_idx = fd_topo_find_wksp( topo, wksp_name ); FD_TEST( wksp_idx!=ULONG_MAX ); fd_topo_wksp_t * wksp = &topo->workspaces[ wksp_idx ]; - ulong part_max = fd_wksp_part_max_est( funk_footprint, 1U<<18U ); - if( FD_UNLIKELY( !part_max ) ) FD_LOG_ERR(( "fd_wksp_part_max_est(%lu,256KiB) failed", funk_footprint )); + ulong part_max = fd_wksp_part_max_est( funk_footprint, 1U<<14U ); + if( FD_UNLIKELY( !part_max ) ) FD_LOG_ERR(( "fd_wksp_part_max_est(%lu,16KiB) failed", funk_footprint )); wksp->part_max += part_max; return obj; @@ -471,7 +473,8 @@ fd_topo_initialize( config_t * config ) { fd_topo_obj_t * funk_obj = setup_topo_funk( topo, "funk", config->firedancer.funk.max_account_records, - config->firedancer.funk.max_database_transactions ); + config->firedancer.funk.max_database_transactions, + config->firedancer.funk.heap_size_gib ); /* */ fd_topob_tile_uses( topo, batch_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); FOR(exec_tile_cnt) fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "exec", i ) ], funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); diff --git a/src/app/firedancer/topology.h b/src/app/firedancer/topology.h index 3f213b1eba..73a2d827e9 100644 --- a/src/app/firedancer/topology.h +++ b/src/app/firedancer/topology.h @@ -39,7 +39,8 @@ fd_topo_obj_t * setup_topo_funk( fd_topo_t * topo, char const * wksp_name, ulong max_account_records, - ulong max_database_transactions ); + ulong max_database_transactions, + ulong heap_size_gib ); int fd_topo_configure_tile( fd_topo_tile_t * tile, diff --git a/src/app/shared/fd_obj_callbacks.c b/src/app/shared/fd_obj_callbacks.c index 009392f6e8..18fed6cad5 100644 --- a/src/app/shared/fd_obj_callbacks.c +++ b/src/app/shared/fd_obj_callbacks.c @@ -277,6 +277,13 @@ fd_topo_obj_callbacks_t fd_obj_cb_keyswitch = { .new = keyswitch_new, }; +static ulong +funk_align( fd_topo_t const * topo, + fd_topo_obj_t const * obj ) { + (void)topo; (void)obj; + return fd_funk_align(); +} + static ulong funk_footprint( fd_topo_t const * topo, fd_topo_obj_t const * obj ) { @@ -285,10 +292,10 @@ funk_footprint( fd_topo_t const * topo, } static ulong -funk_align( fd_topo_t const * topo, +funk_loose( fd_topo_t const * topo, fd_topo_obj_t const * obj ) { - (void)topo; (void)obj; - return fd_funk_align(); + (void)topo; + return VAL("heap_max"); } static void @@ -303,6 +310,7 @@ funk_new( fd_topo_t const * topo, fd_topo_obj_callbacks_t fd_obj_cb_funk = { .name = "funk", .footprint = funk_footprint, + .loose = funk_loose, .align = funk_align, .new = funk_new, }; From fa278201fc7c26833d836c0f7412ba6e23b207b2 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 09:46:52 +0000 Subject: [PATCH 20/34] Fix snapin backpressure --- src/discof/restore/fd_actalc_tile.c | 2 + src/discof/restore/fd_snapin_tile.c | 74 +++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index bc45df5101..25224ec3d0 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -55,6 +55,7 @@ unprivileged_init( fd_topo_t * topo, /* FIXME check link names */ fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + memset( ctx, 0, sizeof(fd_actalc_tile_t) ); /* Join account output */ @@ -372,6 +373,7 @@ fd_actalc_run1( this_in_seq = fd_seq_inc( this_in_seq, 1UL ); this_in->seq = this_in_seq; + this_in->goff = meta.goff + meta.sz; this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index 9c33116304..d83d0647bd 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -2,6 +2,7 @@ #include "../../disco/topo/fd_topo.h" #include "../../disco/metrics/fd_metrics.h" #include "../../util/archive/fd_tar.h" +#include "../../flamenco/runtime/fd_acc_mgr.h" /* FD_ACC_SZ_MAX */ #include "../../flamenco/types/fd_types.h" #include #include @@ -65,7 +66,6 @@ struct fd_snapin_tile { uchar const * in_base; ulong goff_translate; - ulong loff_translate; ulong in_skip; /* Frame buffer */ @@ -308,6 +308,9 @@ snapshot_read_is_complete( fd_snapin_tile_t const * restore ) { static int snapshot_restore_account_hdr( fd_snapin_tile_t * restore ) { fd_solana_account_hdr_t const * hdr = fd_type_pun_const( restore->buf ); + if( FD_UNLIKELY( hdr->meta.data_len > FD_ACC_SZ_MAX ) ) { + FD_LOG_ERR(( "account data size (%lu) exceeds max (%lu) (possible memory corruption?)", hdr->meta.data_len, FD_ACC_SZ_MAX )); + } ulong data_sz = hdr->meta.data_len; restore->acc_sz = data_sz; @@ -344,7 +347,7 @@ snapshot_read_account_hdr_chunk( fd_snapin_tile_t * restore, int som = restore->buf_ctr == 0UL; ulong frag_goff = (ulong)buf - restore->goff_translate; - ulong frag_loff = (ulong)buf - restore->loff_translate; + ulong frag_loff = (ulong)buf - (ulong)restore->in_base; uchar const * buf_next = snapshot_read_buffered( restore, buf, bufsz ); ulong hdr_read = (ulong)(buf_next-buf); @@ -365,13 +368,14 @@ snapshot_read_account_hdr_chunk( fd_snapin_tile_t * restore, If data was included, skip ahead. (Combining header+data into the same fragment reduces the amount of descriptor frags published.) */ + ulong const frag_sz = hdr_read + peek_sz; fd_mcache_publish_stream( restore->out_mcache, restore->out_depth, restore->out_seq, frag_goff, frag_loff, - hdr_read + peek_sz, + frag_sz, fd_frag_meta_ctl( 0UL, som, eom, 0 ) ); restore->out_seq = fd_seq_inc( restore->out_seq, 1UL ); @@ -401,7 +405,7 @@ snapshot_read_account_chunk( fd_snapin_tile_t * restore, restore->out_depth, restore->out_seq, (ulong)buf - restore->goff_translate, - (ulong)buf - restore->loff_translate, + (ulong)buf - (ulong)restore->in_base, chunk_sz, fd_frag_meta_ctl( 0UL, 0, eom, 0 ) ); @@ -596,7 +600,8 @@ unprivileged_init( fd_topo_t * topo, /* Join stream input */ - ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp;; + FD_TEST( fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ) ); + ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; ctx->in_skip = 0UL; /* Join frame buffer */ @@ -776,7 +781,6 @@ on_stream_frag( fd_snapin_tile_t * ctx, uchar const * cur = start; ctx->goff_translate = (ulong)chunk0 - frag->goff; - ctx->loff_translate = (ulong)chunk0 - frag->loff; int consume_frag = 1; for(;;) { @@ -789,7 +793,8 @@ on_stream_frag( fd_snapin_tile_t * ctx, if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_FAILED ) ) { FD_LOG_ERR(( "Failed to restore snapshot" )); } - FD_LOG_ERR(( "blocked" )); + } + if( FD_UNLIKELY( ctx->out_seq >= ctx->out_seq_max ) ) { consume_frag = 0; /* retry this frag */ ulong consumed_sz = (uint)( cur-start ); ctx->in_skip += consumed_sz; @@ -807,10 +812,30 @@ on_stream_frag( fd_snapin_tile_t * ctx, credits back to the stream producer. Also updates link in metrics. */ static void -fd_snapin_in_update( fd_snapin_in_t * in ) { +fd_snapin_in_update( fd_snapin_tile_t * ctx, + fd_snapin_in_t * in, + ulong const * restrict cons_seq ) { + int const downstream_active = !!ctx->manifest_done; + ulong const downstream_seq = cons_seq[ 0 ]; + ulong const downstream_goff = cons_seq[ 1 ]; + + /* Defend against buggy consumer */ + if( FD_UNLIKELY( fd_seq_gt( downstream_seq, ctx->out_seq ) | + fd_seq_gt( downstream_goff, in->goff ) ) ) { + FD_LOG_CRIT(( "Consumer skipped ahead of me: self=(%lu,%lu) consumer=(%lu,%lu)", + ctx->out_seq, in->goff, + downstream_seq, downstream_goff )); + } + FD_COMPILER_MFENCE(); FD_VOLATILE( in->fseq[0] ) = in->seq; - FD_VOLATILE( in->fseq[1] ) = in->goff; + if( !downstream_active ) { + /* Initially, just send this tile's progress */ + FD_VOLATILE( in->fseq[1] ) = in->goff; + } else { + /* Once downstream tiles are active, forward backpressure signals */ + FD_VOLATILE( in->fseq[1] ) = downstream_goff; + } FD_COMPILER_MFENCE(); ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); @@ -834,13 +859,11 @@ fd_snapin_run1( ulong out_cnt, fd_frag_meta_t ** out_mcache, /* [out_cnt] */ ulong * out_depth, /* [out_cnt] */ - ulong * out_seq, /* [out_cnt] */ ulong cons_cnt, ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ - ulong * cons_out, /* [cons_cnt] */ ulong ** cons_fseq, /* [cons_cnt] */ ulong volatile ** restrict cons_slow, /* [cons_cnt] */ - ulong * restrict cons_seq, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [2*cons_cnt] */ long lazy, fd_rng_t * rng ) { @@ -889,15 +912,15 @@ fd_snapin_run1( if( FD_UNLIKELY( !out_mcache[ out_idx ] ) ) FD_LOG_ERR(( "NULL out_mcache[%lu]", out_idx )); out_depth[ out_idx ] = fd_mcache_depth( out_mcache[ out_idx ] ); - out_seq[ out_idx ] = 0UL; cr_max = fd_ulong_min( cr_max, out_depth[ out_idx ] ); } for( ulong cons_idx=0UL; cons_idxcons_cnt ) ) { /* in fctl for in in_idx */ + /* Send flow control credits and drain flow control diagnostics. */ ulong in_idx = event_idx - cons_cnt - 1UL; - fd_snapin_in_update( &in[ in_idx ] ); + fd_snapin_in_update( ctx, &in[ in_idx ], cons_seq ); } else { /* event_idx==cons_cnt, housekeeping event */ @@ -968,7 +996,7 @@ fd_snapin_run1( ulong slowest_cons = ULONG_MAX; cr_avail = cr_max; for( ulong cons_idx=0UL; cons_idxout_seq, cons_seq[ cons_idx ] ), 0L ), 0L ); slowest_cons = fd_ulong_if( cons_cr_availout_seq; int consumed_frag = on_stream_frag( ctx, this_in, &meta, &sz ); + ulong const out_seq1 = ctx->out_seq; + ulong const frags_published = out_seq1-out_seq0; + if( FD_UNLIKELY( frags_published>cr_avail ) ) FD_LOG_CRIT(( "frags_published (%lu) > cr_avail (%lu)", frags_published, cr_avail )); + cr_avail -= frags_published; this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)sz; @@ -1126,12 +1160,10 @@ fd_snapin_run( fd_topo_t * topo, fd_frag_meta_t * out_mcache[ tile->out_cnt ]; ulong out_depth [ tile->out_cnt ]; - ulong out_seq [ tile->out_cnt ]; for( ulong i=0UL; iout_cnt; i++ ) { out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; FD_TEST( out_mcache[ i ] ); out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); - out_seq [ i ] = 0UL; } ulong reliable_cons_cnt = 0UL; @@ -1178,7 +1210,7 @@ fd_snapin_run( fd_topo_t * topo, ushort event_map[ 1+reliable_cons_cnt ]; ulong volatile * cons_slow[ reliable_cons_cnt ]; ulong cons_seq [ reliable_cons_cnt ]; - fd_snapin_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, out_seq, reliable_cons_cnt, event_map, cons_out, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); + fd_snapin_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); } #ifndef FD_TILE_TEST From 37888df1bbb5b5caa58fadf4e9a2d283ba4421f6 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 09:48:54 +0000 Subject: [PATCH 21/34] Implement ActAlc tile --- .../firedancer-dev/commands/snapshot_load.c | 21 ++- src/discof/restore/fd_actalc_tile.c | 133 ++++++++++++++---- 2 files changed, 121 insertions(+), 33 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 98b828b5f2..ad46e60a9d 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -112,24 +112,31 @@ snapshot_load_topo( config_t * config, } + /* "SnapIn": Snapshot parser tile */ fd_topob_wksp( topo, "SnapIn" ); fd_topo_tile_t * snapin_tile = fd_topob_tile( topo, "SnapIn", "SnapIn", "SnapIn", tile_to_cpu[3], 0, 0 ); snapin_tile->snapin.scratch_sz = (3UL<<30); + /* uncompressed stream -> snapin tile */ + fd_topob_tile_in ( topo, "SnapIn", 0UL, "metric_in", "snap_stream", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, snapin_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* snapin tile -> account frags */ + fd_topob_wksp( topo, "snap_frags" ); + fd_topo_link_t * snap_frags_link = fd_topob_link( topo, "snap_frags", "snap_frags", 512UL, 0UL, 0UL ); + snap_frags_link->dcache_obj_id = snapin_dcache->id; + fd_topob_tile_out( topo, "SnapIn", 0UL, "snap_frags", 0UL ); + + /* "ActAlc": Account allocator tile */ fd_topob_wksp( topo, "ActAlc" ); fd_topo_tile_t * actalc_tile = fd_topob_tile( topo, "ActAlc", "ActAlc", "ActAlc", tile_to_cpu[4], 0, 0 ); fd_topob_tile_uses( topo, actalc_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); actalc_tile->actalc.funk_obj_id = funk_obj->id; - fd_topob_tile_in ( topo, "SnapIn", 0UL, "metric_in", "snap_stream", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - fd_topob_tile_uses( topo, snapin_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + /* account frags -> "ActAlc" tile */ + fd_topob_tile_in( topo, "ActAlc", 0UL, "metric_in", "snap_frags", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); fd_topob_tile_uses( topo, actalc_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); - fd_topob_wksp( topo, "snap_frags" ); - fd_topob_link( topo, "snap_frags", "snap_frags", 65536UL, 0UL, 0UL ); - fd_topob_tile_out( topo, "SnapIn", 0UL, "snap_frags", 0UL ); - fd_topob_tile_in ( topo, "ActAlc", 0UL, "metric_in", "snap_frags", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - fd_topob_wksp( topo, "snap_descs" ); fd_topob_link( topo, "snap_descs", "snap_descs", 512UL, 0UL, 0UL )->permit_no_consumers = 1; fd_topob_tile_out( topo, "ActAlc", 0UL, "snap_descs", 0UL ); diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index 25224ec3d0..4f0fa9f296 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -1,14 +1,28 @@ #include "fd_restore_base.h" #include "../../disco/topo/fd_topo.h" #include "../../disco/metrics/fd_metrics.h" +#include "../../flamenco/runtime/fd_acc_mgr.h" /* FD_ACC_SZ_MAX */ #include "../../flamenco/types/fd_types.h" +#include "../../funk/fd_funk.h" -#define LINK_IN_MAX 2UL +#define LINK_IN_MAX 1UL #define BURST 1UL struct fd_actalc_tile { fd_solana_account_stored_meta_t acc_meta; + /* Stream input */ + + uchar const * in_base; + uchar in_buf[ 16 ]; + ulong in_skip; + + /* Funk database */ + + fd_funk_t funk[1]; + fd_alloc_t * alloc; + uint db_full : 1; + /* Account output */ fd_stream_frag_meta_t * out_mcache; @@ -17,6 +31,13 @@ struct fd_actalc_tile { ulong out_seq; ulong out_cnt; ulong out_depth; + + /* Metrics */ + + struct { + ulong alloc_cnt; + ulong cum_alloc_sz; + } metrics; }; typedef struct fd_actalc_tile fd_actalc_tile_t; @@ -57,6 +78,19 @@ unprivileged_init( fd_topo_t * topo, fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); memset( ctx, 0, sizeof(fd_actalc_tile_t) ); + /* Join stream input */ + + FD_TEST( fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ) ); + ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; + ctx->in_skip = 0UL; + + /* Join funk database */ + + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->actalc.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + ctx->alloc = fd_funk_alloc( ctx->funk ); + /* Join account output */ ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); @@ -75,14 +109,66 @@ metrics_write( fd_actalc_tile_t * ctx ) { (void)ctx; } -static int +static void +allocate_account( fd_actalc_tile_t * ctx, + ulong seq, + ulong acc_data_sz ) { + (void)seq; + if( FD_UNLIKELY( acc_data_sz > FD_ACC_SZ_MAX ) ) { + FD_LOG_ERR(( "account data size (%lu) exceeds max (%lu) (possible memory corruption?)", acc_data_sz, FD_ACC_SZ_MAX )); + } + + ulong rec_sz = sizeof(fd_account_meta_t)+acc_data_sz; + + void * buf = fd_alloc_malloc( ctx->alloc, 1UL, rec_sz ); + if( FD_UNLIKELY( !buf ) ) { + FD_LOG_WARNING(( "Database full after inserting %lu records totalling %.3f GiB: fd_alloc_malloc(align=1,sz=%lu) failed", + ctx->metrics.alloc_cnt, (double)ctx->metrics.cum_alloc_sz/(double)(1UL<<30), rec_sz )); + ctx->db_full = 1; + return; + } + ctx->metrics.alloc_cnt++; + ctx->metrics.cum_alloc_sz += rec_sz; +} + +#include + +static void on_stream_frag( fd_actalc_tile_t * ctx, - fd_actalc_in_t * in, - fd_stream_frag_meta_t const * frag, - ulong * read_sz ) { - (void)ctx; (void)in; (void)frag; (void)read_sz; - // FD_LOG_NOTICE(( "frag" )); - return 1; + fd_stream_frag_meta_t const * meta ) { + ulong const seq = meta->seq; + ulong const loff = meta->loff; + ulong const sz = meta->sz; + ulong const ctl = meta->ctl; + int const som = fd_frag_meta_ctl_som( ctl ); /* first frag of account? */ + + /* Are we already done with this account? */ + if( FD_UNLIKELY( !som && !ctx->in_skip ) ) return; + + /* Read bytes 8-16 of each account header (data_len) */ + ulong const want_sz = 16UL; + uchar const * frag = ctx->in_base + loff; + + /* Unfragmented fast path */ + if( FD_LIKELY( som && sz>=want_sz ) ) { + ulong const acc_data_sz = FD_LOAD( ulong, frag+8 ); + allocate_account( ctx, seq, acc_data_sz ); + return; + } + + /* Slow path: Recover from fragmentation */ + if( FD_UNLIKELY( ctx->in_skip >= want_sz ) ) FD_LOG_CRIT(( "invariant violation: in_skip (%lu) > want_sz (%lu)", ctx->in_skip, want_sz )); + ulong const chunk0 = ctx->in_skip; + ulong const rem_sz = want_sz-chunk0; + ulong const read_sz = fd_ulong_min( rem_sz, sz ); + if( FD_UNLIKELY( !read_sz ) ) return; + fd_memcpy( ctx->in_buf+chunk0, frag, read_sz ); + ctx->in_skip += read_sz; + if( FD_LIKELY( ctx->in_skip == want_sz ) ) { + ulong const acc_data_sz = FD_LOAD( ulong, ctx->in_buf+8 ); + allocate_account( ctx, seq, acc_data_sz ); + ctx->in_skip = 0UL; + } } /* fd_actalc_in_update gets called periodically synchronize flow control @@ -297,7 +383,7 @@ fd_actalc_run1( /* Check if we are backpressured. */ - if( FD_UNLIKELY( cr_availdb_full || cr_availaccum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)sz; + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)meta.sz; - if( FD_LIKELY( consumed_frag ) ) { - - ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); - if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { - FD_LOG_ERR(( "Overrun while reading from input %lu", in_seq )); - } - - /* Windup for the next in poll and accumulate diagnostics */ + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu: seq_found=%lu seq_test=%lu", in_seq, seq_found, seq_test )); + } - this_in_seq = fd_seq_inc( this_in_seq, 1UL ); - this_in->seq = this_in_seq; - this_in->goff = meta.goff + meta.sz; - this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + /* Windup for the next in poll and accumulate diagnostics */ - this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->goff = meta.goff + meta.sz; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); - } + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; metric_regime_ticks[1] += housekeeping_ticks; metric_regime_ticks[4] += prefrag_ticks; From 4c3ebd3ee6feddea8e2ce2badcdfe916fe4d0635 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 10:22:36 +0000 Subject: [PATCH 22/34] Improve monitor --- .../firedancer-dev/commands/snapshot_load.c | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index ad46e60a9d..c495745ee6 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -212,13 +212,20 @@ snapshot_load_cmd_fn( args_t * args, ulong goff_old = 0UL; ulong file_rd_backp_old = 0UL; + ulong file_rd_wait_old = 0UL; ulong snap_in_backp_old = 0UL; ulong snap_in_wait_old = 0UL; ulong actalc_wait_old = 0UL; ulong acc_cnt_old = 0UL; - ulong frag_cnt_old = 0UL; - sleep( 1 ); - FD_LOG_NOTICE(( "---------------backp=(file,snap) busy=(snap,alc )-------------------------------" )); + sleep( 1 ); + puts( "" ); + puts( "Columns:" ); + puts( "- bw: Uncompressed bandwidth" ); + puts( "- backp: Backpressured by downstream tile" ); + puts( "- stall: Waiting on upstream tile" ); + puts( "- acc: Number of accounts" ); + puts( "" ); + puts( "-------------backp=(file,snap) busy=(file,snap,alc )---------------" ); for(;;) { ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); @@ -230,28 +237,32 @@ snapshot_load_cmd_fn( args_t * args, ulong goff = FD_VOLATILE_CONST( snap_in_fseq[ 1 ] ); ulong file_rd_backp = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong file_rd_wait = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + file_rd_backp; ulong snap_in_backp = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); ulong snap_in_wait = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + - FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ); + FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + snap_in_backp; ulong actalc_wait = FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ); - ulong frag_cnt = FD_VOLATILE_CONST( snap_accs_sync[0] ); ulong acc_cnt = FD_VOLATILE_CONST( snap_accs_sync[1] ); - FD_LOG_NOTICE(( "rate=%4.2g GB/s backp=(%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%) acc=%8.3g/s frag=%8.3g/s", - (double)( goff-goff_old )/1e9, - ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, - ( (double)( snap_in_backp-snap_in_backp_old )*ns_per_tick )/1e7, - ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7, - ( (double)( actalc_wait -actalc_wait_old )*ns_per_tick )/1e7, - (double)( acc_cnt -acc_cnt_old ), - (double)( frag_cnt-frag_cnt_old ) ) ); + printf( "bw=%4.2g GB/s backp=(%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%) acc=%8.3g/s\n", + (double)( goff-goff_old )/1e9, + ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, + ( (double)( snap_in_backp-snap_in_backp_old )*ns_per_tick )/1e7, + 100-( ( (double)( file_rd_wait -file_rd_wait_old )*ns_per_tick )/1e7 ), + 100-( ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7 ), + 100-( ( (double)( actalc_wait -actalc_wait_old )*ns_per_tick )/1e7 ), + (double)( acc_cnt -acc_cnt_old ) ); + fflush( stdout ); goff_old = goff; file_rd_backp_old = file_rd_backp; + file_rd_wait_old = file_rd_wait; snap_in_backp_old = snap_in_backp; snap_in_wait_old = snap_in_wait; actalc_wait_old = actalc_wait; acc_cnt_old = acc_cnt; - frag_cnt_old = frag_cnt; sleep( 1 ); } From fbd2761cc11412dd721930b15da2249df01cccd1 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 11:26:33 +0000 Subject: [PATCH 23/34] progress --- snapload.toml | 3 + .../firedancer-dev/commands/snapshot_load.c | 14 +++- src/disco/topo/fd_topob.c | 8 +- src/discof/restore/fd_actalc_tile.c | 79 +++++++++++++------ src/discof/restore/fd_actcpy_tile.c | 0 src/discof/restore/fd_restore_base.h | 25 +++++- src/discof/restore/stream/fd_stream_reader.h | 4 +- src/funk/fd_funk_base.h | 18 +++++ src/funk/fd_funk_filemap.c | 2 +- 9 files changed, 118 insertions(+), 35 deletions(-) delete mode 100644 src/discof/restore/fd_actcpy_tile.c diff --git a/snapload.toml b/snapload.toml index 38945b3cef..bc20a2cf84 100644 --- a/snapload.toml +++ b/snapload.toml @@ -1,6 +1,9 @@ [hugetlbfs] max_page_size = "huge" +[funk] +heap_size_gib = 64 + [log] level_stderr = "INFO" level_logfile = "INFO" diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index c495745ee6..10a33791aa 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -133,10 +133,11 @@ snapshot_load_topo( config_t * config, fd_topob_tile_uses( topo, actalc_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); actalc_tile->actalc.funk_obj_id = funk_obj->id; - /* account frags -> "ActAlc" tile */ + /* account frags -> actalc tile */ fd_topob_tile_in( topo, "ActAlc", 0UL, "metric_in", "snap_frags", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); fd_topob_tile_uses( topo, actalc_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + /* actalc tile -> record pointers */ fd_topob_wksp( topo, "snap_descs" ); fd_topob_link( topo, "snap_descs", "snap_descs", 512UL, 0UL, 0UL )->permit_no_consumers = 1; fd_topob_tile_out( topo, "ActAlc", 0UL, "snap_descs", 0UL ); @@ -215,6 +216,7 @@ snapshot_load_cmd_fn( args_t * args, ulong file_rd_wait_old = 0UL; ulong snap_in_backp_old = 0UL; ulong snap_in_wait_old = 0UL; + ulong actalc_backp_old = 0UL; ulong actalc_wait_old = 0UL; ulong acc_cnt_old = 0UL; sleep( 1 ); @@ -225,7 +227,7 @@ snapshot_load_cmd_fn( args_t * args, puts( "- stall: Waiting on upstream tile" ); puts( "- acc: Number of accounts" ); puts( "" ); - puts( "-------------backp=(file,snap) busy=(file,snap,alc )---------------" ); + puts( "-------------backp=(file,snap,alc ) busy=(file,snap,alc )---------------" ); for(;;) { ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); @@ -244,13 +246,16 @@ snapshot_load_cmd_fn( args_t * args, ulong snap_in_wait = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + snap_in_backp; + ulong actalc_backp = FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); ulong actalc_wait = FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + - FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ); + FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + actalc_backp; ulong acc_cnt = FD_VOLATILE_CONST( snap_accs_sync[1] ); - printf( "bw=%4.2g GB/s backp=(%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%) acc=%8.3g/s\n", + printf( "bw=%4.2g GB/s backp=(%3.0f%%,%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%) acc=%8.3g/s\n", (double)( goff-goff_old )/1e9, ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, ( (double)( snap_in_backp-snap_in_backp_old )*ns_per_tick )/1e7, + ( (double)( actalc_backp -actalc_backp_old )*ns_per_tick )/1e7, 100-( ( (double)( file_rd_wait -file_rd_wait_old )*ns_per_tick )/1e7 ), 100-( ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7 ), 100-( ( (double)( actalc_wait -actalc_wait_old )*ns_per_tick )/1e7 ), @@ -261,6 +266,7 @@ snapshot_load_cmd_fn( args_t * args, file_rd_wait_old = file_rd_wait; snap_in_backp_old = snap_in_backp; snap_in_wait_old = snap_in_wait; + actalc_backp_old = actalc_backp; actalc_wait_old = actalc_wait; acc_cnt_old = acc_cnt; sleep( 1 ); diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index d26d4b0949..ae2b6347ae 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -519,13 +519,13 @@ initialize_numa_assignments( fd_topo_t * topo ) { int found_lazy = 0; for( ulong j=0UL; jtile_cnt; j++ ) { fd_topo_tile_t * tile = &topo->tiles[ j ]; - if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idx!=ULONG_MAX ) ) { + if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idxworkspaces[ i ].numa_idx = fd_numa_node_idx( tile->cpu_idx ); FD_TEST( topo->workspaces[ i ].numa_idx!=ULONG_MAX ); found_strict = 1; found_lazy = 1; break; - } else if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idx==ULONG_MAX ) ) { + } else if( FD_UNLIKELY( tile->tile_obj_id==max_obj && tile->cpu_idx>=FD_TILE_MAX ) ) { topo->workspaces[ i ].numa_idx = 0; found_lazy = 1; break; @@ -536,12 +536,12 @@ initialize_numa_assignments( fd_topo_t * topo ) { for( ulong j=0UL; jtile_cnt; j++ ) { fd_topo_tile_t * tile = &topo->tiles[ j ]; for( ulong k=0UL; kuses_obj_cnt; k++ ) { - if( FD_LIKELY( tile->uses_obj_id[ k ]==max_obj && tile->cpu_idx!=ULONG_MAX ) ) { + if( FD_LIKELY( tile->uses_obj_id[ k ]==max_obj && tile->cpu_idxworkspaces[ i ].numa_idx = fd_numa_node_idx( tile->cpu_idx ); FD_TEST( topo->workspaces[ i ].numa_idx!=ULONG_MAX ); found_lazy = 1; break; - } else if( FD_UNLIKELY( tile->uses_obj_id[ k ]==max_obj ) && tile->cpu_idx==ULONG_MAX ) { + } else if( FD_UNLIKELY( tile->uses_obj_id[ k ]==max_obj ) && tile->cpu_idx>=FD_TILE_MAX ) { topo->workspaces[ i ].numa_idx = 0; found_lazy = 1; /* Don't break, keep looking -- a tile with a CPU assignment diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index 4f0fa9f296..b12ecd924b 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -9,25 +9,27 @@ #define BURST 1UL struct fd_actalc_tile { - fd_solana_account_stored_meta_t acc_meta; - /* Stream input */ uchar const * in_base; - uchar in_buf[ 16 ]; - ulong in_skip; + union { + fd_solana_account_hdr_t acc_meta; + uchar in_buf[ 136 ]; + }; + ulong in_skip; + ulong acc_seq0; /* Funk database */ fd_funk_t funk[1]; fd_alloc_t * alloc; + ulong funk_seed; uint db_full : 1; /* Account output */ - fd_stream_frag_meta_t * out_mcache; + fd_account_frag_meta_t * out_mcache; - ulong out_seq_max; ulong out_seq; ulong out_cnt; ulong out_depth; @@ -89,12 +91,12 @@ unprivileged_init( fd_topo_t * topo, if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->actalc.funk_obj_id ) ) ) ) { FD_LOG_ERR(( "Failed to join database cache" )); } - ctx->alloc = fd_funk_alloc( ctx->funk ); + ctx->alloc = fd_funk_alloc( ctx->funk ); + ctx->funk_seed = fd_funk_seed( ctx->funk ); /* Join account output */ ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); - ctx->out_seq_max = 0UL; ctx->out_seq = 0UL; ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); } @@ -110,17 +112,19 @@ metrics_write( fd_actalc_tile_t * ctx ) { } static void -allocate_account( fd_actalc_tile_t * ctx, - ulong seq, - ulong acc_data_sz ) { - (void)seq; +allocate_account( fd_actalc_tile_t * ctx, + ulong acc_seq0, + fd_solana_account_hdr_t const * hdr ) { + ulong const acc_data_sz = hdr->meta.data_len; + if( FD_UNLIKELY( acc_data_sz > FD_ACC_SZ_MAX ) ) { FD_LOG_ERR(( "account data size (%lu) exceeds max (%lu) (possible memory corruption?)", acc_data_sz, FD_ACC_SZ_MAX )); } - ulong rec_sz = sizeof(fd_account_meta_t)+acc_data_sz; + /* Allocate account */ - void * buf = fd_alloc_malloc( ctx->alloc, 1UL, rec_sz ); + ulong rec_sz = sizeof(fd_account_meta_t)+acc_data_sz; + void * buf = fd_alloc_malloc( ctx->alloc, 1UL, rec_sz ); if( FD_UNLIKELY( !buf ) ) { FD_LOG_WARNING(( "Database full after inserting %lu records totalling %.3f GiB: fd_alloc_malloc(align=1,sz=%lu) failed", ctx->metrics.alloc_cnt, (double)ctx->metrics.cum_alloc_sz/(double)(1UL<<30), rec_sz )); @@ -129,9 +133,40 @@ allocate_account( fd_actalc_tile_t * ctx, } ctx->metrics.alloc_cnt++; ctx->metrics.cum_alloc_sz += rec_sz; -} -#include + /* Calculate funk hash */ + + ulong const funk_hash = fd_funk_rec_key_hash1( hdr->meta.pubkey, FD_FUNK_KEY_TYPE_ACC, ctx->funk_seed ); + + /* Copy account metadata */ + + fd_account_meta_t * meta = buf; + *meta = (fd_account_meta_t) { + .magic = FD_ACCOUNT_META_MAGIC, + .hlen = sizeof(fd_account_meta_t), + .dlen = acc_data_sz, + .slot = hdr->meta.write_version_obsolete, /* ??? */ + .info = { + .lamports = hdr->info.lamports, + .rent_epoch = hdr->info.rent_epoch, + .executable = hdr->info.executable + } + }; + memcpy( meta->info.owner, hdr->info.owner, sizeof(fd_pubkey_t) ); + + /* Publish account descriptor */ + + ulong const rec_gaddr = (ulong)buf - (ulong)ctx->funk->shmem; + fd_mcache_publish_account( + ctx->out_mcache, + ctx->out_depth, + ctx->out_seq, + funk_hash, + rec_gaddr, + acc_seq0 + ); + ctx->out_seq = fd_seq_inc( ctx->out_seq, 1UL ); +} static void on_stream_frag( fd_actalc_tile_t * ctx, @@ -146,17 +181,17 @@ on_stream_frag( fd_actalc_tile_t * ctx, if( FD_UNLIKELY( !som && !ctx->in_skip ) ) return; /* Read bytes 8-16 of each account header (data_len) */ - ulong const want_sz = 16UL; + ulong const want_sz = sizeof(fd_solana_account_hdr_t); uchar const * frag = ctx->in_base + loff; /* Unfragmented fast path */ if( FD_LIKELY( som && sz>=want_sz ) ) { - ulong const acc_data_sz = FD_LOAD( ulong, frag+8 ); - allocate_account( ctx, seq, acc_data_sz ); + allocate_account( ctx, seq, (fd_solana_account_hdr_t const *)frag ); return; } /* Slow path: Recover from fragmentation */ + if( som ) ctx->acc_seq0 = seq; if( FD_UNLIKELY( ctx->in_skip >= want_sz ) ) FD_LOG_CRIT(( "invariant violation: in_skip (%lu) > want_sz (%lu)", ctx->in_skip, want_sz )); ulong const chunk0 = ctx->in_skip; ulong const rem_sz = want_sz-chunk0; @@ -165,8 +200,7 @@ on_stream_frag( fd_actalc_tile_t * ctx, fd_memcpy( ctx->in_buf+chunk0, frag, read_sz ); ctx->in_skip += read_sz; if( FD_LIKELY( ctx->in_skip == want_sz ) ) { - ulong const acc_data_sz = FD_LOAD( ulong, ctx->in_buf+8 ); - allocate_account( ctx, seq, acc_data_sz ); + allocate_account( ctx, ctx->acc_seq0, &ctx->acc_meta ); ctx->in_skip = 0UL; } } @@ -271,7 +305,7 @@ fd_actalc_run1( /* housekeeping init */ // if( lazy<=0L ) lazy = fd_tempo_lazy_default( cr_max ); - lazy = 1e3L; + lazy = 10e3L; FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy )); /* Initial event sequence */ @@ -340,7 +374,6 @@ fd_actalc_run1( slowest_cons = fd_ulong_if( cons_cr_availout_seq_max = ctx->out_seq + cr_avail; if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { FD_COMPILER_MFENCE(); diff --git a/src/discof/restore/fd_actcpy_tile.c b/src/discof/restore/fd_actcpy_tile.c deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index 6533d98175..0bb1b5b74b 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -23,8 +23,8 @@ union fd_account_frag_meta { ulong seq; ulong rec_hash; + ulong gaddr; ulong frag_seq; - ulong rec_goff; }; @@ -37,4 +37,27 @@ typedef union fd_account_frag_meta fd_account_frag_meta_t; FD_STATIC_ASSERT( alignof(fd_account_frag_meta_t)==32, abi ); FD_STATIC_ASSERT( sizeof (fd_account_frag_meta_t)==32, abi ); +FD_PROTOTYPES_BEGIN + +static inline void +fd_mcache_publish_account( fd_account_frag_meta_t * mcache, + ulong depth, + ulong seq, + ulong rec_hash, + ulong gaddr, + ulong frag_seq ) { + fd_account_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + FD_COMPILER_MFENCE(); + meta->seq = fd_seq_dec( seq, 1UL ); + FD_COMPILER_MFENCE(); + meta->rec_hash = rec_hash; + meta->gaddr = gaddr; + meta->frag_seq = frag_seq; + FD_COMPILER_MFENCE(); + meta->seq = seq; + FD_COMPILER_MFENCE(); +} + +FD_PROTOTYPES_END + #endif /* HEADER_fd_src_discof_restore_fd_restore_base_h */ diff --git a/src/discof/restore/stream/fd_stream_reader.h b/src/discof/restore/stream/fd_stream_reader.h index e562f1f6a3..e3e2312a91 100644 --- a/src/discof/restore/stream/fd_stream_reader.h +++ b/src/discof/restore/stream/fd_stream_reader.h @@ -11,11 +11,11 @@ union fd_stream_frag_meta { struct { ulong seq; /* frag sequence number */ - ulong goff; /* stream offset */ - uint sz; ushort unused; ushort ctl; + + ulong goff; /* stream offset */ ulong loff; /* dcache offset */ }; diff --git a/src/funk/fd_funk_base.h b/src/funk/fd_funk_base.h index 3d3decc5ac..866cdc8e0e 100644 --- a/src/funk/fd_funk_base.h +++ b/src/funk/fd_funk_base.h @@ -162,6 +162,24 @@ fd_xxh3_mix16b( ulong i0, ulong i1, return fd_xxh3_mul128_fold64( i0 ^ (s0 + seed), i1 ^ (s1 - seed) ); } +FD_FN_PURE static inline ulong +fd_funk_rec_key_hash1( uchar const key[ 32 ], + ulong rec_type, + ulong seed ) { + seed ^= rec_type; + ulong k0 = FD_LOAD( ulong, key+ 0 ); + ulong k1 = FD_LOAD( ulong, key+ 8 ); + ulong k2 = FD_LOAD( ulong, key+16 ); + ulong k3 = FD_LOAD( ulong, key+24 ); + ulong acc = 32 * 0x9E3779B185EBCA87ULL; + acc += fd_xxh3_mix16b( k0, k1, 0xbe4ba423396cfeb8UL, 0x1cad21f72c81017cUL, seed ); + acc += fd_xxh3_mix16b( k2, k3, 0xdb979083e96dd4deUL, 0x1f67b3b7a4a44072UL, seed ); + acc = acc ^ (acc >> 37); + acc *= 0x165667919E3779F9ULL; + acc = acc ^ (acc >> 32); + return acc; +} + FD_FN_PURE static inline ulong fd_funk_rec_key_hash( fd_funk_rec_key_t const * k, ulong seed ) { diff --git a/src/funk/fd_funk_filemap.c b/src/funk/fd_funk_filemap.c index 1ff9405e9e..61ffd5bda2 100644 --- a/src/funk/fd_funk_filemap.c +++ b/src/funk/fd_funk_filemap.c @@ -180,7 +180,7 @@ fd_funk_open_file( void * ljoin, ulong part_max = fd_wksp_part_max_est( total_sz, 1U<<18U ); if( FD_UNLIKELY( !part_max ) ) { - FD_LOG_WARNING(( "fd_wksp_part_max_est(%lu,64KiB) failed", total_sz )); + FD_LOG_WARNING(( "fd_wksp_part_max_est(%lu,256KiB) failed", total_sz )); munmap( shmem, total_sz ); close( fd ); return NULL; From c8080fc98061e6005123bc7604210957ba00d08c Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 13 May 2025 11:58:49 +0000 Subject: [PATCH 24/34] Allocate funk records --- src/discof/restore/fd_actalc_tile.c | 59 ++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index b12ecd924b..d56064f752 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -8,6 +8,11 @@ #define LINK_IN_MAX 1UL #define BURST 1UL +/* The ActAlc tile has the following responsibilities: + - Bump allocate funk records + - Heap allocate account datas + - Copy account data */ + struct fd_actalc_tile { /* Stream input */ @@ -21,10 +26,16 @@ struct fd_actalc_tile { /* Funk database */ - fd_funk_t funk[1]; - fd_alloc_t * alloc; - ulong funk_seed; - uint db_full : 1; + fd_funk_t funk[1]; + void * funk_base; + + fd_alloc_t * alloc; + ulong funk_seed; + + fd_funk_rec_t * rec_next; + ulong rec1_laddr; + + uint db_full : 1; /* Account output */ @@ -91,9 +102,15 @@ unprivileged_init( fd_topo_t * topo, if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->actalc.funk_obj_id ) ) ) ) { FD_LOG_ERR(( "Failed to join database cache" )); } + + ctx->funk_base = fd_wksp_containing( ctx->funk->shmem ); ctx->alloc = fd_funk_alloc( ctx->funk ); ctx->funk_seed = fd_funk_seed( ctx->funk ); + fd_funk_rec_map_t const * rec_map = fd_funk_rec_map( ctx->funk ); + ctx->rec_next = rec_map->ele; + ctx->rec1_laddr = (ulong)( rec_map->ele + rec_map->ele_max ); + /* Join account output */ ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); @@ -123,16 +140,18 @@ allocate_account( fd_actalc_tile_t * ctx, /* Allocate account */ - ulong rec_sz = sizeof(fd_account_meta_t)+acc_data_sz; - void * buf = fd_alloc_malloc( ctx->alloc, 1UL, rec_sz ); + ulong const buf_min = sizeof(fd_account_meta_t)+acc_data_sz; + ulong buf_max = 0UL; + void * const buf = fd_alloc_malloc_at_least( ctx->alloc, 1UL, buf_min, &buf_max ); if( FD_UNLIKELY( !buf ) ) { FD_LOG_WARNING(( "Database full after inserting %lu records totalling %.3f GiB: fd_alloc_malloc(align=1,sz=%lu) failed", - ctx->metrics.alloc_cnt, (double)ctx->metrics.cum_alloc_sz/(double)(1UL<<30), rec_sz )); + ctx->metrics.alloc_cnt, (double)ctx->metrics.cum_alloc_sz/(double)(1UL<<30), buf_min )); ctx->db_full = 1; return; } + ulong const buf_gaddr = (ulong)buf - (ulong)ctx->funk_base; ctx->metrics.alloc_cnt++; - ctx->metrics.cum_alloc_sz += rec_sz; + ctx->metrics.cum_alloc_sz += buf_min; /* Calculate funk hash */ @@ -154,9 +173,22 @@ allocate_account( fd_actalc_tile_t * ctx, }; memcpy( meta->info.owner, hdr->info.owner, sizeof(fd_pubkey_t) ); + /* Allocate funk record */ + + fd_funk_rec_t * rec = ctx->rec_next; + ulong const rec_gaddr = (ulong)rec - (ulong)ctx->funk_base; + memset( rec, 0, sizeof(fd_funk_rec_t) ); + + memcpy( rec->pair.key->uc, hdr->meta.pubkey, sizeof(fd_pubkey_t) ); + rec->pair.key->ul[ 4 ] = FD_FUNK_KEY_TYPE_ACC; + + rec->map_hash = funk_hash; + rec->val_sz = (uint)buf_min; + rec->val_max = (uint)buf_max; + rec->val_gaddr = buf_gaddr; + /* Publish account descriptor */ - ulong const rec_gaddr = (ulong)buf - (ulong)ctx->funk->shmem; fd_mcache_publish_account( ctx->out_mcache, ctx->out_depth, @@ -165,7 +197,16 @@ allocate_account( fd_actalc_tile_t * ctx, rec_gaddr, acc_seq0 ); + + /* Wind up for next publish */ + ctx->out_seq = fd_seq_inc( ctx->out_seq, 1UL ); + ctx->rec_next++; + if( FD_UNLIKELY( (ulong)ctx->rec_next >= ctx->rec1_laddr ) ) { + FD_LOG_WARNING(( "Funk record map full" )); + ctx->db_full = 1; + return; + } } static void From 16debe0f5ff68ea8be426b0c2a1462561fa54f2b Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Wed, 14 May 2025 12:16:17 +0000 Subject: [PATCH 25/34] Add ActIdx tile --- snapload.toml | 1 + .../firedancer-dev/commands/snapshot_load.c | 51 +- src/app/firedancer-dev/main.c | 6 +- src/disco/topo/fd_topo.h | 4 + src/discof/restore/Local.mk | 4 +- src/discof/restore/fd_actalc_tile.c | 76 ++- src/discof/restore/fd_actidx_tile.c | 472 ++++++++++++++++++ src/discof/restore/fd_restore_base.h | 2 + src/discof/restore/fd_snapin_tile.c | 9 +- src/discof/restore/test_snapin_tile.c | 8 + 10 files changed, 561 insertions(+), 72 deletions(-) create mode 100644 src/discof/restore/fd_actidx_tile.c diff --git a/snapload.toml b/snapload.toml index bc20a2cf84..2038d361b0 100644 --- a/snapload.toml +++ b/snapload.toml @@ -2,6 +2,7 @@ max_page_size = "huge" [funk] +max_account_records = 100_000_000 heap_size_gib = 64 [log] diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index 10a33791aa..b63a9b6591 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -57,7 +57,7 @@ snapshot_load_topo( config_t * config, static ushort tile_to_cpu[ FD_TILE_MAX ] = {0}; if( args->tile_cpus[0] ) { ulong cpu_cnt = fd_tile_private_cpus_parse( args->tile_cpus, tile_to_cpu ); - if( FD_UNLIKELY( cpu_cnt<4UL ) ) FD_LOG_ERR(( "--tile-cpus specifies %lu CPUs, but need at least 4", cpu_cnt )); + if( FD_UNLIKELY( cpu_cnt<6UL ) ) FD_LOG_ERR(( "--tile-cpus specifies %lu CPUs, but need at least 6", cpu_cnt )); } fd_topob_wksp( topo, "metric_in" ); @@ -142,6 +142,14 @@ snapshot_load_topo( config_t * config, fd_topob_link( topo, "snap_descs", "snap_descs", 512UL, 0UL, 0UL )->permit_no_consumers = 1; fd_topob_tile_out( topo, "ActAlc", 0UL, "snap_descs", 0UL ); + /* "ActIdx": Account indexer tile */ + fd_topob_wksp( topo, "ActIdx" ); + fd_topo_tile_t * actidx_tile = fd_topob_tile( topo, "ActIdx", "ActIdx", "ActIdx", tile_to_cpu[5], 0, 0 ); + actidx_tile->actidx.funk_obj_id = funk_obj->id; + + /* record pointers -> actidx tile */ + fd_topob_tile_in( topo, "ActIdx", 0UL, "metric_in", "snap_descs", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * tile = &topo->tiles[ i ]; fd_topo_configure_tile( tile, config ); @@ -198,18 +206,20 @@ snapshot_load_cmd_fn( args_t * args, double ns_per_tick = 1.0/tick_per_ns; fd_topo_run_single_process( topo, 2, config->uid, config->gid, fdctl_tile_run, NULL ); - fd_topo_tile_t * file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; - fd_topo_tile_t * snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; - ulong zstd_tile_idx = fd_topo_find_tile( topo, "Unzstd", 0UL ); - fd_topo_tile_t * unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; - fd_topo_tile_t * actalc_tile = &topo->tiles[ fd_topo_find_tile( topo, "ActAlc", 0UL ) ]; - - ulong * snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; - ulong * snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); - ulong volatile * file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); - ulong volatile * snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); - ulong volatile * unzstd_in_metrics = unzstd_tile ? fd_metrics_tile( unzstd_tile->metrics ) : NULL; - ulong volatile * actalc_metrics = fd_metrics_tile( actalc_tile->metrics ); + fd_topo_tile_t * const file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; + fd_topo_tile_t * const snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; + ulong const zstd_tile_idx = fd_topo_find_tile( topo, "Unzstd", 0UL ); + fd_topo_tile_t * const unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; + fd_topo_tile_t * const actalc_tile = &topo->tiles[ fd_topo_find_tile( topo, "ActAlc", 0UL ) ]; + fd_topo_tile_t * const actidx_tile = &topo->tiles[ fd_topo_find_tile( topo, "ActIdx", 0UL ) ]; + + ulong * const snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; + ulong * const snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); + ulong volatile * const file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); + ulong volatile * const snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); + ulong volatile * const unzstd_in_metrics = unzstd_tile ? fd_metrics_tile( unzstd_tile->metrics ) : NULL; + ulong volatile * const actalc_metrics = fd_metrics_tile( actalc_tile->metrics ); + ulong volatile * const actidx_metrics = fd_metrics_tile( actidx_tile->metrics ); ulong goff_old = 0UL; ulong file_rd_backp_old = 0UL; @@ -218,6 +228,7 @@ snapshot_load_cmd_fn( args_t * args, ulong snap_in_wait_old = 0UL; ulong actalc_backp_old = 0UL; ulong actalc_wait_old = 0UL; + ulong actidx_wait_old = 0UL; ulong acc_cnt_old = 0UL; sleep( 1 ); puts( "" ); @@ -227,7 +238,7 @@ snapshot_load_cmd_fn( args_t * args, puts( "- stall: Waiting on upstream tile" ); puts( "- acc: Number of accounts" ); puts( "" ); - puts( "-------------backp=(file,snap,alc ) busy=(file,snap,alc )---------------" ); + puts( "-------------backp=(file,snap,alc ) busy=(file,snap,alc ,idx )---------------" ); for(;;) { ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); @@ -247,11 +258,15 @@ snapshot_load_cmd_fn( args_t * args, FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + snap_in_backp; ulong actalc_backp = FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); - ulong actalc_wait = FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + - FD_VOLATILE_CONST( actalc_metrics [ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + ulong actalc_wait = FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( actalc_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + actalc_backp; + ulong actidx_backp = FD_VOLATILE_CONST( actidx_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); + ulong actidx_wait = FD_VOLATILE_CONST( actidx_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( actidx_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + + actidx_backp; ulong acc_cnt = FD_VOLATILE_CONST( snap_accs_sync[1] ); - printf( "bw=%4.2g GB/s backp=(%3.0f%%,%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%) acc=%8.3g/s\n", + printf( "bw=%4.2g GB/s backp=(%3.0f%%,%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%,%3.0f%%) acc=%8.3g/s\n", (double)( goff-goff_old )/1e9, ( (double)( file_rd_backp-file_rd_backp_old )*ns_per_tick )/1e7, ( (double)( snap_in_backp-snap_in_backp_old )*ns_per_tick )/1e7, @@ -259,6 +274,7 @@ snapshot_load_cmd_fn( args_t * args, 100-( ( (double)( file_rd_wait -file_rd_wait_old )*ns_per_tick )/1e7 ), 100-( ( (double)( snap_in_wait -snap_in_wait_old )*ns_per_tick )/1e7 ), 100-( ( (double)( actalc_wait -actalc_wait_old )*ns_per_tick )/1e7 ), + 100-( ( (double)( actidx_wait -actidx_wait_old )*ns_per_tick )/1e7 ), (double)( acc_cnt -acc_cnt_old ) ); fflush( stdout ); goff_old = goff; @@ -268,6 +284,7 @@ snapshot_load_cmd_fn( args_t * args, snap_in_wait_old = snap_in_wait; actalc_backp_old = actalc_backp; actalc_wait_old = actalc_wait; + actidx_wait_old = actidx_wait; acc_cnt_old = acc_cnt; sleep( 1 ); } diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index 72c3aa6fad..0e2bf5728e 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -100,9 +100,10 @@ extern fd_topo_run_tile_t fd_tile_archiver_playback; extern fd_topo_run_tile_t fd_tile_archiver_backtest; extern fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; extern fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn; extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc; -extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActIdx; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, @@ -139,9 +140,10 @@ fd_topo_run_tile_t * TILES[] = { &fd_tile_archiver_playback, &fd_tile_archiver_backtest, &fd_tile_snapshot_restore_FileRd, + &fd_tile_snapshot_restore_Unzstd, &fd_tile_snapshot_restore_SnapIn, &fd_tile_snapshot_restore_ActAlc, - &fd_tile_snapshot_restore_Unzstd, + &fd_tile_snapshot_restore_ActIdx, NULL, }; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 595474abba..0ed7896ee3 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -441,6 +441,10 @@ typedef struct { ulong funk_obj_id; } actalc; + struct { + ulong funk_obj_id; + } actidx; + }; } fd_topo_tile_t; diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index ea659cac49..61ce3329ae 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -1,7 +1,9 @@ $(call add-objs,fd_filerd_tile,fd_discof) +$(call add-objs,fd_unzstd_tile,fd_discof) $(call add-objs,fd_snapin_tile,fd_discof) $(call add-objs,fd_actalc_tile,fd_discof) -$(call add-objs,fd_unzstd_tile,fd_discof) +$(call add-objs,fd_actidx_tile,fd_discof) $(call add-objs,stream/fd_stream_writer,fd_discof) $(call add-objs,stream/fd_event_map,fd_discof) $(call add-objs,stream/fd_stream_ctx,fd_discof) +$(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_util) diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index d56064f752..b685c480ae 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -5,6 +5,7 @@ #include "../../flamenco/types/fd_types.h" #include "../../funk/fd_funk.h" +#define NAME "ActAlc" #define LINK_IN_MAX 1UL #define BURST 1UL @@ -42,6 +43,7 @@ struct fd_actalc_tile { fd_account_frag_meta_t * out_mcache; ulong out_seq; + ulong out_seq_max; ulong out_cnt; ulong out_depth; @@ -84,8 +86,8 @@ unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `ActAlc` tile" )); - if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu ins, expected 1", tile->in_cnt )); - if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu outs, expected 1", tile->out_cnt )); + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); /* FIXME check link names */ fd_actalc_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); @@ -221,7 +223,7 @@ on_stream_frag( fd_actalc_tile_t * ctx, /* Are we already done with this account? */ if( FD_UNLIKELY( !som && !ctx->in_skip ) ) return; - /* Read bytes 8-16 of each account header (data_len) */ + /* Read account header */ ulong const want_sz = sizeof(fd_solana_account_hdr_t); uchar const * frag = ctx->in_base + loff; @@ -274,13 +276,9 @@ fd_actalc_run1( fd_actalc_tile_t * ctx, ulong in_cnt, fd_actalc_in_t * in, /* [in_cnt] */ - ulong out_cnt, - fd_frag_meta_t ** out_mcache, /* [out_cnt] */ - ulong * out_depth, /* [out_cnt] */ - ulong * out_seq, /* [out_cnt] */ + fd_frag_meta_t * out_mcache, ulong cons_cnt, ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ - ulong * cons_out, /* [cons_cnt] */ ulong ** cons_fseq, /* [cons_cnt] */ ulong volatile ** restrict cons_slow, /* [cons_cnt] */ ulong * restrict cons_seq, /* [cons_cnt] */ @@ -290,9 +288,6 @@ fd_actalc_run1( /* in frag stream state */ ulong in_seq; - /* out flow control state */ - ulong cr_avail; - /* housekeeping state */ ulong event_cnt; ulong event_seq; @@ -320,20 +315,12 @@ fd_actalc_run1( /* out frag stream init */ - cr_avail = 0UL; - ulong const burst = BURST; - ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX ); - - for( ulong out_idx=0UL; out_idxout_seq, cons_seq[ cons_idx ] ), 0L ), 0L ); + slowest_cons = fd_ulong_if( cons_cr_availout_seq_max = ctx->out_seq + cr_avail; + + if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { + FD_COMPILER_MFENCE(); + (*cons_slow[ slowest_cons ]) += metric_in_backp; + FD_COMPILER_MFENCE(); } during_housekeeping( ctx ); @@ -457,7 +443,7 @@ fd_actalc_run1( /* Check if we are backpressured. */ - if( FD_UNLIKELY( ctx->db_full || cr_availdb_full || ctx->out_seq+burst > ctx->out_seq_max ) ) { metric_backp_cnt += (ulong)!metric_in_backp; metric_in_backp = 1UL; FD_SPIN_PAUSE(); @@ -561,15 +547,9 @@ fd_actalc_run( fd_topo_t * topo, } FD_TEST( polled_in_cnt<=LINK_IN_MAX ); - fd_frag_meta_t * out_mcache[ tile->out_cnt ]; - ulong out_depth [ tile->out_cnt ]; - ulong out_seq [ tile->out_cnt ]; - for( ulong i=0UL; iout_cnt; i++ ) { - out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; - FD_TEST( out_mcache[ i ] ); - out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); - out_seq [ i ] = 0UL; - } + FD_TEST( tile->out_cnt==1UL ); + fd_frag_meta_t * const out_mcache = topo->links[ tile->out_link_id[ 0 ] ].mcache; + FD_TEST( out_mcache ); ulong reliable_cons_cnt = 0UL; ulong cons_out[ FD_TOPO_MAX_LINKS ]; @@ -615,7 +595,7 @@ fd_actalc_run( fd_topo_t * topo, ushort event_map[ 1+reliable_cons_cnt ]; ulong volatile * cons_slow[ reliable_cons_cnt ]; ulong cons_seq [ reliable_cons_cnt ]; - fd_actalc_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, out_seq, reliable_cons_cnt, event_map, cons_out, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); + fd_actalc_run1( ctx, polled_in_cnt, polled_in, out_mcache, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); } #ifndef FD_TILE_TEST diff --git a/src/discof/restore/fd_actidx_tile.c b/src/discof/restore/fd_actidx_tile.c new file mode 100644 index 0000000000..26203d1939 --- /dev/null +++ b/src/discof/restore/fd_actidx_tile.c @@ -0,0 +1,472 @@ +#include "fd_restore_base.h" +#include "../../disco/topo/fd_topo.h" +#include "../../funk/fd_funk.h" + +#define NAME "ActIdx" +#define LINK_IN_MAX 1UL +#define BURST 1UL + +typedef fd_funk_rec_map_shmem_private_chain_t fd_funk_rec_chain_t; + +struct fd_actidx_tile { + /* Stream input */ + + uchar const * in_base; + + /* Funk database */ + + fd_funk_t funk[1]; + void * funk_base; + fd_funk_rec_t * rec0; + fd_funk_rec_chain_t * chain0; + ulong chain_mask; +}; + +typedef struct fd_actidx_tile fd_actidx_tile_t; + +struct fd_actidx_in { + fd_account_frag_meta_t const * mcache; + uint depth; + uint idx; + ulong seq; + fd_account_frag_meta_t const * mline; + ulong volatile * restrict fseq; + uint accum[6]; +}; + +typedef struct fd_actidx_in fd_actidx_in_t; + +static ulong +scratch_align( void ) { + return alignof(fd_actidx_tile_t); +} + +static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + return sizeof(fd_actidx_tile_t); +} + +static void +during_housekeeping( fd_actidx_tile_t * ctx ) { + (void)ctx; +} + +static void +metrics_write( fd_actidx_tile_t * ctx ) { + (void)ctx; +} + +static void +on_account_frag( fd_actidx_tile_t * ctx, + fd_account_frag_meta_t const * meta ) { + ulong const chain_mask = ctx->chain_mask; + ulong const rec_hash = meta->rec_hash; + ulong const rec_gaddr = meta->gaddr; + + fd_funk_rec_t * const rec = (fd_funk_rec_t *)( (ulong)ctx->funk_base + rec_gaddr ); + ulong const rec_idx = (ulong)( rec - ctx->rec0 ) / sizeof(fd_funk_rec_t); + ulong const chain_idx = rec_hash & chain_mask; + fd_funk_rec_chain_t * const chain = ctx->chain0 + chain_idx; + + ulong ver_cnt = chain->ver_cnt; + ulong version = fd_funk_rec_map_private_vcnt_ver( ver_cnt ); + ulong ele_cnt = fd_funk_rec_map_private_vcnt_cnt( ver_cnt ); + + uint old_head = chain->head_cidx; + uint new_head = (uint)rec_idx; + rec->map_next = old_head; + + chain->head_cidx = new_head; + chain->ver_cnt = fd_funk_rec_map_private_vcnt( version, ele_cnt+1UL ); +} + +static void +fd_actidx_in_update( fd_actidx_in_t * in ) { + FD_COMPILER_MFENCE(); + FD_VOLATILE( in->fseq[0] ) = in->seq; + FD_COMPILER_MFENCE(); + + ulong volatile * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + + uint * accum = in->accum; + ulong a0 = accum[0]; ulong a1 = accum[1]; ulong a2 = accum[2]; + ulong a3 = accum[3]; ulong a4 = accum[4]; ulong a5 = accum[5]; + FD_COMPILER_MFENCE(); + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; + FD_COMPILER_MFENCE(); + accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; + accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `ActIdx` tile" )); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + //if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + + fd_actidx_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + memset( ctx, 0, sizeof(fd_actidx_tile_t) ); + + /* Join funk database */ + + if( FD_UNLIKELY( !fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->actidx.funk_obj_id ) ) ) ) { + FD_LOG_ERR(( "Failed to join database cache" )); + } + ctx->funk_base = fd_wksp_containing( ctx->funk->shmem ); + + fd_funk_rec_map_t const * rec_map = fd_funk_rec_map( ctx->funk ); + ctx->rec0 = rec_map->ele; + ctx->chain0 = fd_funk_rec_map_shmem_private_chain( rec_map->map, 0UL ); + ctx->chain_mask = rec_map->map->chain_cnt-1UL; +} + +__attribute__((noinline)) static void +fd_actidx_run1( + fd_actidx_tile_t * ctx, + ulong in_cnt, + fd_actidx_in_t * in, /* [in_cnt] */ + ulong out_cnt, + fd_frag_meta_t ** out_mcache, /* [out_cnt] */ + ulong * out_depth, /* [out_cnt] */ + ulong * out_seq, /* [out_cnt] */ + ulong cons_cnt, + ushort * restrict event_map, /* [1+in_cnt+cons_cnt] */ + ulong * cons_out, /* [cons_cnt] */ + ulong ** cons_fseq, /* [cons_cnt] */ + ulong volatile ** restrict cons_slow, /* [cons_cnt] */ + ulong * restrict cons_seq, /* [cons_cnt] */ + long lazy, + fd_rng_t * rng +) { + /* in frag stream state */ + ulong in_seq; + + /* out flow control state */ + ulong cr_avail; + + /* housekeeping state */ + ulong event_cnt; + ulong event_seq; + ulong async_min; + + /* performance metrics */ + ulong metric_in_backp; + ulong metric_backp_cnt; + ulong metric_regime_ticks[9]; + + metric_in_backp = 1UL; + metric_backp_cnt = 0UL; + memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); + + /* in frag stream init */ + + in_seq = 0UL; /* First in to poll */ + + ulong min_in_depth = (ulong)LONG_MAX; + for( ulong in_idx=0UL; in_idxmcache->f ); + min_in_depth = fd_ulong_min( min_in_depth, depth ); + } + + /* out frag stream init */ + + cr_avail = 0UL; + + ulong const burst = BURST; + + ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX ); + + for( ulong out_idx=0UL; out_idx=0L ) ) { + ulong event_idx = (ulong)event_map[ event_seq ]; + + if( FD_LIKELY( event_idxcons_cnt ) ) { /* in fctl for in in_idx */ + + /* Send flow control credits and drain flow control diagnostics. */ + ulong in_idx = event_idx - cons_cnt - 1UL; + fd_actidx_in_update( &in[ in_idx ] ); + + } else { /* event_idx==cons_cnt, housekeeping event */ + + /* Update metrics counters to external viewers */ + FD_COMPILER_MFENCE(); + FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now ); + FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp ); + FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt ); + FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks ); + metrics_write( ctx ); + FD_COMPILER_MFENCE(); + metric_backp_cnt = 0UL; + + /* Receive flow control credits */ + if( FD_LIKELY( cr_avail=event_cnt ) ) { + event_seq = 0UL; + + ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); + ushort map_tmp = event_map[ swap_idx ]; + event_map[ swap_idx ] = event_map[ 0 ]; + event_map[ 0 ] = map_tmp; + + if( FD_LIKELY( in_cnt>1UL ) ) { + swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt ); + fd_actidx_in_t in_tmp; + in_tmp = in[ swap_idx ]; + in[ swap_idx ] = in[ 0 ]; + in[ 0 ] = in_tmp; + } + } + + /* Reload housekeeping timer */ + then = now + (long)fd_tempo_async_reload( rng, async_min ); + long next = fd_tickcount(); + housekeeping_ticks = (ulong)(next - now); + now = next; + } + + /* Check if we are backpressured. */ + + if( FD_UNLIKELY( cr_avail=in_cnt ) in_seq = 0UL; /* cmov */ + + /* Check if this in has any new fragments to mux */ + + ulong this_in_seq = this_in->seq; + fd_account_frag_meta_t const * this_in_mline = this_in->mline; + + ulong seq_found = fd_frag_meta_seq_query( this_in_mline->f ); + + long diff = fd_seq_diff( this_in_seq, seq_found ); + if( FD_UNLIKELY( diff ) ) { + ulong * housekeeping_regime = &metric_regime_ticks[0]; + ulong * prefrag_regime = &metric_regime_ticks[3]; + ulong * finish_regime = &metric_regime_ticks[6]; + if( FD_UNLIKELY( diff<0L ) ) { + this_in->seq = seq_found; + housekeeping_regime = &metric_regime_ticks[1]; + prefrag_regime = &metric_regime_ticks[4]; + finish_regime = &metric_regime_ticks[7]; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff); + } + + /* Don't bother with spin as polling multiple locations */ + *housekeeping_regime += housekeeping_ticks; + *prefrag_regime += prefrag_ticks; + long next = fd_tickcount(); + *finish_regime += (ulong)(next - now); + now = next; + continue; + } + + FD_COMPILER_MFENCE(); + fd_account_frag_meta_t meta = FD_VOLATILE_CONST( *this_in_mline ); + on_account_frag( ctx, &meta ); + + ulong seq_test = fd_frag_meta_seq_query( this_in_mline->f ); + if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { + FD_LOG_ERR(( "Overrun while reading from input %lu: seq_found=%lu seq_test=%lu", in_seq, seq_found, seq_test )); + } + + /* Windup for the next in poll and accumulate diagnostics */ + + this_in_seq = fd_seq_inc( this_in_seq, 1UL ); + this_in->seq = this_in_seq; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); + + this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++; + + metric_regime_ticks[1] += housekeeping_ticks; + metric_regime_ticks[4] += prefrag_ticks; + long next = fd_tickcount(); + metric_regime_ticks[7] += (ulong)(next - now); + now = next; + } +} + +static void +fd_actidx_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_account_frag_meta_t * in_mcache[ LINK_IN_MAX ]; + ulong * in_fseq [ LINK_IN_MAX ]; + + ulong polled_in_cnt = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + in_mcache[ polled_in_cnt ] = fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ); + FD_TEST( in_mcache[ polled_in_cnt ] ); + in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ]; + FD_TEST( in_fseq[ polled_in_cnt ] ); + polled_in_cnt += 1; + } + FD_TEST( polled_in_cnt<=LINK_IN_MAX ); + + fd_frag_meta_t * out_mcache[ tile->out_cnt ]; + ulong out_depth [ tile->out_cnt ]; + ulong out_seq [ tile->out_cnt ]; + for( ulong i=0UL; iout_cnt; i++ ) { + out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache; + FD_TEST( out_mcache[ i ] ); + out_depth [ i ] = fd_mcache_depth( out_mcache[ i ] ); + out_seq [ i ] = 0UL; + } + + ulong reliable_cons_cnt = 0UL; + ulong cons_out[ FD_TOPO_MAX_LINKS ]; + ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; + for( ulong i=0UL; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0UL; jin_cnt; j++ ) { + for( ulong k=0UL; kout_cnt; k++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { + cons_out[ reliable_cons_cnt ] = k; + cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; + FD_TEST( cons_fseq[ reliable_cons_cnt ] ); + reliable_cons_cnt++; + FD_TEST( reliable_cons_cntmcache = in_mcache[ i ]; + this_in->fseq = in_fseq [ i ]; + + ulong depth = fd_mcache_depth( this_in->mcache->f ); + if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", i )); + this_in->depth = (uint)depth; + this_in->idx = (uint)i; + this_in->seq = 0UL; + this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth ); + + this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U; + this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U; + } + + fd_actidx_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ushort event_map[ 1+reliable_cons_cnt ]; + ulong volatile * cons_slow[ reliable_cons_cnt ]; + ulong cons_seq [ reliable_cons_cnt ]; + fd_actidx_run1( ctx, polled_in_cnt, polled_in, reliable_cons_cnt, out_mcache, out_depth, out_seq, reliable_cons_cnt, event_map, cons_out, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); +} + +#ifndef FD_TILE_TEST +fd_topo_run_tile_t fd_tile_snapshot_restore_ActIdx = { + .name = "ActIdx", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .unprivileged_init = unprivileged_init, + .run = fd_actidx_run, +}; +#endif diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index 0bb1b5b74b..8851058f85 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -30,6 +30,8 @@ union fd_account_frag_meta { fd_frag_meta_t f[1]; + fd_stream_frag_meta_t acc[1]; + }; typedef union fd_account_frag_meta fd_account_frag_meta_t; diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index d83d0647bd..185b37aaef 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -9,6 +9,7 @@ #include #include +#define NAME "SnapIn" #define LINK_IN_MAX 1UL #define BURST 16UL @@ -578,10 +579,10 @@ scratch_footprint( fd_topo_tile_t const * tile ) { static void unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { - if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `FileRd` tile" )); + if( FD_UNLIKELY( tile->kind_id ) ) FD_LOG_ERR(( "There can only be one `" NAME "` tile" )); - if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu ins, expected 1", tile->in_cnt )); - if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `FileRd` has %lu outs, expected 1", tile->out_cnt )); + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); /* FIXME check link names */ if( FD_UNLIKELY( !tile->snapin.scratch_sz ) ) FD_LOG_ERR(( "scratch_sz param not set" )); @@ -1140,7 +1141,7 @@ fd_snapin_run1( } } -static void +FD_FN_UNUSED static void fd_snapin_run( fd_topo_t * topo, fd_topo_tile_t * tile ) { fd_stream_frag_meta_t * in_mcache[ LINK_IN_MAX ]; diff --git a/src/discof/restore/test_snapin_tile.c b/src/discof/restore/test_snapin_tile.c index 43ff1bb075..9cf0b60274 100644 --- a/src/discof/restore/test_snapin_tile.c +++ b/src/discof/restore/test_snapin_tile.c @@ -1,3 +1,4 @@ +#define FD_TILE_TEST #include "fd_snapin_tile.c" int @@ -12,6 +13,13 @@ main( int argc, fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL ); if( FD_UNLIKELY( !wksp ) ) FD_LOG_ERR(( "Unable to attach to wksp" )); + fd_topo_tile_t topo_tile = { + .name = "snapin", + }; + + uchar * tile_scratch = fd_wksp_alloc_laddr( wksp, scratch_align(), scratch_footprint( &topo_tile ), 1UL ); + FD_TEST( tile_scratch ); + fd_wksp_delete_anonymous( wksp ); FD_LOG_NOTICE(( "pass" )); From 6e00c1b155bc3ee059911c9a18d48180de05c1f3 Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Thu, 15 May 2025 11:55:38 -0500 Subject: [PATCH 26/34] working httpdl tile and more generic stream ctx (#5133) --- .../firedancer-dev/commands/snapshot_load.c | 123 ++++++-- src/app/firedancer-dev/main.c | 4 + src/app/shared/fd_action.h | 1 + src/disco/topo/fd_topo.h | 9 + src/discof/restore/Local.mk | 2 + src/discof/restore/fd_actalc_tile.c | 2 - src/discof/restore/fd_actidx_tile.c | 1 + src/discof/restore/fd_httpdl_tile.c | 180 ++++++++++++ src/discof/restore/fd_restore_base.h | 61 +++- src/discof/restore/fd_snapin_tile.c | 2 - src/discof/restore/fd_unzstd_tile.c | 134 ++++----- src/discof/restore/stream/fd_event_map.c | 6 +- src/discof/restore/stream/fd_event_map.h | 14 +- src/discof/restore/stream/fd_stream_ctx.c | 36 +-- src/discof/restore/stream/fd_stream_ctx.h | 267 ++++++++++++++---- src/discof/restore/stream/fd_stream_metrics.h | 33 --- src/discof/restore/stream/fd_stream_reader.h | 48 +--- src/discof/restore/stream/fd_stream_ticks.h | 6 - src/discof/restore/stream/fd_stream_writer.c | 7 +- src/discof/restore/stream/fd_stream_writer.h | 31 +- src/flamenco/snapshot/fd_snapshot_http.c | 24 +- src/flamenco/snapshot/fd_snapshot_http.h | 19 ++ 22 files changed, 690 insertions(+), 320 deletions(-) create mode 100644 src/discof/restore/fd_httpdl_tile.c diff --git a/src/app/firedancer-dev/commands/snapshot_load.c b/src/app/firedancer-dev/commands/snapshot_load.c index b63a9b6591..bc2063f117 100644 --- a/src/app/firedancer-dev/commands/snapshot_load.c +++ b/src/app/firedancer-dev/commands/snapshot_load.c @@ -5,6 +5,7 @@ #include "../../../disco/topo/fd_topob.h" #include "../../../disco/topo/fd_pod_format.h" #include "../../../util/tile/fd_tile_private.h" +#include "../../../flamenco/snapshot/fd_snapshot_loader.h" #include #include #include @@ -39,10 +40,14 @@ _is_zstd( char const * path ) { fclose( file ); return ( magic==0xFD2FB528UL ); } + static void snapshot_load_topo( config_t * config, args_t const * args ) { - int is_zstd = _is_zstd( args->snapshot_load.snapshot_path ); + fd_snapshot_src_t src[1]; + char snapshot_path_copy[4096]; + memcpy( snapshot_path_copy, args->snapshot_load.snapshot_path, sizeof(snapshot_path_copy) ); + fd_snapshot_src_parse_type_unknown( src, snapshot_path_copy ); fd_topo_t * topo = &config->topo; fd_topob_new( &config->topo, config->name ); @@ -64,13 +69,6 @@ snapshot_load_topo( config_t * config, fd_topob_wksp( topo, "metric" ); fd_topob_tile( topo, "metric", "metric", "metric_in", tile_to_cpu[0], 0, 0 ); - /* read() tile */ - fd_topob_wksp( topo, "FileRd" ); - fd_topo_tile_t * filerd_tile = fd_topob_tile( topo, "FileRd", "FileRd", "FileRd", tile_to_cpu[1], 0, 0 ); - fd_memcpy( filerd_tile->filerd.file_path, args->snapshot_load.snapshot_path, PATH_MAX ); - FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==sizeof(args->snapshot_load.snapshot_path), abi ); - FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==PATH_MAX, abi ); - /* Uncompressed data stream */ fd_topob_wksp( topo, "snap_stream" ); fd_topo_link_t * snapin_link = fd_topob_link( topo, "snap_stream", "snap_stream", 512UL, 0UL, 0UL ); @@ -78,7 +76,62 @@ snapshot_load_topo( config_t * config, snapin_link->dcache_obj_id = snapin_dcache->id; FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", snapin_dcache->id ) ); - if( is_zstd ) { /* .tar.zst file */ + if( src->type==FD_SNAPSHOT_SRC_FILE ) { + + int is_zstd = _is_zstd( args->snapshot_load.snapshot_path ); + + /* read() tile */ + fd_topob_wksp( topo, "FileRd" ); + fd_topo_tile_t * filerd_tile = fd_topob_tile( topo, "FileRd", "FileRd", "FileRd", tile_to_cpu[1], 0, 0 ); + fd_memcpy( filerd_tile->filerd.file_path, args->snapshot_load.snapshot_path, PATH_MAX ); + FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==sizeof(args->snapshot_load.snapshot_path), abi ); + FD_STATIC_ASSERT( sizeof(filerd_tile->filerd.file_path)==PATH_MAX, abi ); + + if( is_zstd ) { /* .tar.zst file */ + + /* "unzstd": Zstandard decompress tile */ + fd_topob_wksp( topo, "Unzstd" ); + fd_topo_tile_t * unzstd_tile = fd_topob_tile( topo, "Unzstd", "Unzstd", "Unzstd", tile_to_cpu[2], 0, 0 ); + (void)unzstd_tile; + + /* Compressed data stream */ + fd_topob_wksp( topo, "snap_zstd" ); + fd_topo_link_t * zstd_link = fd_topob_link( topo, "snap_zstd", "snap_zstd", 512UL, 0UL, 0UL ); + fd_topo_obj_t * zstd_dcache = fd_topob_obj( topo, "dcache", "snap_zstd"); + zstd_link->dcache_obj_id = zstd_dcache->id; + FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", zstd_dcache->id ) ); + + /* filerd tile -> compressed stream */ + fd_topob_tile_out( topo, "FileRd", 0UL, "snap_zstd", 0UL ); + fd_topob_tile_uses( topo, filerd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + /* compressed stream -> unzstd tile */ + fd_topob_tile_in( topo, "Unzstd", 0UL, "metric_in", "snap_zstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + fd_topob_tile_uses( topo, unzstd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY ); + + /* unzstd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + } else { /* .tar file */ + + /* filerd tile -> uncompressed stream */ + fd_topob_tile_out( topo, "FileRd", 0UL, "snap_stream", 0UL ); + fd_topob_tile_uses( topo, filerd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + + } + } + else if ( src->type==FD_SNAPSHOT_SRC_HTTP ) { + + /* httpdl() tile */ + fd_topob_wksp( topo, "HttpDl" ); + fd_topo_tile_t * httpdl_tile = fd_topob_tile( topo, "HttpDl", "HttpDl", "HttpDl", tile_to_cpu[1], 0, 0 ); + fd_memcpy( httpdl_tile->httpdl.path, src->http.path, PATH_MAX ); + fd_memcpy( httpdl_tile->httpdl.snapshot_dir, args->snapshot_load.snapshot_dir, PATH_MAX ); + fd_memcpy( httpdl_tile->httpdl.dest, src->http.dest, sizeof(src->http.dest) ); + httpdl_tile->httpdl.ip4 = src->http.ip4; + httpdl_tile->httpdl.path_len = src->http.path_len; + httpdl_tile->httpdl.port = src->http.port; /* "unzstd": Zstandard decompress tile */ fd_topob_wksp( topo, "Unzstd" ); @@ -93,8 +146,8 @@ snapshot_load_topo( config_t * config, FD_TEST( fd_pod_insertf_ulong( topo->props, (16UL<<20), "obj.%lu.data_sz", zstd_dcache->id ) ); /* filerd tile -> compressed stream */ - fd_topob_tile_out( topo, "FileRd", 0UL, "snap_zstd", 0UL ); - fd_topob_tile_uses( topo, filerd_tile, zstd_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); + fd_topob_tile_out( topo, "HttpDl", 0UL, "snap_zstd", 0UL ); + fd_topob_tile_uses( topo, httpdl_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); /* compressed stream -> unzstd tile */ fd_topob_tile_in( topo, "Unzstd", 0UL, "metric_in", "snap_zstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); @@ -103,13 +156,6 @@ snapshot_load_topo( config_t * config, /* unzstd tile -> uncompressed stream */ fd_topob_tile_out( topo, "Unzstd", 0UL, "snap_stream", 0UL ); fd_topob_tile_uses( topo, unzstd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); - - } else { /* .tar file */ - - /* filerd tile -> uncompressed stream */ - fd_topob_tile_out( topo, "FileRd", 0UL, "snap_stream", 0UL ); - fd_topob_tile_uses( topo, filerd_tile, snapin_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE ); - } /* "SnapIn": Snapshot parser tile */ @@ -166,8 +212,9 @@ static void snapshot_load_cmd_args( int * pargc, char *** pargv, args_t * args ) { - char const * tile_cpus = fd_env_strip_cmdline_cstr( pargc, pargv, "--tile-cpus", "FD_TILE_CPUS", NULL ); - char const * snapshot_file = fd_env_strip_cmdline_cstr( pargc, pargv, "--snapshot", NULL, NULL ); + char const * tile_cpus = fd_env_strip_cmdline_cstr( pargc, pargv, "--tile-cpus", "FD_TILE_CPUS", NULL ); + char const * snapshot_src = fd_env_strip_cmdline_cstr( pargc, pargv, "--snapshot", NULL, NULL ); + char const * snapshot_dir = fd_env_strip_cmdline_cstr( pargc, pargv, "--snapshot-dir", NULL, NULL ); if( tile_cpus ) { ulong tile_cpus_strlen = strlen( tile_cpus ); @@ -175,10 +222,17 @@ snapshot_load_cmd_args( int * pargc, fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->tile_cpus ), tile_cpus, tile_cpus_strlen ) ); } - if( FD_UNLIKELY( !snapshot_file ) ) FD_LOG_ERR(( "Missing --snapshot flag" )); - ulong snapshot_file_strlen = strlen( snapshot_file ); + if( FD_UNLIKELY( !snapshot_src ) ) FD_LOG_ERR(( "Missing --snapshot flag" )); + ulong snapshot_file_strlen = strlen( snapshot_src ); if( FD_UNLIKELY( snapshot_file_strlen>=sizeof(args->snapshot_load.snapshot_path) ) ) FD_LOG_ERR(( "--snapshot: path too long" )); - fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->snapshot_load.snapshot_path ), snapshot_file, snapshot_file_strlen ) ); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->snapshot_load.snapshot_path ), snapshot_src, snapshot_file_strlen ) ); + + /* FIXME: check if we need the snapshot dir argument (parse the snapshot input src to see if it's http)*/ + if( snapshot_dir!=NULL ) { + ulong snapshot_dir_strlen = strlen( snapshot_dir ); + if( FD_UNLIKELY( snapshot_file_strlen>=sizeof(args->snapshot_load.snapshot_dir) ) ) FD_LOG_ERR(( "--snapshot-dir: dir too long" )); + fd_cstr_fini( fd_cstr_append_text( fd_cstr_init( args->snapshot_load.snapshot_dir ), snapshot_dir, snapshot_dir_strlen ) ); + } } static void @@ -206,7 +260,10 @@ snapshot_load_cmd_fn( args_t * args, double ns_per_tick = 1.0/tick_per_ns; fd_topo_run_single_process( topo, 2, config->uid, config->gid, fdctl_tile_run, NULL ); - fd_topo_tile_t * const file_rd_tile = &topo->tiles[ fd_topo_find_tile( topo, "FileRd", 0UL ) ]; + ulong httpdl_tile_idx = fd_topo_find_tile( topo, "HttpDl", 0UL ); + ulong filerd_tile_idx = fd_topo_find_tile( topo, "FileRd", 0UL ); + fd_topo_tile_t * http_dl_tile = httpdl_tile_idx!=ULONG_MAX ? &topo->tiles[ httpdl_tile_idx ] : NULL; + fd_topo_tile_t * file_rd_tile = filerd_tile_idx!=ULONG_MAX ? &topo->tiles[ filerd_tile_idx ] : NULL; fd_topo_tile_t * const snap_in_tile = &topo->tiles[ fd_topo_find_tile( topo, "SnapIn", 0UL ) ]; ulong const zstd_tile_idx = fd_topo_find_tile( topo, "Unzstd", 0UL ); fd_topo_tile_t * const unzstd_tile = zstd_tile_idx!=ULONG_MAX ? &topo->tiles[ zstd_tile_idx ] : NULL; @@ -215,7 +272,8 @@ snapshot_load_cmd_fn( args_t * args, ulong * const snap_in_fseq = snap_in_tile->in_link_fseq[ 0 ]; ulong * const snap_accs_sync = fd_mcache_seq_laddr( topo->links[ fd_topo_find_link( topo, "snap_frags", 0UL ) ].mcache ); - ulong volatile * const file_rd_metrics = fd_metrics_tile( file_rd_tile->metrics ); + ulong volatile * file_rd_metrics = file_rd_tile ? fd_metrics_tile( file_rd_tile->metrics ) : NULL; + ulong volatile * http_dl_metrics = http_dl_tile ? fd_metrics_tile( http_dl_tile->metrics ) : NULL; ulong volatile * const snap_in_metrics = fd_metrics_tile( snap_in_tile->metrics ); ulong volatile * const unzstd_in_metrics = unzstd_tile ? fd_metrics_tile( unzstd_tile->metrics ) : NULL; ulong volatile * const actalc_metrics = fd_metrics_tile( actalc_tile->metrics ); @@ -240,19 +298,22 @@ snapshot_load_cmd_fn( args_t * args, puts( "" ); puts( "-------------backp=(file,snap,alc ) busy=(file,snap,alc ,idx )---------------" ); for(;;) { - ulong filerd_status = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); + ulong filerd_status = file_rd_metrics ? FD_VOLATILE_CONST( file_rd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; + ulong httpdl_status = http_dl_metrics ? FD_VOLATILE_CONST( http_dl_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; ulong snapin_status = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ); ulong unzstd_status = unzstd_in_metrics ? FD_VOLATILE_CONST( unzstd_in_metrics[ MIDX( GAUGE, TILE, STATUS ) ] ) : 2UL; - if( FD_UNLIKELY( filerd_status==2UL && unzstd_status==2UL && snapin_status == 2UL ) ) { + if( FD_UNLIKELY( httpdl_status==2UL && filerd_status==2UL && unzstd_status==2UL && snapin_status == 2UL ) ) { FD_LOG_NOTICE(( "Done" )); break; } ulong goff = FD_VOLATILE_CONST( snap_in_fseq[ 1 ] ); - ulong file_rd_backp = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); - ulong file_rd_wait = FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + - FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + - file_rd_backp; + ulong file_rd_backp = file_rd_metrics ? FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ) : + http_dl_metrics ? FD_VOLATILE_CONST( http_dl_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ) : 0UL; + ulong file_rd_wait = file_rd_metrics ? FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( file_rd_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + file_rd_backp : + http_dl_metrics ? FD_VOLATILE_CONST( http_dl_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + + FD_VOLATILE_CONST( http_dl_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + file_rd_backp :0UL; ulong snap_in_backp = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ] ); ulong snap_in_wait = FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] ) + FD_VOLATILE_CONST( snap_in_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] ) + diff --git a/src/app/firedancer-dev/main.c b/src/app/firedancer-dev/main.c index 0e2bf5728e..2e4732223d 100644 --- a/src/app/firedancer-dev/main.c +++ b/src/app/firedancer-dev/main.c @@ -104,6 +104,8 @@ extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; extern fd_topo_run_tile_t fd_tile_snapshot_restore_SnapIn; extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActAlc; extern fd_topo_run_tile_t fd_tile_snapshot_restore_ActIdx; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd; +extern fd_topo_run_tile_t fd_tile_snapshot_restore_HttpDl; fd_topo_run_tile_t * TILES[] = { &fd_tile_net, @@ -144,6 +146,8 @@ fd_topo_run_tile_t * TILES[] = { &fd_tile_snapshot_restore_SnapIn, &fd_tile_snapshot_restore_ActAlc, &fd_tile_snapshot_restore_ActIdx, + &fd_tile_snapshot_restore_Unzstd, + &fd_tile_snapshot_restore_HttpDl, NULL, }; diff --git a/src/app/shared/fd_action.h b/src/app/shared/fd_action.h index cd30022936..c111ef0304 100644 --- a/src/app/shared/fd_action.h +++ b/src/app/shared/fd_action.h @@ -92,6 +92,7 @@ struct fdctl_args { struct { char snapshot_path[ PATH_MAX ]; + char snapshot_dir[ PATH_MAX ]; } snapshot_load; }; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 0ed7896ee3..38a4aba0bc 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -433,6 +433,15 @@ typedef struct { char file_path[ PATH_MAX ]; } filerd; + struct { + char dest[128]; + uint ip4; + ushort port; + char path[ PATH_MAX ]; + ulong path_len; + char snapshot_dir[ PATH_MAX ]; + } httpdl; + struct { ulong scratch_sz; } snapin; diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index 61ce3329ae..bb1366c801 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -3,6 +3,8 @@ $(call add-objs,fd_unzstd_tile,fd_discof) $(call add-objs,fd_snapin_tile,fd_discof) $(call add-objs,fd_actalc_tile,fd_discof) $(call add-objs,fd_actidx_tile,fd_discof) +$(call add-objs,fd_unzstd_tile,fd_discof) +$(call add-objs,fd_httpdl_tile,fd_discof) $(call add-objs,stream/fd_stream_writer,fd_discof) $(call add-objs,stream/fd_event_map,fd_discof) $(call add-objs,stream/fd_stream_ctx,fd_discof) diff --git a/src/discof/restore/fd_actalc_tile.c b/src/discof/restore/fd_actalc_tile.c index b685c480ae..1f17b0390a 100644 --- a/src/discof/restore/fd_actalc_tile.c +++ b/src/discof/restore/fd_actalc_tile.c @@ -552,14 +552,12 @@ fd_actalc_run( fd_topo_t * topo, FD_TEST( out_mcache ); ulong reliable_cons_cnt = 0UL; - ulong cons_out[ FD_TOPO_MAX_LINKS ]; ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; for( ulong j=0UL; jin_cnt; j++ ) { for( ulong k=0UL; kout_cnt; k++ ) { if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { - cons_out[ reliable_cons_cnt ] = k; cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; FD_TEST( cons_fseq[ reliable_cons_cnt ] ); reliable_cons_cnt++; diff --git a/src/discof/restore/fd_actidx_tile.c b/src/discof/restore/fd_actidx_tile.c index 26203d1939..f58fa243c2 100644 --- a/src/discof/restore/fd_actidx_tile.c +++ b/src/discof/restore/fd_actidx_tile.c @@ -1,6 +1,7 @@ #include "fd_restore_base.h" #include "../../disco/topo/fd_topo.h" #include "../../funk/fd_funk.h" +#include "../../disco/metrics/fd_metrics.h" #define NAME "ActIdx" #define LINK_IN_MAX 1UL diff --git a/src/discof/restore/fd_httpdl_tile.c b/src/discof/restore/fd_httpdl_tile.c new file mode 100644 index 0000000000..59bdb483be --- /dev/null +++ b/src/discof/restore/fd_httpdl_tile.c @@ -0,0 +1,180 @@ +#include "../../disco/topo/fd_topo.h" +#include "../../flamenco/snapshot/fd_snapshot_http.h" +#include "stream/fd_stream_writer.h" +#include "stream/fd_stream_ctx.h" +#include + +#define NAME "http" +#define HTTP_CHUNK_SZ 8 * 1024 * 1024UL + +struct fd_httpdl_tile { + fd_snapshot_http_t * http; + fd_stream_writer_t * writer; +}; +typedef struct fd_httpdl_tile fd_httpdl_tile_t; + +FD_FN_PURE static ulong +scratch_align( void ) { + return fd_ulong_max( alignof(fd_httpdl_tile_t), + fd_ulong_max( fd_snapshot_http_align(), fd_stream_writer_align() ) ); +} + +FD_FN_PURE static ulong +scratch_footprint( fd_topo_tile_t const * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_httpdl_tile_t), sizeof(fd_httpdl_tile_t) ); + l = FD_LAYOUT_APPEND( l, fd_snapshot_http_align(), fd_snapshot_http_footprint() ); + return FD_LAYOUT_FINI( l, scratch_align() ); +} + +static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + fd_httpdl_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_httpdl_tile_t), sizeof(fd_httpdl_tile_t) ); + void * http_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_snapshot_http_align(), fd_snapshot_http_footprint() ); + + fd_memset( ctx, 0, sizeof(fd_httpdl_tile_t) ); + + if( FD_UNLIKELY( !tile->httpdl.dest[0] ) ) { + FD_LOG_ERR(( "http dest not set" )); + } + + /* TODO: is null ok for the name? */ + ctx->http = fd_snapshot_http_new( http_mem, + tile->httpdl.dest, + tile->httpdl.ip4, + tile->httpdl.port, + tile->httpdl.snapshot_dir, + NULL ); + + fd_snapshot_http_privileged_init( ctx->http ); +} + +static void +fd_httpdl_init_from_stream_ctx( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); + + /* There's only one writer */ + ctx->writer = &stream_ctx->writers[0]; + fd_stream_writer_set_read_max( ctx->writer, HTTP_CHUNK_SZ ); +} + +__attribute__((noreturn)) FD_FN_UNUSED static void +fd_httpdl_shutdown( fd_httpdl_tile_t * ctx ) { + fd_snapshot_http_cleanup_fds( ctx->http ); + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + fd_stream_writer_notify_shutdown( ctx->writer ); + FD_COMPILER_MFENCE(); + FD_LOG_WARNING(("Done downloading snapshot")); + + for(;;) pause(); +} + +__attribute__((unused)) static void +after_credit_chunk( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); + (void)stream_ctx; + ulong downloaded_sz = 0UL; + + /* Don't do anything if backpressured */ + if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { + return; + } + + for(;;) { + if( downloaded_sz >= HTTP_CHUNK_SZ ) { + fd_stream_writer_publish( ctx->writer, downloaded_sz ); + break; + } + /* get write pointers into dcache buffer */ + uchar * out = fd_stream_writer_get_write_ptr( ctx->writer ); + ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); + ulong sz = 0UL; + + if( dst_max==0 ) { + fd_stream_writer_publish( ctx->writer, downloaded_sz ); + break; + } + + int err = fd_io_istream_snapshot_http_read( ctx->http, out, dst_max, &sz ); + if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); + else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); + + if( sz ) { + fd_stream_writer_advance( ctx->writer, sz ); + downloaded_sz += sz; + } + } +} + +__attribute__((unused)) static void +after_credit_stream( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); + (void)stream_ctx; + + /* Don't do anything if backpressured */ + if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { + return; + } + + /* get write pointers into dcache buffer */ + uchar * out = fd_stream_writer_get_write_ptr( ctx->writer ); + ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); + ulong sz = 0UL; + + int err = fd_io_istream_snapshot_http_read( ctx->http, out, dst_max, &sz ); + if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); + else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); + + if( sz ) { + fd_stream_writer_advance( ctx->writer, sz ); + fd_stream_writer_publish( ctx->writer, sz ); + } +} + +__attribute__((noinline)) static void +fd_httpdl_run1( + fd_httpdl_tile_t * ctx, + fd_stream_ctx_t * stream_ctx ) { + + FD_LOG_INFO(( "Running httpdl tile" )); + + fd_stream_ctx_run( stream_ctx, + ctx, + fd_httpdl_init_from_stream_ctx, + NULL, + NULL, + NULL, + after_credit_stream, + NULL ); +} + +static void +fd_httpdl_run( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + fd_httpdl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); + ulong in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong out_cnt = tile->out_cnt; + + void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, out_cnt ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, out_cnt ); + fd_httpdl_run1( ctx, stream_ctx ); +} + +fd_topo_run_tile_t fd_tile_snapshot_restore_HttpDl = { + .name = "HttpDl", + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .run = fd_httpdl_run, +}; + +#undef NAME + + + diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index 8851058f85..808fe5bf6b 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -3,15 +3,32 @@ #include "../../tango/mcache/fd_mcache.h" #include "../../disco/topo/fd_topo.h" -#include "stream/fd_stream_reader.h" -struct fd_stream_frag_meta_ctx { - uchar const * in_buf; - ulong goff_translate; - ulong loff_translate; - ulong in_skip; +/* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for + stream I/O. */ + +union fd_stream_frag_meta { + +struct { + + ulong seq; /* frag sequence number */ + uint sz; + ushort unused; + ushort ctl; + + ulong goff; /* stream offset */ + ulong loff; /* dcache offset */ + }; -typedef struct fd_stream_frag_meta_ctx fd_stream_frag_meta_ctx_t; + +fd_frag_meta_t f[1]; + +}; + +typedef union fd_stream_frag_meta fd_stream_frag_meta_t; + +FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); +FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); /* fd_account_frag_meta_t is a variation of fd_frag_meta_t optimized for accounts. */ @@ -39,8 +56,38 @@ typedef union fd_account_frag_meta fd_account_frag_meta_t; FD_STATIC_ASSERT( alignof(fd_account_frag_meta_t)==32, abi ); FD_STATIC_ASSERT( sizeof (fd_account_frag_meta_t)==32, abi ); +/* fd_stream_frag_meta_ctx_t tracks receiving state from a stream */ +struct fd_stream_frag_meta_ctx { + uchar const * in_buf; + ulong goff_translate; + ulong loff_translate; + ulong in_skip; +}; +typedef struct fd_stream_frag_meta_ctx fd_stream_frag_meta_ctx_t; + FD_PROTOTYPES_BEGIN +static inline void +fd_mcache_publish_stream( fd_stream_frag_meta_t * mcache, + ulong depth, + ulong seq, + ulong goff, + ulong loff, + ulong sz, + ulong ctl ) { + fd_stream_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); + FD_COMPILER_MFENCE(); + meta->seq = fd_seq_dec( seq, 1UL ); + FD_COMPILER_MFENCE(); + meta->goff = goff; + meta->sz = (uint)sz; + meta->ctl = (ushort)ctl; + meta->loff = loff; + FD_COMPILER_MFENCE(); + meta->seq = seq; + FD_COMPILER_MFENCE(); +} + static inline void fd_mcache_publish_account( fd_account_frag_meta_t * mcache, ulong depth, diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index 185b37aaef..055531e279 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -1168,14 +1168,12 @@ fd_snapin_run( fd_topo_t * topo, } ulong reliable_cons_cnt = 0UL; - ulong cons_out[ FD_TOPO_MAX_LINKS ]; ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; for( ulong i=0UL; itile_cnt; i++ ) { fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; for( ulong j=0UL; jin_cnt; j++ ) { for( ulong k=0UL; kout_cnt; k++ ) { if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { - cons_out[ reliable_cons_cnt ] = k; cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; FD_TEST( cons_fseq[ reliable_cons_cnt ] ); reliable_cons_cnt++; diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 4bd9b352ba..3d0c641d99 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -15,8 +15,8 @@ struct fd_unzstd_tile { fd_stream_frag_meta_ctx_t in_state; /* input mcache context */ fd_zstd_dstream_t * dstream; /* zstd decompress reader */ fd_stream_writer_t * writer; /* stream writer object */ + ulong const volatile * shutdown_signal; }; - typedef struct fd_unzstd_tile fd_unzstd_tile_t; FD_FN_PURE static ulong @@ -30,8 +30,7 @@ scratch_footprint( fd_topo_tile_t const * tile ) { ulong l = FD_LAYOUT_INIT; l = FD_LAYOUT_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); l = FD_LAYOUT_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); - l = FD_LAYOUT_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint() ); - return l; + return FD_LAYOUT_FINI( l, scratch_align() ); } static void @@ -40,7 +39,6 @@ unprivileged_init( fd_topo_t * topo, FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); fd_unzstd_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); void * zstd_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); - void * writer_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint() ); void * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ 0 ] ].dcache_obj_id ) ); FD_TEST( out_dcache ); @@ -49,14 +47,46 @@ unprivileged_init( fd_topo_t * topo, ctx->in_state.in_buf = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; ctx->dstream = fd_zstd_dstream_new( zstd_mem, ZSTD_WINDOW_SZ ); - ctx->writer = fd_stream_writer_new( writer_mem, topo, tile, 0, ZSTD_WINDOW_SZ, 512UL, 2UL ); fd_zstd_dstream_reset( ctx->dstream ); } static void -during_housekeeping( fd_unzstd_tile_t * ctx ) { - (void)ctx; +fd_unzstd_init_from_stream_ctx( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + + /* There's only one writer */ + ctx->writer = &stream_ctx->writers[0]; + fd_stream_writer_set_read_max( ctx->writer, ZSTD_FRAME_SZ ); + ctx->shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 2; +} + +__attribute__((noreturn)) static void +fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { + FD_MGAUGE_SET( TILE, STATUS, 2UL ); + fd_stream_writer_notify_shutdown( ctx->writer ); + FD_COMPILER_MFENCE(); + + for(;;) pause(); +} + +static void +fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, + fd_unzstd_tile_t * ctx ) { + ulong const in_seq_max = FD_VOLATILE_CONST( *ctx->shutdown_signal ); + if( FD_UNLIKELY( in_seq_max == stream_ctx->in[ 0 ].base.seq && in_seq_max != 0) ) { + FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", + in_seq_max, stream_ctx->in[0].base.seq)); + fd_unzstd_shutdown( ctx ); + } +} + +static void +during_housekeeping( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + fd_unzstd_poll_shutdown( stream_ctx, ctx ); } static int @@ -65,6 +95,12 @@ on_stream_frag( void * _ctx, fd_stream_frag_meta_t const * frag, ulong * sz ) { fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); + + /* Don't do anything if backpressured */ + if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { + return 0; + } + uchar const * chunk0 = ctx->in_state.in_buf + frag->loff; uchar const * chunk_start = chunk0 + ctx->in_state.in_skip; uchar const * chunk_end = chunk0 + frag->sz; @@ -95,7 +131,6 @@ on_stream_frag( void * _ctx, break; } - /* fd_zstd_dstream_read updates chunk_start and out */ int zstd_err = fd_zstd_dstream_read( ctx->dstream, &cur, chunk_end, &out, out_end, NULL ); if( FD_UNLIKELY( zstd_err>0) ) { @@ -138,27 +173,6 @@ fd_unzstd_in_update( fd_stream_reader_t * in ) { accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; } -__attribute__((noreturn)) static void -fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { - FD_MGAUGE_SET( TILE, STATUS, 2UL ); - fd_stream_writer_notify_shutdown( ctx->writer ); - FD_COMPILER_MFENCE(); - - for(;;) pause(); -} - -static void -fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, - fd_unzstd_tile_t * ctx, - ulong const volatile * shutdown_signal ) { - ulong const in_seq_max = FD_VOLATILE_CONST( *shutdown_signal ); - if( FD_UNLIKELY( in_seq_max == stream_ctx->in[ 0 ].base.seq && in_seq_max != 0) ) { - FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", - in_seq_max, stream_ctx->in[0].base.seq)); - fd_unzstd_shutdown( ctx ); - } -} - __attribute__((noinline)) static void fd_unzstd_run1( fd_unzstd_tile_t * ctx, @@ -166,52 +180,14 @@ fd_unzstd_run1( FD_LOG_INFO(( "Running unzstd tile" )); - /* run loop init */ - ulong const volatile * restrict shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 2; - fd_stream_writer_init_flow_control_credits( ctx->writer ); - fd_stream_ctx_init_run_loop( stream_ctx ); - - for(;;) { - if( FD_UNLIKELY( fd_stream_ticks_is_housekeeping_time( stream_ctx->ticks ) ) ) { - ulong event_idx = fd_event_map_get_event( stream_ctx->event_map ); - - if( FD_LIKELY( event_idxcons_cnt ) ) { /* receive credits */ - ulong cons_idx = event_idx; - - /* Receive flow control credits from this out. */ - fd_stream_writer_receive_flow_control_credits( ctx->writer, cons_idx ); - - fd_unzstd_poll_shutdown( stream_ctx, ctx, shutdown_signal ); - - } else if( event_idx>stream_ctx->cons_cnt) { /* send credits */ - ulong in_idx = event_idx - stream_ctx->cons_cnt - 1UL; - fd_unzstd_in_update( &stream_ctx->in[ in_idx ] ); - } - else { /* event_idx==cons_cnt, housekeeping event */ - - /* Update metrics counters to external viewers */ - fd_stream_metrics_update_external( stream_ctx->metrics, - stream_ctx->ticks->now, - NULL, - ctx ); - /* Recalculate flow control credits */ - ulong slowest_cons = ULONG_MAX; - fd_stream_writer_update_flow_control_credits( ctx->writer, - &slowest_cons ); - fd_stream_ctx_update_cons_slow( stream_ctx, - slowest_cons ); - during_housekeeping( ctx ); - } - fd_stream_ctx_housekeeping_advance( stream_ctx ); - } - - /* Check if we are backpressured, otherwise poll */ - if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { - fd_stream_ctx_process_backpressure( stream_ctx ); - } else { - fd_stream_ctx_poll( stream_ctx, ctx, on_stream_frag ); - } - } + fd_stream_ctx_run( stream_ctx, + ctx, + fd_unzstd_init_from_stream_ctx, + fd_unzstd_in_update, + during_housekeeping, + NULL, + NULL, + on_stream_frag ); } static void @@ -219,10 +195,10 @@ fd_unzstd_run( fd_topo_t * topo, fd_topo_tile_t * tile ) { fd_unzstd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); ulong in_cnt = fd_topo_tile_producer_cnt( topo, tile ); - ulong cons_cnt = fd_topo_tile_reliable_consumer_cnt( topo, tile ); + ulong out_cnt = tile->out_cnt; - void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, cons_cnt ) ); - fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, cons_cnt ); + void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, out_cnt ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, out_cnt ); fd_unzstd_run1( ctx, stream_ctx ); } @@ -236,3 +212,5 @@ fd_topo_run_tile_t fd_tile_snapshot_restore_Unzstd = { .run = fd_unzstd_run, }; #endif + +#undef NAME diff --git a/src/discof/restore/stream/fd_event_map.c b/src/discof/restore/stream/fd_event_map.c index 8d02c60df5..9eddf5f0c5 100644 --- a/src/discof/restore/stream/fd_event_map.c +++ b/src/discof/restore/stream/fd_event_map.c @@ -3,7 +3,7 @@ fd_event_map_t * fd_event_map_new( void * mem, ulong in_cnt, - ulong cons_cnt ) { + ulong out_cnt ) { if( FD_UNLIKELY( !mem ) ) { FD_LOG_WARNING(( "NULL mem" )); return NULL; @@ -17,13 +17,13 @@ fd_event_map_new( void * mem, FD_SCRATCH_ALLOC_INIT( l, mem ); fd_event_map_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_event_map_t), sizeof(fd_event_map_t) ); - ulong event_cnt = 1UL + in_cnt + cons_cnt; + ulong event_cnt = 1UL + in_cnt + out_cnt; self->event_map = FD_SCRATCH_ALLOC_APPEND( l, alignof(ushort), sizeof(ushort)*event_cnt ); self->event_cnt = event_cnt; self->event_seq = 0UL; /* init event map */ - fd_event_map_init(self, in_cnt, cons_cnt ); + fd_event_map_init(self, in_cnt, out_cnt ); return self; } diff --git a/src/discof/restore/stream/fd_event_map.h b/src/discof/restore/stream/fd_event_map.h index eaa89b9739..13b3fc0d08 100644 --- a/src/discof/restore/stream/fd_event_map.h +++ b/src/discof/restore/stream/fd_event_map.h @@ -22,8 +22,8 @@ fd_event_map_align( void ) { FD_FN_CONST static inline ulong fd_event_map_footprint( ulong in_cnt, - ulong cons_cnt ) { - ulong event_cnt = 1UL + in_cnt + cons_cnt; + ulong out_cnt ) { + ulong event_cnt = 1UL + in_cnt + out_cnt; ulong l = FD_LAYOUT_INIT; l = FD_LAYOUT_APPEND(l, alignof(fd_event_map_t), sizeof(fd_event_map_t) ); l = FD_LAYOUT_APPEND(l, alignof(ushort), sizeof(ushort)*event_cnt ); @@ -33,17 +33,17 @@ fd_event_map_footprint( ulong in_cnt, fd_event_map_t * fd_event_map_new( void * mem, ulong in_cnt, - ulong cons_cnt ); + ulong out_cnt ); static inline void fd_event_map_init( fd_event_map_t * map, ulong in_cnt, - ulong cons_cnt ) { + ulong out_cnt ) { ulong idx = 0UL; - map->event_map[ idx++ ] = (ushort)cons_cnt; + map->event_map[ idx++ ] = (ushort)out_cnt; for( ulong in_idx=0UL; in_idxevent_map[ idx++ ] = (ushort)(in_idx+cons_cnt+1UL); - for( ulong cons_idx=0UL; cons_idxevent_map[ idx++ ] = (ushort)(in_idx+out_cnt+1UL); + for( ulong cons_idx=0UL; cons_idxevent_map[ idx++ ] = (ushort)cons_idx; } diff --git a/src/discof/restore/stream/fd_stream_ctx.c b/src/discof/restore/stream/fd_stream_ctx.c index 65c54dceb7..b6c95a6f34 100644 --- a/src/discof/restore/stream/fd_stream_ctx.c +++ b/src/discof/restore/stream/fd_stream_ctx.c @@ -21,28 +21,19 @@ fd_stream_ctx_init( fd_stream_ctx_t * ctx, ctx->in_ptrs[ i ] = &ctx->in[ i ]; } - /* init cons_fseq */ - ulong cons_idx = 0UL; - for( ulong i=0UL; itile_cnt; i++ ) { - fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; - for( ulong j=0UL; jin_cnt; j++ ) { - for( ulong k=0UL; kout_cnt; k++ ) { - if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) { - ctx->cons_fseq[ cons_idx ] = consumer_tile->in_link_fseq[ j ]; - } - } - } + /* init writers */ + for( ulong i=0UL; iout_cnt; i++ ) { + fd_stream_writer_new( &ctx->writers[i], + topo, + tile, + i, + 512UL, + 2UL ); } fd_stream_ticks_init( ctx->ticks, ctx->event_map->event_cnt, 1e3L ); fd_stream_metrics_init( ctx->metrics ); FD_TEST( fd_rng_join( fd_rng_new( ctx->rng, 0, 0UL ) ) ); - - /* init metrics link for cons_slow */ - cons_idx = 0UL; - for( ; cons_idxcons_cnt; cons_idx++ ) { - ctx->cons_slow[ cons_idx ] = (ulong *)(fd_metrics_link_out( fd_metrics_base_tl, cons_idx ) + FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF); - } } fd_stream_ctx_t * @@ -50,7 +41,7 @@ fd_stream_ctx_new( void * mem, fd_topo_t * topo, fd_topo_tile_t * tile, ulong in_cnt, - ulong cons_cnt ) { + ulong out_cnt ) { if( FD_UNLIKELY( !mem ) ) { FD_LOG_WARNING(( "NULL mem" )); return NULL; @@ -66,14 +57,13 @@ fd_stream_ctx_new( void * mem, self->in = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); self->in_ptrs = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); - self->cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong const *), cons_cnt*sizeof(ulong const *) ); - self->cons_slow = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); - void * event_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, cons_cnt ) ); + void * event_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, out_cnt ) ); + self->writers = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), sizeof(fd_stream_writer_t)*out_cnt ); self->in_cnt = in_cnt; - self->cons_cnt = cons_cnt; + self->out_cnt = out_cnt; - self->event_map = fd_event_map_new( event_map_mem, in_cnt, cons_cnt ); + self->event_map = fd_event_map_new( event_map_mem, in_cnt, out_cnt ); fd_stream_ctx_init( self, topo, tile ); self->in_seq = 0UL; diff --git a/src/discof/restore/stream/fd_stream_ctx.h b/src/discof/restore/stream/fd_stream_ctx.h index a8bf3522b6..e5d2f089ab 100644 --- a/src/discof/restore/stream/fd_stream_ctx.h +++ b/src/discof/restore/stream/fd_stream_ctx.h @@ -3,22 +3,42 @@ #include "../../../disco/topo/fd_topo.h" #include "fd_stream_reader.h" +#include "fd_stream_writer.h" #include "fd_event_map.h" #include "fd_stream_ticks.h" #include "fd_stream_metrics.h" +struct fd_stream_ctx; +typedef struct fd_stream_ctx fd_stream_ctx_t; + +typedef void fd_tile_ctx_init_run_loop_fn_t( void * ctx, + fd_stream_ctx_t * stream_ctx ); +typedef void fd_tile_update_in_fn_t( fd_stream_reader_t * reader ); +typedef void fd_tile_housekeeping_fn_t( void * ctx, + fd_stream_ctx_t * stream_ctx ); +typedef void fd_tile_metrics_write_fn_t( void * ctx ); +typedef void fd_tile_run_fn_t( void * ctx, fd_stream_ctx_t * stream_ctx ); +typedef int fd_tile_on_stream_frag_fn_t( void * ctx, + fd_stream_reader_t * reader, + fd_stream_frag_meta_t const * frag, + ulong * sz ); + struct fd_stream_ctx { - fd_stream_reader_t * in; - fd_stream_reader_t ** in_ptrs; - ulong ** cons_fseq; - ulong ** cons_slow; - fd_event_map_t * event_map; - ulong in_cnt; - ulong cons_cnt; - ulong in_seq; - fd_rng_t rng[1]; - fd_stream_ticks_t ticks[1]; - fd_stream_metrics_t metrics[1]; + fd_stream_reader_t * in; + fd_stream_reader_t ** in_ptrs; + fd_event_map_t * event_map; + ulong in_cnt; + ulong out_cnt; + ulong in_seq; + fd_rng_t rng[1]; + fd_stream_ticks_t ticks[1]; + fd_stream_metrics_t metrics[1]; + fd_stream_writer_t * writers; + fd_tile_update_in_fn_t * tile_update_in; + fd_tile_housekeeping_fn_t * tile_housekeeping; + fd_tile_metrics_write_fn_t * tile_metrics_write; + fd_tile_run_fn_t * tile_run; + fd_tile_on_stream_frag_fn_t * tile_on_stream_frag; }; typedef struct fd_stream_ctx fd_stream_ctx_t; @@ -31,14 +51,12 @@ fd_stream_ctx_scratch_align( void ) { FD_FN_PURE static inline ulong fd_stream_ctx_scratch_footprint( ulong in_cnt, - ulong cons_cnt ) { + ulong out_cnt ) { ulong l = FD_LAYOUT_INIT; l = FD_LAYOUT_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); /* in */ l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); /* in_ptrs */ - l = FD_LAYOUT_APPEND( l, alignof(ulong const *), cons_cnt*sizeof(ulong const *) ); /* cons_fseq */ - l = FD_LAYOUT_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); /* cons_slow */ - l = FD_LAYOUT_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, cons_cnt ) ); /* event_map */ + l = FD_LAYOUT_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, out_cnt ) ); /* event_map */ return FD_LAYOUT_FINI( l, fd_stream_ctx_scratch_align() ); } @@ -47,7 +65,7 @@ fd_stream_ctx_new( void * mem, fd_topo_t * topo, fd_topo_tile_t * tile, ulong in_cnt, - ulong cons_cnt ); + ulong out_cnt ); void fd_stream_ctx_init( fd_stream_ctx_t * ctx, @@ -55,19 +73,46 @@ fd_stream_ctx_init( fd_stream_ctx_t * ctx, fd_topo_tile_t * tile ); static inline void -fd_stream_ctx_update_cons_slow( fd_stream_ctx_t * ctx, - ulong slowest_cons ) { -if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) { - FD_COMPILER_MFENCE(); - (*ctx->cons_slow[ slowest_cons ]) += ctx->metrics->in_backp; - FD_COMPILER_MFENCE(); +fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx, + void * tile_ctx, + fd_tile_ctx_init_run_loop_fn_t * tile_init_run_loop, + fd_tile_update_in_fn_t * tile_update_in, + fd_tile_housekeeping_fn_t * tile_housekeeping, + fd_tile_metrics_write_fn_t * tile_metrics_write, + fd_tile_run_fn_t * tile_run, + fd_tile_on_stream_frag_fn_t * tile_on_stream_frag ) { + if( ctx->in_cnt && !tile_update_in ) { + FD_LOG_ERR(( "tile_update_in function cannot be null if there are producers to this tile!" )); } -} -static inline void -fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx ) { + if( ctx->in_cnt && !tile_on_stream_frag ) { + FD_LOG_ERR(( "tile_on_stream_frag function cannot be null if there are producers to this tile!" )); + } + + ctx->tile_update_in = tile_update_in; + ctx->tile_housekeeping = tile_housekeeping; + ctx->tile_metrics_write = tile_metrics_write; + ctx->tile_run = tile_run; + ctx->tile_on_stream_frag = tile_on_stream_frag; + FD_MGAUGE_SET( TILE, STATUS, 1UL ); fd_stream_ticks_init_timer( ctx->ticks ); + + for( ulong i=0UL; iout_cnt; i++ ) { + fd_stream_writer_init_flow_control_credits( &ctx->writers[ i ] ); + } + + if( tile_init_run_loop ) { + tile_init_run_loop( tile_ctx, ctx ); + } +} + +static inline void +fd_stream_ctx_update_flow_control_credits( fd_stream_ctx_t * ctx ) { + /* Recalculate flow control credits */ + for( ulong i=0UL; iout_cnt; i++ ) { + fd_stream_writer_update_flow_control_credits( &ctx->writers[i] ); + } } static inline void @@ -84,58 +129,127 @@ fd_stream_ctx_housekeeping_advance( fd_stream_ctx_t * ctx ) { ctx->rng); } +static inline void +fd_stream_ctx_do_housekeeping( fd_stream_ctx_t * ctx, + void * tile_ctx ) { + if( FD_UNLIKELY( fd_stream_ticks_is_housekeeping_time( ctx->ticks ) ) ) { + ulong event_idx = fd_event_map_get_event( ctx->event_map ); + + if( FD_LIKELY( event_idxout_cnt ) ) { /* receive credits */ + ulong out_idx = event_idx; + + /* Receive flow control credits from this out. */ + fd_stream_writer_receive_flow_control_credits( &ctx->writers[ out_idx ] ); + + } else if( event_idx>ctx->out_cnt) { /* send credits */ + ulong in_idx = event_idx - ctx->out_cnt - 1UL; + ctx->tile_update_in( &ctx->in[ in_idx ] ); + + } else { /* event_idx==out_cnt, housekeeping event */ + + /* Update metrics counters to external viewers */ + fd_stream_metrics_update_external( ctx->metrics, + ctx->ticks->now, + ctx->tile_metrics_write, + ctx ); + fd_stream_ctx_update_flow_control_credits( ctx ); + + if( ctx->tile_housekeeping ) { + ctx->tile_housekeeping( tile_ctx, ctx ); + } + } + + fd_stream_ctx_housekeeping_advance( ctx ); + } +} + static inline void fd_stream_ctx_process_backpressure( fd_stream_ctx_t * ctx ) { - fd_stream_metrics_update_backpressure( ctx->metrics, - ctx->ticks->housekeeping_ticks ); - fd_stream_ticks_reload_backpressure( ctx->ticks ); + ctx->metrics->backp_cnt += (ulong)!ctx->metrics->in_backp; + ctx->metrics->in_backp = 1UL; + FD_SPIN_PAUSE(); + ctx->metrics->regime_ticks[2] += ctx->ticks->housekeeping_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[5] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; } -typedef int fd_on_stream_frag_fn_t( void * ctx, - fd_stream_reader_t * reader, - fd_stream_frag_meta_t const * frag, - ulong * sz ); +static inline int +fd_stream_ctx_is_backpressured( fd_stream_ctx_t * ctx ) { + int backpressured = 1UL; + for( ulong i=0UL; iout_cnt; i++ ) { + backpressured &= fd_stream_writer_is_backpressured( &ctx->writers[i] ); + } + return backpressured; +} + +static inline void +fd_stream_ctx_advance_poll_empty( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[0] += ctx->ticks->housekeeping_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[3] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} static inline void -fd_stream_ctx_poll( fd_stream_ctx_t * stream_ctx, - void * ctx, - fd_on_stream_frag_fn_t * on_stream_frag ) { - stream_ctx->metrics->in_backp = 0UL; - stream_ctx->ticks->prefrag_ticks = 0UL; +fd_stream_ctx_advance_poll( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[1] += ctx->ticks->housekeeping_ticks; + ctx->metrics->regime_ticks[4] += ctx->ticks->prefrag_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[7] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline void +fd_stream_ctx_advance_poll_idle( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[0] += ctx->ticks->housekeeping_ticks; + ctx->metrics->regime_ticks[3] += ctx->ticks->prefrag_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[6] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + +static inline void +fd_stream_ctx_poll( fd_stream_ctx_t * ctx, + void * tile_ctx ) { + ctx->metrics->in_backp = 0UL; + + if( FD_UNLIKELY( !ctx->in_cnt ) ) { + fd_stream_ctx_advance_poll_empty( ctx ); + return; + } + + ctx->ticks->prefrag_ticks = 0UL; /* select input to poll */ - fd_stream_reader_t * this_in = &stream_ctx->in[ stream_ctx->in_seq ]; - stream_ctx->in_seq++; - if( stream_ctx->in_seq>=stream_ctx->in_cnt ) { - stream_ctx->in_seq = 0UL; /* cmov */ + fd_stream_reader_t * this_in = &ctx->in[ ctx->in_seq ]; + ctx->in_seq++; + if( ctx->in_seq>=ctx->in_cnt ) { + ctx->in_seq = 0UL; /* cmov */ } fd_frag_reader_consume_ctx_t consume_ctx; long diff = fd_stream_reader_poll_frag( this_in, - stream_ctx->in_seq, + ctx->in_seq, &consume_ctx ); if( FD_UNLIKELY( diff<0L ) ) { - fd_stream_metrics_update_poll( stream_ctx->metrics, - stream_ctx->ticks->housekeeping_ticks, - stream_ctx->ticks->prefrag_ticks, - &stream_ctx->ticks->now); + /* overrun case, technically impossible with reliable streams */ + fd_stream_ctx_advance_poll( ctx ); fd_stream_reader_process_overrun( this_in, &consume_ctx, diff ); } else if ( FD_UNLIKELY( diff ) ) { - fd_stream_metrics_update_poll_idle( stream_ctx->metrics, - stream_ctx->ticks->housekeeping_ticks, - stream_ctx->ticks->prefrag_ticks, - &stream_ctx->ticks->now ); + /* nothing new to poll */ + fd_stream_ctx_advance_poll_idle( ctx ); } else { FD_COMPILER_MFENCE(); ulong sz = 0U; fd_stream_frag_meta_t const * frag = fd_type_pun_const( consume_ctx.mline ); - int consumed_frag = on_stream_frag( ctx, this_in, frag, &sz ); + int consumed_frag = ctx->tile_on_stream_frag( tile_ctx, this_in, frag, &sz ); fd_stream_reader_consume_bytes( this_in, sz ); @@ -144,13 +258,51 @@ fd_stream_ctx_poll( fd_stream_ctx_t * stream_ctx, &consume_ctx ); } - fd_stream_metrics_update_poll( stream_ctx->metrics, - stream_ctx->ticks->housekeeping_ticks, - stream_ctx->ticks->prefrag_ticks, - &stream_ctx->ticks->now ); + fd_stream_ctx_advance_poll( ctx ); } } +static inline void +fd_stream_ctx_run_loop( fd_stream_ctx_t * ctx, + void * tile_ctx ) { + for(;;) { + fd_stream_ctx_do_housekeeping( ctx, tile_ctx ); + + if( FD_UNLIKELY( fd_stream_ctx_is_backpressured( ctx ) ) ) { + fd_stream_ctx_process_backpressure( ctx ); + continue; + } + + /* equivalent of after credit */ + if( ctx->tile_run ) { + ctx->tile_run( tile_ctx, ctx ); + } + + fd_stream_ctx_poll( ctx, tile_ctx ); + } +} + +static inline void +fd_stream_ctx_run( fd_stream_ctx_t * ctx, + void * tile_ctx, + fd_tile_ctx_init_run_loop_fn_t * tile_init_run_loop, + fd_tile_update_in_fn_t * tile_update_in, + fd_tile_housekeeping_fn_t * tile_housekeeping, + fd_tile_metrics_write_fn_t * tile_metrics_write, + fd_tile_run_fn_t * tile_run, + fd_tile_on_stream_frag_fn_t * tile_on_stream_frag ) { + fd_stream_ctx_init_run_loop( ctx, + tile_ctx, + tile_init_run_loop, + tile_update_in, + tile_housekeeping, + tile_metrics_write, + tile_run, + tile_on_stream_frag ); + + fd_stream_ctx_run_loop( ctx, tile_ctx ); +} + static inline void * fd_stream_ctx_delete( fd_stream_ctx_t * ctx ) { for( ulong i=0UL; iin_cnt; i++ ) { @@ -158,11 +310,6 @@ fd_stream_ctx_delete( fd_stream_ctx_t * ctx ) { ctx->in_ptrs[ i ] = NULL; } - for( ulong i=0UL; icons_cnt; i++ ) { - ctx->cons_fseq[ i ] = NULL; - ctx->cons_slow[ i ] = NULL; - } - fd_event_map_delete( ctx->event_map ); fd_memset(ctx, 0, sizeof(fd_stream_ctx_t) ); return (void *)ctx; diff --git a/src/discof/restore/stream/fd_stream_metrics.h b/src/discof/restore/stream/fd_stream_metrics.h index 031e5500e2..04eb728287 100644 --- a/src/discof/restore/stream/fd_stream_metrics.h +++ b/src/discof/restore/stream/fd_stream_metrics.h @@ -41,39 +41,6 @@ fd_stream_metrics_update_external( fd_stream_metrics_t * metrics, metrics->backp_cnt = 0UL; } -static inline void -fd_stream_metrics_update_backpressure( fd_stream_metrics_t * metrics, - ulong housekeeping_ticks ) { - metrics->backp_cnt += (ulong)!metrics->in_backp; - metrics->in_backp = 1UL; - FD_SPIN_PAUSE(); - metrics->regime_ticks[2] += housekeeping_ticks; -} - -static inline void -fd_stream_metrics_update_poll( fd_stream_metrics_t * metrics, - ulong housekeeping_ticks, - ulong prefrag_ticks, - long * now) { - metrics->regime_ticks[1] += housekeeping_ticks; - metrics->regime_ticks[4] += prefrag_ticks; - long next = fd_tickcount(); - metrics->regime_ticks[7] += (ulong)(next - *now); - *now = next; -} - -static inline void -fd_stream_metrics_update_poll_idle( fd_stream_metrics_t * metrics, - ulong housekeeping_ticks, - ulong prefrag_ticks, - long * now) { - metrics->regime_ticks[0] += housekeeping_ticks; - metrics->regime_ticks[3] += prefrag_ticks; - long next = fd_tickcount(); - metrics->regime_ticks[6] += (ulong)(next - *now); - *now = next; -} - FD_PROTOTYPES_END #endif /* HEADER_fd_src_discof_restore_stream_fd_stream_metrics_h */ diff --git a/src/discof/restore/stream/fd_stream_reader.h b/src/discof/restore/stream/fd_stream_reader.h index e3e2312a91..3d9610dcfe 100644 --- a/src/discof/restore/stream/fd_stream_reader.h +++ b/src/discof/restore/stream/fd_stream_reader.h @@ -2,32 +2,7 @@ #define HEADER_fd_src_discof_restore_stream_fd_stream_reader_h #include "fd_frag_reader.h" - -/* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for - stream I/O. */ - -union fd_stream_frag_meta { - -struct { - - ulong seq; /* frag sequence number */ - uint sz; - ushort unused; - ushort ctl; - - ulong goff; /* stream offset */ - ulong loff; /* dcache offset */ - -}; - -fd_frag_meta_t f[1]; - -}; - -typedef union fd_stream_frag_meta fd_stream_frag_meta_t; - -FD_STATIC_ASSERT( alignof(fd_stream_frag_meta_t)==32, abi ); -FD_STATIC_ASSERT( sizeof (fd_stream_frag_meta_t)==32, abi ); +#include "../../restore/fd_restore_base.h" struct fd_stream_reader { union { @@ -49,27 +24,6 @@ typedef struct fd_stream_reader fd_stream_reader_t; FD_PROTOTYPES_BEGIN -static inline void -fd_mcache_publish_stream( fd_stream_frag_meta_t * mcache, - ulong depth, - ulong seq, - ulong goff, - ulong loff, - ulong sz, - ulong ctl ) { - fd_stream_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth ); - FD_COMPILER_MFENCE(); - meta->seq = fd_seq_dec( seq, 1UL ); - FD_COMPILER_MFENCE(); - meta->goff = goff; - meta->sz = (uint)sz; - meta->ctl = (ushort)ctl; - meta->loff = loff; - FD_COMPILER_MFENCE(); - meta->seq = seq; - FD_COMPILER_MFENCE(); -} - FD_FN_CONST static inline ulong fd_stream_reader_align( void ) { return alignof(fd_stream_reader_t); diff --git a/src/discof/restore/stream/fd_stream_ticks.h b/src/discof/restore/stream/fd_stream_ticks.h index 9db346cfac..ac181fadb1 100644 --- a/src/discof/restore/stream/fd_stream_ticks.h +++ b/src/discof/restore/stream/fd_stream_ticks.h @@ -46,10 +46,4 @@ fd_stream_ticks_reload_housekeeping( fd_stream_ticks_t * ticks, fd_rng_t * rng ) ticks->now = next; } -static inline void -fd_stream_ticks_reload_backpressure( fd_stream_ticks_t * ticks ) { - long next = fd_tickcount(); - ticks->now = next; -} - #endif /* HEADER_fd_src_discof_restore_stream_fd_stream_ticks_h */ diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c index dea9aeb027..69e654d03d 100644 --- a/src/discof/restore/stream/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -7,7 +7,6 @@ fd_stream_writer_new( void * mem, fd_topo_t * topo, fd_topo_tile_t * tile, ulong link_id, - ulong read_max, ulong burst_byte, ulong burst_frag ) { if( FD_UNLIKELY( !mem ) ) { @@ -34,7 +33,7 @@ fd_stream_writer_new( void * mem, self->buf_off = 0UL; self->buf_sz = fd_dcache_data_sz( dcache ); self->goff = 0UL; - self->read_max = read_max; + self->read_max = 0UL; /* this should be set by the tile via fd_stream_writer_set_read_max */ self->stream_off = 0UL; self->goff_start = 0UL; self->out_seq = 0UL; @@ -61,7 +60,9 @@ fd_stream_writer_new( void * mem, for( ulong j=0UL; jin_cnt; j++ ) { if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ link_id ] && consumer_tile->in_link_reliable[ j ] ) ) { self->cons_fseq[ cons_idx ] = consumer_tile->in_link_fseq[ j ]; - if( FD_UNLIKELY( !self->cons_fseq[ cons_idx ] ) ) FD_LOG_ERR(( "NULL cons_fseq[%lu]", cons_idx )); + if( FD_UNLIKELY( !self->cons_fseq[ cons_idx ] ) ) { + FD_LOG_ERR(( "NULL cons_fseq[%lu] for out_link=%lu", cons_idx, tile->out_link_id[ link_id ] )); + } cons_idx++; } } diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h index a08bd2a0af..174e515d6b 100644 --- a/src/discof/restore/stream/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -27,8 +27,8 @@ struct fd_stream_writer { ulong cr_frag_avail; /* frags available in the slowest consumer */ ulong cr_byte_max; /* max dcache buffer credits (size of dcache buffer)*/ ulong cr_frag_max; /* max mcache frag credits */ - ulong burst_byte; - ulong burst_frag; + ulong burst_byte; /* dcache backpressure threshold */ + ulong burst_frag; /* mcache backpressure threshold */ ulong cons_cnt; /* number of consumers */ ulong * cons_seq; /* consumer fseq values */ ulong ** cons_fseq; /* consumer fseq pointers */ @@ -60,7 +60,6 @@ fd_stream_writer_new( void * mem, fd_topo_t * topo, fd_topo_tile_t * tile, ulong link_id, - ulong read_max, ulong burst_byte, ulong burst_frag ); @@ -73,17 +72,23 @@ fd_stream_writer_init_flow_control_credits( fd_stream_writer_t * writer ) { } static inline void -fd_stream_writer_receive_flow_control_credits( fd_stream_writer_t * writer, - ulong cons_idx) { - FD_COMPILER_MFENCE(); - writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][0] ); - writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][1] ); - FD_COMPILER_MFENCE(); +fd_stream_writer_set_read_max( fd_stream_writer_t * writer, + ulong read_max ) { + writer->read_max = read_max; } static inline void -fd_stream_writer_update_flow_control_credits( fd_stream_writer_t * writer, - ulong * slowest_cons_out ) { +fd_stream_writer_receive_flow_control_credits( fd_stream_writer_t * writer ) { + for( ulong i=0UL; icons_cnt; i++ ) { + FD_COMPILER_MFENCE(); + writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*i ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][0] ); + writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*i+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][1] ); + FD_COMPILER_MFENCE(); + } +} + +static inline void +fd_stream_writer_update_flow_control_credits( fd_stream_writer_t * writer ) { ulong slowest_cons = ULONG_MAX; if( FD_LIKELY( writer->cr_byte_availcr_byte_max || writer->cr_frag_availcr_frag_max ) ) { ulong cr_byte_avail = writer->cr_byte_max; @@ -99,10 +104,6 @@ fd_stream_writer_update_flow_control_credits( fd_stream_writer_t * writer, writer->cr_byte_avail = cr_byte_avail; writer->cr_frag_avail = cr_frag_avail; } - - if( slowest_cons_out ) { - *slowest_cons_out = slowest_cons; - } } static inline ulong diff --git a/src/flamenco/snapshot/fd_snapshot_http.c b/src/flamenco/snapshot/fd_snapshot_http.c index bf62bee2e6..4ddb9a6342 100644 --- a/src/flamenco/snapshot/fd_snapshot_http.c +++ b/src/flamenco/snapshot/fd_snapshot_http.c @@ -122,14 +122,18 @@ fd_snapshot_http_new( void * mem, return this; } -static void +void fd_snapshot_http_cleanup_fds( fd_snapshot_http_t * this ) { if( this->snapshot_fd!=-1 ) { - close( this->snapshot_fd ); + if( FD_UNLIKELY( close( this->snapshot_fd ) ) ) { + FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } this->snapshot_fd = -1; } if( this->socket_fd!=-1 ) { - close( this->socket_fd ); + if( FD_UNLIKELY( close( this->socket_fd ) ) ) { + FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); + } this->socket_fd = -1; } } @@ -192,6 +196,18 @@ fd_snapshot_http_init( fd_snapshot_http_t * this ) { return 0; } +/* for http tile use */ +void +fd_snapshot_http_privileged_init( fd_snapshot_http_t * this ) { + fd_snapshot_http_init( this ); + + /* open snapshot fd for writing to snapshot file */ + this->snapshot_fd = open( this->snapshot_path, O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR ); + if( this->snapshot_fd<0 ) { + FD_LOG_ERR(( "open(%s) failed (%d-%s)", this->snapshot_path, errno, fd_io_strerror( errno ) )); + } +} + /* fd_snapshot_http_req writes out the request. */ static int @@ -652,6 +668,8 @@ fd_io_istream_snapshot_http_read( void * _this, return fd_snapshot_http_dl( this, dst, dst_max, dst_sz ); case FD_SNAPSHOT_HTTP_STATE_READ: return fd_snapshot_http_read( this, dst, dst_max, dst_sz ); + case FD_SNAPSHOT_HTTP_STATE_DONE: + return 1; } /* Not yet ready to read at this point. */ diff --git a/src/flamenco/snapshot/fd_snapshot_http.h b/src/flamenco/snapshot/fd_snapshot_http.h index 20b36aeea4..481e27b80d 100644 --- a/src/flamenco/snapshot/fd_snapshot_http.h +++ b/src/flamenco/snapshot/fd_snapshot_http.h @@ -98,6 +98,19 @@ struct fd_snapshot_http { typedef struct fd_snapshot_http fd_snapshot_http_t; +FD_FN_PURE static inline ulong +fd_snapshot_http_align( void ) { + return fd_ulong_max( alignof(fd_snapshot_http_t), alignof(fd_snapshot_name_t) ); +} + +FD_FN_PURE static inline ulong +fd_snapshot_http_footprint( void ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_snapshot_http_t), sizeof(fd_snapshot_http_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_snapshot_name_t), sizeof(fd_snapshot_name_t) ); + return FD_LAYOUT_FINI( l, fd_snapshot_http_align() ); +} + fd_snapshot_http_t * fd_snapshot_http_new( void * mem, const char * dst_str, @@ -106,6 +119,9 @@ fd_snapshot_http_new( void * mem, const char * snapshot_dir, fd_snapshot_name_t * name_out ); +void +fd_snapshot_http_privileged_init( fd_snapshot_http_t * this ); + void * fd_snapshot_http_delete( fd_snapshot_http_t * this ); @@ -142,6 +158,9 @@ fd_io_istream_snapshot_http_virtual( fd_snapshot_http_t * this ) { }; } +void +fd_snapshot_http_cleanup_fds( fd_snapshot_http_t * this ); + FD_PROTOTYPES_END #endif /* HEADER_fd_src_flamenco_snapshot_fd_snapshot_http_h */ From d78e7ff91768c489ac0c7d1df6c791ff972bcca4 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Fri, 16 May 2025 03:25:37 +0000 Subject: [PATCH 27/34] remove unnecessary dependency --- src/discof/restore/fd_restore_base.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index 808fe5bf6b..bdf011481f 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -2,7 +2,6 @@ #define HEADER_fd_src_discof_restore_fd_restore_base_h #include "../../tango/mcache/fd_mcache.h" -#include "../../disco/topo/fd_topo.h" /* fd_stream_frag_meta_t is a variation of fd_frag_meta_t optimized for stream I/O. */ From 90fe1e6f7eb4cf8abf20b123b73123b5e655e4d4 Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Fri, 16 May 2025 09:01:15 -0500 Subject: [PATCH 28/34] cleaned up filerd tile (#5139) --- src/discof/restore/fd_filerd_tile.c | 306 ++++--------------- src/discof/restore/fd_httpdl_tile.c | 34 ++- src/discof/restore/fd_unzstd_tile.c | 12 +- src/discof/restore/stream/fd_stream_ctx.c | 3 + src/discof/restore/stream/fd_stream_ctx.h | 20 +- src/discof/restore/stream/fd_stream_reader.h | 15 +- 6 files changed, 117 insertions(+), 273 deletions(-) diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c index b3b63b5414..f64b3c93bf 100644 --- a/src/discof/restore/fd_filerd_tile.c +++ b/src/discof/restore/fd_filerd_tile.c @@ -1,4 +1,5 @@ #include "fd_restore_base.h" +#include "stream/fd_stream_ctx.h" #include "../../disco/topo/fd_topo.h" #include "../../disco/metrics/fd_metrics.h" #include @@ -7,18 +8,11 @@ #include #define NAME "FileRd" +#define FILE_READ_MAX 8UL<<20 struct fd_filerd_tile { - int fd; - - uchar * buf; /* dcache */ - ulong buf_base; - ulong buf_off; - ulong buf_sz; - ulong goff; - ulong read_max; - - ulong * out_sync; /* mcache seq sync */ + fd_stream_writer_t * writer; + int fd; }; typedef struct fd_filerd_tile fd_filerd_tile_t; @@ -48,42 +42,29 @@ privileged_init( fd_topo_t * topo, static void unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { - fd_filerd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); - + (void)topo; if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); - - void * out_dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ 0 ] ].dcache_obj_id ) ); - FD_TEST( out_dcache ); - - ctx->buf = out_dcache; - ctx->buf_base = (ulong)out_dcache - (ulong)fd_wksp_containing( out_dcache ); - ctx->buf_off = 0UL; - ctx->buf_sz = fd_dcache_data_sz( out_dcache ); - ctx->goff = 0UL; - ctx->read_max = (8UL<<20); - ctx->out_sync = fd_mcache_seq_laddr( topo->links[ tile->out_link_id[ 0 ] ].mcache ); } static void -during_housekeeping( fd_filerd_tile_t * ctx ) { - (void)ctx; -} +fd_filerd_init_from_stream_ctx( void * _ctx, + fd_stream_ctx_t * stream_ctx ) { + fd_filerd_tile_t * ctx = fd_type_pun(_ctx); -static void -metrics_write( fd_filerd_tile_t * ctx ) { - (void)ctx; + /* TODO: this should be a join */ + ctx->writer = &stream_ctx->writers[0]; + fd_stream_writer_set_read_max( ctx->writer, FILE_READ_MAX ); } __attribute__((noreturn)) FD_FN_UNUSED static void -fd_filerd_shutdown( fd_filerd_tile_t * ctx, - ulong seq_final ) { +fd_filerd_shutdown( fd_filerd_tile_t * ctx ) { if( FD_UNLIKELY( close( ctx->fd ) ) ) { FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); } ctx->fd = -1; FD_MGAUGE_SET( TILE, STATUS, 2UL ); - FD_VOLATILE( ctx->out_sync[ 2 ] ) = seq_final; + fd_stream_writer_notify_shutdown( ctx->writer ); FD_COMPILER_MFENCE(); FD_LOG_INFO(( "Reached end of file" )); @@ -91,28 +72,30 @@ fd_filerd_shutdown( fd_filerd_tile_t * ctx, } static void -after_credit( fd_filerd_tile_t * ctx, - fd_stream_frag_meta_t * out_mcache, - ulong const out_depth, - ulong * restrict out_seq, - ulong * restrict cr_frag_avail, - ulong * restrict cr_byte_avail, - int * restrict charge_busy_after ) { - /* Assumes *cr_frag_avail>=2 */ +after_credit( void * _ctx, + fd_stream_ctx_t * stream_ctx, + int * poll_in FD_PARAM_UNUSED ) { + fd_filerd_tile_t * ctx = fd_type_pun(_ctx); + (void)stream_ctx; + + /* technically, this is not needed because fd_stream_ctx_run_loop + checks for backpresure on all outgoing links and there is only one + outgoing link anyways. But, it is added for clarity that + callbacks should handle backpressure for their out links. */ + if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { + return; + } + int fd = ctx->fd; if( FD_UNLIKELY( fd<0 ) ) return; - if( FD_UNLIKELY( ctx->buf_off >= ctx->buf_sz ) ) { - FD_LOG_CRIT(( "Buffer overflow (buf_off=%lu buf_sz=%lu)", ctx->buf_off, ctx->buf_sz )); - } - - ulong const read_max = fd_ulong_min( *cr_byte_avail, ctx->read_max ); - ulong const read_sz = fd_ulong_min( read_max, ctx->buf_sz - ctx->buf_off ); + uchar * out = fd_stream_writer_get_write_ptr( ctx->writer ); + ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); - long res = read( fd, ctx->buf + ctx->buf_off, read_sz ); + long res = read( fd, out, dst_max ); if( FD_UNLIKELY( res<=0L ) ) { if( FD_UNLIKELY( res==0 ) ) { - fd_filerd_shutdown( ctx, out_seq[0] ); + fd_filerd_shutdown( ctx ); return; } if( FD_LIKELY( errno==EAGAIN ) ) return; @@ -121,220 +104,37 @@ after_credit( fd_filerd_tile_t * ctx, } ulong sz = (ulong)res; - cr_byte_avail[0] -= sz; - *charge_busy_after = 1; - - ulong frag_sz = fd_ulong_min( read_sz, sz ); - - ulong loff = ctx->buf_base + ctx->buf_off; - fd_mcache_publish_stream( out_mcache, out_depth, out_seq[0], ctx->goff, loff, frag_sz, 0 ); - out_seq[0] = fd_seq_inc( out_seq[0], 1UL ); - cr_frag_avail[0]--; - ctx->goff += frag_sz; - ctx->buf_off += frag_sz; - if( ctx->buf_off >= ctx->buf_sz ) ctx->buf_off = 0UL; /* cmov */ + if( FD_LIKELY( sz ) ) { + fd_stream_writer_advance( ctx->writer, sz ); + fd_stream_writer_publish( ctx->writer, sz ); + } } -/* run/run1 are a custom run loop based on fd_stem.c. */ - __attribute__((noinline)) static void -fd_filerd_run1( - fd_filerd_tile_t * ctx, - fd_stream_frag_meta_t * out_mcache, - void * out_dcache, - ulong cons_cnt, - ushort * restrict event_map, /* cnt=1+cons_cnt */ - ulong ** restrict cons_fseq, /* cnt= cons_cnt points to each consumer's fseq */ - ulong volatile ** restrict cons_slow, /* cnt= cons_cnt points to 'slow' metrics */ - ulong * restrict cons_seq, /* cnt=2*cons_cnt cache of recent fseq observations */ - long lazy, - fd_rng_t * rng -) { - - /* out flow control state */ - ulong cr_byte_avail; /* byte burst quota */ - ulong cr_frag_avail; /* frag burst quota */ - - /* housekeeping state */ - ulong event_cnt; - ulong event_seq; - ulong async_min; /* min number of ticks between a housekeeping event */ - - /* performance metrics */ - ulong metric_in_backp; - ulong metric_backp_cnt; - ulong metric_regime_ticks[9]; - - metric_in_backp = 1UL; - metric_backp_cnt = 0UL; - memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) ); - - /* out frag stream init */ - - cr_byte_avail = 0UL; - cr_frag_avail = 0UL; - - ulong const out_depth = fd_mcache_depth( out_mcache->f ); - ulong out_seq = 0UL; - - ulong const out_bufsz = fd_dcache_data_sz( out_dcache ); - - ulong const cr_byte_max = out_bufsz; - ulong const cr_frag_max = out_depth; - - ulong const burst_byte = 512UL; /* don't producing frags smaller than this */ - ulong const burst_frag = 2UL; - - for( ulong cons_idx=0UL; cons_idx=0L ) ) { - ulong event_idx = (ulong)event_map[ event_seq ]; - - if( FD_LIKELY( event_idxgoff, cons_seq[ 2*cons_idx+1 ] ), 0L ), 0L ); - slowest_cons = fd_ulong_if( cons_cr_byte_avail=event_cnt ) ) { - event_seq = 0UL; - ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt ); - ushort map_tmp = event_map[ swap_idx ]; - event_map[ swap_idx ] = event_map[ 0 ]; - event_map[ 0 ] = map_tmp; - } - - /* Reload housekeeping timer */ - then = now + (long)fd_tempo_async_reload( rng, async_min ); - long next = fd_tickcount(); - housekeeping_ticks = (ulong)(next - now); - now = next; - } - - /* Check if we are backpressured. */ - - if( FD_UNLIKELY( cr_byte_availlinks[ tile->out_link_id[ 0 ] ].mcache ); - FD_TEST( out_mcache ); - - ulong reliable_cons_cnt = 0UL; - ulong * cons_fseq[ FD_TOPO_MAX_LINKS ]; - for( ulong i=0UL; itile_cnt; i++ ) { - fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; - for( ulong j=0UL; jin_cnt; j++ ) { - if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[0] && consumer_tile->in_link_reliable[ j ] ) ) { - cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ]; - FD_TEST( cons_fseq[ reliable_cons_cnt ] ); - reliable_cons_cnt++; - FD_TEST( reliable_cons_cnttile_obj_id ); - ushort event_map[ 1+reliable_cons_cnt ]; - ulong volatile * cons_slow[ reliable_cons_cnt ]; - ulong cons_seq [ 2*reliable_cons_cnt ]; - fd_filerd_run1( ctx, out_mcache, ctx->buf, reliable_cons_cnt, event_map, cons_fseq, cons_slow, cons_seq, (ulong)10e3, rng ); + ulong in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong out_cnt = tile->out_cnt; + + void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, out_cnt ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, out_cnt ); + fd_filerd_run1( ctx, stream_ctx ); } fd_topo_run_tile_t fd_tile_snapshot_restore_FileRd = { diff --git a/src/discof/restore/fd_httpdl_tile.c b/src/discof/restore/fd_httpdl_tile.c index 59bdb483be..6631f4365c 100644 --- a/src/discof/restore/fd_httpdl_tile.c +++ b/src/discof/restore/fd_httpdl_tile.c @@ -4,7 +4,7 @@ #include "stream/fd_stream_ctx.h" #include -#define NAME "http" +#define NAME "HttpDl" #define HTTP_CHUNK_SZ 8 * 1024 * 1024UL struct fd_httpdl_tile { @@ -53,11 +53,20 @@ privileged_init( fd_topo_t * topo, } static void -fd_httpdl_init_from_stream_ctx( void * _ctx, +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile ) { + (void)topo; + if( FD_UNLIKELY( tile->in_cnt !=0UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 0", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); +} + +static void +fd_httpdl_init_from_stream_ctx( void * _ctx, fd_stream_ctx_t * stream_ctx ) { fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); - /* There's only one writer */ + /* There's only one writer. Since fd_stream_ctx_t owns the + stream writer, we just assign the pointer here. */ ctx->writer = &stream_ctx->writers[0]; fd_stream_writer_set_read_max( ctx->writer, HTTP_CHUNK_SZ ); } @@ -75,7 +84,8 @@ fd_httpdl_shutdown( fd_httpdl_tile_t * ctx ) { __attribute__((unused)) static void after_credit_chunk( void * _ctx, - fd_stream_ctx_t * stream_ctx ) { + fd_stream_ctx_t * stream_ctx, + int * opt_poll_in FD_PARAM_UNUSED ) { fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); (void)stream_ctx; ulong downloaded_sz = 0UL; @@ -113,7 +123,8 @@ after_credit_chunk( void * _ctx, __attribute__((unused)) static void after_credit_stream( void * _ctx, - fd_stream_ctx_t * stream_ctx ) { + fd_stream_ctx_t * stream_ctx, + int * opt_poll_in FD_PARAM_UNUSED ) { fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); (void)stream_ctx; @@ -131,7 +142,7 @@ after_credit_stream( void * _ctx, if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); - if( sz ) { + if( FD_LIKELY( sz ) ) { fd_stream_writer_advance( ctx->writer, sz ); fd_stream_writer_publish( ctx->writer, sz ); } @@ -167,11 +178,12 @@ fd_httpdl_run( fd_topo_t * topo, } fd_topo_run_tile_t fd_tile_snapshot_restore_HttpDl = { - .name = "HttpDl", - .scratch_align = scratch_align, - .scratch_footprint = scratch_footprint, - .privileged_init = privileged_init, - .run = fd_httpdl_run, + .name = NAME, + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, + .run = fd_httpdl_run, }; #undef NAME diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 3d0c641d99..86d09b3955 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -15,7 +15,6 @@ struct fd_unzstd_tile { fd_stream_frag_meta_ctx_t in_state; /* input mcache context */ fd_zstd_dstream_t * dstream; /* zstd decompress reader */ fd_stream_writer_t * writer; /* stream writer object */ - ulong const volatile * shutdown_signal; }; typedef struct fd_unzstd_tile fd_unzstd_tile_t; @@ -37,6 +36,10 @@ static void unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); + + if( FD_UNLIKELY( tile->in_cnt !=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu ins, expected 1", tile->in_cnt )); + if( FD_UNLIKELY( tile->out_cnt!=1UL ) ) FD_LOG_ERR(( "tile `" NAME "` has %lu outs, expected 1", tile->out_cnt )); + fd_unzstd_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_unzstd_tile_t), sizeof(fd_unzstd_tile_t) ); void * zstd_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_zstd_dstream_align(), fd_zstd_dstream_footprint( ZSTD_WINDOW_SZ ) ); @@ -59,7 +62,6 @@ fd_unzstd_init_from_stream_ctx( void * _ctx, /* There's only one writer */ ctx->writer = &stream_ctx->writers[0]; fd_stream_writer_set_read_max( ctx->writer, ZSTD_FRAME_SZ ); - ctx->shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 2; } __attribute__((noreturn)) static void @@ -74,10 +76,10 @@ fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { static void fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, fd_unzstd_tile_t * ctx ) { - ulong const in_seq_max = FD_VOLATILE_CONST( *ctx->shutdown_signal ); - if( FD_UNLIKELY( in_seq_max == stream_ctx->in[ 0 ].base.seq && in_seq_max != 0) ) { + ulong shutdown_seq = fd_stream_reader_poll_shutdown( stream_ctx->in_ptrs[0] ); + if( FD_UNLIKELY( shutdown_seq ) ) { FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", - in_seq_max, stream_ctx->in[0].base.seq)); + shutdown_seq, stream_ctx->in[0].base.seq)); fd_unzstd_shutdown( ctx ); } } diff --git a/src/discof/restore/stream/fd_stream_ctx.c b/src/discof/restore/stream/fd_stream_ctx.c index b6c95a6f34..6d94775035 100644 --- a/src/discof/restore/stream/fd_stream_ctx.c +++ b/src/discof/restore/stream/fd_stream_ctx.c @@ -22,6 +22,7 @@ fd_stream_ctx_init( fd_stream_ctx_t * ctx, } /* init writers */ + /* FIXME: make burst_byte and burst_frag configurable */ for( ulong i=0UL; iout_cnt; i++ ) { fd_stream_writer_new( &ctx->writers[i], topo, @@ -33,6 +34,8 @@ fd_stream_ctx_init( fd_stream_ctx_t * ctx, fd_stream_ticks_init( ctx->ticks, ctx->event_map->event_cnt, 1e3L ); fd_stream_metrics_init( ctx->metrics ); + + /* FIXME: rng seed should not be 0 */ FD_TEST( fd_rng_join( fd_rng_new( ctx->rng, 0, 0UL ) ) ); } diff --git a/src/discof/restore/stream/fd_stream_ctx.h b/src/discof/restore/stream/fd_stream_ctx.h index e5d2f089ab..c812703684 100644 --- a/src/discof/restore/stream/fd_stream_ctx.h +++ b/src/discof/restore/stream/fd_stream_ctx.h @@ -17,7 +17,9 @@ typedef void fd_tile_update_in_fn_t( fd_stream_reader_t * reader ); typedef void fd_tile_housekeeping_fn_t( void * ctx, fd_stream_ctx_t * stream_ctx ); typedef void fd_tile_metrics_write_fn_t( void * ctx ); -typedef void fd_tile_run_fn_t( void * ctx, fd_stream_ctx_t * stream_ctx ); +typedef void fd_tile_run_fn_t( void * ctx, + fd_stream_ctx_t * stream_ctx, + int * opt_poll_in ); typedef int fd_tile_on_stream_frag_fn_t( void * ctx, fd_stream_reader_t * reader, fd_stream_frag_meta_t const * frag, @@ -209,6 +211,14 @@ fd_stream_ctx_advance_poll_idle( fd_stream_ctx_t * ctx ) { ctx->ticks->now = next; } +static inline void +fd_stream_ctx_advance_skip_poll( fd_stream_ctx_t * ctx ) { + ctx->metrics->regime_ticks[1] += ctx->ticks->housekeeping_ticks; + long next = fd_tickcount(); + ctx->metrics->regime_ticks[4] += (ulong)(next - ctx->ticks->now); + ctx->ticks->now = next; +} + static inline void fd_stream_ctx_poll( fd_stream_ctx_t * ctx, void * tile_ctx ) { @@ -275,7 +285,13 @@ fd_stream_ctx_run_loop( fd_stream_ctx_t * ctx, /* equivalent of after credit */ if( ctx->tile_run ) { - ctx->tile_run( tile_ctx, ctx ); + int poll_in = 1; + ctx->tile_run( tile_ctx, ctx, &poll_in ); + + if( FD_UNLIKELY( !poll_in ) ) { + fd_stream_ctx_advance_skip_poll( ctx ); + continue; + } } fd_stream_ctx_poll( ctx, tile_ctx ); diff --git a/src/discof/restore/stream/fd_stream_reader.h b/src/discof/restore/stream/fd_stream_reader.h index 3d9610dcfe..8b322594f5 100644 --- a/src/discof/restore/stream/fd_stream_reader.h +++ b/src/discof/restore/stream/fd_stream_reader.h @@ -18,7 +18,9 @@ struct fd_stream_reader { fd_frag_reader_t r[1]; } base; - ulong goff; + + ulong goff; + ulong const volatile * shutdown_signal; }; typedef struct fd_stream_reader fd_stream_reader_t; @@ -35,12 +37,14 @@ fd_stream_reader_footprint( void ) { } static inline void -fd_stream_reader_init( fd_stream_reader_t * reader, +fd_stream_reader_init( fd_stream_reader_t * reader, fd_frag_meta_t const * mcache, ulong * fseq, ulong in_idx ) { fd_frag_reader_init( reader->base.r, mcache, fseq, in_idx ); reader->goff = 0UL; + /* shutdown signal is located at fseq 2 */ + reader->shutdown_signal = fd_mcache_seq_laddr_const( reader->base.mcache->f ) + 2; } static inline fd_stream_reader_t * @@ -93,6 +97,13 @@ fd_stream_reader_consume_frag( fd_stream_reader_t * reader, fd_frag_reader_consume_frag( reader->base.r, ctx ); } +static inline ulong +fd_stream_reader_poll_shutdown( fd_stream_reader_t * reader ) { + ulong const in_seq_max = FD_VOLATILE_CONST( *reader->shutdown_signal ); + return in_seq_max == reader->base.seq && in_seq_max != 0 ? + in_seq_max : 0UL; +} + static inline void * fd_stream_reader_delete( fd_stream_reader_t * reader ) { fd_frag_reader_delete( reader->base.r ); From 79715211c90eb1d21077ee5322d8d37fd1fa344a Mon Sep 17 00:00:00 2001 From: cali-jumptrading Date: Fri, 16 May 2025 15:37:20 +0000 Subject: [PATCH 29/34] rename fd_stream_writer fields, add stream writer join api --- src/discof/restore/fd_filerd_tile.c | 5 ++- src/discof/restore/fd_httpdl_tile.c | 7 ++--- src/discof/restore/fd_unzstd_tile.c | 6 ++-- src/discof/restore/stream/fd_stream_writer.c | 21 +++++++------ src/discof/restore/stream/fd_stream_writer.h | 32 ++++++++++++++------ 5 files changed, 41 insertions(+), 30 deletions(-) diff --git a/src/discof/restore/fd_filerd_tile.c b/src/discof/restore/fd_filerd_tile.c index f64b3c93bf..23fc148d84 100644 --- a/src/discof/restore/fd_filerd_tile.c +++ b/src/discof/restore/fd_filerd_tile.c @@ -52,9 +52,8 @@ fd_filerd_init_from_stream_ctx( void * _ctx, fd_stream_ctx_t * stream_ctx ) { fd_filerd_tile_t * ctx = fd_type_pun(_ctx); - /* TODO: this should be a join */ - ctx->writer = &stream_ctx->writers[0]; - fd_stream_writer_set_read_max( ctx->writer, FILE_READ_MAX ); + ctx->writer = fd_stream_writer_join( &stream_ctx->writers[0] ); + fd_stream_writer_set_frag_sz_max( ctx->writer, FILE_READ_MAX ); } __attribute__((noreturn)) FD_FN_UNUSED static void diff --git a/src/discof/restore/fd_httpdl_tile.c b/src/discof/restore/fd_httpdl_tile.c index 6631f4365c..0a90af60f3 100644 --- a/src/discof/restore/fd_httpdl_tile.c +++ b/src/discof/restore/fd_httpdl_tile.c @@ -65,10 +65,9 @@ fd_httpdl_init_from_stream_ctx( void * _ctx, fd_stream_ctx_t * stream_ctx ) { fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); - /* There's only one writer. Since fd_stream_ctx_t owns the - stream writer, we just assign the pointer here. */ - ctx->writer = &stream_ctx->writers[0]; - fd_stream_writer_set_read_max( ctx->writer, HTTP_CHUNK_SZ ); + /* join writer */ + ctx->writer = fd_stream_writer_join( &stream_ctx->writers[0] ); + fd_stream_writer_set_frag_sz_max( ctx->writer, HTTP_CHUNK_SZ ); } __attribute__((noreturn)) FD_FN_UNUSED static void diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 86d09b3955..d1f44bbdef 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -59,9 +59,9 @@ fd_unzstd_init_from_stream_ctx( void * _ctx, fd_stream_ctx_t * stream_ctx ) { fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); - /* There's only one writer */ - ctx->writer = &stream_ctx->writers[0]; - fd_stream_writer_set_read_max( ctx->writer, ZSTD_FRAME_SZ ); + /* join writer */ + ctx->writer = fd_stream_writer_join( &stream_ctx->writers[0] ); + fd_stream_writer_set_frag_sz_max( ctx->writer, ZSTD_FRAME_SZ ); } __attribute__((noreturn)) static void diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c index 69e654d03d..cdc28b1131 100644 --- a/src/discof/restore/stream/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -27,16 +27,17 @@ fd_stream_writer_new( void * mem, fd_stream_frag_meta_t * out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ link_id ] ].mcache ); ulong cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, link ); - self->out_mcache = out_mcache; - self->buf = dcache; - self->buf_base = (ulong)dcache - (ulong)fd_wksp_containing( dcache ); - self->buf_off = 0UL; - self->buf_sz = fd_dcache_data_sz( dcache ); - self->goff = 0UL; - self->read_max = 0UL; /* this should be set by the tile via fd_stream_writer_set_read_max */ - self->stream_off = 0UL; - self->goff_start = 0UL; - self->out_seq = 0UL; + self->magic = FD_STREAM_WRITER_MAGIC; + self->out_mcache = out_mcache; + self->dcache = dcache; + self->base = (ulong)dcache - (ulong)fd_wksp_containing( dcache ); + self->buf_off = 0UL; + self->buf_sz = fd_dcache_data_sz( dcache ); + self->goff = 0UL; + self->frag_sz_max = 0UL; /* this should be set by the tile via fd_stream_writer_set_frag_sz_max */ + self->stream_off = 0UL; + self->goff_start = 0UL; + self->out_seq = 0UL; /* Set up flow control state */ self->cr_byte_avail = 0UL; diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h index 174e515d6b..5099427b74 100644 --- a/src/discof/restore/stream/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -8,16 +8,17 @@ /* A shared stream has a single producer and multiple consumers. fd_stream_writer implements the producer APIs of the shared stream */ struct fd_stream_writer { + ulong magic; /* magic */ fd_stream_frag_meta_t * out_mcache; /* frag producer mcache */ - uchar * buf; /* laddr of shared dcache buffer */ - ulong buf_base; /* offset to the dcache buffer from wksp */ + uchar * dcache; /* laddr of shared dcache buffer */ + ulong base; /* offset to the dcache buffer from wksp */ /* dcache buffer state */ ulong buf_off; /* local write offset into dcache buffer */ ulong buf_sz; /* dcache buffer size */ ulong goff; /* global offset into byte stream */ - ulong read_max; /* max chunk size */ + ulong frag_sz_max; /* max frag size (controls the size of a single write into dcache)*/ ulong stream_off; /* start of published stream */ ulong goff_start; /* start of goff in stream */ ulong out_seq; /* current sequence number */ @@ -36,6 +37,7 @@ struct fd_stream_writer { }; typedef struct fd_stream_writer fd_stream_writer_t; +#define FD_STREAM_WRITER_MAGIC (0xFD57337717E736C0UL) #define EXPECTED_FSEQ_CNT_PER_CONS 2 FD_PROTOTYPES_BEGIN @@ -50,9 +52,19 @@ fd_stream_writer_footprint( void ) { return sizeof(fd_stream_writer_t); } +static inline fd_stream_writer_t * +fd_stream_writer_join( void * _writer ) { + fd_stream_writer_t * writer = (fd_stream_writer_t *)_writer; + if( FD_UNLIKELY( !writer ) ) return NULL; + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)writer, fd_stream_writer_align() ) ) ) return NULL; + if( FD_UNLIKELY( writer->magic!=FD_STREAM_WRITER_MAGIC ) ) return NULL; + + return writer; +} + static inline uchar * fd_stream_writer_get_write_ptr( fd_stream_writer_t * writer ) { - return writer->buf + writer->buf_off; + return writer->dcache + writer->buf_off; } fd_stream_writer_t * @@ -72,9 +84,9 @@ fd_stream_writer_init_flow_control_credits( fd_stream_writer_t * writer ) { } static inline void -fd_stream_writer_set_read_max( fd_stream_writer_t * writer, - ulong read_max ) { - writer->read_max = read_max; +fd_stream_writer_set_frag_sz_max( fd_stream_writer_t * writer, + ulong frag_sz_max ) { + writer->frag_sz_max = frag_sz_max; } static inline void @@ -113,14 +125,14 @@ fd_stream_writer_get_avail_bytes( fd_stream_writer_t * writer ) { return 0; } - ulong const read_max = fd_ulong_min( writer->cr_byte_avail, writer->read_max ); - return fd_ulong_min( read_max, writer->buf_sz - writer->buf_off ); + ulong const frag_sz_max = fd_ulong_min( writer->cr_byte_avail, writer->frag_sz_max ); + return fd_ulong_min( frag_sz_max, writer->buf_sz - writer->buf_off ); } static inline void fd_stream_writer_publish( fd_stream_writer_t * writer, ulong frag_sz ) { - ulong loff = writer->buf_base + writer->stream_off; + ulong loff = writer->base + writer->stream_off; fd_mcache_publish_stream( writer->out_mcache, fd_mcache_depth( writer->out_mcache->f ), writer->out_seq, From b0199849f6d934d02c5dd8695816311ee10c5fbc Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Fri, 16 May 2025 18:49:05 +0000 Subject: [PATCH 30/34] belt sanding stream_writer --- src/disco/topo/fd_topo.h | 6 +- src/discof/restore/Local.mk | 2 +- src/discof/restore/fd_httpdl_tile.c | 75 ++-- src/discof/restore/fd_restore_base.h | 18 +- src/discof/restore/fd_snapin_tile.c | 32 +- src/discof/restore/fd_unzstd_tile.c | 79 ++--- src/discof/restore/stream/fd_stream_ctx.c | 118 ++++--- src/discof/restore/stream/fd_stream_ctx.h | 124 ++++--- src/discof/restore/stream/fd_stream_writer.c | 185 +++++++--- src/discof/restore/stream/fd_stream_writer.h | 338 ++++++++++++------- src/discof/restore/test_snapin_tile.c | 82 ++++- 11 files changed, 656 insertions(+), 403 deletions(-) diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index 38a4aba0bc..6a693b14d8 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -89,7 +89,7 @@ typedef struct { All input links will be automatically polled by the tile infrastructure, and output links will automatically source and manage credits from consumers. */ -typedef struct { +struct fd_topo_tile { ulong id; /* The ID of this tile. Indexed from [0, tile_cnt). When placed in a topology, the ID must be the index of the tile in the tiles list. */ char name[ 7UL ]; /* The name of this tile. There can be multiple of each tile name in a topology. */ ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */ @@ -455,7 +455,9 @@ typedef struct { } actidx; }; -} fd_topo_tile_t; +}; + +typedef struct fd_topo_tile fd_topo_tile_t; typedef struct { ulong id; diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index bb1366c801..386678881e 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -8,4 +8,4 @@ $(call add-objs,fd_httpdl_tile,fd_discof) $(call add-objs,stream/fd_stream_writer,fd_discof) $(call add-objs,stream/fd_event_map,fd_discof) $(call add-objs,stream/fd_stream_ctx,fd_discof) -$(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_util) +$(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_tango fd_util) diff --git a/src/discof/restore/fd_httpdl_tile.c b/src/discof/restore/fd_httpdl_tile.c index 59bdb483be..71cdb5deea 100644 --- a/src/discof/restore/fd_httpdl_tile.c +++ b/src/discof/restore/fd_httpdl_tile.c @@ -58,15 +58,15 @@ fd_httpdl_init_from_stream_ctx( void * _ctx, fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); /* There's only one writer */ - ctx->writer = &stream_ctx->writers[0]; - fd_stream_writer_set_read_max( ctx->writer, HTTP_CHUNK_SZ ); + ctx->writer = stream_ctx->writers[0]; + fd_stream_writer_set_frag_sz_max( ctx->writer, HTTP_CHUNK_SZ ); } __attribute__((noreturn)) FD_FN_UNUSED static void fd_httpdl_shutdown( fd_httpdl_tile_t * ctx ) { fd_snapshot_http_cleanup_fds( ctx->http ); FD_MGAUGE_SET( TILE, STATUS, 2UL ); - fd_stream_writer_notify_shutdown( ctx->writer ); + fd_stream_writer_close( ctx->writer ); FD_COMPILER_MFENCE(); FD_LOG_WARNING(("Done downloading snapshot")); @@ -76,39 +76,23 @@ fd_httpdl_shutdown( fd_httpdl_tile_t * ctx ) { __attribute__((unused)) static void after_credit_chunk( void * _ctx, fd_stream_ctx_t * stream_ctx ) { - fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); + fd_httpdl_tile_t * ctx = _ctx; (void)stream_ctx; - ulong downloaded_sz = 0UL; - /* Don't do anything if backpressured */ - if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { - return; - } + /* Output */ + uchar * const out = fd_stream_writer_prepare( ctx->writer ); + uchar * const out_end = out + fd_stream_writer_publish_sz_max( ctx->writer ); + uchar * out_cur = out; - for(;;) { - if( downloaded_sz >= HTTP_CHUNK_SZ ) { - fd_stream_writer_publish( ctx->writer, downloaded_sz ); - break; - } - /* get write pointers into dcache buffer */ - uchar * out = fd_stream_writer_get_write_ptr( ctx->writer ); - ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); - ulong sz = 0UL; - - if( dst_max==0 ) { - fd_stream_writer_publish( ctx->writer, downloaded_sz ); - break; - } - - int err = fd_io_istream_snapshot_http_read( ctx->http, out, dst_max, &sz ); + while( out_curhttp, out_cur, (ulong)out_cur-(ulong)out, &chunk_sz ); if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); - - if( sz ) { - fd_stream_writer_advance( ctx->writer, sz ); - downloaded_sz += sz; - } + out_cur += chunk_sz; } + + fd_stream_writer_publish( ctx->writer, (ulong)out_cur-(ulong)out, 0UL ); } __attribute__((unused)) static void @@ -117,24 +101,16 @@ after_credit_stream( void * _ctx, fd_httpdl_tile_t * ctx = fd_type_pun(_ctx); (void)stream_ctx; - /* Don't do anything if backpressured */ - if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { - return; - } - - /* get write pointers into dcache buffer */ - uchar * out = fd_stream_writer_get_write_ptr( ctx->writer ); - ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); - ulong sz = 0UL; + /* Output */ + uchar * const out = fd_stream_writer_prepare( ctx->writer ); + ulong const out_max = fd_stream_writer_publish_sz_max( ctx->writer ); - int err = fd_io_istream_snapshot_http_read( ctx->http, out, dst_max, &sz ); + ulong chunk_sz; + int err = fd_io_istream_snapshot_http_read( ctx->http, out, out_max, &chunk_sz ); if( FD_UNLIKELY( err==1 ) ) fd_httpdl_shutdown( ctx ); else if( FD_UNLIKELY( err ) ) FD_LOG_ERR(( "http err: %d", err )); - if( sz ) { - fd_stream_writer_advance( ctx->writer, sz ); - fd_stream_writer_publish( ctx->writer, sz ); - } + fd_stream_writer_publish( ctx->writer, chunk_sz, 0UL ); } __attribute__((noinline)) static void @@ -158,11 +134,9 @@ static void fd_httpdl_run( fd_topo_t * topo, fd_topo_tile_t * tile ) { fd_httpdl_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); - ulong in_cnt = fd_topo_tile_producer_cnt( topo, tile ); - ulong out_cnt = tile->out_cnt; - - void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, out_cnt ) ); - fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, out_cnt ); + void * ctx_mem = fd_alloca_check( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_footprint( topo, tile ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile ); + FD_TEST( stream_ctx ); fd_httpdl_run1( ctx, stream_ctx ); } @@ -175,6 +149,3 @@ fd_topo_run_tile_t fd_tile_snapshot_restore_HttpDl = { }; #undef NAME - - - diff --git a/src/discof/restore/fd_restore_base.h b/src/discof/restore/fd_restore_base.h index bdf011481f..0dd55ecc26 100644 --- a/src/discof/restore/fd_restore_base.h +++ b/src/discof/restore/fd_restore_base.h @@ -8,19 +8,19 @@ union fd_stream_frag_meta { -struct { + struct { - ulong seq; /* frag sequence number */ - uint sz; - ushort unused; - ushort ctl; + ulong seq; /* frag sequence number */ + uint sz; + ushort unused; + ushort ctl; - ulong goff; /* stream offset */ - ulong loff; /* dcache offset */ + ulong goff; /* stream offset */ + ulong loff; /* dcache offset */ -}; + }; -fd_frag_meta_t f[1]; + fd_frag_meta_t f[1]; }; diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index 9369f37550..d9b0dbcc3a 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -576,6 +576,25 @@ scratch_footprint( fd_topo_tile_t const * tile ) { return l; } +static fd_snapin_tile_t * +scratch_init( void * mem, + fd_topo_tile_t const * tile ) { + if( FD_UNLIKELY( !mem ) ) return NULL; + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, scratch_align() ) ) ) return NULL; + + FD_SCRATCH_ALLOC_INIT( l, mem ); + fd_snapin_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_snapin_tile_t), sizeof(fd_snapin_tile_t) ); + void * accv_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_snapshot_accv_map_align(), fd_snapshot_accv_map_footprint() ); + void * scratch_mem = FD_SCRATCH_ALLOC_APPEND( l, 16UL, tile->snapin.scratch_sz ); + + fd_memset( ctx, 0, sizeof(fd_snapin_tile_t) ); + ctx->accv_map = fd_snapshot_accv_map_join( fd_snapshot_accv_map_new( accv_map_mem ) ); + FD_TEST( ctx->accv_map ); + ctx->buf = scratch_mem; + + return ctx; +} + FD_FN_UNUSED static void unprivileged_init( fd_topo_t * topo, fd_topo_tile_t * tile ) { @@ -587,11 +606,8 @@ unprivileged_init( fd_topo_t * topo, if( FD_UNLIKELY( !tile->snapin.scratch_sz ) ) FD_LOG_ERR(( "scratch_sz param not set" )); - FD_SCRATCH_ALLOC_INIT( l, fd_topo_obj_laddr( topo, tile->tile_obj_id ) ); - fd_snapin_tile_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_snapin_tile_t), sizeof(fd_snapin_tile_t) ); - void * accv_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_snapshot_accv_map_align(), fd_snapshot_accv_map_footprint() ); - void * scratch_mem = FD_SCRATCH_ALLOC_APPEND( l, 16UL, tile->snapin.scratch_sz ); - fd_memset( ctx, 0, sizeof(fd_snapin_tile_t) ); + fd_snapin_tile_t * ctx = scratch_init( fd_topo_obj_laddr( topo, tile->tile_obj_id ), tile ); + if( FD_UNLIKELY( !ctx ) ) FD_LOG_ERR(( "scratch_init failed" )); /* Init state */ @@ -607,16 +623,10 @@ unprivileged_init( fd_topo_t * topo, /* Join frame buffer */ - ctx->buf = scratch_mem; ctx->buf_sz = 0UL; ctx->buf_ctr = 0UL; ctx->buf_max = tile->snapin.scratch_sz; - /* Join snapshot file parser */ - - ctx->accv_map = fd_snapshot_accv_map_join( fd_snapshot_accv_map_new( accv_map_mem ) ); - FD_TEST( ctx->accv_map ); - /* Join account output */ ctx->out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ 0 ] ].mcache ); diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 3d0c641d99..4839012c04 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -3,12 +3,10 @@ #include "fd_restore_base.h" #include "stream/fd_stream_ctx.h" #include "stream/fd_stream_writer.h" -#include -#include +#include /* pause */ #define NAME "unzstd" #define ZSTD_WINDOW_SZ (33554432UL) -#define ZSTD_FRAME_SZ 16384UL #define LINK_IN_MAX 1 struct fd_unzstd_tile { @@ -57,15 +55,14 @@ fd_unzstd_init_from_stream_ctx( void * _ctx, fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); /* There's only one writer */ - ctx->writer = &stream_ctx->writers[0]; - fd_stream_writer_set_read_max( ctx->writer, ZSTD_FRAME_SZ ); + ctx->writer = stream_ctx->writers[0]; ctx->shutdown_signal = fd_mcache_seq_laddr_const( stream_ctx->in[0].base.mcache->f ) + 2; } __attribute__((noreturn)) static void fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { FD_MGAUGE_SET( TILE, STATUS, 2UL ); - fd_stream_writer_notify_shutdown( ctx->writer ); + fd_stream_writer_close( ctx->writer ); FD_COMPILER_MFENCE(); for(;;) pause(); @@ -96,61 +93,44 @@ on_stream_frag( void * _ctx, ulong * sz ) { fd_unzstd_tile_t * ctx = fd_type_pun(_ctx); - /* Don't do anything if backpressured */ - if( FD_UNLIKELY( fd_stream_writer_is_backpressured( ctx->writer ) ) ) { - return 0; - } + /* Input */ + uchar const * in_chunk0 = ctx->in_state.in_buf + frag->loff; + uchar const * in_chunk_start = in_chunk0 + ctx->in_state.in_skip; + uchar const * in_chunk_end = in_chunk0 + frag->sz; + uchar const * in_cur = in_chunk_start; + int in_consume = 0; - uchar const * chunk0 = ctx->in_state.in_buf + frag->loff; - uchar const * chunk_start = chunk0 + ctx->in_state.in_skip; - uchar const * chunk_end = chunk0 + frag->sz; - uchar const * cur = chunk_start; - ulong total_decompressed = 0UL; - int consume_frag = 0; + /* Output */ + uchar * const out = fd_stream_writer_prepare( ctx->writer ); + uchar * const out_end = out + fd_stream_writer_publish_sz_max( ctx->writer ); + uchar * out_cur = out; - for(;;) { - uchar const * prev = cur; + while( out_curwriter, total_decompressed ); ctx->in_state.in_skip = 0UL; - consume_frag = 1; - break; - } - - /* get write pointers into dcache buffer */ - uchar * buf_write_start = fd_stream_writer_get_write_ptr( ctx->writer ); - uchar * out = buf_write_start; - ulong dst_max = fd_stream_writer_get_avail_bytes( ctx->writer ); - uchar * out_end = buf_write_start + dst_max; - - if( dst_max==0 ) { - /* we are blocked by downstream */ - fd_stream_writer_publish( ctx->writer, total_decompressed ); + in_consume = 1; break; } /* fd_zstd_dstream_read updates chunk_start and out */ - int zstd_err = fd_zstd_dstream_read( ctx->dstream, &cur, chunk_end, &out, out_end, NULL ); - if( FD_UNLIKELY( zstd_err>0) ) { + int zstd_err = fd_zstd_dstream_read( ctx->dstream, &in_cur, in_chunk_end, &out_cur, out_end, NULL ); + if( FD_UNLIKELY( zstd_err>0 ) ) { FD_LOG_ERR(( "fd_zstd_dstream_read failed" )); break; } - /* accumulate decompressed bytes */ - ulong decompress_sz = (ulong)out - (ulong)buf_write_start; - total_decompressed += decompress_sz; - /* accumulate consumed bytes */ - ulong consumed_sz = (ulong)cur - (ulong)prev; + ulong consumed_sz = (ulong)in_cur - (ulong)in_prev; ctx->in_state.in_skip += consumed_sz; - - fd_stream_writer_advance( ctx->writer, decompress_sz ); } - *sz = (ulong)cur - (ulong)chunk_start; - return consume_frag; + fd_stream_writer_publish( ctx->writer, (ulong)out_cur-(ulong)out, 0UL ); + + *sz = (ulong)in_cur - (ulong)in_chunk_start; + return in_consume; } static void @@ -194,13 +174,10 @@ static void fd_unzstd_run( fd_topo_t * topo, fd_topo_tile_t * tile ) { fd_unzstd_tile_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id ); - ulong in_cnt = fd_topo_tile_producer_cnt( topo, tile ); - ulong out_cnt = tile->out_cnt; - - void * ctx_mem = fd_alloca( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_scratch_footprint( in_cnt, out_cnt ) ); - fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile, in_cnt, out_cnt ); - fd_unzstd_run1( ctx, - stream_ctx ); + void * ctx_mem = fd_alloca_check( FD_STEM_SCRATCH_ALIGN, fd_stream_ctx_footprint( topo, tile ) ); + fd_stream_ctx_t * stream_ctx = fd_stream_ctx_new( ctx_mem, topo, tile ); + FD_TEST( stream_ctx ); + fd_unzstd_run1( ctx, stream_ctx ); } #ifndef FD_TILE_TEST diff --git a/src/discof/restore/stream/fd_stream_ctx.c b/src/discof/restore/stream/fd_stream_ctx.c index b6c95a6f34..162394318c 100644 --- a/src/discof/restore/stream/fd_stream_ctx.c +++ b/src/discof/restore/stream/fd_stream_ctx.c @@ -1,71 +1,111 @@ #include "fd_stream_ctx.h" +#include "fd_stream_writer.h" -void -fd_stream_ctx_init( fd_stream_ctx_t * ctx, - fd_topo_t * topo, - fd_topo_tile_t * tile ) { - /* init in */ - ulong in_idx = 0UL; - for( ulong i=0UL; iin_cnt; i++ ) { - if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; - - fd_stream_reader_init( &ctx->in[ in_idx ], - fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ), - tile->in_link_fseq[ i ], - in_idx ); - in_idx++; - } +FD_FN_PURE ulong +fd_stream_ctx_align( void ) { + return 128UL; +} - /* init in_ptrs */ - for( ulong i=0UL; iin_cnt; i++ ) { - ctx->in_ptrs[ i ] = &ctx->in[ i ]; - } +ulong +fd_stream_ctx_footprint( fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { + ulong const in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong const out_cnt = tile->out_cnt; - /* init writers */ - for( ulong i=0UL; iout_cnt; i++ ) { - fd_stream_writer_new( &ctx->writers[i], - topo, - tile, - i, - 512UL, - 2UL ); + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_event_map_t), fd_event_map_footprint( in_cnt, out_cnt ) ); + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_writer_t *), out_cnt*sizeof(fd_stream_writer_t *) ); + for( ulong i=0UL; iout_cnt; i++ ) { + fd_topo_link_t const * link = &topo->links[ tile->out_link_id[ i ] ]; + ulong writer_fp = fd_stream_writer_footprint( fd_topo_link_reliable_consumer_cnt( topo, link ) ); + FD_TEST( writer_fp ); + l = FD_LAYOUT_APPEND( l, fd_stream_writer_align(), writer_fp ); } - - fd_stream_ticks_init( ctx->ticks, ctx->event_map->event_cnt, 1e3L ); - fd_stream_metrics_init( ctx->metrics ); - FD_TEST( fd_rng_join( fd_rng_new( ctx->rng, 0, 0UL ) ) ); + return FD_LAYOUT_FINI( l, fd_stream_ctx_align() ); } fd_stream_ctx_t * -fd_stream_ctx_new( void * mem, - fd_topo_t * topo, - fd_topo_tile_t * tile, - ulong in_cnt, - ulong out_cnt ) { +fd_stream_ctx_new( void * mem, + fd_topo_t const * topo, + fd_topo_tile_t const * tile ) { if( FD_UNLIKELY( !mem ) ) { FD_LOG_WARNING(( "NULL mem" )); return NULL; } - if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_ctx_scratch_align() ) ) ) { + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_ctx_align() ) ) ) { FD_LOG_WARNING(( "unaligned mem" )); return NULL; } FD_SCRATCH_ALLOC_INIT( l, mem ); fd_stream_ctx_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); + fd_memset( self, 0, sizeof(fd_stream_ctx_t) ); + + ulong const in_cnt = fd_topo_tile_producer_cnt( topo, tile ); + ulong const out_cnt = tile->out_cnt; self->in = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); self->in_ptrs = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); void * event_map_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, out_cnt ) ); - self->writers = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), sizeof(fd_stream_writer_t)*out_cnt ); + self->writers = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_writer_t *), out_cnt*sizeof(fd_stream_writer_t *) ); + + for( ulong i=0UL; iout_cnt; i++ ) { + fd_topo_link_t const * link = &topo->links[ tile->out_link_id[ i ] ]; + ulong const cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, link ); + void * writer = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint( cons_cnt ) ); + + self->writers[ i ] = fd_stream_writer_new_topo( + writer, + fd_topo_link_reliable_consumer_cnt( topo, link ), + topo, + tile, + i + ); + if( FD_UNLIKELY( !self->writers[ i ] ) ) return NULL; /* logs warning */ + } self->in_cnt = in_cnt; self->out_cnt = out_cnt; self->event_map = fd_event_map_new( event_map_mem, in_cnt, out_cnt ); - fd_stream_ctx_init( self, topo, tile ); self->in_seq = 0UL; + /* init in */ + ulong in_idx = 0UL; + for( ulong i=0UL; iin_cnt; i++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue; + + fd_stream_reader_init( &self->in[ in_idx ], + fd_type_pun( topo->links[ tile->in_link_id[ i ] ].mcache ), + tile->in_link_fseq[ i ], + in_idx ); + in_idx++; + } + + /* init in_ptrs */ + for( ulong i=0UL; iin_cnt; i++ ) { + self->in_ptrs[ i ] = &self->in[ i ]; + } + + /* init writers */ + for( ulong i=0UL; iout_cnt; i++ ) { + fd_stream_writer_new_topo( + self->writers[i], + self->out_cnt, + topo, + tile, + i + ); + } + + fd_stream_ticks_init( self->ticks, self->event_map->event_cnt, 1e3L ); + fd_stream_metrics_init( self->metrics ); + FD_TEST( fd_rng_join( fd_rng_new( self->rng, 0, 0UL ) ) ); + + FD_SCRATCH_ALLOC_FINI( l, fd_stream_ctx_align() ); return self; } diff --git a/src/discof/restore/stream/fd_stream_ctx.h b/src/discof/restore/stream/fd_stream_ctx.h index e5d2f089ab..c8bf356150 100644 --- a/src/discof/restore/stream/fd_stream_ctx.h +++ b/src/discof/restore/stream/fd_stream_ctx.h @@ -11,17 +11,29 @@ struct fd_stream_ctx; typedef struct fd_stream_ctx fd_stream_ctx_t; -typedef void fd_tile_ctx_init_run_loop_fn_t( void * ctx, - fd_stream_ctx_t * stream_ctx ); -typedef void fd_tile_update_in_fn_t( fd_stream_reader_t * reader ); -typedef void fd_tile_housekeeping_fn_t( void * ctx, - fd_stream_ctx_t * stream_ctx ); -typedef void fd_tile_metrics_write_fn_t( void * ctx ); -typedef void fd_tile_run_fn_t( void * ctx, fd_stream_ctx_t * stream_ctx ); -typedef int fd_tile_on_stream_frag_fn_t( void * ctx, - fd_stream_reader_t * reader, - fd_stream_frag_meta_t const * frag, - ulong * sz ); +typedef void +(* fd_tile_ctx_init_run_loop_fn_t)( void * ctx, + fd_stream_ctx_t * stream_ctx ); + +typedef void +(* fd_tile_update_in_fn_t)( fd_stream_reader_t * reader ); + +typedef void +(* fd_tile_housekeeping_fn_t)( void * ctx, + fd_stream_ctx_t * stream_ctx ); + +typedef void +(* fd_tile_metrics_write_fn_t)( void * ctx ); + +typedef void +(* fd_tile_run_fn_t)( void * ctx, + fd_stream_ctx_t * stream_ctx ); + +typedef int +(* fd_tile_on_stream_frag_fn_t)( void * ctx, + fd_stream_reader_t * reader, + fd_stream_frag_meta_t const * frag, + ulong * sz ); struct fd_stream_ctx { fd_stream_reader_t * in; @@ -33,54 +45,38 @@ struct fd_stream_ctx { fd_rng_t rng[1]; fd_stream_ticks_t ticks[1]; fd_stream_metrics_t metrics[1]; - fd_stream_writer_t * writers; - fd_tile_update_in_fn_t * tile_update_in; - fd_tile_housekeeping_fn_t * tile_housekeeping; - fd_tile_metrics_write_fn_t * tile_metrics_write; - fd_tile_run_fn_t * tile_run; - fd_tile_on_stream_frag_fn_t * tile_on_stream_frag; + fd_stream_writer_t ** writers; + fd_tile_update_in_fn_t tile_update_in; + fd_tile_housekeeping_fn_t tile_housekeeping; + fd_tile_metrics_write_fn_t tile_metrics_write; + fd_tile_run_fn_t tile_run; + fd_tile_on_stream_frag_fn_t tile_on_stream_frag; }; typedef struct fd_stream_ctx fd_stream_ctx_t; FD_PROTOTYPES_BEGIN -FD_FN_PURE static inline ulong -fd_stream_ctx_scratch_align( void ) { - return FD_STEM_SCRATCH_ALIGN; -} +FD_FN_PURE ulong +fd_stream_ctx_align( void ); -FD_FN_PURE static inline ulong -fd_stream_ctx_scratch_footprint( ulong in_cnt, - ulong out_cnt ) { - ulong l = FD_LAYOUT_INIT; - l = FD_LAYOUT_APPEND( l, alignof(fd_stream_ctx_t), sizeof(fd_stream_ctx_t) ); - l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t), in_cnt*sizeof(fd_stream_reader_t) ); /* in */ - l = FD_LAYOUT_APPEND( l, alignof(fd_stream_reader_t *), in_cnt*sizeof(fd_stream_reader_t *) ); /* in_ptrs */ - l = FD_LAYOUT_APPEND( l, fd_event_map_align(), fd_event_map_footprint( in_cnt, out_cnt ) ); /* event_map */ - return FD_LAYOUT_FINI( l, fd_stream_ctx_scratch_align() ); -} +ulong +fd_stream_ctx_footprint( fd_topo_t const * topo, + fd_topo_tile_t const * tile ); fd_stream_ctx_t * -fd_stream_ctx_new( void * mem, - fd_topo_t * topo, - fd_topo_tile_t * tile, - ulong in_cnt, - ulong out_cnt ); - -void -fd_stream_ctx_init( fd_stream_ctx_t * ctx, - fd_topo_t * topo, - fd_topo_tile_t * tile ); +fd_stream_ctx_new( void * mem, + fd_topo_t const * topo, + fd_topo_tile_t const * tile ); static inline void -fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx, - void * tile_ctx, - fd_tile_ctx_init_run_loop_fn_t * tile_init_run_loop, - fd_tile_update_in_fn_t * tile_update_in, - fd_tile_housekeeping_fn_t * tile_housekeeping, - fd_tile_metrics_write_fn_t * tile_metrics_write, - fd_tile_run_fn_t * tile_run, - fd_tile_on_stream_frag_fn_t * tile_on_stream_frag ) { +fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx, + void * tile_ctx, + fd_tile_ctx_init_run_loop_fn_t tile_init_run_loop, + fd_tile_update_in_fn_t tile_update_in, + fd_tile_housekeeping_fn_t tile_housekeeping, + fd_tile_metrics_write_fn_t tile_metrics_write, + fd_tile_run_fn_t tile_run, + fd_tile_on_stream_frag_fn_t tile_on_stream_frag ) { if( ctx->in_cnt && !tile_update_in ) { FD_LOG_ERR(( "tile_update_in function cannot be null if there are producers to this tile!" )); } @@ -98,20 +94,16 @@ fd_stream_ctx_init_run_loop( fd_stream_ctx_t * ctx, FD_MGAUGE_SET( TILE, STATUS, 1UL ); fd_stream_ticks_init_timer( ctx->ticks ); - for( ulong i=0UL; iout_cnt; i++ ) { - fd_stream_writer_init_flow_control_credits( &ctx->writers[ i ] ); - } - if( tile_init_run_loop ) { tile_init_run_loop( tile_ctx, ctx ); } } static inline void -fd_stream_ctx_update_flow_control_credits( fd_stream_ctx_t * ctx ) { +fd_stream_ctx_calculate_backpressure( fd_stream_ctx_t * ctx ) { /* Recalculate flow control credits */ for( ulong i=0UL; iout_cnt; i++ ) { - fd_stream_writer_update_flow_control_credits( &ctx->writers[i] ); + fd_stream_writer_calculate_backpressure( ctx->writers[i] ); } } @@ -139,7 +131,7 @@ fd_stream_ctx_do_housekeeping( fd_stream_ctx_t * ctx, ulong out_idx = event_idx; /* Receive flow control credits from this out. */ - fd_stream_writer_receive_flow_control_credits( &ctx->writers[ out_idx ] ); + fd_stream_writer_receive_flow_control_credits( ctx->writers[ out_idx ] ); } else if( event_idx>ctx->out_cnt) { /* send credits */ ulong in_idx = event_idx - ctx->out_cnt - 1UL; @@ -152,7 +144,7 @@ fd_stream_ctx_do_housekeeping( fd_stream_ctx_t * ctx, ctx->ticks->now, ctx->tile_metrics_write, ctx ); - fd_stream_ctx_update_flow_control_credits( ctx ); + fd_stream_ctx_calculate_backpressure( ctx ); if( ctx->tile_housekeeping ) { ctx->tile_housekeeping( tile_ctx, ctx ); @@ -178,7 +170,7 @@ static inline int fd_stream_ctx_is_backpressured( fd_stream_ctx_t * ctx ) { int backpressured = 1UL; for( ulong i=0UL; iout_cnt; i++ ) { - backpressured &= fd_stream_writer_is_backpressured( &ctx->writers[i] ); + backpressured &= !fd_stream_writer_publish_sz_max( ctx->writers[i] ); } return backpressured; } @@ -283,14 +275,14 @@ fd_stream_ctx_run_loop( fd_stream_ctx_t * ctx, } static inline void -fd_stream_ctx_run( fd_stream_ctx_t * ctx, - void * tile_ctx, - fd_tile_ctx_init_run_loop_fn_t * tile_init_run_loop, - fd_tile_update_in_fn_t * tile_update_in, - fd_tile_housekeeping_fn_t * tile_housekeeping, - fd_tile_metrics_write_fn_t * tile_metrics_write, - fd_tile_run_fn_t * tile_run, - fd_tile_on_stream_frag_fn_t * tile_on_stream_frag ) { +fd_stream_ctx_run( fd_stream_ctx_t * ctx, + void * tile_ctx, + fd_tile_ctx_init_run_loop_fn_t tile_init_run_loop, + fd_tile_update_in_fn_t tile_update_in, + fd_tile_housekeeping_fn_t tile_housekeeping, + fd_tile_metrics_write_fn_t tile_metrics_write, + fd_tile_run_fn_t tile_run, + fd_tile_on_stream_frag_fn_t tile_on_stream_frag ) { fd_stream_ctx_init_run_loop( ctx, tile_ctx, tile_init_run_loop, diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c index 69e654d03d..377fa89923 100644 --- a/src/discof/restore/stream/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -1,76 +1,157 @@ #include "fd_stream_writer.h" #include "../../../util/log/fd_log.h" #include "../../../tango/dcache/fd_dcache.h" +#include "../../../disco/topo/fd_topo.h" fd_stream_writer_t * fd_stream_writer_new( void * mem, - fd_topo_t * topo, - fd_topo_tile_t * tile, - ulong link_id, - ulong burst_byte, - ulong burst_frag ) { + ulong cons_max, + fd_stream_frag_meta_t * mcache, + uchar * dcache ) { if( FD_UNLIKELY( !mem ) ) { FD_LOG_WARNING(( "NULL mem" )); return NULL; } if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, fd_stream_writer_align() ) ) ) { - FD_LOG_WARNING(( "unaligned mem" )); + FD_LOG_WARNING(( "misaligned mem" )); return NULL; } FD_SCRATCH_ALLOC_INIT( l, mem ); - fd_stream_writer_t * self = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_writer_t), sizeof(fd_stream_writer_t) ); - - fd_topo_link_t const * link = &topo->links[ tile->out_link_id[ link_id ] ]; - void * dcache = fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->out_link_id[ link_id ] ].dcache_obj_id ) ); - fd_stream_frag_meta_t * out_mcache = fd_type_pun( topo->links[ tile->out_link_id[ link_id ] ].mcache ); - ulong cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, link ); - - self->out_mcache = out_mcache; - self->buf = dcache; - self->buf_base = (ulong)dcache - (ulong)fd_wksp_containing( dcache ); - self->buf_off = 0UL; - self->buf_sz = fd_dcache_data_sz( dcache ); - self->goff = 0UL; - self->read_max = 0UL; /* this should be set by the tile via fd_stream_writer_set_read_max */ - self->stream_off = 0UL; - self->goff_start = 0UL; - self->out_seq = 0UL; - - /* Set up flow control state */ - self->cr_byte_avail = 0UL; - self->cr_frag_avail = 0UL; - self->cr_byte_max = fd_dcache_data_sz( dcache ); - self->cr_frag_max = fd_mcache_depth( self->out_mcache->f ); - self->burst_byte = burst_byte; - self->burst_frag = burst_frag; - self->cons_cnt = cons_cnt; - self->cons_seq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), EXPECTED_FSEQ_CNT_PER_CONS*cons_cnt*sizeof(ulong) ); - self->cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); - self->out_sync = fd_mcache_seq_laddr( topo->links[ tile->out_link_id[ link_id ] ].mcache ); - - /* Set up consumer fseq pointer array. - We keep track of 2 fseqs per consumer to manage stream flow control. - The first fseq tracks the consumer's mcache sequence number. - The second fseq tracks the consumer's global read offset into stream. */ - ulong cons_idx = 0UL; + fd_stream_writer_t * writer = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stream_writer_t), sizeof(fd_stream_writer_t) ); + ulong * cons_seq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), cons_max*sizeof(ulong)*FD_STREAM_WRITER_CONS_SEQ_STRIDE ); + ulong volatile ** cons_fseq = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_max*sizeof(ulong *) ); + FD_SCRATCH_ALLOC_FINI( l, fd_stream_writer_align() ); + + fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); + + writer->mcache = mcache; + writer->out_sync = fd_mcache_seq_laddr( mcache->f ); + writer->seq = fd_mcache_seq_query( writer->out_sync ); + writer->depth = fd_mcache_depth( mcache->f ); + + writer->data = dcache; + writer->data_max = fd_dcache_data_sz( dcache ); + writer->data_cur = 0UL; + writer->base = (uchar *)fd_wksp_containing( dcache ); /* FIXME impure */ + writer->goff = 0UL; + + writer->cr_byte_avail = 0UL; + writer->cr_frag_avail = 0UL; + writer->cons_seq = cons_seq; + writer->cons_fseq = cons_fseq; + + writer->frag_sz_max = writer->data_max; + + writer->cons_cnt = 0UL; + writer->cons_max = cons_max; + /* writer->out_sync already set */ + + return writer; +} + +void * +fd_stream_writer_delete( fd_stream_writer_t * writer ) { + fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); + return writer; +} + +ulong * +fd_stream_writer_register_consumer( + fd_stream_writer_t * writer, + ulong * fseq_join +) { + if( FD_UNLIKELY( writer->cons_cnt >= writer->cons_max ) ) { + FD_LOG_WARNING(( "Can't register consumer, cons_max %lu exceeded", writer->cons_max )); + return NULL; + } + + ulong const cons_idx = writer->cons_cnt++; + ulong * seq = writer->cons_seq + ( cons_idx*FD_STREAM_WRITER_CONS_SEQ_STRIDE+0 ); + writer->cons_fseq[ cons_idx ] = fd_type_pun( fseq_join ); + seq[ 0 ] = FD_VOLATILE_CONST( fseq_join[ 0 ] ); + seq[ 1 ] = FD_VOLATILE_CONST( fseq_join[ 1 ] ); + return seq; +} + +fd_stream_writer_t * +fd_stream_writer_new_topo( + void * mem, + ulong cons_max, + fd_topo_t const * topo, + fd_topo_tile_t const * tile, + ulong out_link_idx +) { + ulong const out_link_id = tile->out_link_id[ out_link_idx ]; + fd_topo_link_t const * out_link = &topo->links[ out_link_id ]; + fd_stream_frag_meta_t * mcache = fd_type_pun( out_link->mcache ); + void * dcache = fd_dcache_join( fd_topo_obj_laddr( topo, out_link->dcache_obj_id ) ); + ulong cons_cnt = fd_topo_link_reliable_consumer_cnt( topo, out_link ); + if( FD_UNLIKELY( !mcache ) ) { + FD_LOG_WARNING(( "NULL mcache" )); + return NULL; + } + if( FD_UNLIKELY( !dcache ) ) { + FD_LOG_WARNING(( "NULL dcache" )); + return NULL; + } + if( FD_UNLIKELY( cons_cnt>cons_max ) ) { + FD_LOG_WARNING(( "cons_cnt is %lu but cons_max is only %lu", cons_cnt, cons_max )); + } + + fd_stream_writer_t * writer = fd_stream_writer_new( mem, cons_max, mcache, dcache ); + if( FD_UNLIKELY( !writer ) ) return NULL; /* logs warning */ + for( ulong i=0UL; itile_cnt; i++ ) { - fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + fd_topo_tile_t const * consumer_tile = &topo->tiles[ i ]; for( ulong j=0UL; jin_cnt; j++ ) { - if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ link_id ] && consumer_tile->in_link_reliable[ j ] ) ) { - self->cons_fseq[ cons_idx ] = consumer_tile->in_link_fseq[ j ]; - if( FD_UNLIKELY( !self->cons_fseq[ cons_idx ] ) ) { - FD_LOG_ERR(( "NULL cons_fseq[%lu] for out_link=%lu", cons_idx, tile->out_link_id[ link_id ] )); - } - cons_idx++; + if( consumer_tile->in_link_id[ j ]!=out_link_id ) continue; + if( !consumer_tile->in_link_reliable[ j ] ) continue; + + ulong * fseq = consumer_tile->in_link_fseq[ j ]; + if( FD_UNLIKELY( !fseq ) ) { + FD_LOG_WARNING(( "NULL fseq for consumer tile=%s:%lu in_link_idx=%lu", + consumer_tile->name, consumer_tile->kind_id, j )); + } + if( FD_UNLIKELY( !fd_stream_writer_register_consumer( writer, fseq ) ) ) { + return NULL; /* logs warning */ } } } - fd_memset(self->cons_seq, 0, EXPECTED_FSEQ_CNT_PER_CONS*cons_cnt*sizeof(ulong) ); - /* make sure we're not tripping */ - FD_TEST( cons_idx==cons_cnt ); + return writer; +} + +void +fd_stream_writer_set_frag_sz_max( fd_stream_writer_t * writer, + ulong frag_sz_max ) { + writer->frag_sz_max = fd_ulong_min( writer->data_max, frag_sz_max ); +} + +void +fd_stream_writer_copy( fd_stream_writer_t * writer, + void const * data, + ulong data_sz, + ulong const ctl_mask ) { + if( FD_UNLIKELY( ( data_sz > writer->cr_byte_avail ) | + ( data_sz > writer->data_max ) ) ) { + FD_LOG_ERR(( "invalid data_sz %lu (cr_byte_avail=%lu data_max=%lu)", + data_sz, writer->cr_byte_avail, writer->data_max )); + } + + ulong const frag_sz_max = writer->frag_sz_max; + int som = 1; + for(;;) { + ulong const op_sz = fd_ulong_min( data_sz, frag_sz_max ); + ulong const next_sz = data_sz-op_sz; + int const eom = next_sz==0UL; + ulong const ctl = ctl_mask & fd_frag_meta_ctl( FD_FRAG_META_ORIG_MAX-1, som, eom, 1 ); - return self; + fd_memcpy( fd_stream_writer_prepare( writer ), data, op_sz ); + fd_stream_writer_publish( writer, op_sz, ctl ); + + som = 0; + data_sz = next_sz; + } } diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h index 174e515d6b..2b78941239 100644 --- a/src/discof/restore/stream/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -1,169 +1,279 @@ #ifndef HEADER_fd_src_discof_restore_stream_fd_stream_writer_h #define HEADER_fd_src_discof_restore_stream_fd_stream_writer_h -#include "../../../util/fd_util_base.h" -#include "../../../disco/topo/fd_topo.h" -#include "fd_stream_reader.h" - -/* A shared stream has a single producer and multiple consumers. - fd_stream_writer implements the producer APIs of the shared stream */ -struct fd_stream_writer { - fd_stream_frag_meta_t * out_mcache; /* frag producer mcache */ - - uchar * buf; /* laddr of shared dcache buffer */ - ulong buf_base; /* offset to the dcache buffer from wksp */ - - /* dcache buffer state */ - ulong buf_off; /* local write offset into dcache buffer */ - ulong buf_sz; /* dcache buffer size */ - ulong goff; /* global offset into byte stream */ - ulong read_max; /* max chunk size */ - ulong stream_off; /* start of published stream */ - ulong goff_start; /* start of goff in stream */ - ulong out_seq; /* current sequence number */ - - /* flow control */ - ulong cr_byte_avail; /* bytes available in the slowest consumer */ - ulong cr_frag_avail; /* frags available in the slowest consumer */ - ulong cr_byte_max; /* max dcache buffer credits (size of dcache buffer)*/ - ulong cr_frag_max; /* max mcache frag credits */ - ulong burst_byte; /* dcache backpressure threshold */ - ulong burst_frag; /* mcache backpressure threshold */ - ulong cons_cnt; /* number of consumers */ - ulong * cons_seq; /* consumer fseq values */ - ulong ** cons_fseq; /* consumer fseq pointers */ - ulong * out_sync; /* out fseq */ +/* fd_stream_writer.h provides an API to publish data to SPMC shared + memory byte streams. */ + +#include "../fd_restore_base.h" + +/* fd_stream_writer_t holds stream producer state. */ + +struct __attribute__((aligned(16))) fd_stream_writer { + /* Fragment descriptor output */ + fd_stream_frag_meta_t * mcache; /* frag producer mcache */ + ulong seq; /* next sequence number */ + ulong depth; /* mcache depth */ + + /* Data buffer (dcache) output */ + uchar * data; /* points to first byte of dcache data region (dcache join) */ + ulong data_max; /* dcache data region size */ + ulong data_cur; /* next dcache data offset in [0,data_sz) */ + uchar * base; /* workspace base address */ + ulong goff; /* byte stream offset */ + + /* This point is 16-byte aligned */ + + /* Backpressure */ + ulong cr_byte_avail; /* byte publish count before slowest consumer overrun */ + ulong cr_frag_avail; /* frag publish count before slowest consumer overrun */ + ulong * cons_seq; /* cons_seq[ 2*cons_idx+i ] caches cons_fseq[ cons_idx ][i] */ + ulong volatile ** cons_fseq; /* cons_fseq[ cons_idx ] points to consumer fseq */ + /* Each consumer reports a 'frag sequence number' and the 'stream offset' */ +# define FD_STREAM_WRITER_CONS_SEQ_STRIDE 2UL + + /* Fragmentation */ + ulong frag_sz_max; /* max data sz for each frag descriptor */ + + /* Cold data */ + ulong cons_cnt; /* number of consumers */ + ulong cons_max; /* max number of consumers */ + ulong * out_sync; /* points to mcache 'sync' field (last published seq no) */ + + /* variable length data follows */ }; + typedef struct fd_stream_writer fd_stream_writer_t; -#define EXPECTED_FSEQ_CNT_PER_CONS 2 +/* Forward declarations */ + +typedef struct fd_topo fd_topo_t; +typedef struct fd_topo_tile fd_topo_tile_t; FD_PROTOTYPES_BEGIN +/* Constructor API ****************************************************/ + +/* fd_stream_writer_{align,footprint} describe a memory region suitable + to hold a stream_writer. */ + FD_FN_CONST static inline ulong fd_stream_writer_align( void ) { return alignof(fd_stream_writer_t); } FD_FN_CONST static inline ulong -fd_stream_writer_footprint( void ) { - return sizeof(fd_stream_writer_t); +fd_stream_writer_footprint( ulong cons_max ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof(fd_stream_writer_t), sizeof(fd_stream_writer_t) ); + l = FD_LAYOUT_APPEND( l, alignof(ulong), cons_max*sizeof(ulong)*FD_STREAM_WRITER_CONS_SEQ_STRIDE ); + l = FD_LAYOUT_APPEND( l, alignof(ulong *), cons_max*sizeof(ulong *) ); + return FD_LAYOUT_FINI( l, fd_stream_writer_align() ); } -static inline uchar * -fd_stream_writer_get_write_ptr( fd_stream_writer_t * writer ) { - return writer->buf + writer->buf_off; -} +/* fd_stream_writer_new initializes the memory region at mem as a + stream_writer object. mcache_join is a local join to an mcache + (frag_meta or similar pointer) to which frags will be published. + dcache_join is a local join to a dcache into which data is written. + Returns writer object in mem on success, and NULL on failure. Logs + reason for failure. */ fd_stream_writer_t * fd_stream_writer_new( void * mem, - fd_topo_t * topo, - fd_topo_tile_t * tile, - ulong link_id, - ulong burst_byte, - ulong burst_frag ); + ulong cons_max, + fd_stream_frag_meta_t * mcache_join, + uchar * dcache_join ); -static inline void -fd_stream_writer_init_flow_control_credits( fd_stream_writer_t * writer ) { - for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { - writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][0] ); - writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ cons_idx ][1] ); - } -} +/* fd_stream_writer_delete releases the memory region backing a + stream_writer. Returns a pointer to the memory region originally + provided to fd_stream_writer_new. */ + +void * +fd_stream_writer_delete( fd_stream_writer_t * writer ); + +/* fd_stream_writer_join_topo constructs a stream writer for a topology + definition. Calls new() and register_consumer() under the hood. + tile is the actor that will be writing stream frags in topo. + out_link_idx is the index of the output link for that tile. */ + +fd_stream_writer_t * +fd_stream_writer_new_topo( + void * mem, + ulong cons_max, + fd_topo_t const * topo, + fd_topo_tile_t const * tile, + ulong out_link_idx +); + +/* Control API ********************************************************/ + +/* fd_stream_writer_register_consumer registers a consumer of the + stream to the writer. fseq_join is a local join to that consumer's + fseq (points to the fseq's seq[0] field). Future backpressure checks + will include this consumer. Returns a pointer to this consumer's + seq cache field, or NULL on if cons_max exceeded (logs warning). */ + +ulong * +fd_stream_writer_register_consumer( + fd_stream_writer_t * writer, + ulong * fseq_join +); + +/* fd_stream_writer_close marks the stream as closed. */ static inline void -fd_stream_writer_set_read_max( fd_stream_writer_t * writer, - ulong read_max ) { - writer->read_max = read_max; +fd_stream_writer_close( fd_stream_writer_t * writer ) { + FD_VOLATILE( writer->out_sync[ 0 ] ) = writer->seq; + FD_VOLATILE( writer->out_sync[ 1 ] ) = writer->goff; + FD_COMPILER_MFENCE(); + FD_VOLATILE( writer->out_sync[ 2 ] ) = 1; } +/* Flow control API ***************************************************/ + +/* fd_stream_writer_set_frag_sz_max puts an upper bound on the fragment + sizes produced to the stream. This helps reduce latency. */ + +void +fd_stream_writer_set_frag_sz_max( fd_stream_writer_t * writer, + ulong frag_sz_max ); + +/* fd_stream_writer_receive_flow_control_credits updates cached consumer + progress from the consumers' fseq objects. + + FIXME Provide an API to round-robin update ins temporally spaced apart */ + static inline void fd_stream_writer_receive_flow_control_credits( fd_stream_writer_t * writer ) { + ulong const stride = FD_STREAM_WRITER_CONS_SEQ_STRIDE; for( ulong i=0UL; icons_cnt; i++ ) { + /* FIXME could be SSE aligned copy */ FD_COMPILER_MFENCE(); - writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*i ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][0] ); - writer->cons_seq [ EXPECTED_FSEQ_CNT_PER_CONS*i+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][1] ); + writer->cons_seq[ stride*i ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][0] ); + writer->cons_seq[ stride*i+1 ] = FD_VOLATILE_CONST( writer->cons_fseq[ i ][1] ); FD_COMPILER_MFENCE(); } } +/* fd_stream_writer_calculate_backpressure updates fragment and stream + backpressure from cached consumer progress. */ + static inline void -fd_stream_writer_update_flow_control_credits( fd_stream_writer_t * writer ) { - ulong slowest_cons = ULONG_MAX; - if( FD_LIKELY( writer->cr_byte_availcr_byte_max || writer->cr_frag_availcr_frag_max ) ) { - ulong cr_byte_avail = writer->cr_byte_max; - ulong cr_frag_avail = writer->cr_frag_max; - for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { - ulong cons_cr_byte_avail = (ulong)fd_long_max( (long)writer->cr_byte_max-fd_long_max( fd_seq_diff( writer->goff, writer->cons_seq[ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx+1 ] ), 0L ), 0L ); - ulong cons_cr_frag_avail = (ulong)fd_long_max( (long)writer->cr_frag_max-fd_long_max( fd_seq_diff( writer->out_seq, writer->cons_seq[ EXPECTED_FSEQ_CNT_PER_CONS*cons_idx ] ), 0L ), 0L ); - slowest_cons = fd_ulong_if( cons_cr_byte_availcr_byte_avail = cr_byte_avail; - writer->cr_frag_avail = cr_frag_avail; +fd_stream_writer_calculate_backpressure( fd_stream_writer_t * writer ) { + ulong const cr_byte_max = writer->data_max; + ulong const cr_frag_max = writer->depth; + + ulong cr_byte_avail = ULONG_MAX; + ulong cr_frag_avail = ULONG_MAX; + ulong const stride = FD_STREAM_WRITER_CONS_SEQ_STRIDE; + for( ulong cons_idx=0UL; cons_idxcons_cnt; cons_idx++ ) { + ulong cons_cr_byte_avail = (ulong)fd_long_max( (long)cr_byte_max-fd_long_max( fd_seq_diff( writer->goff, writer->cons_seq[ stride*cons_idx+1 ] ), 0L ), 0L ); + ulong cons_cr_frag_avail = (ulong)fd_long_max( (long)cr_frag_max-fd_long_max( fd_seq_diff( writer->seq, writer->cons_seq[ stride*cons_idx ] ), 0L ), 0L ); + cr_byte_avail = fd_ulong_min( cons_cr_byte_avail, cr_byte_avail ); + cr_frag_avail = fd_ulong_min( cons_cr_frag_avail, cr_frag_avail ); } + + writer->cr_byte_avail = cr_byte_avail; + writer->cr_frag_avail = cr_frag_avail; } -static inline ulong -fd_stream_writer_get_avail_bytes( fd_stream_writer_t * writer ) { - if( FD_UNLIKELY( writer->buf_off > writer->buf_sz ) ) { - FD_LOG_CRIT(( "Buffer overflow (buf_off=%lu buf_sz=%lu)", writer->buf_off, writer->buf_sz )); +/* In-place publish API ************************************************ + + Example usage: + + void * p = fd_stream_writer_prepare( w ); + ulong sz = fd_stream_writer_publish_sz_max( w ) + fd_memcpy( p, src, sz ); + src += sz; + fd_stream_writer_publish( w, sz ); */ + +/* fd_stream_writer_prepare prepares the caller for a frag publish. + Returns a pointer to a memory region of publish_sz_max() bytes, into + which the caller can write data. A subsequent publish() call makes + the data visible to consumers. U.B. return value if + publish_sz_max()==0. */ + +static inline void * +fd_stream_writer_prepare( fd_stream_writer_t * writer ) { + if( FD_UNLIKELY( writer->data_cur > writer->data_max ) ) { + FD_LOG_CRIT(( "Out-of-bounds data_cur (data_cur=%lu data_max=%lu)", writer->data_cur, writer->data_max )); return 0; } + return writer->data + writer->data_cur; +} + +/* fd_stream_writer_publish_sz_max returns the max amount of bytes that + can be published in the next fragment. */ - ulong const read_max = fd_ulong_min( writer->cr_byte_avail, writer->read_max ); - return fd_ulong_min( read_max, writer->buf_sz - writer->buf_off ); +static inline ulong +fd_stream_writer_publish_sz_max( fd_stream_writer_t * writer ) { + ulong const data_backp = writer->cr_byte_avail; + ulong const frag_backp = fd_ulong_if( !!writer->cr_frag_avail, writer->frag_sz_max, 0UL ); + ulong const buf_avail = writer->data_max - writer->data_cur; + return fd_ulong_min( fd_ulong_min( data_backp, frag_backp ), buf_avail ); } +/* fd_stream_writer_publish completes a publish operation. Writes a + fragment descriptor out to the mcache if frag_sz>0. */ + static inline void fd_stream_writer_publish( fd_stream_writer_t * writer, - ulong frag_sz ) { - ulong loff = writer->buf_base + writer->stream_off; - fd_mcache_publish_stream( writer->out_mcache, - fd_mcache_depth( writer->out_mcache->f ), - writer->out_seq, - writer->goff_start, - loff, - frag_sz, - 0 ); - writer->out_seq = fd_seq_inc( writer->out_seq, 1UL ); + ulong frag_sz, + ulong ctl ) { + if( FD_UNLIKELY( !frag_sz ) ) return; + + uchar * const data = writer->data + writer->data_cur; + ulong const loff = (ulong)data - (ulong)writer->base; + + fd_mcache_publish_stream( + writer->mcache, + writer->depth, + writer->seq, + writer->goff, + loff, + frag_sz, + ctl + ); + + /* Advance fragment descriptor stream */ + writer->seq = fd_seq_inc( writer->seq, 1UL ); writer->cr_frag_avail -= 1; - /* rewind buf_off to start of buffer */ - if( writer->buf_off >= writer->buf_sz ) { - writer->buf_off = 0UL; + /* Advance buffer */ + writer->data_cur += frag_sz; + writer->goff += frag_sz; + if( FD_UNLIKELY( writer->data_cur > writer->data_max ) ) { + FD_LOG_CRIT(( "Out-of-bounds data_cur (data_cur=%lu data_max=%lu)", writer->data_cur, writer->data_max )); + return; + } + if( writer->data_cur == writer->data_max ) { + writer->data_cur = 0UL; /* cmov */ } - - /* update stream_off and goff_start to current values - of buf_off and goff */ - writer->stream_off = writer->buf_off; - writer->goff_start = writer->goff; } -static inline void -fd_stream_writer_advance( fd_stream_writer_t * writer, - ulong sz ) { - writer->goff += sz; - writer->buf_off += sz; - writer->cr_byte_avail -= sz; -} +/* Copy publish API ***************************************************/ -static inline int -fd_stream_writer_is_backpressured( fd_stream_writer_t * writer ) { - return writer->cr_byte_availburst_byte || writer->cr_frag_availburst_frag; -} +/* fd_stream_writer_copy publishes the given chunk to the stream as a + sequence of stream frags. data points to the first byte of the chunk + to send. data_sz is the number of bytes (<=copy_max()). + ctl specifies how to set the 'ctl' field. All ctl bits are copied as + is, except for 'som' and 'eom', which act as a mask: + Use 'fd_frag_meta_ctl( ..., som=1, eom=1, ... )' to set the 'som' + bit on the first frag and the 'eom' bit on the last flag. Pass + 'fd_frag_meta_ctl( ..., som=0, eom=0, ... )' or just '0UL' to leave + fragmentation bits cleared on published frags. */ -static inline void -fd_stream_writer_notify_shutdown( fd_stream_writer_t * writer ) { - FD_VOLATILE( writer->out_sync[ EXPECTED_FSEQ_CNT_PER_CONS * writer->cons_cnt ] ) = writer->out_seq; -} +void +fd_stream_writer_copy( fd_stream_writer_t * writer, + void const * data, + ulong data_sz, + ulong ctl ); -static inline void * -fd_stream_writer_delete( fd_stream_writer_t * writer ) { - fd_memset( writer, 0, sizeof(fd_stream_writer_t) ); - return (void *)writer; +static inline ulong +fd_stream_writer_copy_max( fd_stream_writer_t * writer ) { + ulong const data_backp = writer->cr_byte_avail; + ulong const frag_backp = fd_ulong_sat_mul( writer->cr_frag_avail, writer->frag_sz_max ); + ulong const buf_avail = writer->data_max - writer->data_cur; + return fd_ulong_min( fd_ulong_min( data_backp, frag_backp ), buf_avail ); } FD_PROTOTYPES_END diff --git a/src/discof/restore/test_snapin_tile.c b/src/discof/restore/test_snapin_tile.c index 9cf0b60274..b7348cbda2 100644 --- a/src/discof/restore/test_snapin_tile.c +++ b/src/discof/restore/test_snapin_tile.c @@ -1,5 +1,76 @@ #define FD_TILE_TEST #include "fd_snapin_tile.c" +#include "stream/fd_stream_writer.h" + +static ulong +mock_stream_align( void ) { + return fd_ulong_max( fd_ulong_max( fd_stream_writer_align(), fd_mcache_align() ), fd_dcache_align() ); +} + +static ulong +mock_stream_footprint( ulong depth, + ulong dcache_data_sz ) { + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint( 0UL ) ); + l = FD_LAYOUT_APPEND( l, fd_mcache_align(), fd_mcache_footprint( depth, 0uL ) ); + l = FD_LAYOUT_APPEND( l, fd_dcache_align(), fd_dcache_footprint( dcache_data_sz, 0UL ) ); + return l; +} + +static fd_stream_writer_t * +mock_stream_init( void * mem, + ulong depth, + ulong dcache_data_sz ) { + if( FD_UNLIKELY( !mem ) ) return NULL; + if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, mock_stream_align() ) ) ) return NULL; + + FD_SCRATCH_ALLOC_INIT( l, mem ); + void * writer_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_stream_writer_align(), fd_stream_writer_footprint( 0UL ) ); + void * mcache_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_mcache_align(), fd_mcache_footprint( depth, 0uL ) ); + void * dcache_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_dcache_align(), fd_dcache_footprint( dcache_data_sz, 0UL ) ); + + fd_frag_meta_t * mcache = fd_mcache_join( fd_mcache_new( mcache_mem, depth, 0UL, 0UL ) ); + uchar * dcache = fd_dcache_join( fd_dcache_new( dcache_mem, dcache_data_sz, 0UL ) ); + + return fd_stream_writer_new( writer_mem, 0UL, fd_type_pun( mcache ), dcache ); +} + +static void * +mock_stream_delete( fd_stream_writer_t * writer ) { + fd_dcache_delete( fd_dcache_leave( writer->data ) ); + fd_mcache_delete( fd_mcache_leave( fd_type_pun( writer->mcache ) ) ); + return fd_stream_writer_delete( writer ); +} + +/* Feed in snapshot stream frags and validate the resulting account + frags are sane. This variant tests handwritten edge cases. */ + +static void +test_account_frags( fd_wksp_t * wksp ) { + /* Create a snapin context */ + fd_topo_tile_t topo_tile = { + .name = "snapin", + .snapin = { + .scratch_sz = 4096UL + } + }; + void * tile_scratch = fd_wksp_alloc_laddr( wksp, scratch_align(), scratch_footprint( &topo_tile ), 1UL ); + FD_TEST( tile_scratch ); + fd_snapin_tile_t * ctx = scratch_init( tile_scratch, &topo_tile ); + FD_TEST( ctx ); + ctx->state = SNAP_STATE_ACCOUNT_HDR; + ctx->accv_sz = UINT_MAX; + + /* Create an input */ + void * in_stream_mem = fd_wksp_alloc_laddr( wksp, mock_stream_align(), mock_stream_footprint( 128UL, 4096UL ), 1UL ); + fd_stream_writer_t * in_stream = mock_stream_init( in_stream_mem, 128UL, 4096UL ); + FD_TEST( in_stream ); + + /* An empty account */ + + fd_wksp_free_laddr( mock_stream_delete( in_stream ) ); + fd_wksp_free_laddr( tile_scratch ); +} int main( int argc, @@ -9,18 +80,17 @@ main( int argc, char const * _page_sz = fd_env_strip_cmdline_cstr ( &argc, &argv, "--page-sz", NULL, "gigantic" ); ulong page_cnt = fd_env_strip_cmdline_ulong( &argc, &argv, "--page-cnt", NULL, 1UL ); ulong near_cpu = fd_env_strip_cmdline_ulong( &argc, &argv, "--near-cpu", NULL, fd_log_cpu_id() ); + uint rng_seed = fd_env_strip_cmdline_uint ( &argc, &argv, "--rng-seed", NULL, 0U ); + + fd_rng_t _rng[1]; fd_rng_t * rng = fd_rng_join( fd_rng_new( _rng, rng_seed, 0UL ) ); fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( _page_sz ), page_cnt, near_cpu, "wksp", 0UL ); if( FD_UNLIKELY( !wksp ) ) FD_LOG_ERR(( "Unable to attach to wksp" )); - fd_topo_tile_t topo_tile = { - .name = "snapin", - }; - - uchar * tile_scratch = fd_wksp_alloc_laddr( wksp, scratch_align(), scratch_footprint( &topo_tile ), 1UL ); - FD_TEST( tile_scratch ); + test_account_frags( wksp ); fd_wksp_delete_anonymous( wksp ); + fd_rng_delete( fd_rng_leave( rng ) ); FD_LOG_NOTICE(( "pass" )); fd_halt(); From e24efa39b2d1ff2ca93563e6da9728fa4bff8740 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Mon, 19 May 2025 16:34:18 +0000 Subject: [PATCH 31/34] snapin test --- src/discof/restore/Local.mk | 2 +- src/discof/restore/fd_snapin_tile.c | 8 ++--- src/discof/restore/stream/fd_stream_writer.c | 11 +++++-- src/discof/restore/test_snapin_tile.c | 33 ++++++++++++++++++-- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index 386678881e..cc9935b4b6 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -8,4 +8,4 @@ $(call add-objs,fd_httpdl_tile,fd_discof) $(call add-objs,stream/fd_stream_writer,fd_discof) $(call add-objs,stream/fd_event_map,fd_discof) $(call add-objs,stream/fd_stream_ctx,fd_discof) -$(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_tango fd_util) +$(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_disco fd_flamenco fd_tango fd_ballet fd_util) diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index d9b0dbcc3a..549dff91c0 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -285,8 +285,8 @@ restore_file( void * restore_, static uchar const * snapshot_read_buffered( fd_snapin_tile_t * restore, - uchar const * buf, - ulong bufsz ) { + uchar const * buf, + ulong bufsz ) { /* Should not be called if read is complete */ FD_TEST( restore->buf_ctr < restore->buf_sz ); @@ -363,7 +363,7 @@ snapshot_read_account_hdr_chunk( fd_snapin_tile_t * restore, peek_sz = fd_ulong_min( restore->acc_rem, bufsz ); } - int eom = bufsz > restore->acc_rem; + int eom = bufsz >= restore->acc_rem; /* Publish header-only fragment or header+data fragment. If data was included, skip ahead. (Combining header+data into the @@ -805,7 +805,7 @@ on_stream_frag( fd_snapin_tile_t * ctx, FD_LOG_ERR(( "Failed to restore snapshot" )); } } - if( FD_UNLIKELY( ctx->out_seq >= ctx->out_seq_max ) ) { + if( FD_UNLIKELY( fd_seq_ge( ctx->out_seq, ctx->out_seq_max ) ) ) { consume_frag = 0; /* retry this frag */ ulong consumed_sz = (uint)( cur-start ); ctx->in_skip += consumed_sz; diff --git a/src/discof/restore/stream/fd_stream_writer.c b/src/discof/restore/stream/fd_stream_writer.c index 0ee8d03a26..f3fbf668ea 100644 --- a/src/discof/restore/stream/fd_stream_writer.c +++ b/src/discof/restore/stream/fd_stream_writer.c @@ -37,8 +37,8 @@ fd_stream_writer_new( void * mem, writer->base = (uchar *)fd_wksp_containing( dcache ); /* FIXME impure */ writer->goff = 0UL; - writer->cr_byte_avail = 0UL; - writer->cr_frag_avail = 0UL; + writer->cr_byte_avail = ULONG_MAX; + writer->cr_frag_avail = ULONG_MAX; writer->cons_seq = cons_seq; writer->cons_fseq = cons_fseq; @@ -68,6 +68,8 @@ fd_stream_writer_register_consumer( FD_LOG_WARNING(( "Can't register consumer, cons_max %lu exceeded", writer->cons_max )); return NULL; } + writer->cr_byte_avail = 0UL; + writer->cr_frag_avail = 0UL; ulong const cons_idx = writer->cons_cnt++; ulong * seq = writer->cons_seq + ( cons_idx*FD_STREAM_WRITER_CONS_SEQ_STRIDE ); @@ -143,8 +145,11 @@ fd_stream_writer_copy( fd_stream_writer_t * writer, } ulong const frag_sz_max = writer->frag_sz_max; + if( FD_UNLIKELY( !frag_sz_max ) ) { + FD_LOG_ERR(( "zero frag_sz_max" )); + } int som = 1; - for(;;) { + while( data_sz ) { ulong const op_sz = fd_ulong_min( data_sz, frag_sz_max ); ulong const next_sz = data_sz-op_sz; int const eom = next_sz==0UL; diff --git a/src/discof/restore/test_snapin_tile.c b/src/discof/restore/test_snapin_tile.c index b7348cbda2..8e084a85ad 100644 --- a/src/discof/restore/test_snapin_tile.c +++ b/src/discof/restore/test_snapin_tile.c @@ -58,15 +58,44 @@ test_account_frags( fd_wksp_t * wksp ) { FD_TEST( tile_scratch ); fd_snapin_tile_t * ctx = scratch_init( tile_scratch, &topo_tile ); FD_TEST( ctx ); - ctx->state = SNAP_STATE_ACCOUNT_HDR; - ctx->accv_sz = UINT_MAX; + + void * out_mcache_mem = fd_wksp_alloc_laddr( wksp, fd_mcache_align(), fd_mcache_footprint( 128UL, 0UL ), 1UL ); + ctx->out_mcache = fd_type_pun( fd_mcache_join( fd_mcache_new( out_mcache_mem, 128UL, 0UL, 0UL ) ) ); + FD_TEST( ctx->out_mcache ); + ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); + ctx->out_seq_max = UINT_MAX; + + ctx->tar_file_rem = ULONG_MAX; + ctx->accv_sz = ULONG_MAX; + fd_snapshot_expect_account_hdr( ctx ); + uchar scratch_buf[ 256 ]; + ctx->buf = scratch_buf; + ctx->buf_max = sizeof(scratch_buf); /* Create an input */ void * in_stream_mem = fd_wksp_alloc_laddr( wksp, mock_stream_align(), mock_stream_footprint( 128UL, 4096UL ), 1UL ); fd_stream_writer_t * in_stream = mock_stream_init( in_stream_mem, 128UL, 4096UL ); FD_TEST( in_stream ); + fd_snapin_in_t in = { + .mcache = in_stream->mcache, + .depth = (uint)in_stream->depth, + .idx = 0U, + .seq = 0UL, + .goff = 0UL, + .mline = in_stream->mcache + }; + ctx->in_base = (uchar *)wksp; /* An empty account */ + fd_solana_account_hdr_t const acc1 = { .hash={ .uc={ 1,2,3 } } }; + fd_stream_writer_copy( in_stream, &acc1, sizeof(fd_solana_account_hdr_t), fd_frag_meta_ctl( 0, 1, 1, 0 ) ); + ulong read_sz; + FD_TEST( on_stream_frag( ctx, &in, in_stream->mcache+0, &read_sz )==1 ); + FD_TEST( ctx->out_mcache[ 0 ].seq==0UL ); + FD_TEST( ctx->out_mcache[ 0 ].sz==sizeof(fd_solana_account_hdr_t) ); + FD_TEST( ctx->out_mcache[ 0 ].ctl==fd_frag_meta_ctl( 0, 1, 1, 0 ) ); + FD_TEST( ctx->out_mcache[ 0 ].goff==0UL ); + FD_TEST( fd_memeq( ctx->in_base+ctx->out_mcache[ 0 ].loff, &acc1, sizeof(fd_solana_account_hdr_t) ) ); fd_wksp_free_laddr( mock_stream_delete( in_stream ) ); fd_wksp_free_laddr( tile_scratch ); From b67ca71ec5c6ad01fb21393bc89f9995d7f384f5 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 20 May 2025 08:06:21 +0000 Subject: [PATCH 32/34] fix --- src/discof/restore/stream/fd_stream_writer.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/discof/restore/stream/fd_stream_writer.h b/src/discof/restore/stream/fd_stream_writer.h index 188d1799fb..4d3243d6db 100644 --- a/src/discof/restore/stream/fd_stream_writer.h +++ b/src/discof/restore/stream/fd_stream_writer.h @@ -251,8 +251,9 @@ fd_stream_writer_publish( fd_stream_writer_t * writer, writer->cr_frag_avail -= 1; /* Advance buffer */ - writer->data_cur += frag_sz; - writer->goff += frag_sz; + writer->data_cur += frag_sz; + writer->goff += frag_sz; + writer->cr_byte_avail -= frag_sz; if( FD_UNLIKELY( writer->data_cur > writer->data_max ) ) { FD_LOG_CRIT(( "Out-of-bounds data_cur (data_cur=%lu data_max=%lu)", writer->data_cur, writer->data_max )); return; From b25fc75cba83d074f5499483d7997f8237b280f6 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 20 May 2025 08:06:29 +0000 Subject: [PATCH 33/34] shutdown improvement --- src/discof/restore/fd_snapin_tile.c | 28 +++++++++++--------- src/discof/restore/fd_unzstd_tile.c | 16 +++++------ src/discof/restore/stream/fd_stream_reader.h | 13 +++------ 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/discof/restore/fd_snapin_tile.c b/src/discof/restore/fd_snapin_tile.c index 549dff91c0..e14eb3bf2f 100644 --- a/src/discof/restore/fd_snapin_tile.c +++ b/src/discof/restore/fd_snapin_tile.c @@ -57,6 +57,7 @@ typedef struct fd_snapshot_accv_map fd_snapshot_accv_map_t; #define SNAP_FLAG_FAILED 1 #define SNAP_FLAG_BLOCKED 2 +#define SNAP_FLAG_DONE 4 struct fd_snapin_tile { uchar state; @@ -68,6 +69,7 @@ struct fd_snapin_tile { uchar const * in_base; ulong goff_translate; ulong in_skip; + ulong const * in_sync; /* Frame buffer */ @@ -102,7 +104,6 @@ struct fd_snapin_tile { ulong out_cnt; ulong out_depth; ulong seq; - ulong const volatile * shutdown_signal; }; typedef struct fd_snapin_tile fd_snapin_tile_t; @@ -120,21 +121,20 @@ struct fd_snapin_in { typedef struct fd_snapin_in fd_snapin_in_t; -__attribute__((noreturn)) static void +static void fd_snapin_shutdown( fd_snapin_tile_t * ctx ) { - ulong in_seq_max = FD_VOLATILE_CONST( *ctx->shutdown_signal ); - /* wait for zstd tile to set shutdown sequence number */ - while ( in_seq_max == 0 ) { - in_seq_max = FD_VOLATILE_CONST( *ctx->shutdown_signal ); - FD_SPIN_PAUSE(); - } + ctx->flags = SNAP_FLAG_DONE; - /* FIXME set final sequence number */ FD_MGAUGE_SET( TILE, STATUS, 2UL ); - FD_TEST( in_seq_max == ctx->seq+1 && in_seq_max != 0 ); - FD_COMPILER_MFENCE(); FD_LOG_WARNING(( "Finished parsing snapshot" )); + /* Send synchronization info */ + ulong volatile * out_sync = fd_mcache_seq_laddr( ctx->out_mcache->f ); + FD_COMPILER_MFENCE(); + FD_VOLATILE( out_sync[0] ) = ctx->out_seq; + FD_VOLATILE( out_sync[2] ) = 1; + FD_COMPILER_MFENCE(); + for(;;) pause(); } @@ -620,6 +620,7 @@ unprivileged_init( fd_topo_t * topo, FD_TEST( fd_dcache_join( fd_topo_obj_laddr( topo, topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ) ) ); ctx->in_base = (uchar const *)topo->workspaces[ topo->objs[ topo->links[ tile->in_link_id[ 0 ] ].dcache_obj_id ].wksp_id ].wksp; ctx->in_skip = 0UL; + ctx->in_sync = fd_mcache_seq_laddr_const( topo->links[ tile->in_link_id[ 0 ] ].mcache ); /* Join frame buffer */ @@ -633,7 +634,6 @@ unprivileged_init( fd_topo_t * topo, ctx->out_seq_max = 0UL; ctx->out_seq = 0UL; ctx->out_depth = fd_mcache_depth( ctx->out_mcache->f ); - ctx->shutdown_signal = fd_mcache_seq_laddr_const( topo->links[ tile->in_link_id[ 0 ] ].mcache ) + 2; } @@ -782,6 +782,10 @@ on_stream_frag( fd_snapin_tile_t * ctx, ulong * read_sz ) { if( FD_UNLIKELY( ctx->flags ) ) { if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_FAILED ) ) FD_LOG_ERR(( "Failed to restore snapshot" )); + if( FD_UNLIKELY( ctx->flags & SNAP_FLAG_DONE ) ) { + *read_sz = frag->sz; + return 1; + } return 0; } diff --git a/src/discof/restore/fd_unzstd_tile.c b/src/discof/restore/fd_unzstd_tile.c index 69883021c3..a22b9e061b 100644 --- a/src/discof/restore/fd_unzstd_tile.c +++ b/src/discof/restore/fd_unzstd_tile.c @@ -74,14 +74,14 @@ fd_unzstd_shutdown( fd_unzstd_tile_t * ctx ) { } static void -fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, - fd_unzstd_tile_t * ctx ) { - ulong shutdown_seq = fd_stream_reader_poll_shutdown( stream_ctx->in_ptrs[0] ); - if( FD_UNLIKELY( shutdown_seq ) ) { - FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", - shutdown_seq, stream_ctx->in[0].base.seq)); - fd_unzstd_shutdown( ctx ); - } +fd_unzstd_poll_shutdown( fd_stream_ctx_t * stream_ctx, + fd_unzstd_tile_t * ctx ) { + ulong const volatile * in_sync = stream_ctx->in_ptrs[ 0 ]->in_sync; + if( FD_LIKELY( !FD_VOLATILE_CONST( in_sync[ 2 ] ) ) ) return; + + FD_LOG_WARNING(( "zstd shutting down! in_seq_max is %lu in[0].base.seq is %lu", + FD_VOLATILE_CONST( in_sync[ 0 ] ), stream_ctx->in[0].base.seq )); + fd_unzstd_shutdown( ctx ); } static void diff --git a/src/discof/restore/stream/fd_stream_reader.h b/src/discof/restore/stream/fd_stream_reader.h index 8b322594f5..bf40a9787d 100644 --- a/src/discof/restore/stream/fd_stream_reader.h +++ b/src/discof/restore/stream/fd_stream_reader.h @@ -16,11 +16,11 @@ struct fd_stream_reader { uint accum[6]; }; - fd_frag_reader_t r[1]; + fd_frag_reader_t r[1]; /* FIXME strict aliasing violation on mcache */ } base; ulong goff; - ulong const volatile * shutdown_signal; + ulong const volatile * in_sync; }; typedef struct fd_stream_reader fd_stream_reader_t; @@ -43,8 +43,7 @@ fd_stream_reader_init( fd_stream_reader_t * reader, ulong in_idx ) { fd_frag_reader_init( reader->base.r, mcache, fseq, in_idx ); reader->goff = 0UL; - /* shutdown signal is located at fseq 2 */ - reader->shutdown_signal = fd_mcache_seq_laddr_const( reader->base.mcache->f ) + 2; + reader->in_sync = fd_mcache_seq_laddr_const( reader->base.mcache->f ); } static inline fd_stream_reader_t * @@ -97,12 +96,6 @@ fd_stream_reader_consume_frag( fd_stream_reader_t * reader, fd_frag_reader_consume_frag( reader->base.r, ctx ); } -static inline ulong -fd_stream_reader_poll_shutdown( fd_stream_reader_t * reader ) { - ulong const in_seq_max = FD_VOLATILE_CONST( *reader->shutdown_signal ); - return in_seq_max == reader->base.seq && in_seq_max != 0 ? - in_seq_max : 0UL; -} static inline void * fd_stream_reader_delete( fd_stream_reader_t * reader ) { From 3989fb341b225931bf1e0520622fdcd042a9acb1 Mon Sep 17 00:00:00 2001 From: Richard Patel Date: Tue, 20 May 2025 08:10:34 +0000 Subject: [PATCH 34/34] guard fixes --- src/discof/restore/Local.mk | 7 ++++++- src/funk/fd_funk_base.h | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/discof/restore/Local.mk b/src/discof/restore/Local.mk index cc9935b4b6..334526f06b 100644 --- a/src/discof/restore/Local.mk +++ b/src/discof/restore/Local.mk @@ -1,11 +1,16 @@ $(call add-objs,fd_filerd_tile,fd_discof) +ifdef FD_HAS_ZSTD $(call add-objs,fd_unzstd_tile,fd_discof) +endif +ifdef FD_HAS_INT128 $(call add-objs,fd_snapin_tile,fd_discof) $(call add-objs,fd_actalc_tile,fd_discof) +endif $(call add-objs,fd_actidx_tile,fd_discof) -$(call add-objs,fd_unzstd_tile,fd_discof) $(call add-objs,fd_httpdl_tile,fd_discof) $(call add-objs,stream/fd_stream_writer,fd_discof) $(call add-objs,stream/fd_event_map,fd_discof) $(call add-objs,stream/fd_stream_ctx,fd_discof) +ifdef FD_HAS_INT128 $(call make-unit-test,test_snapin_tile,test_snapin_tile,fd_discof fd_disco fd_flamenco fd_tango fd_ballet fd_util) +endif diff --git a/src/funk/fd_funk_base.h b/src/funk/fd_funk_base.h index 866cdc8e0e..068616fc5b 100644 --- a/src/funk/fd_funk_base.h +++ b/src/funk/fd_funk_base.h @@ -200,13 +200,22 @@ fd_funk_rec_key_hash( fd_funk_rec_key_t const * k, FIXME This version is vulnerable to HashDoS */ +FD_FN_PURE static inline ulong +fd_funk_rec_key_hash1( uchar const key[ 32 ], + ulong rec_type, + ulong seed ) { + seed ^= rec_type; + /* tons of ILP */ + return (fd_ulong_hash( seed ^ (1UL<<0) ^ FD_LOAD( ulong, key+ 0 ) ) ^ + fd_ulong_hash( seed ^ (1UL<<1) ^ FD_LOAD( ulong, key+ 8 ) ) ) ^ + (fd_ulong_hash( seed ^ (1UL<<2) ^ FD_LOAD( ulong, key+16 ) ) ^ + fd_ulong_hash( seed ^ (1UL<<3) ^ FD_LOAD( ulong, key+24 ) ) ); +} + FD_FN_PURE static inline ulong fd_funk_rec_key_hash( fd_funk_rec_key_t const * k, ulong seed ) { - seed ^= k->ul[4]; - /* tons of ILP */ - return (fd_ulong_hash( seed ^ (1UL<<0) ^ k->ul[0] ) ^ fd_ulong_hash( seed ^ (1UL<<1) ^ k->ul[1] ) ) ^ - (fd_ulong_hash( seed ^ (1UL<<2) ^ k->ul[2] ) ^ fd_ulong_hash( seed ^ (1UL<<3) ^ k->ul[3] ) ); + return fd_funk_rec_key_hash1( k->uc, k->ul[4], seed ); } #endif /* FD_HAS_INT128 */