Skip to content
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f71f674
feat(container-loader): add captureContainerPendingState free function
anthony-murphy Apr 20, 2026
d7a03b1
feat(container-loader): inline attachment blob contents in captureCon…
anthony-murphy Apr 20, 2026
49df699
feat(container-loader): make captureContainerPendingState reference-a…
anthony-murphy Apr 20, 2026
220a846
test(container-loader): unit-test captureReferencedContents helpers
anthony-murphy Apr 20, 2026
ebe5f7f
refactor(container-loader): rename captureContainerPendingState to ca…
anthony-murphy Apr 20, 2026
73d1df0
style(local-server-tests): biome auto-format captureFullContainerStat…
anthony-murphy Apr 20, 2026
8a748cc
fix(container-loader): address PR review feedback on captureFullConta…
anthony-murphy Apr 20, 2026
8d89e8e
fix(container-loader): bound concurrency in captureGroupIdSnapshots too
anthony-murphy Apr 20, 2026
2e5c3f7
fix(container-loader): preserve `this` in captureGroupIdSnapshots, sk…
anthony-murphy Apr 21, 2026
57a1a76
Merge remote-tracking branch 'origin/main' into full-container-state
markfields May 1, 2026
807f73f
build fixes
markfields May 5, 2026
aa4ce9c
Revert support for loading groups
markfields May 5, 2026
2e7f66c
TMP - working notes
markfields May 5, 2026
81d04de
Review feedback items
markfields May 5, 2026
a6bb56f
Revert mc addition
markfields May 5, 2026
e4065af
changeset
markfields May 5, 2026
1d5a51b
Merge branch 'main' into full-container-state
markfields May 5, 2026
64c6cc4
lint
markfields May 5, 2026
7739827
nit: mitigate theoretical V8 scale limitation
markfields May 5, 2026
cdda9ba
Fix utf-8 encoding bug and some nits
markfields May 5, 2026
1a4bcfa
formatting
markfields May 6, 2026
d095ae5
Fix trailing blobattach ops
markfields May 7, 2026
6e66f7b
PR feedback
markfields May 7, 2026
f4621c2
CI fix and note follow-up
markfields May 7, 2026
0062623
Remove MD file used while PR in progress
markfields May 7, 2026
c21964d
Merge branch 'main' into full-container-state
markfields May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/wide-foxes-behave.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@fluidframework/container-loader": minor
"@fluidframework/container-runtime": minor
"__section": feature
---
Add a new free function captureFullContainerState for serializing a container without loading it, for rehydrating into a future session.

Check failure on line 6 in .changeset/wide-foxes-behave.md

View workflow job for this annotation

GitHub Actions / vale

[vale] reported by reviewdog 🐶 [Vale.Spelling] Did you really mean 'rehydrating'? Raw Output: {"message": "[Vale.Spelling] Did you really mean 'rehydrating'?", "location": {"path": ".changeset/wide-foxes-behave.md", "range": {"start": {"line": 6, "column": 103}}}, "severity": "ERROR"}

318 changes: 318 additions & 0 deletions full-container-state-review-notes.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
// @alpha @legacy
export function asLegacyAlpha(base: IContainer): ContainerAlpha;

// @alpha @legacy
export function captureFullContainerState(input: ICaptureFullContainerStateProps): Promise<string>;

// @public
export enum ConnectionState {
CatchingUp = 1,
Expand Down Expand Up @@ -44,6 +47,14 @@ export interface IBaseProtocolHandler {
snapshot(): IQuorumSnapshot;
}

// @alpha @legacy
export interface ICaptureFullContainerStateProps {
readonly documentServiceFactory: IDocumentServiceFactory;
readonly logger?: ITelemetryBaseLogger | undefined;
readonly request: IRequest;
readonly urlResolver: IUrlResolver;
}

// @beta @deprecated @legacy (undocumented)
export interface ICodeDetailsLoader extends Partial<IProvideFluidCodeDetailsComparer> {
load(source: IFluidCodeDetails): Promise<IFluidModuleWithDetails>;
Expand Down
339 changes: 339 additions & 0 deletions packages/loader/container-loader/src/captureReferencedContents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,339 @@
/*!
* Copyright (c) Microsoft Corporation and contributors. All rights reserved.
* Licensed under the MIT License.
*/

import { bufferToString } from "@fluid-internal/client-utils";
import type {
IDocumentStorageService,
ISnapshot,
ISnapshotTree,
} from "@fluidframework/driver-definitions/internal";
import { readAndParse } from "@fluidframework/driver-utils/internal";

import type {
IBase64BlobContents,
ISerializableBlobContents,
} from "./containerStorageAdapter.js";

/**
* Wire-format constants this module needs to walk and filter snapshots.
* Authoritative definitions live in `container-runtime` and
* `runtime-definitions`; the values are duplicated here to avoid a
* loader → runtime layering dependency. A contract test in
* `packages/test/local-server-tests` asserts these match the authoritative
* values; do not change them in isolation.
*
* Authoritative sources:
* - `blobsTreeName`, `redirectTableBlobName`: `packages/runtime/container-runtime/src/blobManager/blobManagerSnapSum.ts`
* - `blobManagerBasePath`: `packages/runtime/container-runtime/src/blobManager/blobManager.ts`
* - `gcTreeKey`, `gcBlobPrefix`, `gcTombstoneBlobKey`, `gcDeletedBlobKey`: `packages/runtime/runtime-definitions/src/garbageCollectionDefinitions.ts`
*
* @internal
*/
export const wireFormatConstants = {
blobsTreeName: ".blobs",
redirectTableBlobName: ".redirectTable",
blobManagerBasePath: "_blobs",
gcTreeKey: "gc",
gcBlobPrefix: "__gc",
gcTombstoneBlobKey: "__tombstones",
gcDeletedBlobKey: "__deletedNodes",
} as const;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deep Review: Loader↔runtime constant duplication — gatekeeper nod ask, no code change.

This PR introduces a wireFormatConstants block in the loader mirroring seven runtime-side constants (.blobs, .redirectTable, _blobs, gc, __gc, __tombstones, __deletedNodes) and a cross-package deepStrictEqual contract test (wireFormatConstants.spec.ts). PR #21729 (anthony-murphy, 2024-07-02) was the deliberate refactor that produced these constants in their current form, with markfields reviewing for path-format discipline. This PR is the first cross-layer consumer; the chosen pattern (duplicate + contract test) sets precedent for any future shared wire-format constant.

Tag anthony-murphy for an explicit nod on the duplicate+contract-test layering choice, or merge with a clear note that AB#72934 will revisit (per thread #3192306888). No code change expected.


const {
blobsTreeName,
redirectTableBlobName,
blobManagerBasePath,
gcTreeKey,
gcBlobPrefix,
gcTombstoneBlobKey,
gcDeletedBlobKey,
} = wireFormatConstants;

interface IGcNodeData {
outboundRoutes: string[];
unreferencedTimestampMs?: number;
}

interface IGcState {
gcNodes: { [id: string]: IGcNodeData };
}

/**
* The parsed subset of the `gc` subtree that drives reachability decisions.
*/
export interface IGcSnapshotData {
gcState: IGcState | undefined;
tombstones: string[] | undefined;
deletedNodes: string[] | undefined;
}

/** Reader that returns a blob's contents for a given storage id. */
type BlobReader = (id: string) => Promise<ArrayBufferLike>;

/**
* Upper bound on concurrent `readBlob` calls. Driver/service back-pressure is
* real for large documents, and unbounded `Promise.all` can trigger throttling
* or spike memory. The value is a pragmatic middle ground — high enough to
* keep a typical driver's request pipeline full, low enough to avoid storms.
*/
const maxReadConcurrency = 32;

/**
* Runs `fn` over `items` with at most `limit` promises in flight. Preserves
* input order on output (not that any caller depends on it today).
*/
async function mapWithConcurrency<T, R>(
items: readonly T[],
limit: number,
fn: (item: T) => Promise<R>,
): Promise<R[]> {
const results: R[] = Array.from({ length: items.length });
let cursor = 0;
const workerCount = Math.min(limit, items.length);
const workers = Array.from({ length: workerCount }, async () => {
while (cursor < items.length) {
const index = cursor++;
const item = items[index];
if (item !== undefined) {
results[index] = await fn(item);
}
}
});
await Promise.all(workers);
return results;
}

/**
* Parses the `gc` subtree of a base snapshot. Returns `undefined` if the
* snapshot has no GC tree (GC disabled or pre-GC document).
*/
export async function parseGcSnapshotData(
baseSnapshot: ISnapshotTree,
storage: Pick<IDocumentStorageService, "readBlob">,
): Promise<IGcSnapshotData | undefined> {
const gcSnapshotTree: ISnapshotTree | undefined = baseSnapshot.trees[gcTreeKey];
if (gcSnapshotTree === undefined) {
return undefined;
}
let gcState: IGcState | undefined;
let tombstones: string[] | undefined;
let deletedNodes: string[] | undefined;
for (const [key, blobId] of Object.entries(gcSnapshotTree.blobs)) {
if (key === gcDeletedBlobKey) {
deletedNodes = await readAndParse<string[]>(storage, blobId);
} else if (key === gcTombstoneBlobKey) {
tombstones = await readAndParse<string[]>(storage, blobId);
} else if (key.startsWith(gcBlobPrefix)) {
const partial = await readAndParse<IGcState>(storage, blobId);
if (gcState === undefined) {
gcState = { gcNodes: { ...partial.gcNodes } };
} else {
for (const [nodeId, nodeData] of Object.entries(partial.gcNodes)) {
gcState.gcNodes[nodeId] ??= nodeData;
}
}
}
}
return { gcState, tombstones, deletedNodes };
}

/**
* Walks a snapshot and inlines the contents of every blob reachable without
* crossing an `unreferenced` subtree boundary. Subtrees flagged
* `unreferenced: true` are skipped entirely — the summarizer sets that flag
* from GC state, so honouring it filters out dead subtrees without a
* separate GC-path traversal.
*
* The root-level `.blobs` subtree is special-cased: only its `.redirectTable`
* blob is read, because attachment blob contents are captured separately via
* {@link captureReferencedAttachmentBlobs}.
*/
export async function readReferencedSnapshotBlobs(
snapshot: ISnapshot | ISnapshotTree,
storage: Pick<IDocumentStorageService, "readBlob">,
): Promise<ISerializableBlobContents> {
const { tree, read } = toTreeAndReader(snapshot, storage);
const ids = new Set<string>();
collectReferencedBlobIds(tree, true, ids);
const blobs: ISerializableBlobContents = {};
await mapWithConcurrency([...ids], maxReadConcurrency, async (id) => {
const data = await read(id);
blobs[id] = bufferToString(data, "utf8");
});
return blobs;
}

/**
* Synchronously walks the snapshot tree and gathers the set of blob ids that
* should be inlined. Subtrees flagged `unreferenced: true` are skipped
* entirely. The root-level `.blobs` subtree is special-cased: only its
* `.redirectTable` id is collected, because attachment blob contents are
* captured separately via {@link captureReferencedAttachmentBlobs}.
*/
function collectReferencedBlobIds(
tree: ISnapshotTree,
isRoot: boolean,
ids: Set<string>,
): void {
if (tree.unreferenced === true) {
return;
}
for (const blobId of Object.values(tree.blobs)) {
ids.add(blobId);
}
for (const [key, subTree] of Object.entries(tree.trees)) {
if (isRoot && key === blobsTreeName) {
const tableBlobId = subTree.blobs[redirectTableBlobName];
if (tableBlobId !== undefined) {
ids.add(tableBlobId);
}
} else {
collectReferencedBlobIds(subTree, false, ids);
}
Comment thread
markfields marked this conversation as resolved.
}
}

function toTreeAndReader(
snapshot: ISnapshot | ISnapshotTree,
storage: Pick<IDocumentStorageService, "readBlob">,
): { tree: ISnapshotTree; read: BlobReader } {
if ("snapshotTree" in snapshot) {
const blobContents = snapshot.blobContents;
return {
tree: snapshot.snapshotTree,
read: async (id) => blobContents.get(id) ?? storage.readBlob(id),
};
}
return { tree: snapshot, read: async (id) => storage.readBlob(id) };
}

/**
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deep Review: UTF-8/base64 encoding split — process sign-off ask, no code change.

captureReferencedAttachmentBlobs writes base64 (bufferToString(buffer, "base64")) into a new IBase64BlobContents; readReferencedSnapshotBlobs writes UTF-8 into ISerializableBlobContents (serializedStateManager.ts:284-299, decode in containerStorageAdapter.ts:515-527). The split is technically correct and aligns with the runtime's existing pending-blob serializer (blobManager.ts:340-343, :951-960); the non-UTF-8 round-trip e2e test at captureFullContainerState.spec.ts:351-427 covers the binary case.

Process concern: vladsud's "we should stick to one format across layers for blobs" position from PR #20507 (with anthony-murphy explicitly looped in) hasn't been revisited on the record. vladsud is already requested; please also tag anthony-murphy so the area's "one format" stance is consciously superseded rather than silently bypassed. The binary-safety argument for the split is sound — no code change expected.

* Fetches attachment blob contents from a snapshot, filtered by GC
* reachability. Blobs GC has explicitly marked unreferenced, tombstoned, or
* deleted are skipped. Blobs absent from the GC graph are kept — GC state
* lags behind recent attachments and dropping them would lose live data.
* If `gcData` is `undefined`, every attachment blob is returned.
*
* The returned map is keyed by attachment blob storage id. Values are the
* raw bytes encoded as **base64** strings — attachment blobs may carry
* arbitrary binary payloads (images, encrypted data, etc.) and a
* UTF-8 round-trip would silently corrupt non-UTF-8 byte sequences with
* replacement characters. The runtime's own pending-blob serializer uses
* base64 for the same reason. This diverges from the structural-blob path
* in {@link readReferencedSnapshotBlobs}, which encodes UTF-8 because those
* blobs are JSON or other text the runtime authored. Callers must keep the
* two encodings on separate fields of the pending state so the load side
* can decode each correctly.
*/
export async function captureReferencedAttachmentBlobs(
baseSnapshot: ISnapshotTree,
storage: Pick<IDocumentStorageService, "readBlob">,
gcData: IGcSnapshotData | undefined,
): Promise<IBase64BlobContents> {
const blobsTree: ISnapshotTree | undefined = baseSnapshot.trees[blobsTreeName];
if (blobsTree === undefined) {
return {};
}
const localIdToStorageId = await readRedirectTable(blobsTree, storage);
if (localIdToStorageId.size === 0) {
return {};
}

const unreferencedLocalIds =
gcData === undefined ? undefined : collectUnreferencedBlobLocalIds(gcData);

const storageIdsToFetch = new Set<string>();
for (const [localId, storageId] of localIdToStorageId) {
if (unreferencedLocalIds?.has(localId) !== true) {
storageIdsToFetch.add(storageId);
}
}

const contents: IBase64BlobContents = {};
await mapWithConcurrency([...storageIdsToFetch], maxReadConcurrency, async (storageId) => {
Comment thread
markfields marked this conversation as resolved.
const buffer = await storage.readBlob(storageId);
contents[storageId] = bufferToString(buffer, "base64");
});
return contents;
}

/**
* Reconstructs the BlobManager's redirect table from a `.blobs` subtree.
* Mirrors `toRedirectTable` in blobManagerSnapSum.ts.
*/
async function readRedirectTable(
blobsTree: ISnapshotTree,
storage: Pick<IDocumentStorageService, "readBlob">,
): Promise<Map<string, string>> {
const redirectTable = new Map<string, string>();
const tableBlobId: string | undefined = blobsTree.blobs[redirectTableBlobName];
if (tableBlobId !== undefined) {
const entries = await readAndParse<[string, string][]>(storage, tableBlobId);
for (const [localId, storageId] of entries) {
redirectTable.set(localId, storageId);
}
}
for (const [key, storageId] of Object.entries(blobsTree.blobs)) {
if (key !== redirectTableBlobName) {
// Identity mapping: storage ids referenced directly in handles (legacy).
redirectTable.set(storageId, storageId);
Comment thread
markfields marked this conversation as resolved.
}
}
return redirectTable;
}

/**
* Collects the set of blob localIds that GC has explicitly marked as
* unreferenced (via `unreferencedTimestampMs` on a gc node), tombstoned, or
* deleted. Tombstones and deletedNodes are applied regardless of whether
* `gcState` is present — they are authoritative on their own and must not
* be silently dropped when gc state is absent but tombstone/deleted lists
* exist.
*/
function collectUnreferencedBlobLocalIds(gcData: IGcSnapshotData): Set<string> {
const blobPathPrefix = `/${blobManagerBasePath}/`;
const unreferenced = new Set<string>();
if (gcData.gcState !== undefined) {
for (const [nodePath, nodeData] of Object.entries(gcData.gcState.gcNodes)) {
if (
nodePath.startsWith(blobPathPrefix) &&
nodeData.unreferencedTimestampMs !== undefined
) {
unreferenced.add(nodePath.slice(blobPathPrefix.length));
}
}
}
for (const nodePath of [...(gcData.tombstones ?? []), ...(gcData.deletedNodes ?? [])]) {
if (nodePath.startsWith(blobPathPrefix)) {
unreferenced.add(nodePath.slice(blobPathPrefix.length));
}
}
return unreferenced;
}

/**
* Returns true if any referenced subtree of `baseSnapshot` declares a
* `loadingGroupId`. Subtrees flagged `unreferenced` are skipped — a dead
Comment thread
markfields marked this conversation as resolved.
Outdated
* subtree's groupId would not be loaded by the runtime either.
*
* `captureFullContainerState` does not yet support loading groups: prefetching
* per-group snapshots adds a code path that has no end-to-end coverage and no
* known production consumer. Callers use this to fail fast with a `UsageError`
* rather than silently producing a pending state that omits group data.
*/
export function snapshotHasLoadingGroups(baseSnapshot: ISnapshotTree): boolean {
if (baseSnapshot.unreferenced === true) {
return false;
}
if (baseSnapshot.groupId !== undefined) {
return true;
}
for (const child of Object.values(baseSnapshot.trees)) {
if (snapshotHasLoadingGroups(child)) {
return true;
}
}
return false;
}
Loading
Loading