-
Notifications
You must be signed in to change notification settings - Fork 576
feat(container-loader): add captureFullContainerState free function #27220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 20 commits
f71f674
d7a03b1
49df699
220a846
ebe5f7f
73d1df0
8a748cc
8d89e8e
2e5c3f7
57a1a76
807f73f
aa4ce9c
2e7f66c
81d04de
a6bb56f
e4065af
1d5a51b
64c6cc4
7739827
cdda9ba
1a4bcfa
d095ae5
6e66f7b
f4621c2
0062623
c21964d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| --- | ||
| "@fluidframework/container-loader": minor | ||
| "@fluidframework/container-runtime": minor | ||
| "__section": feature | ||
| --- | ||
| Add a new free function captureFullContainerState for serializing a container without loading it, for rehydrating into a future session. | ||
|
Check failure on line 6 in .changeset/wide-foxes-behave.md
|
||
|
|
||
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,339 @@ | ||
| /*! | ||
| * Copyright (c) Microsoft Corporation and contributors. All rights reserved. | ||
| * Licensed under the MIT License. | ||
| */ | ||
|
|
||
| import { bufferToString } from "@fluid-internal/client-utils"; | ||
| import type { | ||
| IDocumentStorageService, | ||
| ISnapshot, | ||
| ISnapshotTree, | ||
| } from "@fluidframework/driver-definitions/internal"; | ||
| import { readAndParse } from "@fluidframework/driver-utils/internal"; | ||
|
|
||
| import type { | ||
| IBase64BlobContents, | ||
| ISerializableBlobContents, | ||
| } from "./containerStorageAdapter.js"; | ||
|
|
||
| /** | ||
| * Wire-format constants this module needs to walk and filter snapshots. | ||
| * Authoritative definitions live in `container-runtime` and | ||
| * `runtime-definitions`; the values are duplicated here to avoid a | ||
| * loader → runtime layering dependency. A contract test in | ||
| * `packages/test/local-server-tests` asserts these match the authoritative | ||
| * values; do not change them in isolation. | ||
| * | ||
| * Authoritative sources: | ||
| * - `blobsTreeName`, `redirectTableBlobName`: `packages/runtime/container-runtime/src/blobManager/blobManagerSnapSum.ts` | ||
| * - `blobManagerBasePath`: `packages/runtime/container-runtime/src/blobManager/blobManager.ts` | ||
| * - `gcTreeKey`, `gcBlobPrefix`, `gcTombstoneBlobKey`, `gcDeletedBlobKey`: `packages/runtime/runtime-definitions/src/garbageCollectionDefinitions.ts` | ||
| * | ||
| * @internal | ||
| */ | ||
| export const wireFormatConstants = { | ||
| blobsTreeName: ".blobs", | ||
| redirectTableBlobName: ".redirectTable", | ||
| blobManagerBasePath: "_blobs", | ||
| gcTreeKey: "gc", | ||
| gcBlobPrefix: "__gc", | ||
| gcTombstoneBlobKey: "__tombstones", | ||
| gcDeletedBlobKey: "__deletedNodes", | ||
| } as const; | ||
|
|
||
| const { | ||
| blobsTreeName, | ||
| redirectTableBlobName, | ||
| blobManagerBasePath, | ||
| gcTreeKey, | ||
| gcBlobPrefix, | ||
| gcTombstoneBlobKey, | ||
| gcDeletedBlobKey, | ||
| } = wireFormatConstants; | ||
|
|
||
| interface IGcNodeData { | ||
| outboundRoutes: string[]; | ||
| unreferencedTimestampMs?: number; | ||
| } | ||
|
|
||
| interface IGcState { | ||
| gcNodes: { [id: string]: IGcNodeData }; | ||
| } | ||
|
|
||
| /** | ||
| * The parsed subset of the `gc` subtree that drives reachability decisions. | ||
| */ | ||
| export interface IGcSnapshotData { | ||
| gcState: IGcState | undefined; | ||
| tombstones: string[] | undefined; | ||
| deletedNodes: string[] | undefined; | ||
| } | ||
|
|
||
| /** Reader that returns a blob's contents for a given storage id. */ | ||
| type BlobReader = (id: string) => Promise<ArrayBufferLike>; | ||
|
|
||
| /** | ||
| * Upper bound on concurrent `readBlob` calls. Driver/service back-pressure is | ||
| * real for large documents, and unbounded `Promise.all` can trigger throttling | ||
| * or spike memory. The value is a pragmatic middle ground — high enough to | ||
| * keep a typical driver's request pipeline full, low enough to avoid storms. | ||
| */ | ||
| const maxReadConcurrency = 32; | ||
|
|
||
| /** | ||
| * Runs `fn` over `items` with at most `limit` promises in flight. Preserves | ||
| * input order on output (not that any caller depends on it today). | ||
| */ | ||
| async function mapWithConcurrency<T, R>( | ||
| items: readonly T[], | ||
| limit: number, | ||
| fn: (item: T) => Promise<R>, | ||
| ): Promise<R[]> { | ||
| const results: R[] = Array.from({ length: items.length }); | ||
| let cursor = 0; | ||
| const workerCount = Math.min(limit, items.length); | ||
| const workers = Array.from({ length: workerCount }, async () => { | ||
| while (cursor < items.length) { | ||
| const index = cursor++; | ||
| const item = items[index]; | ||
| if (item !== undefined) { | ||
| results[index] = await fn(item); | ||
| } | ||
| } | ||
| }); | ||
| await Promise.all(workers); | ||
| return results; | ||
| } | ||
|
|
||
| /** | ||
| * Parses the `gc` subtree of a base snapshot. Returns `undefined` if the | ||
| * snapshot has no GC tree (GC disabled or pre-GC document). | ||
| */ | ||
| export async function parseGcSnapshotData( | ||
| baseSnapshot: ISnapshotTree, | ||
| storage: Pick<IDocumentStorageService, "readBlob">, | ||
| ): Promise<IGcSnapshotData | undefined> { | ||
| const gcSnapshotTree: ISnapshotTree | undefined = baseSnapshot.trees[gcTreeKey]; | ||
| if (gcSnapshotTree === undefined) { | ||
| return undefined; | ||
| } | ||
| let gcState: IGcState | undefined; | ||
| let tombstones: string[] | undefined; | ||
| let deletedNodes: string[] | undefined; | ||
| for (const [key, blobId] of Object.entries(gcSnapshotTree.blobs)) { | ||
| if (key === gcDeletedBlobKey) { | ||
| deletedNodes = await readAndParse<string[]>(storage, blobId); | ||
| } else if (key === gcTombstoneBlobKey) { | ||
| tombstones = await readAndParse<string[]>(storage, blobId); | ||
| } else if (key.startsWith(gcBlobPrefix)) { | ||
| const partial = await readAndParse<IGcState>(storage, blobId); | ||
| if (gcState === undefined) { | ||
| gcState = { gcNodes: { ...partial.gcNodes } }; | ||
| } else { | ||
| for (const [nodeId, nodeData] of Object.entries(partial.gcNodes)) { | ||
| gcState.gcNodes[nodeId] ??= nodeData; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| return { gcState, tombstones, deletedNodes }; | ||
| } | ||
|
|
||
| /** | ||
| * Walks a snapshot and inlines the contents of every blob reachable without | ||
| * crossing an `unreferenced` subtree boundary. Subtrees flagged | ||
| * `unreferenced: true` are skipped entirely — the summarizer sets that flag | ||
| * from GC state, so honouring it filters out dead subtrees without a | ||
| * separate GC-path traversal. | ||
| * | ||
| * The root-level `.blobs` subtree is special-cased: only its `.redirectTable` | ||
| * blob is read, because attachment blob contents are captured separately via | ||
| * {@link captureReferencedAttachmentBlobs}. | ||
| */ | ||
| export async function readReferencedSnapshotBlobs( | ||
| snapshot: ISnapshot | ISnapshotTree, | ||
| storage: Pick<IDocumentStorageService, "readBlob">, | ||
| ): Promise<ISerializableBlobContents> { | ||
| const { tree, read } = toTreeAndReader(snapshot, storage); | ||
| const ids = new Set<string>(); | ||
| collectReferencedBlobIds(tree, true, ids); | ||
| const blobs: ISerializableBlobContents = {}; | ||
| await mapWithConcurrency([...ids], maxReadConcurrency, async (id) => { | ||
| const data = await read(id); | ||
| blobs[id] = bufferToString(data, "utf8"); | ||
| }); | ||
| return blobs; | ||
| } | ||
|
|
||
| /** | ||
| * Synchronously walks the snapshot tree and gathers the set of blob ids that | ||
| * should be inlined. Subtrees flagged `unreferenced: true` are skipped | ||
| * entirely. The root-level `.blobs` subtree is special-cased: only its | ||
| * `.redirectTable` id is collected, because attachment blob contents are | ||
| * captured separately via {@link captureReferencedAttachmentBlobs}. | ||
| */ | ||
| function collectReferencedBlobIds( | ||
| tree: ISnapshotTree, | ||
| isRoot: boolean, | ||
| ids: Set<string>, | ||
| ): void { | ||
| if (tree.unreferenced === true) { | ||
| return; | ||
| } | ||
| for (const blobId of Object.values(tree.blobs)) { | ||
| ids.add(blobId); | ||
| } | ||
| for (const [key, subTree] of Object.entries(tree.trees)) { | ||
| if (isRoot && key === blobsTreeName) { | ||
| const tableBlobId = subTree.blobs[redirectTableBlobName]; | ||
| if (tableBlobId !== undefined) { | ||
| ids.add(tableBlobId); | ||
| } | ||
| } else { | ||
| collectReferencedBlobIds(subTree, false, ids); | ||
| } | ||
|
markfields marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| function toTreeAndReader( | ||
| snapshot: ISnapshot | ISnapshotTree, | ||
| storage: Pick<IDocumentStorageService, "readBlob">, | ||
| ): { tree: ISnapshotTree; read: BlobReader } { | ||
| if ("snapshotTree" in snapshot) { | ||
| const blobContents = snapshot.blobContents; | ||
| return { | ||
| tree: snapshot.snapshotTree, | ||
| read: async (id) => blobContents.get(id) ?? storage.readBlob(id), | ||
| }; | ||
| } | ||
| return { tree: snapshot, read: async (id) => storage.readBlob(id) }; | ||
| } | ||
|
|
||
| /** | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deep Review: UTF-8/base64 encoding split — process sign-off ask, no code change.
Process concern: vladsud's "we should stick to one format across layers for blobs" position from PR #20507 (with anthony-murphy explicitly looped in) hasn't been revisited on the record. vladsud is already requested; please also tag anthony-murphy so the area's "one format" stance is consciously superseded rather than silently bypassed. The binary-safety argument for the split is sound — no code change expected. |
||
| * Fetches attachment blob contents from a snapshot, filtered by GC | ||
| * reachability. Blobs GC has explicitly marked unreferenced, tombstoned, or | ||
| * deleted are skipped. Blobs absent from the GC graph are kept — GC state | ||
| * lags behind recent attachments and dropping them would lose live data. | ||
| * If `gcData` is `undefined`, every attachment blob is returned. | ||
| * | ||
| * The returned map is keyed by attachment blob storage id. Values are the | ||
| * raw bytes encoded as **base64** strings — attachment blobs may carry | ||
| * arbitrary binary payloads (images, encrypted data, etc.) and a | ||
| * UTF-8 round-trip would silently corrupt non-UTF-8 byte sequences with | ||
| * replacement characters. The runtime's own pending-blob serializer uses | ||
| * base64 for the same reason. This diverges from the structural-blob path | ||
| * in {@link readReferencedSnapshotBlobs}, which encodes UTF-8 because those | ||
| * blobs are JSON or other text the runtime authored. Callers must keep the | ||
| * two encodings on separate fields of the pending state so the load side | ||
| * can decode each correctly. | ||
| */ | ||
| export async function captureReferencedAttachmentBlobs( | ||
| baseSnapshot: ISnapshotTree, | ||
| storage: Pick<IDocumentStorageService, "readBlob">, | ||
| gcData: IGcSnapshotData | undefined, | ||
| ): Promise<IBase64BlobContents> { | ||
| const blobsTree: ISnapshotTree | undefined = baseSnapshot.trees[blobsTreeName]; | ||
| if (blobsTree === undefined) { | ||
| return {}; | ||
| } | ||
| const localIdToStorageId = await readRedirectTable(blobsTree, storage); | ||
| if (localIdToStorageId.size === 0) { | ||
| return {}; | ||
| } | ||
|
|
||
| const unreferencedLocalIds = | ||
| gcData === undefined ? undefined : collectUnreferencedBlobLocalIds(gcData); | ||
|
|
||
| const storageIdsToFetch = new Set<string>(); | ||
| for (const [localId, storageId] of localIdToStorageId) { | ||
| if (unreferencedLocalIds?.has(localId) !== true) { | ||
| storageIdsToFetch.add(storageId); | ||
| } | ||
| } | ||
|
|
||
| const contents: IBase64BlobContents = {}; | ||
| await mapWithConcurrency([...storageIdsToFetch], maxReadConcurrency, async (storageId) => { | ||
|
markfields marked this conversation as resolved.
|
||
| const buffer = await storage.readBlob(storageId); | ||
| contents[storageId] = bufferToString(buffer, "base64"); | ||
| }); | ||
| return contents; | ||
| } | ||
|
|
||
| /** | ||
| * Reconstructs the BlobManager's redirect table from a `.blobs` subtree. | ||
| * Mirrors `toRedirectTable` in blobManagerSnapSum.ts. | ||
| */ | ||
| async function readRedirectTable( | ||
| blobsTree: ISnapshotTree, | ||
| storage: Pick<IDocumentStorageService, "readBlob">, | ||
| ): Promise<Map<string, string>> { | ||
| const redirectTable = new Map<string, string>(); | ||
| const tableBlobId: string | undefined = blobsTree.blobs[redirectTableBlobName]; | ||
| if (tableBlobId !== undefined) { | ||
| const entries = await readAndParse<[string, string][]>(storage, tableBlobId); | ||
| for (const [localId, storageId] of entries) { | ||
| redirectTable.set(localId, storageId); | ||
| } | ||
| } | ||
| for (const [key, storageId] of Object.entries(blobsTree.blobs)) { | ||
| if (key !== redirectTableBlobName) { | ||
| // Identity mapping: storage ids referenced directly in handles (legacy). | ||
| redirectTable.set(storageId, storageId); | ||
|
markfields marked this conversation as resolved.
|
||
| } | ||
| } | ||
| return redirectTable; | ||
| } | ||
|
|
||
| /** | ||
| * Collects the set of blob localIds that GC has explicitly marked as | ||
| * unreferenced (via `unreferencedTimestampMs` on a gc node), tombstoned, or | ||
| * deleted. Tombstones and deletedNodes are applied regardless of whether | ||
| * `gcState` is present — they are authoritative on their own and must not | ||
| * be silently dropped when gc state is absent but tombstone/deleted lists | ||
| * exist. | ||
| */ | ||
| function collectUnreferencedBlobLocalIds(gcData: IGcSnapshotData): Set<string> { | ||
| const blobPathPrefix = `/${blobManagerBasePath}/`; | ||
| const unreferenced = new Set<string>(); | ||
| if (gcData.gcState !== undefined) { | ||
| for (const [nodePath, nodeData] of Object.entries(gcData.gcState.gcNodes)) { | ||
| if ( | ||
| nodePath.startsWith(blobPathPrefix) && | ||
| nodeData.unreferencedTimestampMs !== undefined | ||
| ) { | ||
| unreferenced.add(nodePath.slice(blobPathPrefix.length)); | ||
| } | ||
| } | ||
| } | ||
| for (const nodePath of [...(gcData.tombstones ?? []), ...(gcData.deletedNodes ?? [])]) { | ||
| if (nodePath.startsWith(blobPathPrefix)) { | ||
| unreferenced.add(nodePath.slice(blobPathPrefix.length)); | ||
| } | ||
| } | ||
| return unreferenced; | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if any referenced subtree of `baseSnapshot` declares a | ||
| * `loadingGroupId`. Subtrees flagged `unreferenced` are skipped — a dead | ||
|
markfields marked this conversation as resolved.
Outdated
|
||
| * subtree's groupId would not be loaded by the runtime either. | ||
| * | ||
| * `captureFullContainerState` does not yet support loading groups: prefetching | ||
| * per-group snapshots adds a code path that has no end-to-end coverage and no | ||
| * known production consumer. Callers use this to fail fast with a `UsageError` | ||
| * rather than silently producing a pending state that omits group data. | ||
| */ | ||
| export function snapshotHasLoadingGroups(baseSnapshot: ISnapshotTree): boolean { | ||
| if (baseSnapshot.unreferenced === true) { | ||
| return false; | ||
| } | ||
| if (baseSnapshot.groupId !== undefined) { | ||
| return true; | ||
| } | ||
| for (const child of Object.values(baseSnapshot.trees)) { | ||
| if (snapshotHasLoadingGroups(child)) { | ||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deep Review: Loader↔runtime constant duplication — gatekeeper nod ask, no code change.
This PR introduces a
wireFormatConstantsblock in the loader mirroring seven runtime-side constants (.blobs,.redirectTable,_blobs,gc,__gc,__tombstones,__deletedNodes) and a cross-packagedeepStrictEqualcontract test (wireFormatConstants.spec.ts). PR #21729 (anthony-murphy, 2024-07-02) was the deliberate refactor that produced these constants in their current form, with markfields reviewing for path-format discipline. This PR is the first cross-layer consumer; the chosen pattern (duplicate + contract test) sets precedent for any future shared wire-format constant.Tag anthony-murphy for an explicit nod on the duplicate+contract-test layering choice, or merge with a clear note that AB#72934 will revisit (per thread #3192306888). No code change expected.