-
Notifications
You must be signed in to change notification settings - Fork 576
feat(tree): internals for field batch format with specialized node shapes #27200
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -40,14 +40,17 @@ import { | |||||||
| import type { IncrementalDecoder } from "./codecs.js"; | ||||||||
| import { | ||||||||
| type EncodedAnyShape, | ||||||||
| type EncodedChunkShapeV1OrV2, | ||||||||
| type EncodedChunkShape, | ||||||||
| type EncodedChunkShapeV2, | ||||||||
| type EncodedChunkShapeVTextExperimental, | ||||||||
| type EncodedFieldBatchV1OrV2, | ||||||||
| type EncodedFieldBatchV2, | ||||||||
| type EncodedFieldShape, | ||||||||
| type EncodedIncrementalChunkShape, | ||||||||
| type EncodedInlineArrayShape, | ||||||||
| type EncodedNestedArrayShape, | ||||||||
| type EncodedNodeShape, | ||||||||
| type EncodedSpecializedNodeShape, | ||||||||
| type EncodedValueShape, | ||||||||
| SpecialField, | ||||||||
| supportsIncrementalEncoding, | ||||||||
|
|
@@ -76,9 +79,120 @@ export function decode( | |||||||
| ); | ||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * Resolves `shapeIndex` to a fully-merged {@link EncodedNodeShape}, normalizing away any | ||||||||
| * specialized node shapes (`f`) along the way by applying their overlays via | ||||||||
| * {@link applySpecialization} until a concrete node shape is reached. | ||||||||
| * | ||||||||
| * @remarks | ||||||||
| * Exported for testing. | ||||||||
| */ | ||||||||
| export function normalizeToNodeShape( | ||||||||
| shapeIndex: number, | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For clarity, use our ShapeIndex type alias for number here and in the set below. |
||||||||
| context: DecoderContext<EncodedChunkShape>, | ||||||||
| visited: Set<number> = new Set(), | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extending the API docs to cover parameters, and noting why this exists and how most callers don't need to provide it would be good. Maybe a more specific name for it would help too: our codec does have logic to traverse and visit all shape which this might be confused with. |
||||||||
| ): EncodedNodeShape { | ||||||||
| assert(!visited.has(shapeIndex), "cyclic specialized node shape chain"); | ||||||||
| visited.add(shapeIndex); | ||||||||
| const encoded = context.shapes[shapeIndex]; | ||||||||
| assert(encoded !== undefined, "shape index out of bounds"); | ||||||||
| // Persisted shape variants are discriminated by single-letter property names (see | ||||||||
| // `EncodedChunkShape`): `c` is a concrete node shape, `f` is a specialized node shape that | ||||||||
| // derives from another shape via overrides. Other variants (`a`/`b`/`d`/`e`) are | ||||||||
| // non-node shapes and cannot appear in a specialization chain. | ||||||||
| if (encoded.c !== undefined) { | ||||||||
|
justus-camp-microsoft marked this conversation as resolved.
Outdated
|
||||||||
| return encoded.c; | ||||||||
| } | ||||||||
| assert( | ||||||||
| "f" in encoded && encoded.f !== undefined, | ||||||||
| "shape in specialization chain must be a node shape (c) or specialized node shape (f)", | ||||||||
| ); | ||||||||
| return applySpecialization( | ||||||||
| normalizeToNodeShape(encoded.f.base, context, visited), | ||||||||
| encoded.f, | ||||||||
| context, | ||||||||
| ); | ||||||||
|
justus-camp-microsoft marked this conversation as resolved.
|
||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * Produces a merged {@link EncodedNodeShape} by overlaying `spec` onto `base`. | ||||||||
| * | ||||||||
| * `type` is always inherited from `base`. `value` and `extraFields` are inherited from `base` | ||||||||
| * unless `spec` includes them as own properties (regardless of value), in which case `spec`'s | ||||||||
| * value wins. | ||||||||
| * | ||||||||
| * For `fields`, override entries whose key matches a key in `base.fields` replace that entry's | ||||||||
| * shape index in place; override entries with keys not present in `base` are appended at the end | ||||||||
| * in `spec.fields` order. Field order from `base` is preserved. | ||||||||
| * | ||||||||
| * @remarks | ||||||||
| * Exported for testing. | ||||||||
| */ | ||||||||
| export function applySpecialization( | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. EncodedSpecializedNodeShape documents the merge rules: this should refer to/link that and that should likely link here as well. Also I noticed an issue with those documented rules:
These types are supposed to be JSON compatible since they get json serialized. JSON does not preserve undefined properties, so we should not have them be significant or it won't work with real encoded data (I'm not sure if this is a docs bug or a real format issue)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also that doc calls these rules "Merge rules": given we have a bunch of logic about merges in this codebase which is about a different thing, calling them specialization rules or override rules would be more clear I think. |
||||||||
| base: EncodedNodeShape, | ||||||||
| spec: EncodedSpecializedNodeShape, | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "spec" is an ambiguous abbreviation (and we try to avoid abbreviations in the first place). Most people would assume this means specification, but here it might mean specialization. Renaming this to "overrides" (and "overrides" below to "fieldOverrides") would be more clear. |
||||||||
| context: DecoderContext<EncodedChunkShape>, | ||||||||
| ): EncodedNodeShape { | ||||||||
| const overrides = new Map<FieldKey, number>(); | ||||||||
| for (const [keyEncoded, shapeIndex] of spec.fields ?? []) { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This algorithm with its three loops seems like it could be simplified, and made a bit mor intuitive if you refactored it. Copy the base fields array: I feel like that approach is more obviously a correct implementation of the state merge policy. |
||||||||
| const key = context.identifier<FieldKey>(keyEncoded); | ||||||||
| assert(!overrides.has(key), "duplicate field key in specialized node shape"); | ||||||||
| overrides.set(key, shapeIndex); | ||||||||
| } | ||||||||
|
|
||||||||
| const mergedFields: EncodedFieldShape[] = []; | ||||||||
| const overriddenKeys = new Set<FieldKey>(); | ||||||||
| const baseKeys = new Set<FieldKey>(); | ||||||||
| for (const [keyEncoded, shapeIndex] of base.fields ?? []) { | ||||||||
| const key = context.identifier<FieldKey>(keyEncoded); | ||||||||
| assert(!baseKeys.has(key), "duplicate field key in base node shape"); | ||||||||
| baseKeys.add(key); | ||||||||
| const overrideShape = overrides.get(key); | ||||||||
| if (overrideShape === undefined) { | ||||||||
| mergedFields.push([keyEncoded, shapeIndex]); | ||||||||
| } else { | ||||||||
| overriddenKeys.add(key); | ||||||||
| mergedFields.push([keyEncoded, overrideShape]); | ||||||||
| } | ||||||||
| } | ||||||||
| for (const [keyEncoded, shapeIndex] of spec.fields ?? []) { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A code comment on these loops would be helpful. This one could be something like:
Suggested change
|
||||||||
| if (!overriddenKeys.has(context.identifier<FieldKey>(keyEncoded))) { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could just use overrides instead of overriddenKeys, then overriddenKeys can be entirely removed. |
||||||||
| mergedFields.push([keyEncoded, shapeIndex]); | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| return { | ||||||||
| type: base.type, | ||||||||
| value: "value" in spec ? spec.value : base.value, | ||||||||
| fields: mergedFields.length > 0 ? mergedFields : undefined, | ||||||||
| extraFields: "extraFields" in spec ? spec.extraFields : base.extraFields, | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is going to break due to JSON not persisting undefined fields if trying to remove optional fields in an override. I think you might need to store some other value (like 0, false or null: 0 is the shortest of those so maybe pick that) to encode that case. The fact you missed this implies that there might be a gap in test coverage, either for this case and/or not testing with the JSON round tripping. |
||||||||
| }; | ||||||||
| } | ||||||||
|
|
||||||||
| /** | ||||||||
| * Decoder for {@link EncodedSpecializedNodeShape}s. | ||||||||
| * Merges the specialization's field overrides with the resolved base node shape, then delegates | ||||||||
| * to a {@link NodeDecoder} built from the merged shape. | ||||||||
| */ | ||||||||
| export class SpecializedNodeDecoder implements ChunkDecoder { | ||||||||
| private readonly inner: NodeDecoder; | ||||||||
| public constructor( | ||||||||
| shape: EncodedSpecializedNodeShape, | ||||||||
| context: DecoderContext<EncodedChunkShape>, | ||||||||
| ) { | ||||||||
| this.inner = new NodeDecoder( | ||||||||
| applySpecialization(normalizeToNodeShape(shape.base, context), shape, context), | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can't you replace this with a single call to normalizeToNodeShape on shape? (Might be work tweaking the signature for normalizeToNodeShape to enable that) |
||||||||
| context, | ||||||||
| ); | ||||||||
| } | ||||||||
| public decode(decoders: readonly ChunkDecoder[], stream: StreamCursor): TreeChunk { | ||||||||
| return this.inner.decode(decoders, stream); | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| const decoderLibrary = new DiscriminatedUnionDispatcher< | ||||||||
| EncodedChunkShapeV1OrV2, | ||||||||
| [context: DecoderContext<EncodedChunkShapeV1OrV2>], | ||||||||
| EncodedChunkShapeVTextExperimental, | ||||||||
| [context: DecoderContext<EncodedChunkShape>], | ||||||||
| ChunkDecoder | ||||||||
| >({ | ||||||||
| a(shape: EncodedNestedArrayShape, context): ChunkDecoder { | ||||||||
|
|
@@ -99,6 +213,9 @@ const decoderLibrary = new DiscriminatedUnionDispatcher< | |||||||
| ): ChunkDecoder { | ||||||||
| return new IncrementalChunkDecoder(context); | ||||||||
| }, | ||||||||
| f(shape: EncodedSpecializedNodeShape, context): ChunkDecoder { | ||||||||
| return new SpecializedNodeDecoder(shape, context); | ||||||||
| }, | ||||||||
| }); | ||||||||
|
|
||||||||
| /** | ||||||||
|
|
@@ -300,7 +417,7 @@ type BasicFieldDecoder = ( | |||||||
| * Get a decoder for fields of a provided (via `shape` and `context`). | ||||||||
| */ | ||||||||
| function fieldDecoder( | ||||||||
| context: DecoderContext<EncodedChunkShapeV1OrV2>, | ||||||||
| context: DecoderContext<EncodedChunkShape>, | ||||||||
| key: FieldKey, | ||||||||
| shape: number, | ||||||||
| ): BasicFieldDecoder { | ||||||||
|
|
@@ -319,7 +436,7 @@ export class NodeDecoder implements ChunkDecoder { | |||||||
| private readonly fieldDecoders: readonly BasicFieldDecoder[]; | ||||||||
| public constructor( | ||||||||
| private readonly shape: EncodedNodeShape, | ||||||||
| private readonly context: DecoderContext<EncodedChunkShapeV1OrV2>, | ||||||||
| private readonly context: DecoderContext<EncodedChunkShape>, | ||||||||
| ) { | ||||||||
| this.type = shape.type === undefined ? undefined : context.identifier(shape.type); | ||||||||
|
|
||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| /*! | ||
| * Copyright (c) Microsoft Corporation and contributors. All rights reserved. | ||
| * Licensed under the MIT License. | ||
| */ | ||
|
|
||
| import { type Static, Type } from "@sinclair/typebox"; | ||
|
|
||
| import { unionOptions } from "../../../../codec/index.js"; | ||
|
|
||
| import { ShapeIndex } from "./formatGeneric.js"; | ||
| import { EncodedFieldShape, EncodedValueShape } from "./formatV1.js"; | ||
| import { shapesV2 } from "./formatV2.js"; | ||
|
|
||
| /** | ||
| * A node shape that derives from another node shape by overlaying property-level overrides. | ||
| * | ||
| * @remarks | ||
| * Compresses runs of node shapes that differ only in a few properties: a base node shape | ||
| * defines the structural skeleton, and the specialization narrows specific properties. | ||
| * | ||
| * For example, a base `FormatNode` with a variable-boolean `bold` field can be specialized | ||
| * to a shape that pins `bold` to a constant `true` — every node decoded with the | ||
| * specialization contributes zero stream tokens for `bold`. | ||
| * | ||
| * Merge rules: `type` is always inherited from the resolved base. `fields`, `value`, and | ||
| * `extraFields` are inherited unless the specialization sets them as own properties — to | ||
| * inherit, the property must be omitted; setting it explicitly (even to `false` or | ||
| * `undefined`) is treated as an override. | ||
| */ | ||
| export type EncodedSpecializedNodeShape = Static<typeof EncodedSpecializedNodeShape>; | ||
| export const EncodedSpecializedNodeShape = Type.Object( | ||
| { | ||
| /** | ||
| * Index into the enclosing batch's shapes array of the shape this specializes. | ||
| * Must resolve to either an {@link EncodedNodeShape} or another | ||
| * `EncodedSpecializedNodeShape`; chains are followed transitively until a node shape | ||
| * is reached. This restriction is enforced at runtime, not by the schema. | ||
| */ | ||
| base: ShapeIndex, | ||
| /** | ||
| * Field-level overrides applied to the resolved base's `fields`. Entries whose key | ||
| * matches a base field replace that field's shape index in place; entries with new | ||
| * keys are appended after all base fields, in the order given here. Base field order | ||
| * is preserved — this is the stream consumption order at decode time, so encoders | ||
| * must serialize per-field tokens in the merged order, not in this list's order. | ||
| */ | ||
| fields: Type.Optional(Type.Array(EncodedFieldShape)), | ||
| /** | ||
| * If present, replaces the resolved base's value shape. | ||
| */ | ||
| value: Type.Optional(EncodedValueShape), | ||
| /** | ||
| * If present, replaces the resolved base's extraFields shape. | ||
| */ | ||
| extraFields: Type.Optional(ShapeIndex), | ||
| }, | ||
| { additionalProperties: false }, | ||
| ); | ||
|
|
||
| /** | ||
| * Experimental extension of {@link EncodedChunkShapeV2}. | ||
| * @remarks | ||
| * See {@link DiscriminatedUnionDispatcher} for more information on this pattern. | ||
| */ | ||
| export type EncodedChunkShapeVTextExperimental = Static< | ||
| typeof EncodedChunkShapeVTextExperimental | ||
| >; | ||
| export const EncodedChunkShapeVTextExperimental = Type.Object( | ||
| { | ||
| ...shapesV2, | ||
| f: Type.Optional(EncodedSpecializedNodeShape), | ||
| }, | ||
| unionOptions, | ||
| ); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should either:
OR
OR