From b27e5d3267d9b9529f7a6d62f825842835c7c8ec Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Mon, 1 Dec 2025 16:19:21 -0500 Subject: [PATCH 01/34] feat(iavl): initialize disk layout --- iavlx/branch_layout.go | 28 +++++++++++++++++++++++++ iavlx/leaf_layout.go | 22 ++++++++++++++++++++ iavlx/node_id.go | 46 ++++++++++++++++++++++++++++++++++++++++++ iavlx/node_id_test.go | 33 ++++++++++++++++++++++++++++++ 4 files changed, 129 insertions(+) create mode 100644 iavlx/branch_layout.go create mode 100644 iavlx/leaf_layout.go create mode 100644 iavlx/node_id.go create mode 100644 iavlx/node_id_test.go diff --git a/iavlx/branch_layout.go b/iavlx/branch_layout.go new file mode 100644 index 000000000000..c1b9ddcb6c0a --- /dev/null +++ b/iavlx/branch_layout.go @@ -0,0 +1,28 @@ +package iavlx + +import ( + "fmt" + "unsafe" +) + +func init() { + if unsafe.Sizeof(BranchLayout{}) != SizeBranch { + panic(fmt.Sprintf("invalid BranchLayout size: got %d, want %d", unsafe.Sizeof(BranchLayout{}), SizeBranch)) + } +} + +const ( + SizeBranch = 76 +) + +type BranchLayout struct { + ID NodeID + Left NodeID + Right NodeID + LeftOffset uint32 // absolute offset + RightOffset uint32 // absolute offset + KeyOffset uint32 + Height uint8 + Size uint32 // TODO 5 bytes? (there are 3 bytes of padding here) + Hash [32]byte +} diff --git a/iavlx/leaf_layout.go b/iavlx/leaf_layout.go new file mode 100644 index 000000000000..b53add1e83b1 --- /dev/null +++ b/iavlx/leaf_layout.go @@ -0,0 +1,22 @@ +package iavlx + +import ( + "fmt" + "unsafe" +) + +func init() { + if unsafe.Sizeof(LeafLayout{}) != SizeLeaf { + panic(fmt.Sprintf("invalid LeafLayout size: got %d, want %d", unsafe.Sizeof(LeafLayout{}), SizeLeaf)) + } +} + +const ( + SizeLeaf = 44 +) + +type LeafLayout struct { + ID NodeID + Hash [32]byte + KeyOffset uint32 +} diff --git a/iavlx/node_id.go b/iavlx/node_id.go new file mode 100644 index 000000000000..7467199e26a6 --- /dev/null +++ b/iavlx/node_id.go @@ -0,0 +1,46 @@ +package iavlx + +import "fmt" + +// NodeID is a stable identifier for a node in the IAVL tree. +type NodeID struct { + Version uint32 + FlagIndex NodeFlagIndex +} + +// NodeFlagIndex is the index of an IAVL node in the tree plus a flag indicating whether this is a branch or leaf node. +// For leaf nodes, the index value is the 1-based in-order index of the leaf node with reference to other leaf nodes in this version. +// For branch nodes, the index value is the 1-based post-order traversal index of the node within this version. +// Bit 31 indicates whether this is a branch or leaf node (0 for branch, 1 for leaf). +type NodeFlagIndex uint32 + +func NewNodeID(isLeaf bool, version, index uint32) NodeID { + return NodeID{ + Version: version, + FlagIndex: NewNodeFlagIndex(isLeaf, index), + } +} + +func (id NodeID) IsLeaf() bool { + return id.FlagIndex.IsLeaf() +} + +func NewNodeFlagIndex(isLeaf bool, index uint32) NodeFlagIndex { + idx := NodeFlagIndex(index) + if isLeaf { + idx |= 1 << 31 + } + return idx +} + +func (index NodeFlagIndex) IsLeaf() bool { + return index&(1<<31) != 0 +} + +func (index NodeFlagIndex) Index() uint32 { + return uint32(index) & 0x7FFFFFFF +} + +func (id NodeID) String() string { + return fmt.Sprintf("NodeID{leaf:%t, version:%d, index:%d}", id.IsLeaf(), id.Version, id.FlagIndex.Index()) +} diff --git a/iavlx/node_id_test.go b/iavlx/node_id_test.go new file mode 100644 index 000000000000..fbfffb87a918 --- /dev/null +++ b/iavlx/node_id_test.go @@ -0,0 +1,33 @@ +package iavlx + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNodeID(t *testing.T) { + tests := []struct { + name string + leaf bool + version uint32 + index uint32 + }{ + { + "leaf1_1", + true, 1, 1, + }, + { + "branch2_3", + false, 2, 3, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + id := NewNodeID(test.leaf, test.version, test.index) + require.Equal(t, test.leaf, id.IsLeaf()) + require.Equal(t, test.index, id.FlagIndex.Index()) + require.Equal(t, test.version, id.Version) + }) + } +} From 35990101e2978466b0e55759d47a9da78fc172db Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Mon, 1 Dec 2025 17:56:02 -0500 Subject: [PATCH 02/34] change package --- {iavlx => iavl/internal}/branch_layout.go | 2 +- {iavlx => iavl/internal}/leaf_layout.go | 2 +- {iavlx => iavl/internal}/node_id.go | 2 +- {iavlx => iavl/internal}/node_id_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename {iavlx => iavl/internal}/branch_layout.go (96%) rename {iavlx => iavl/internal}/leaf_layout.go (94%) rename {iavlx => iavl/internal}/node_id.go (98%) rename {iavlx => iavl/internal}/node_id_test.go (96%) diff --git a/iavlx/branch_layout.go b/iavl/internal/branch_layout.go similarity index 96% rename from iavlx/branch_layout.go rename to iavl/internal/branch_layout.go index c1b9ddcb6c0a..78a578fb75b1 100644 --- a/iavlx/branch_layout.go +++ b/iavl/internal/branch_layout.go @@ -1,4 +1,4 @@ -package iavlx +package internal import ( "fmt" diff --git a/iavlx/leaf_layout.go b/iavl/internal/leaf_layout.go similarity index 94% rename from iavlx/leaf_layout.go rename to iavl/internal/leaf_layout.go index b53add1e83b1..12a496891092 100644 --- a/iavlx/leaf_layout.go +++ b/iavl/internal/leaf_layout.go @@ -1,4 +1,4 @@ -package iavlx +package internal import ( "fmt" diff --git a/iavlx/node_id.go b/iavl/internal/node_id.go similarity index 98% rename from iavlx/node_id.go rename to iavl/internal/node_id.go index 7467199e26a6..317cec53bdc4 100644 --- a/iavlx/node_id.go +++ b/iavl/internal/node_id.go @@ -1,4 +1,4 @@ -package iavlx +package internal import "fmt" diff --git a/iavlx/node_id_test.go b/iavl/internal/node_id_test.go similarity index 96% rename from iavlx/node_id_test.go rename to iavl/internal/node_id_test.go index fbfffb87a918..dc4f6c261f5a 100644 --- a/iavlx/node_id_test.go +++ b/iavl/internal/node_id_test.go @@ -1,4 +1,4 @@ -package iavlx +package internal import ( "testing" From e5812829a8c861fbb99ac80fd11609d946e19517 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 11:00:39 -0500 Subject: [PATCH 03/34] switch size to uint40, update docs and tests --- iavl/internal/branch_layout.go | 62 ++++++++++++++++++++++++++-------- iavl/internal/leaf_layout.go | 22 +++++++----- iavl/internal/node_id.go | 11 +++++- iavl/internal/node_id_test.go | 4 +++ iavl/internal/uint40.go | 25 ++++++++++++++ iavl/internal/uint40_test.go | 49 +++++++++++++++++++++++++++ 6 files changed, 149 insertions(+), 24 deletions(-) create mode 100644 iavl/internal/uint40.go create mode 100644 iavl/internal/uint40_test.go diff --git a/iavl/internal/branch_layout.go b/iavl/internal/branch_layout.go index 78a578fb75b1..3e14ea7e8768 100644 --- a/iavl/internal/branch_layout.go +++ b/iavl/internal/branch_layout.go @@ -5,24 +5,56 @@ import ( "unsafe" ) +const ( + sizeBranch = 76 +) + func init() { - if unsafe.Sizeof(BranchLayout{}) != SizeBranch { - panic(fmt.Sprintf("invalid BranchLayout size: got %d, want %d", unsafe.Sizeof(BranchLayout{}), SizeBranch)) + // Verify the size of BranchLayout is what we expect it to be at runtime. + if unsafe.Sizeof(BranchLayout{}) != sizeBranch { + panic(fmt.Sprintf("invalid BranchLayout size: got %d, want %d", unsafe.Sizeof(BranchLayout{}), sizeBranch)) } } -const ( - SizeBranch = 76 -) - +// BranchLayout is the on-disk layout of a branch node. +// NOTE: changes to this struct will affect on-disk compatibility. type BranchLayout struct { - ID NodeID - Left NodeID - Right NodeID - LeftOffset uint32 // absolute offset - RightOffset uint32 // absolute offset - KeyOffset uint32 - Height uint8 - Size uint32 // TODO 5 bytes? (there are 3 bytes of padding here) - Hash [32]byte + // ID is the NodeID of this branch node. + ID NodeID + + // Left is the NodeID of the left child node. + Left NodeID + + // Right is the NodeID of the right child node. + Right NodeID + + // NOTE: Left and right offsets are included for performance and take up an extra 8 bytes of storage for each branch node. + // In an alternate design we stored only NodeID or offset for left and right depending on whether they are local + // to this changeset or in a different changeset. + // This saved 8 bytes of storage per branch node but made the implementation significantly more complex. + // For now, we are including both the left and right IDs and offsets, but if storage space becomes a problem + // we can revisit the earlier design and have an 8-byte NodeIDOrOffset type for Left and Right. + + // LeftOffset is the 1-based offset of the left child node if it is in this changeset, 0 otherwise. + // The Left NodeID will indicate whether this is a branch or leaf node. + LeftOffset uint32 + + // RightOffset is the 1-based offset of the right child node if it is in this changeset, 0 otherwise. + // The Right NodeID will indicate whether this is a branch or leaf node. + RightOffset uint32 + + // KeyOffset is the offset the key data for this node in the key value data file. + KeyOffset uint32 + + // Height is the height of this branch node in the tree. + Height uint8 + + // NOTE: there are two bytes of padding here that could be used for something else in the future if needed + // such as an extra byte to allow for 40-bit key offsets. + + // Size is the number of leaf nodes in the subtree rooted at this branch node. + Size Uint40 + + // Hash is the hash of this branch node. + Hash [32]byte } diff --git a/iavl/internal/leaf_layout.go b/iavl/internal/leaf_layout.go index 12a496891092..e780a24bf352 100644 --- a/iavl/internal/leaf_layout.go +++ b/iavl/internal/leaf_layout.go @@ -5,18 +5,24 @@ import ( "unsafe" ) +const ( + sizeLeaf = 44 +) + func init() { - if unsafe.Sizeof(LeafLayout{}) != SizeLeaf { - panic(fmt.Sprintf("invalid LeafLayout size: got %d, want %d", unsafe.Sizeof(LeafLayout{}), SizeLeaf)) + // Verify the size of LeafLayout is what we expect it to be at runtime. + if unsafe.Sizeof(LeafLayout{}) != sizeLeaf { + panic(fmt.Sprintf("invalid LeafLayout size: got %d, want %d", unsafe.Sizeof(LeafLayout{}), sizeLeaf)) } } -const ( - SizeLeaf = 44 -) - +// LeafLayout is the on-disk layout of a leaf node. +// NOTE: changes to this struct will affect on-disk compatibility. type LeafLayout struct { - ID NodeID - Hash [32]byte + // ID is the NodeID of this leaf node. + ID NodeID + // KeyOffset is the offset the key data for this node in the key value data file. KeyOffset uint32 + // Hash is the hash of this leaf node. + Hash [32]byte } diff --git a/iavl/internal/node_id.go b/iavl/internal/node_id.go index 317cec53bdc4..bf45c29fab63 100644 --- a/iavl/internal/node_id.go +++ b/iavl/internal/node_id.go @@ -4,7 +4,10 @@ import "fmt" // NodeID is a stable identifier for a node in the IAVL tree. type NodeID struct { - Version uint32 + // Version is the version of the tree at which this node was created. + Version uint32 + + // FlagIndex indicates whether this is a branch or leaf node and stores its index in the tree. FlagIndex NodeFlagIndex } @@ -14,6 +17,7 @@ type NodeID struct { // Bit 31 indicates whether this is a branch or leaf node (0 for branch, 1 for leaf). type NodeFlagIndex uint32 +// NewNodeID creates a new NodeID. func NewNodeID(isLeaf bool, version, index uint32) NodeID { return NodeID{ Version: version, @@ -21,10 +25,12 @@ func NewNodeID(isLeaf bool, version, index uint32) NodeID { } } +// IsLeaf returns true if the node is a leaf node. func (id NodeID) IsLeaf() bool { return id.FlagIndex.IsLeaf() } +// NewNodeFlagIndex creates a new NodeFlagIndex. func NewNodeFlagIndex(isLeaf bool, index uint32) NodeFlagIndex { idx := NodeFlagIndex(index) if isLeaf { @@ -33,14 +39,17 @@ func NewNodeFlagIndex(isLeaf bool, index uint32) NodeFlagIndex { return idx } +// IsLeaf returns true if the node is a leaf node. func (index NodeFlagIndex) IsLeaf() bool { return index&(1<<31) != 0 } +// Index returns the index of the node in the tree. func (index NodeFlagIndex) Index() uint32 { return uint32(index) & 0x7FFFFFFF } +// String returns a string representation of the NodeID. func (id NodeID) String() string { return fmt.Sprintf("NodeID{leaf:%t, version:%d, index:%d}", id.IsLeaf(), id.Version, id.FlagIndex.Index()) } diff --git a/iavl/internal/node_id_test.go b/iavl/internal/node_id_test.go index dc4f6c261f5a..558530b0e33d 100644 --- a/iavl/internal/node_id_test.go +++ b/iavl/internal/node_id_test.go @@ -12,14 +12,17 @@ func TestNodeID(t *testing.T) { leaf bool version uint32 index uint32 + str string }{ { "leaf1_1", true, 1, 1, + "NodeID{leaf:true, version:1, index:1}", }, { "branch2_3", false, 2, 3, + "NodeID{leaf:false, version:2, index:3}", }, } for _, test := range tests { @@ -28,6 +31,7 @@ func TestNodeID(t *testing.T) { require.Equal(t, test.leaf, id.IsLeaf()) require.Equal(t, test.index, id.FlagIndex.Index()) require.Equal(t, test.version, id.Version) + require.Equal(t, test.str, id.String()) }) } } diff --git a/iavl/internal/uint40.go b/iavl/internal/uint40.go new file mode 100644 index 000000000000..4597be0dfa30 --- /dev/null +++ b/iavl/internal/uint40.go @@ -0,0 +1,25 @@ +package internal + +import "fmt" + +// Uint40 is a 40-bit unsigned integer. +type Uint40 [5]byte + +// NewUint40 creates a new Uint40 from a uint64. +func NewUint40(v uint64) Uint40 { + if v>>40 != 0 { + panic(fmt.Sprintf("value %d overflows Uint40", v)) + } + var u Uint40 + u[0] = byte(v) + u[1] = byte(v >> 8) + u[2] = byte(v >> 16) + u[3] = byte(v >> 24) + u[4] = byte(v >> 32) + return u +} + +// ToUint64 converts the Uint40 to a uint64. +func (u Uint40) ToUint64() uint64 { + return uint64(u[0]) | uint64(u[1])<<8 | uint64(u[2])<<16 | uint64(u[3])<<24 | uint64(u[4])<<32 +} diff --git a/iavl/internal/uint40_test.go b/iavl/internal/uint40_test.go new file mode 100644 index 000000000000..1883f1bc657c --- /dev/null +++ b/iavl/internal/uint40_test.go @@ -0,0 +1,49 @@ +package internal + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestUint40(t *testing.T) { + tests := []struct { + name string + value uint64 + expectPanic bool + }{ + { + "zero", + 0, + false, + }, + { + "max", + 1<<40 - 1, + false, + }, + { + "arbitrary", + 109951162777, + false, + }, + { + "overflow", + 1 << 40, + true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.expectPanic { + require.Panics(t, func() { + _ = NewUint40(tt.value) + }) + } else { + u := NewUint40(tt.value) + got := u.ToUint64() + require.Equal(t, tt.value, got) + } + }) + } +} From abf0976a2a3bbc9dc3d5c087acb32a889c3f4430 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 11:33:42 -0500 Subject: [PATCH 04/34] update docs --- iavl/internal/branch_layout.go | 4 ++++ iavl/internal/leaf_layout.go | 8 ++++++++ iavl/internal/node_id.go | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/iavl/internal/branch_layout.go b/iavl/internal/branch_layout.go index 3e14ea7e8768..7ed33d440352 100644 --- a/iavl/internal/branch_layout.go +++ b/iavl/internal/branch_layout.go @@ -44,6 +44,10 @@ type BranchLayout struct { RightOffset uint32 // KeyOffset is the offset the key data for this node in the key value data file. + // NOTE: that a 32-bit offset means that the key data file can be at most 4GB in size. + // This doesn't limit the size of the overall tree, it just limits the size of individual key/value data files. + // If we want to support larger key/value data files in the future, we can change this to a 40-bit offset, + // and an additional byte of padding is already reserved below for this purpose. KeyOffset uint32 // Height is the height of this branch node in the tree. diff --git a/iavl/internal/leaf_layout.go b/iavl/internal/leaf_layout.go index e780a24bf352..95e67ca3a938 100644 --- a/iavl/internal/leaf_layout.go +++ b/iavl/internal/leaf_layout.go @@ -21,8 +21,16 @@ func init() { type LeafLayout struct { // ID is the NodeID of this leaf node. ID NodeID + // KeyOffset is the offset the key data for this node in the key value data file. + // NOTE: that a 32-bit offset means that the key data file can be at most 4GB in size. + // If we want to support larger key/value data files in the future, we can change this to a 40-bit offset. + // However, this would require changing the size of this struct from 44 bytes to 48 bytes which would break + // on-disk compatibility. + // Such an upgrade could be made by introducing a "wide changeset" format that lives alongside + // this existing "compact" format. KeyOffset uint32 + // Hash is the hash of this leaf node. Hash [32]byte } diff --git a/iavl/internal/node_id.go b/iavl/internal/node_id.go index bf45c29fab63..e3b0be6aa534 100644 --- a/iavl/internal/node_id.go +++ b/iavl/internal/node_id.go @@ -3,6 +3,13 @@ package internal import "fmt" // NodeID is a stable identifier for a node in the IAVL tree. +// A NodeID allows for a 32-bit version and a 31-bit index within that version, +// with 1 bit used to indicate whether the node is a leaf or branch. +// A 32-bit version should allow for 136 years of 1-second blocks. +// If block production significantly speeds up, we can increase the width of the version field in the future. +// This sort of change can be done without any major on-disk migration because we can simply create a "wide changeset" +// format that lives alongside the existing "compact" format. +// Because the cost of migration is low, we have decided to keep things simple and compact for now. type NodeID struct { // Version is the version of the tree at which this node was created. Version uint32 From a716e2fba236ce454423924ff634a40bdabdca03 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 11:47:37 -0500 Subject: [PATCH 05/34] reorder code --- iavl/internal/node_id.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/iavl/internal/node_id.go b/iavl/internal/node_id.go index e3b0be6aa534..7eff3ae50184 100644 --- a/iavl/internal/node_id.go +++ b/iavl/internal/node_id.go @@ -37,6 +37,11 @@ func (id NodeID) IsLeaf() bool { return id.FlagIndex.IsLeaf() } +// String returns a string representation of the NodeID. +func (id NodeID) String() string { + return fmt.Sprintf("NodeID{leaf:%t, version:%d, index:%d}", id.IsLeaf(), id.Version, id.FlagIndex.Index()) +} + // NewNodeFlagIndex creates a new NodeFlagIndex. func NewNodeFlagIndex(isLeaf bool, index uint32) NodeFlagIndex { idx := NodeFlagIndex(index) @@ -55,8 +60,3 @@ func (index NodeFlagIndex) IsLeaf() bool { func (index NodeFlagIndex) Index() uint32 { return uint32(index) & 0x7FFFFFFF } - -// String returns a string representation of the NodeID. -func (id NodeID) String() string { - return fmt.Sprintf("NodeID{leaf:%t, version:%d, index:%d}", id.IsLeaf(), id.Version, id.FlagIndex.Index()) -} From cccd14b4cf99ef761c7036cc41df4ac06b93f67e Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 13:10:26 -0500 Subject: [PATCH 06/34] documented Uint40 endianness and added fmt.Stringer --- iavl/internal/uint40.go | 7 ++++++- iavl/internal/uint40_test.go | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/iavl/internal/uint40.go b/iavl/internal/uint40.go index 4597be0dfa30..d7b9b24f809a 100644 --- a/iavl/internal/uint40.go +++ b/iavl/internal/uint40.go @@ -2,7 +2,7 @@ package internal import "fmt" -// Uint40 is a 40-bit unsigned integer. +// Uint40 is a 40-bit unsigned integer stored in 5 bytes with little-endian encoding. type Uint40 [5]byte // NewUint40 creates a new Uint40 from a uint64. @@ -23,3 +23,8 @@ func NewUint40(v uint64) Uint40 { func (u Uint40) ToUint64() uint64 { return uint64(u[0]) | uint64(u[1])<<8 | uint64(u[2])<<16 | uint64(u[3])<<24 | uint64(u[4])<<32 } + +// String implements fmt.Stringer. +func (u Uint40) String() string { + return fmt.Sprintf("%d", u.ToUint64()) +} diff --git a/iavl/internal/uint40_test.go b/iavl/internal/uint40_test.go index 1883f1bc657c..05b51a8fc9b6 100644 --- a/iavl/internal/uint40_test.go +++ b/iavl/internal/uint40_test.go @@ -11,26 +11,31 @@ func TestUint40(t *testing.T) { name string value uint64 expectPanic bool + str string }{ { "zero", 0, false, + "0", }, { "max", 1<<40 - 1, false, + "1099511627775", }, { "arbitrary", 109951162777, false, + "109951162777", }, { "overflow", 1 << 40, true, + "", }, } for _, tt := range tests { @@ -43,6 +48,7 @@ func TestUint40(t *testing.T) { u := NewUint40(tt.value) got := u.ToUint64() require.Equal(t, tt.value, got) + require.Equal(t, tt.str, u.String()) } }) } From 07d0d5eb6fca908c42090243a84442f3c30c3f25 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 13:48:24 -0500 Subject: [PATCH 07/34] feat(iavl): add Node, MemNode, and NodePointer --- iavl/internal/branch_layout.go | 2 +- iavl/internal/changeset.go | 11 +++ iavl/internal/mem_node.go | 126 +++++++++++++++++++++++++++++++++ iavl/internal/node.go | 51 +++++++++++++ iavl/internal/node_pointer.go | 31 ++++++++ 5 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 iavl/internal/changeset.go create mode 100644 iavl/internal/mem_node.go create mode 100644 iavl/internal/node.go create mode 100644 iavl/internal/node_pointer.go diff --git a/iavl/internal/branch_layout.go b/iavl/internal/branch_layout.go index 7ed33d440352..82ddefe9e72b 100644 --- a/iavl/internal/branch_layout.go +++ b/iavl/internal/branch_layout.go @@ -50,7 +50,7 @@ type BranchLayout struct { // and an additional byte of padding is already reserved below for this purpose. KeyOffset uint32 - // Height is the height of this branch node in the tree. + // Height is the height of the subtree rooted at this branch node. Height uint8 // NOTE: there are two bytes of padding here that could be used for something else in the future if needed diff --git a/iavl/internal/changeset.go b/iavl/internal/changeset.go new file mode 100644 index 000000000000..a5c50f619ff5 --- /dev/null +++ b/iavl/internal/changeset.go @@ -0,0 +1,11 @@ +package internal + +import "fmt" + +// NOTE: This is a placeholder implementation. We will add the implementation in a future PR. + +type Changeset struct{} + +func (cs *Changeset) Resolve(id NodeID, fileIdx uint32) (Node, error) { + return nil, fmt.Errorf("not implemented") +} diff --git a/iavl/internal/mem_node.go b/iavl/internal/mem_node.go new file mode 100644 index 000000000000..20e63485dc03 --- /dev/null +++ b/iavl/internal/mem_node.go @@ -0,0 +1,126 @@ +package internal + +import ( + "bytes" + "fmt" +) + +// MemNode represents an in-memory node that has recently been created and may or may not have +// been serialized to disk yet. +type MemNode struct { + height uint8 + version uint32 + size int64 + key []byte + value []byte + left *NodePointer + right *NodePointer + hash []byte + nodeId NodeID // ID of this node, 0 if not yet assigned + keyOffset uint32 +} + +// ID implements the Node interface. +func (node *MemNode) ID() NodeID { + return node.nodeId +} + +// Height implements the Node interface. +func (node *MemNode) Height() uint8 { + return node.height +} + +// Size implements the Node interface. +func (node *MemNode) Size() int64 { + return node.size +} + +// Version implements the Node interface. +func (node *MemNode) Version() uint32 { + return node.version +} + +// Key implements the Node interface. +func (node *MemNode) Key() ([]byte, error) { + return node.key, nil +} + +// Value implements the Node interface. +func (node *MemNode) Value() ([]byte, error) { + return node.value, nil +} + +// Left implements the Node interface. +func (node *MemNode) Left() *NodePointer { + return node.left +} + +// Right implements the Node interface. +func (node *MemNode) Right() *NodePointer { + return node.right +} + +// Hash implements the Node interface. +func (node *MemNode) Hash() []byte { + return node.hash +} + +// MutateBranch implements the Node interface. +func (node *MemNode) MutateBranch(version uint32) (*MemNode, error) { + n := *node + n.version = version + n.hash = nil + return &n, nil +} + +// Get implements the Node interface. +func (node *MemNode) Get(key []byte) (value []byte, index int64, err error) { + if node.IsLeaf() { + switch bytes.Compare(node.key, key) { + case -1: + return nil, 1, nil + case 1: + return nil, 0, nil + default: + return node.value, 0, nil + } + } + + if bytes.Compare(key, node.key) < 0 { + leftNode, err := node.left.Resolve() + if err != nil { + return nil, 0, err + } + + return leftNode.Get(key) + } + + rightNode, err := node.right.Resolve() + if err != nil { + return nil, 0, err + } + + value, index, err = rightNode.Get(key) + if err != nil { + return nil, 0, err + } + + index += node.size - rightNode.Size() + return value, index, nil +} + +// IsLeaf implements the Node interface. +func (node *MemNode) IsLeaf() bool { + return node.height == 0 +} + +// String implements the fmt.Stringer interface. +func (node *MemNode) String() string { + if node.IsLeaf() { + return fmt.Sprintf("MemNode{key:%x, version:%d, size:%d, value:%x}", node.key, node.version, node.size, node.value) + } else { + return fmt.Sprintf("MemNode{key:%x, version:%d, size:%d, height:%d, left:%s, right:%s}", node.key, node.version, node.size, node.height, node.left, node.right) + } +} + +var _ Node = (*MemNode)(nil) diff --git a/iavl/internal/node.go b/iavl/internal/node.go new file mode 100644 index 000000000000..894195adfc3c --- /dev/null +++ b/iavl/internal/node.go @@ -0,0 +1,51 @@ +package internal + +import "fmt" + +// Node represents a traversable node in the IAVL tree. +type Node interface { + // ID returns the unique identifier of the node. + // If the node has not been assigned an ID yet, it returns the zero value of NodeID. + ID() NodeID + + // IsLeaf indicates whether this node is a leaf node. + IsLeaf() bool + + // Key returns the key of this node. + Key() ([]byte, error) + + // Value returns the value of this node. It is an error to call this method on non-leaf nodes. + Value() ([]byte, error) + + // Left returns a pointer to the left child node. + // If this is called on a leaf node, it returns nil. + Left() *NodePointer + + // Right returns a pointer to the right child node. + // If this is called on a leaf node, it returns nil. + Right() *NodePointer + + // Hash returns the hash of this node. + // Hash may or may not have been computed yet. + Hash() []byte + + // Height returns the height of the subtree rooted at this node. + Height() uint8 + + // Size returns the number of leaf nodes in the subtree rooted at this node. + Size() int64 + + // Version returns the version at which this node was created. + Version() uint32 + + // Get traverses this subtree to find the value associated with the given key. + Get(key []byte) (value []byte, index int64, err error) + + // MutateBranch creates a mutable copy of this branch node created at the specified version. + // Since this is an immutable tree, whenever we need to modify a branch node, we should call this method + // to create a mutable copy of it with its version updated. + // This method should only be called on branch nodes; calling it on leaf nodes will result in an error. + MutateBranch(version uint32) (*MemNode, error) + + fmt.Stringer +} diff --git a/iavl/internal/node_pointer.go b/iavl/internal/node_pointer.go new file mode 100644 index 000000000000..fc14400faed3 --- /dev/null +++ b/iavl/internal/node_pointer.go @@ -0,0 +1,31 @@ +package internal + +import ( + "fmt" + "sync/atomic" +) + +type NodePointer struct { + mem atomic.Pointer[MemNode] + changeset *Changeset + fileIdx uint32 // absolute index in file, 1-based, zero means we don't have an offset + id NodeID +} + +func NewNodePointer(memNode *MemNode) *NodePointer { + n := &NodePointer{} + n.mem.Store(memNode) + return n +} + +func (p *NodePointer) Resolve() (Node, error) { + mem := p.mem.Load() + if mem != nil { + return mem, nil + } + return p.changeset.Resolve(p.id, p.fileIdx) +} + +func (p *NodePointer) String() string { + return fmt.Sprintf("NodePointer{id: %s, fileIdx: %d}", p.id.String(), p.fileIdx) +} From a65f1c640758656071294f7075a70f735ae218a8 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 13:49:23 -0500 Subject: [PATCH 08/34] switch table tests to key: value struct init --- iavl/internal/node_id_test.go | 11 +++++------ iavl/internal/uint40_test.go | 27 +++++++++++---------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/iavl/internal/node_id_test.go b/iavl/internal/node_id_test.go index 558530b0e33d..a99f44c37d21 100644 --- a/iavl/internal/node_id_test.go +++ b/iavl/internal/node_id_test.go @@ -15,14 +15,13 @@ func TestNodeID(t *testing.T) { str string }{ { - "leaf1_1", - true, 1, 1, - "NodeID{leaf:true, version:1, index:1}", + name: "leaf1_1", + leaf: true, version: 1, index: 1, + str: "NodeID{leaf:true, version:1, index:1}", }, { - "branch2_3", - false, 2, 3, - "NodeID{leaf:false, version:2, index:3}", + name: "branch2_3", version: 2, index: 3, + str: "NodeID{leaf:false, version:2, index:3}", }, } for _, test := range tests { diff --git a/iavl/internal/uint40_test.go b/iavl/internal/uint40_test.go index 05b51a8fc9b6..0a41a54e910e 100644 --- a/iavl/internal/uint40_test.go +++ b/iavl/internal/uint40_test.go @@ -14,28 +14,23 @@ func TestUint40(t *testing.T) { str string }{ { - "zero", - 0, - false, - "0", + name: "zero", + str: "0", }, { - "max", - 1<<40 - 1, - false, - "1099511627775", + name: "max", + value: 1<<40 - 1, + str: "1099511627775", }, { - "arbitrary", - 109951162777, - false, - "109951162777", + name: "arbitrary", + value: 109951162777, + str: "109951162777", }, { - "overflow", - 1 << 40, - true, - "", + name: "overflow", + value: 1 << 40, + expectPanic: true, }, } for _, tt := range tests { From b212f96bddc17be6ede105375f4f112edc40b3eb Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 14:06:42 -0500 Subject: [PATCH 09/34] add basic mem node getter tests --- iavl/internal/node_pointer.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/iavl/internal/node_pointer.go b/iavl/internal/node_pointer.go index fc14400faed3..fa0afa130ae1 100644 --- a/iavl/internal/node_pointer.go +++ b/iavl/internal/node_pointer.go @@ -5,6 +5,7 @@ import ( "sync/atomic" ) +// NodePointer is a pointer to a Node, which may be either in-memory, on-disk or both. type NodePointer struct { mem atomic.Pointer[MemNode] changeset *Changeset @@ -12,12 +13,14 @@ type NodePointer struct { id NodeID } +// NewNodePointer creates a new NodePointer pointing to the given in-memory node. func NewNodePointer(memNode *MemNode) *NodePointer { n := &NodePointer{} n.mem.Store(memNode) return n } +// Resolve resolves the NodePointer to a Node, loading from memory or disk as necessary. func (p *NodePointer) Resolve() (Node, error) { mem := p.mem.Load() if mem != nil { @@ -26,6 +29,7 @@ func (p *NodePointer) Resolve() (Node, error) { return p.changeset.Resolve(p.id, p.fileIdx) } +// String implements the fmt.Stringer interface. func (p *NodePointer) String() string { return fmt.Sprintf("NodePointer{id: %s, fileIdx: %d}", p.id.String(), p.fileIdx) } From e6ccd8d8e925207f65b74582fe89362c5676832d Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 17:54:42 -0500 Subject: [PATCH 10/34] adding mutation, hash, verification code, basic tests --- iavl/internal/commit_tree.go | 14 ++ iavl/internal/commit_tree_test.go | 238 ++++++++++++++++++ iavl/internal/node_hash.go | 118 +++++++++ iavl/internal/node_id.go | 5 + iavl/internal/node_update.go | 386 ++++++++++++++++++++++++++++++ iavl/internal/verify.go | 120 ++++++++++ 6 files changed, 881 insertions(+) create mode 100644 iavl/internal/commit_tree.go create mode 100644 iavl/internal/commit_tree_test.go create mode 100644 iavl/internal/node_hash.go create mode 100644 iavl/internal/node_update.go create mode 100644 iavl/internal/verify.go diff --git a/iavl/internal/commit_tree.go b/iavl/internal/commit_tree.go new file mode 100644 index 000000000000..3df7687c8e73 --- /dev/null +++ b/iavl/internal/commit_tree.go @@ -0,0 +1,14 @@ +package internal + +import ( + "log/slog" + "sync" + "sync/atomic" +) + +type CommitTree struct { + latest atomic.Pointer[NodePointer] + root *NodePointer + writeMutex sync.Mutex + logger *slog.Logger +} diff --git a/iavl/internal/commit_tree_test.go b/iavl/internal/commit_tree_test.go new file mode 100644 index 000000000000..d0ff8caf4c08 --- /dev/null +++ b/iavl/internal/commit_tree_test.go @@ -0,0 +1,238 @@ +package internal + +import ( + "bytes" + "fmt" + "os" + "runtime/debug" + "testing" + + corestore "cosmossdk.io/core/store" + sdklog "cosmossdk.io/log" + storetypes "cosmossdk.io/store/types" + iavl1 "github.com/cosmos/iavl" + dbm "github.com/cosmos/iavl/db" + "github.com/stretchr/testify/require" + "golang.org/x/exp/maps" + "pgregory.net/rapid" +) + +func TestIAVLXSims(t *testing.T) { + rapid.Check(t, testIAVLXSims) +} + +func FuzzIAVLX(f *testing.F) { + f.Fuzz(rapid.MakeFuzz(testIAVLXSims)) +} + +func testIAVLXSims(t *rapid.T) { + defer func() { + if r := recover(); r != nil { + t.Fatalf("panic recovered: %v\nStack trace:\n%s", r, debug.Stack()) + } + }() + // logger := sdklog.NewTestLogger(t) + logger := sdklog.NewNopLogger() + dbV1 := dbm.NewMemDB() + treeV1 := iavl1.NewMutableTree(dbV1, 500000, true, logger) + + tempDir, err := os.MkdirTemp("", "iavlx") + require.NoError(t, err, "failed to create temp directory") + defer os.RemoveAll(tempDir) + simMachine := &SimMachine{ + treeV1: treeV1, + dirV2: tempDir, + existingKeys: map[string][]byte{}, + } + simMachine.openV2Tree(t) + + // TODO switch from StateMachineActions to manually setting up the actions map, this is going to be too magical for other maintainers otherwise + t.Repeat(map[string]func(*rapid.T){ + "": simMachine.Check, + "UpdateN": simMachine.UpdateN, + "GetN": simMachine.GetN, + "Iterate": simMachine.Iterate, + "Commit": simMachine.Commit, + }) + + require.NoError(t, treeV1.Close(), "failed to close iavl tree") + require.NoError(t, simMachine.treeV2.Close(), "failed to close iavlx tree") +} + +type SimMachine struct { + treeV1 *iavl1.MutableTree + treeV2 *CommitTree + dirV2 string + // existingKeys keeps track of keys that have been set in the tree or deleted. Deleted keys are retained as nil values. + existingKeys map[string][]byte +} + +func (s *SimMachine) openV2Tree(t interface { + require.TestingT + sdklog.TestingT +}) { + var err error + s.treeV2, err = NewCommitTree(s.dirV2, Options{ + WriteWAL: true, + CompactWAL: true, + DisableCompaction: true, + ZeroCopy: false, + EvictDepth: 0, + CompactionOrphanRatio: 0, + CompactionOrphanAge: 0, + RetainVersions: 0, + MinCompactionSeconds: 0, + ChangesetMaxTarget: 1, + CompactAfterVersions: 0, + ReaderUpdateInterval: 1, + }, sdklog.NewTestLogger(t)) + require.NoError(t, err, "failed to create iavlx tree") +} + +func (s *SimMachine) Check(t *rapid.T) { + // after every operation verify the iavlx tree + // after every operation we check that both trees are identical + s.compareIterators(t, nil, nil, true) +} + +func (s *SimMachine) UpdateN(t *rapid.T) { + n := rapid.IntRange(1, 5000).Draw(t, "n") + for i := 0; i < n; i++ { + del := rapid.Bool().Draw(t, "del") + if del { + s.delete(t) + } else { + s.set(t) + } + } +} + +func (s *SimMachine) GetN(t *rapid.T) { + n := rapid.IntRange(1, 5000).Draw(t, "n") + for i := 0; i < n; i++ { + s.get(t) + } +} + +func (s *SimMachine) set(t *rapid.T) { + // choose either a new or an existing key + key := s.selectKey(t) + value := rapid.SliceOfN(rapid.Byte(), 0, 10).Draw(t, "value") + // set in both trees + updated, errV1 := s.treeV1.Set(key, value) + require.NoError(t, errV1, "failed to set key in V1 tree") + branch := s.treeV2.CacheWrap().(storetypes.CacheKVStore) + branch.Set(key, value) + branch.Write() + // require.Equal(t, updated, updatedV2, "update status mismatch between V1 and V2 trees") + if updated { + require.NotNil(t, s.existingKeys[string(key)], "key shouldn't have been marked as updated") + } else { + existing, found := s.existingKeys[string(key)] + if found { + require.Nil(t, existing, value, "marked as not an update but existin key is nil") + } + } + s.existingKeys[string(key)] = value // mark as existing +} + +func (s *SimMachine) get(t *rapid.T) { + key := s.selectKey(t) + valueV1, errV1 := s.treeV1.Get(key) + require.NoError(t, errV1, "failed to get key from V1 tree") + valueV2 := s.treeV2.CacheWrap().(storetypes.CacheKVStore).Get(key) + require.Equal(t, valueV1, valueV2, "value mismatch between V1 and V2 trees") + expectedValue, found := s.existingKeys[string(key)] + if found { + require.Equal(t, expectedValue, valueV1, "expected value mismatch for key %s", key) + } else { + require.Nil(t, valueV1, "expected nil value for non-existing key %s", key) + } +} + +func (s *SimMachine) selectKey(t *rapid.T) []byte { + if len(s.existingKeys) > 0 && rapid.Bool().Draw(t, "existingKey") { + return []byte(rapid.SampledFrom(maps.Keys(s.existingKeys)).Draw(t, "key")) + } else { + // TODO consider testing longer keys + return rapid.SliceOfN(rapid.Byte(), 1, 10).Draw(t, "key") + } +} + +func (s *SimMachine) delete(t *rapid.T) { + key := s.selectKey(t) + existingValue, found := s.existingKeys[string(key)] + exists := found && existingValue != nil + // delete in both trees + _, removedV1, errV1 := s.treeV1.Remove(key) + require.NoError(t, errV1, "failed to remove key from V1 tree") + branch := s.treeV2.CacheWrap().(storetypes.CacheKVStore) + branch.Delete(key) + branch.Write() + // require.Equal(t, removedV1, removedV2, "removed status mismatch between V1 and V2 trees") + // TODO v1 & v2 have slightly different behaviors for the value returned on removal. We should re-enable this and check. + //if valueV1 == nil || len(valueV1) == 0 { + // require.Empty(t, valueV2, "value should be empty for removed key in V2 tree") + //} else { + // require.Equal(t, valueV1, valueV2, "value mismatch between V1 and V2 trees") + //} + require.Equal(t, exists, removedV1, "removed status should match existence of key") + s.existingKeys[string(key)] = nil // mark as deleted +} + +func (s *SimMachine) Iterate(t *rapid.T) { + start := s.selectKey(t) + end := s.selectKey(t) + // make sure end is after start + if string(end) <= string(start) { + temp := start + start = end + end = temp + } + + // TODO add cases where we nudge start or end up or down a little + + // ascending := rapid.Bool().Draw(t, "ascending") + + // s.compareIterators(t, start, end, ascending) +} + +func (s *SimMachine) Commit(t *rapid.T) { + hash1, _, err := s.treeV1.SaveVersion() + require.NoError(t, err, "failed to save version in V1 tree") + commitId2 := s.treeV2.Commit() + require.NoError(t, err, "failed to save version in V2 tree") + err = VerifyTree(s.treeV2) + require.NoError(t, err, "failed to verify V2 tree") + require.Equal(t, hash1, commitId2.Hash, "hash mismatch between V1 and V2 trees") + closeReopen := rapid.Bool().Draw(t, "closeReopen") + if closeReopen { + require.NoError(t, s.treeV2.Close()) + s.openV2Tree(t) + } +} + +func (s *SimMachine) debugDump(t *rapid.T) { + version := s.treeV1.Version() + t.Logf("Dumping trees at version %d", version) + graph1 := &bytes.Buffer{} + iavl.WriteDOTGraph(graph1, s.treeV1.ImmutableTree, nil) + t.Logf("V1 tree:\n%s", graph1.String()) + // renderTree(t, s.treeV2.Branch()) + iter2 := s.treeV2.CacheWrap().(storetypes.CacheKVStore).Iterator(nil, nil) + s.debugDumpTree(t, iter2) +} + +func (s *SimMachine) debugDumpTree(t *rapid.T, iter corestore.Iterator) { + dumpStr := "Tree dump:" + defer func() { + require.NoError(t, iter.Close(), "failed to close iterator") + }() + for iter.Valid() { + key := iter.Key() + value := iter.Value() + dumpStr += fmt.Sprintf("\n\tKey: %X, Value: %X", key, value) + iter.Next() + } + t.Log(dumpStr) +} diff --git a/iavl/internal/node_hash.go b/iavl/internal/node_hash.go new file mode 100644 index 000000000000..e7011008817c --- /dev/null +++ b/iavl/internal/node_hash.go @@ -0,0 +1,118 @@ +package internal + +import ( + "crypto/sha256" + "encoding/binary" + "fmt" + "hash" + "io" + "sync" +) + +func computeAndSetHash(node *MemNode, leftHash, rightHash []byte) ([]byte, error) { + h, err := computeHash(node, leftHash, rightHash) + if err != nil { + return nil, err + } + node.hash = h + + return h, nil +} + +var hasherPool = sync.Pool{ + New: func() any { + return sha256.New() + }, +} + +func putBackHasher(h hash.Hash) { + h.Reset() + hasherPool.Put(h) +} + +func computeHash(node Node, leftHash, rightHash []byte) ([]byte, error) { + hasher := hasherPool.Get().(hash.Hash) + defer putBackHasher(hasher) + if err := writeHashBytes(node, leftHash, rightHash, hasher); err != nil { + return nil, err + } + return hasher.Sum(nil), nil +} + +var emptyHash = sha256.New().Sum(nil) + +func shaSum256(bz []byte) []byte { + hasher := hasherPool.Get().(hash.Hash) + defer putBackHasher(hasher) + hasher.Write(bz) + var sum [sha256.Size]byte + hasher.Sum(sum[:0]) + return sum[:] +} + +// Writes the node's hash to the given `io.Writer`. This function recursively calls +// children to update hashes. +func writeHashBytes(node Node, leftHash, rightHash []byte, w io.Writer) error { + var ( + n int + buf [binary.MaxVarintLen64]byte + ) + + n = binary.PutVarint(buf[:], int64(node.Height())) + if _, err := w.Write(buf[0:n]); err != nil { + return fmt.Errorf("writing height, %w", err) + } + n = binary.PutVarint(buf[:], node.Size()) + if _, err := w.Write(buf[0:n]); err != nil { + return fmt.Errorf("writing size, %w", err) + } + n = binary.PutVarint(buf[:], int64(node.Version())) + if _, err := w.Write(buf[0:n]); err != nil { + return fmt.Errorf("writing version, %w", err) + } + + // Key is not written for inner nodes, unlike writeBytes. + + if node.IsLeaf() { + key, err := node.Key() + if err != nil { + return fmt.Errorf("getting key, %w", err) + } + + if err := encodeVarintPrefixedBytes(w, key); err != nil { + return fmt.Errorf("writing key, %w", err) + } + + value, err := node.Value() + if err != nil { + return fmt.Errorf("getting value, %w", err) + } + + // Indirection needed to provide proofs without values. + // (e.g. ProofLeafNode.ValueHash) + if err := encodeVarintPrefixedBytes(w, shaSum256(value)); err != nil { + return fmt.Errorf("writing value, %w", err) + } + } else { + if err := encodeVarintPrefixedBytes(w, leftHash); err != nil { + return fmt.Errorf("writing left hash, %w", err) + } + if err := encodeVarintPrefixedBytes(w, rightHash); err != nil { + return fmt.Errorf("writing right hash, %w", err) + } + } + + return nil +} + +// encodeVarintPrefixedBytes writes a varint length-prefixed byte slice to the writer, +// it's used for hash computation, must be compactible with the official IAVL implementation. +func encodeVarintPrefixedBytes(w io.Writer, bz []byte) error { + var buf [binary.MaxVarintLen64]byte + n := binary.PutUvarint(buf[:], uint64(len(bz))) + if _, err := w.Write(buf[0:n]); err != nil { + return err + } + _, err := w.Write(bz) + return err +} diff --git a/iavl/internal/node_id.go b/iavl/internal/node_id.go index 7eff3ae50184..a470e4f44f5f 100644 --- a/iavl/internal/node_id.go +++ b/iavl/internal/node_id.go @@ -37,6 +37,11 @@ func (id NodeID) IsLeaf() bool { return id.FlagIndex.IsLeaf() } +// IsEmpty returns true if the NodeID is the zero value. +func (id NodeID) IsEmpty() bool { + return id.Version == 0 && id.FlagIndex == 0 +} + // String returns a string representation of the NodeID. func (id NodeID) String() string { return fmt.Sprintf("NodeID{leaf:%t, version:%d, index:%d}", id.IsLeaf(), id.Version, id.FlagIndex.Index()) diff --git a/iavl/internal/node_update.go b/iavl/internal/node_update.go new file mode 100644 index 000000000000..7441c93a9273 --- /dev/null +++ b/iavl/internal/node_update.go @@ -0,0 +1,386 @@ +package internal + +import "bytes" + +func newLeafNode(key, value []byte, version uint32) *MemNode { + return &MemNode{ + height: 0, + size: 1, + version: version, + key: key, + value: value, + } +} + +// setRecursive do set operation. +// it always do modification and return new `MemNode`, even if the value is the same. +// also returns if it's an update or insertion, if update, the tree height and balance is not changed. +func setRecursive(nodePtr *NodePointer, leafNode *MemNode, ctx *mutationContext) (*NodePointer, bool, error) { + if nodePtr == nil { + return NewNodePointer(leafNode), true, nil + } + + node, err := nodePtr.Resolve() + if err != nil { + return nil, false, err + } + + nodeKey, err := node.Key() + if err != nil { + return nil, false, err + } + if node.IsLeaf() { + leafNodePtr := NewNodePointer(leafNode) + cmp := bytes.Compare(leafNode.key, nodeKey) + if cmp == 0 { + ctx.AddOrphan(nodePtr.id) + return leafNodePtr, true, nil + } + n := &MemNode{ + height: 1, + size: 2, + version: ctx.version, + } + switch cmp { + case -1: + n.left = leafNodePtr + n.right = nodePtr + n.key = nodeKey + // n._keyRef = node + case 1: + n.left = nodePtr + n.right = leafNodePtr + n.key = leafNode.key + // n._keyRef = leafNode + default: + panic("unreachable") + } + return NewNodePointer(n), false, nil + } else { + var ( + newChildPtr *NodePointer + newNode *MemNode + updated bool + err error + ) + if bytes.Compare(leafNode.key, nodeKey) == -1 { + newChildPtr, updated, err = setRecursive(node.Left(), leafNode, ctx) + if err != nil { + return nil, false, err + } + newNode, err = ctx.mutateBranch(node) + if err != nil { + return nil, false, err + } + newNode.left = newChildPtr + } else { + newChildPtr, updated, err = setRecursive(node.Right(), leafNode, ctx) + if err != nil { + return nil, false, err + } + newNode, err = ctx.mutateBranch(node) + if err != nil { + return nil, false, err + } + newNode.right = newChildPtr + } + + if !updated { + err = newNode.updateHeightSize() + if err != nil { + return nil, false, err + } + + newNode, err = newNode.reBalance(ctx) + if err != nil { + return nil, false, err + } + } + + return NewNodePointer(newNode), updated, nil + } +} + +type newKeyWrapper struct { + key []byte + // keyRef keyRefLink +} + +// removeRecursive returns: +// - (nil, origNode, nil) -> nothing changed in subtree +// - (value, nil, nil) -> leaf node is removed +// - (value, new node, newKey) -> subtree changed +func removeRecursive(nodePtr *NodePointer, key []byte, ctx *mutationContext) (value []byte, newNodePtr *NodePointer, newKey *newKeyWrapper, err error) { + if nodePtr == nil { + return nil, nil, nil, nil + } + + node, err := nodePtr.Resolve() + if err != nil { + return nil, nil, nil, err + } + + nodeKey, err := node.Key() + if err != nil { + return nil, nil, nil, err + } + + if node.IsLeaf() { + if bytes.Equal(nodeKey, key) { + ctx.AddOrphan(nodePtr.id) + value, err := node.Value() + return value, nil, nil, err + } + return nil, nodePtr, nil, nil + } + + if bytes.Compare(key, nodeKey) == -1 { + value, newLeft, newKey, err := removeRecursive(node.Left(), key, ctx) + if err != nil { + return nil, nil, nil, err + } + + if value == nil { + return nil, nodePtr, nil, nil + } + + if newLeft == nil { + ctx.AddOrphan(nodePtr.id) + return value, node.Right(), &newKeyWrapper{ + key: nodeKey, + // keyRef: nodePtr, + }, nil + } + + newNode, err := ctx.mutateBranch(node) + if err != nil { + return nil, nil, nil, err + } + newNode.left = newLeft + err = newNode.updateHeightSize() + if err != nil { + return nil, nil, nil, err + } + newNode, err = newNode.reBalance(ctx) + if err != nil { + return nil, nil, nil, err + } + + return value, NewNodePointer(newNode), newKey, nil + } + + value, newRight, newKey, err := removeRecursive(node.Right(), key, ctx) + if err != nil { + return nil, nil, nil, err + } + + if value == nil { + return nil, nodePtr, nil, nil + } + + if newRight == nil { + ctx.AddOrphan(nodePtr.id) + return value, node.Left(), nil, nil + } + + newNode, err := ctx.mutateBranch(node) + if err != nil { + return nil, nil, nil, err + } + + newNode.right = newRight + if newKey != nil { + newNode.key = newKey.key + // newNode._keyRef = newKey.keyRef + } + + err = newNode.updateHeightSize() + if err != nil { + return nil, nil, nil, err + } + + newNode, err = newNode.reBalance(ctx) + if err != nil { + return nil, nil, nil, err + } + + return value, NewNodePointer(newNode), nil, nil +} + +// IMPORTANT: nodes called with this method must be new or copies first. +// Code reviewers should use find usages to ensure that all callers follow this rule! +func (node *MemNode) updateHeightSize() error { + leftNode, err := node.left.Resolve() + if err != nil { + return err + } + + rightNode, err := node.right.Resolve() + if err != nil { + return err + } + + node.height = maxUint8(leftNode.Height(), rightNode.Height()) + 1 + node.size = leftNode.Size() + rightNode.Size() + return nil +} + +func maxUint8(a, b uint8) uint8 { + if a > b { + return a + } + return b +} + +// IMPORTANT: nodes called with this method must be new or copies first. +// Code reviewers should use find usages to ensure that all callers follow this rule! +func (node *MemNode) reBalance(ctx *mutationContext) (*MemNode, error) { + balance, err := calcBalance(node) + if err != nil { + return nil, err + } + switch { + case balance > 1: + left, err := node.left.Resolve() + if err != nil { + return nil, err + } + + leftBalance, err := calcBalance(left) + if err != nil { + return nil, err + } + + if leftBalance >= 0 { + // left left + return node.rotateRight(ctx) + } + + // left right + newLeft, err := ctx.mutateBranch(left) + if err != nil { + return nil, err + } + newLeft, err = newLeft.rotateLeft(ctx) + if err != nil { + return nil, err + } + node.left = NewNodePointer(newLeft) + return node.rotateRight(ctx) + case balance < -1: + right, err := node.right.Resolve() + if err != nil { + return nil, err + } + + rightBalance, err := calcBalance(right) + if err != nil { + return nil, err + } + + if rightBalance <= 0 { + // right right + return node.rotateLeft(ctx) + } + + // right left + newRight, err := ctx.mutateBranch(right) + if err != nil { + return nil, err + } + newRight, err = newRight.rotateRight(ctx) + node.right = NewNodePointer(newRight) + return node.rotateLeft(ctx) + default: + // nothing changed + return node, err + } +} + +func calcBalance(node Node) (int, error) { + leftNode, err := node.Left().Resolve() + if err != nil { + return 0, err + } + + rightNode, err := node.Right().Resolve() + if err != nil { + return 0, err + } + + return int(leftNode.Height()) - int(rightNode.Height()), nil +} + +// IMPORTANT: nodes called with this method must be new or copies first. +// Code reviewers should use find usages to ensure that all callers follow this rule! +func (node *MemNode) rotateRight(ctx *mutationContext) (*MemNode, error) { + left, err := node.left.Resolve() + if err != nil { + return nil, err + } + newSelf, err := ctx.mutateBranch(left) + if err != nil { + return nil, err + } + node.left = left.Right() + newSelf.right = NewNodePointer(node) + + err = node.updateHeightSize() + if err != nil { + return nil, err + } + err = newSelf.updateHeightSize() + if err != nil { + return nil, err + } + + return newSelf, nil +} + +// IMPORTANT: nodes called with this method must be new or copies first. +// Code reviewers should use find usages to ensure that all callers follow this rule! +func (node *MemNode) rotateLeft(ctx *mutationContext) (*MemNode, error) { + right, err := node.right.Resolve() + if err != nil { + return nil, err + } + + newSelf, err := ctx.mutateBranch(right) + if err != nil { + return nil, err + } + + node.right = right.Left() + newSelf.left = NewNodePointer(node) + + err = node.updateHeightSize() + if err != nil { + return nil, err + } + + err = newSelf.updateHeightSize() + if err != nil { + return nil, err + } + + return newSelf, nil +} + +type mutationContext struct { + version uint32 + orphans []NodeID +} + +func (ctx *mutationContext) mutateBranch(node Node) (*MemNode, error) { + id := node.ID() + if !id.IsEmpty() { + ctx.orphans = append(ctx.orphans, id) + } + return node.MutateBranch(ctx.version) +} + +func (ctx *mutationContext) AddOrphan(id NodeID) { + if !id.IsEmpty() { + ctx.orphans = append(ctx.orphans, id) + } +} diff --git a/iavl/internal/verify.go b/iavl/internal/verify.go new file mode 100644 index 000000000000..fbf7fa97e420 --- /dev/null +++ b/iavl/internal/verify.go @@ -0,0 +1,120 @@ +package internal + +import ( + "bytes" + "fmt" +) + +func verifyNode(np *NodePointer) error { + node, err := np.Resolve() + if err != nil { + return fmt.Errorf("resolve node %s: %w", np.id, err) + } + + if node.Version() != uint32(np.id.Version) { + return fmt.Errorf("node %s has version %d, expected %d", np.id, node.Version(), np.id.Version) + } + + if node.IsLeaf() { + if node.Height() != 0 { + return fmt.Errorf("leaf node %s has height %d", np.id, node.Height()) + } + + if node.Size() != 1 { + return fmt.Errorf("leaf node %s has size %d, expected 1", np.id, node.Size()) + } + + if node.Left() != nil { + return fmt.Errorf("leaf node %s has non-nil left child", np.id) + } + + if node.Right() != nil { + return fmt.Errorf("leaf node %s has non-nil right child", np.id) + } + + hash, err := computeHash(node, nil, nil) + if err != nil { + return fmt.Errorf("compute hash for leaf node %s: %w", np.id, err) + } + + if !bytes.Equal(hash, node.Hash()) { + return fmt.Errorf("leaf node %s has invalid hash", np.id) + } + } else { + leftPtr := node.Left() + if leftPtr == nil { + return fmt.Errorf("branch node %s has nil left child", np.id) + } + + rightPtr := node.Right() + if rightPtr == nil { + return fmt.Errorf("branch node %s has nil right child", np.id) + } + + left, err := leftPtr.Resolve() + if err != nil { + return fmt.Errorf("resolve left child of node %s: %w", np.id, err) + } + + right, err := rightPtr.Resolve() + if err != nil { + return fmt.Errorf("resolve right child of node %s: %w", np.id, err) + } + + key, err := node.Key() + if err != nil { + return fmt.Errorf("get key of node %s: %w", np.id, err) + } + + leftKey, err := left.Key() + if err != nil { + return fmt.Errorf("get key of left child of node %s: %w", np.id, err) + } + + rightKey, err := right.Key() + if err != nil { + return fmt.Errorf("get key of right child of node %s: %w", np.id, err) + } + + if bytes.Compare(leftKey, key) >= 0 { + return fmt.Errorf("branch node %s with id %s has key %x, but left child %s, has key %x", node, np.id, key, left, leftKey) + } + + if bytes.Compare(rightKey, key) < 0 { + return fmt.Errorf("branch node %s with id %s has key %x, but right child %s, has key %x", node, np.id, key, right, rightKey) + } + + if left.Size()+right.Size() != node.Size() { + return fmt.Errorf("branch node %s has size %d, but children sizes are %d and %d", np.id, node.Size(), left.Size(), right.Size()) + } + + expectedHeight := maxUint8(left.Height(), right.Height()) + 1 + if node.Height() != expectedHeight { + return fmt.Errorf("branch node %s has height %d, expected %d, left height %d, right height %d", np.id, node.Height(), expectedHeight, left.Height(), right.Height()) + } + + // ensure balanced + balance := int(left.Height()) - int(right.Height()) + if balance < -1 || balance > 1 { + return fmt.Errorf("branch node %s is unbalanced: left height %d, right height %d", np.id, left.Height(), right.Height()) + } + + hash, err := computeHash(node, left.Hash(), right.Hash()) + if err != nil { + return fmt.Errorf("compute hash for branch node %s: %w", np.id, err) + } + + if !bytes.Equal(hash, node.Hash()) { + return fmt.Errorf("branch node %s has invalid hash", np.id) + } + + if err := verifyNode(leftPtr); err != nil { + return err + } + + if err := verifyNode(rightPtr); err != nil { + return err + } + } + return nil +} From d19f0e716be0d661915cc429e489160ab481ca03 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Tue, 2 Dec 2025 19:03:15 -0500 Subject: [PATCH 11/34] add tests --- iavl/internal/mem_node_test.go | 132 +++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 iavl/internal/mem_node_test.go diff --git a/iavl/internal/mem_node_test.go b/iavl/internal/mem_node_test.go new file mode 100644 index 000000000000..2df32bc0597c --- /dev/null +++ b/iavl/internal/mem_node_test.go @@ -0,0 +1,132 @@ +package internal + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMemNode_Getters(t *testing.T) { + left := NewNodePointer(&MemNode{}) + right := NewNodePointer(&MemNode{}) + nodeId := NewNodeID(true, 5, 10) + + node := &MemNode{ + height: 3, + version: 7, + size: 42, + key: []byte("testkey"), + value: []byte("testvalue"), + hash: []byte("testhash"), + left: left, + right: right, + nodeId: nodeId, + keyOffset: 100, + } + + require.Equal(t, uint8(3), node.Height()) + require.Equal(t, uint32(7), node.Version()) + require.Equal(t, int64(42), node.Size()) + require.Equal(t, left, node.Left()) + require.Equal(t, right, node.Right()) + require.Equal(t, []byte("testhash"), node.Hash()) + require.Equal(t, nodeId, node.ID()) + + key, err := node.Key() + require.NoError(t, err) + require.Equal(t, []byte("testkey"), key) + + value, err := node.Value() + require.NoError(t, err) + require.Equal(t, []byte("testvalue"), value) +} + +func TestMemNode_IsLeaf(t *testing.T) { + tests := []struct { + name string + height uint8 + want bool + }{ + {name: "leaf", height: 0, want: true}, + {name: "branch height 1", height: 1, want: false}, + {name: "branch height 5", height: 5, want: false}, + {name: "branch max height", height: 255, want: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &MemNode{height: tt.height} + require.Equal(t, tt.want, node.IsLeaf()) + }) + } +} + +func TestMemNode_String(t *testing.T) { + tests := []struct { + name string + node *MemNode + want string + }{ + { + name: "leaf node", + node: &MemNode{ + height: 0, + version: 1, + size: 1, + key: []byte{0xab, 0xcd}, + value: []byte{0x12, 0x34}, + }, + want: "MemNode{key:abcd, version:1, size:1, value:1234}", + }, + { + name: "branch node", + node: &MemNode{ + height: 2, + version: 5, + size: 10, + key: []byte{0xff}, + left: &NodePointer{id: NewNodeID(true, 1, 1)}, + right: &NodePointer{id: NewNodeID(true, 1, 2)}, + }, + want: "MemNode{key:ff, version:5, size:10, height:2, left:NodePointer{id: NodeID{leaf:true, version:1, index:1}, fileIdx: 0}, right:NodePointer{id: NodeID{leaf:true, version:1, index:2}, fileIdx: 0}}", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.want, tt.node.String()) + }) + } +} + +func TestMemNode_MutateBranch(t *testing.T) { + original := &MemNode{ + height: 2, + version: 5, + size: 10, + key: []byte("key"), + hash: []byte("oldhash"), + left: NewNodePointer(&MemNode{}), + right: NewNodePointer(&MemNode{}), + } + + mutated, err := original.MutateBranch(12) + require.NoError(t, err) + + // Version updated, hash cleared + require.Equal(t, uint32(12), mutated.Version()) + require.Nil(t, mutated.Hash()) + + // Other fields preserved + require.Equal(t, original.Height(), mutated.Height()) + require.Equal(t, original.Size(), mutated.Size()) + key, _ := mutated.Key() + require.Equal(t, []byte("key"), key) + require.Equal(t, original.Left(), mutated.Left()) + require.Equal(t, original.Right(), mutated.Right()) + + // Is a copy, not same pointer + require.NotSame(t, original, mutated) + + // Original unchanged + require.Equal(t, uint32(5), original.Version()) + require.Equal(t, []byte("oldhash"), original.Hash()) +} From 90bad5670dd05e5bcd2609c3dba3453b0150b949 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 11:35:04 -0500 Subject: [PATCH 12/34] reduce PR size --- iavl/internal/commit_tree.go | 14 -- iavl/internal/commit_tree_test.go | 238 ------------------ iavl/internal/node_hash.go | 118 --------- iavl/internal/node_update.go | 386 ------------------------------ iavl/internal/verify.go | 120 ---------- 5 files changed, 876 deletions(-) delete mode 100644 iavl/internal/commit_tree.go delete mode 100644 iavl/internal/commit_tree_test.go delete mode 100644 iavl/internal/node_hash.go delete mode 100644 iavl/internal/node_update.go delete mode 100644 iavl/internal/verify.go diff --git a/iavl/internal/commit_tree.go b/iavl/internal/commit_tree.go deleted file mode 100644 index 3df7687c8e73..000000000000 --- a/iavl/internal/commit_tree.go +++ /dev/null @@ -1,14 +0,0 @@ -package internal - -import ( - "log/slog" - "sync" - "sync/atomic" -) - -type CommitTree struct { - latest atomic.Pointer[NodePointer] - root *NodePointer - writeMutex sync.Mutex - logger *slog.Logger -} diff --git a/iavl/internal/commit_tree_test.go b/iavl/internal/commit_tree_test.go deleted file mode 100644 index d0ff8caf4c08..000000000000 --- a/iavl/internal/commit_tree_test.go +++ /dev/null @@ -1,238 +0,0 @@ -package internal - -import ( - "bytes" - "fmt" - "os" - "runtime/debug" - "testing" - - corestore "cosmossdk.io/core/store" - sdklog "cosmossdk.io/log" - storetypes "cosmossdk.io/store/types" - iavl1 "github.com/cosmos/iavl" - dbm "github.com/cosmos/iavl/db" - "github.com/stretchr/testify/require" - "golang.org/x/exp/maps" - "pgregory.net/rapid" -) - -func TestIAVLXSims(t *testing.T) { - rapid.Check(t, testIAVLXSims) -} - -func FuzzIAVLX(f *testing.F) { - f.Fuzz(rapid.MakeFuzz(testIAVLXSims)) -} - -func testIAVLXSims(t *rapid.T) { - defer func() { - if r := recover(); r != nil { - t.Fatalf("panic recovered: %v\nStack trace:\n%s", r, debug.Stack()) - } - }() - // logger := sdklog.NewTestLogger(t) - logger := sdklog.NewNopLogger() - dbV1 := dbm.NewMemDB() - treeV1 := iavl1.NewMutableTree(dbV1, 500000, true, logger) - - tempDir, err := os.MkdirTemp("", "iavlx") - require.NoError(t, err, "failed to create temp directory") - defer os.RemoveAll(tempDir) - simMachine := &SimMachine{ - treeV1: treeV1, - dirV2: tempDir, - existingKeys: map[string][]byte{}, - } - simMachine.openV2Tree(t) - - // TODO switch from StateMachineActions to manually setting up the actions map, this is going to be too magical for other maintainers otherwise - t.Repeat(map[string]func(*rapid.T){ - "": simMachine.Check, - "UpdateN": simMachine.UpdateN, - "GetN": simMachine.GetN, - "Iterate": simMachine.Iterate, - "Commit": simMachine.Commit, - }) - - require.NoError(t, treeV1.Close(), "failed to close iavl tree") - require.NoError(t, simMachine.treeV2.Close(), "failed to close iavlx tree") -} - -type SimMachine struct { - treeV1 *iavl1.MutableTree - treeV2 *CommitTree - dirV2 string - // existingKeys keeps track of keys that have been set in the tree or deleted. Deleted keys are retained as nil values. - existingKeys map[string][]byte -} - -func (s *SimMachine) openV2Tree(t interface { - require.TestingT - sdklog.TestingT -}) { - var err error - s.treeV2, err = NewCommitTree(s.dirV2, Options{ - WriteWAL: true, - CompactWAL: true, - DisableCompaction: true, - ZeroCopy: false, - EvictDepth: 0, - CompactionOrphanRatio: 0, - CompactionOrphanAge: 0, - RetainVersions: 0, - MinCompactionSeconds: 0, - ChangesetMaxTarget: 1, - CompactAfterVersions: 0, - ReaderUpdateInterval: 1, - }, sdklog.NewTestLogger(t)) - require.NoError(t, err, "failed to create iavlx tree") -} - -func (s *SimMachine) Check(t *rapid.T) { - // after every operation verify the iavlx tree - // after every operation we check that both trees are identical - s.compareIterators(t, nil, nil, true) -} - -func (s *SimMachine) UpdateN(t *rapid.T) { - n := rapid.IntRange(1, 5000).Draw(t, "n") - for i := 0; i < n; i++ { - del := rapid.Bool().Draw(t, "del") - if del { - s.delete(t) - } else { - s.set(t) - } - } -} - -func (s *SimMachine) GetN(t *rapid.T) { - n := rapid.IntRange(1, 5000).Draw(t, "n") - for i := 0; i < n; i++ { - s.get(t) - } -} - -func (s *SimMachine) set(t *rapid.T) { - // choose either a new or an existing key - key := s.selectKey(t) - value := rapid.SliceOfN(rapid.Byte(), 0, 10).Draw(t, "value") - // set in both trees - updated, errV1 := s.treeV1.Set(key, value) - require.NoError(t, errV1, "failed to set key in V1 tree") - branch := s.treeV2.CacheWrap().(storetypes.CacheKVStore) - branch.Set(key, value) - branch.Write() - // require.Equal(t, updated, updatedV2, "update status mismatch between V1 and V2 trees") - if updated { - require.NotNil(t, s.existingKeys[string(key)], "key shouldn't have been marked as updated") - } else { - existing, found := s.existingKeys[string(key)] - if found { - require.Nil(t, existing, value, "marked as not an update but existin key is nil") - } - } - s.existingKeys[string(key)] = value // mark as existing -} - -func (s *SimMachine) get(t *rapid.T) { - key := s.selectKey(t) - valueV1, errV1 := s.treeV1.Get(key) - require.NoError(t, errV1, "failed to get key from V1 tree") - valueV2 := s.treeV2.CacheWrap().(storetypes.CacheKVStore).Get(key) - require.Equal(t, valueV1, valueV2, "value mismatch between V1 and V2 trees") - expectedValue, found := s.existingKeys[string(key)] - if found { - require.Equal(t, expectedValue, valueV1, "expected value mismatch for key %s", key) - } else { - require.Nil(t, valueV1, "expected nil value for non-existing key %s", key) - } -} - -func (s *SimMachine) selectKey(t *rapid.T) []byte { - if len(s.existingKeys) > 0 && rapid.Bool().Draw(t, "existingKey") { - return []byte(rapid.SampledFrom(maps.Keys(s.existingKeys)).Draw(t, "key")) - } else { - // TODO consider testing longer keys - return rapid.SliceOfN(rapid.Byte(), 1, 10).Draw(t, "key") - } -} - -func (s *SimMachine) delete(t *rapid.T) { - key := s.selectKey(t) - existingValue, found := s.existingKeys[string(key)] - exists := found && existingValue != nil - // delete in both trees - _, removedV1, errV1 := s.treeV1.Remove(key) - require.NoError(t, errV1, "failed to remove key from V1 tree") - branch := s.treeV2.CacheWrap().(storetypes.CacheKVStore) - branch.Delete(key) - branch.Write() - // require.Equal(t, removedV1, removedV2, "removed status mismatch between V1 and V2 trees") - // TODO v1 & v2 have slightly different behaviors for the value returned on removal. We should re-enable this and check. - //if valueV1 == nil || len(valueV1) == 0 { - // require.Empty(t, valueV2, "value should be empty for removed key in V2 tree") - //} else { - // require.Equal(t, valueV1, valueV2, "value mismatch between V1 and V2 trees") - //} - require.Equal(t, exists, removedV1, "removed status should match existence of key") - s.existingKeys[string(key)] = nil // mark as deleted -} - -func (s *SimMachine) Iterate(t *rapid.T) { - start := s.selectKey(t) - end := s.selectKey(t) - // make sure end is after start - if string(end) <= string(start) { - temp := start - start = end - end = temp - } - - // TODO add cases where we nudge start or end up or down a little - - // ascending := rapid.Bool().Draw(t, "ascending") - - // s.compareIterators(t, start, end, ascending) -} - -func (s *SimMachine) Commit(t *rapid.T) { - hash1, _, err := s.treeV1.SaveVersion() - require.NoError(t, err, "failed to save version in V1 tree") - commitId2 := s.treeV2.Commit() - require.NoError(t, err, "failed to save version in V2 tree") - err = VerifyTree(s.treeV2) - require.NoError(t, err, "failed to verify V2 tree") - require.Equal(t, hash1, commitId2.Hash, "hash mismatch between V1 and V2 trees") - closeReopen := rapid.Bool().Draw(t, "closeReopen") - if closeReopen { - require.NoError(t, s.treeV2.Close()) - s.openV2Tree(t) - } -} - -func (s *SimMachine) debugDump(t *rapid.T) { - version := s.treeV1.Version() - t.Logf("Dumping trees at version %d", version) - graph1 := &bytes.Buffer{} - iavl.WriteDOTGraph(graph1, s.treeV1.ImmutableTree, nil) - t.Logf("V1 tree:\n%s", graph1.String()) - // renderTree(t, s.treeV2.Branch()) - iter2 := s.treeV2.CacheWrap().(storetypes.CacheKVStore).Iterator(nil, nil) - s.debugDumpTree(t, iter2) -} - -func (s *SimMachine) debugDumpTree(t *rapid.T, iter corestore.Iterator) { - dumpStr := "Tree dump:" - defer func() { - require.NoError(t, iter.Close(), "failed to close iterator") - }() - for iter.Valid() { - key := iter.Key() - value := iter.Value() - dumpStr += fmt.Sprintf("\n\tKey: %X, Value: %X", key, value) - iter.Next() - } - t.Log(dumpStr) -} diff --git a/iavl/internal/node_hash.go b/iavl/internal/node_hash.go deleted file mode 100644 index e7011008817c..000000000000 --- a/iavl/internal/node_hash.go +++ /dev/null @@ -1,118 +0,0 @@ -package internal - -import ( - "crypto/sha256" - "encoding/binary" - "fmt" - "hash" - "io" - "sync" -) - -func computeAndSetHash(node *MemNode, leftHash, rightHash []byte) ([]byte, error) { - h, err := computeHash(node, leftHash, rightHash) - if err != nil { - return nil, err - } - node.hash = h - - return h, nil -} - -var hasherPool = sync.Pool{ - New: func() any { - return sha256.New() - }, -} - -func putBackHasher(h hash.Hash) { - h.Reset() - hasherPool.Put(h) -} - -func computeHash(node Node, leftHash, rightHash []byte) ([]byte, error) { - hasher := hasherPool.Get().(hash.Hash) - defer putBackHasher(hasher) - if err := writeHashBytes(node, leftHash, rightHash, hasher); err != nil { - return nil, err - } - return hasher.Sum(nil), nil -} - -var emptyHash = sha256.New().Sum(nil) - -func shaSum256(bz []byte) []byte { - hasher := hasherPool.Get().(hash.Hash) - defer putBackHasher(hasher) - hasher.Write(bz) - var sum [sha256.Size]byte - hasher.Sum(sum[:0]) - return sum[:] -} - -// Writes the node's hash to the given `io.Writer`. This function recursively calls -// children to update hashes. -func writeHashBytes(node Node, leftHash, rightHash []byte, w io.Writer) error { - var ( - n int - buf [binary.MaxVarintLen64]byte - ) - - n = binary.PutVarint(buf[:], int64(node.Height())) - if _, err := w.Write(buf[0:n]); err != nil { - return fmt.Errorf("writing height, %w", err) - } - n = binary.PutVarint(buf[:], node.Size()) - if _, err := w.Write(buf[0:n]); err != nil { - return fmt.Errorf("writing size, %w", err) - } - n = binary.PutVarint(buf[:], int64(node.Version())) - if _, err := w.Write(buf[0:n]); err != nil { - return fmt.Errorf("writing version, %w", err) - } - - // Key is not written for inner nodes, unlike writeBytes. - - if node.IsLeaf() { - key, err := node.Key() - if err != nil { - return fmt.Errorf("getting key, %w", err) - } - - if err := encodeVarintPrefixedBytes(w, key); err != nil { - return fmt.Errorf("writing key, %w", err) - } - - value, err := node.Value() - if err != nil { - return fmt.Errorf("getting value, %w", err) - } - - // Indirection needed to provide proofs without values. - // (e.g. ProofLeafNode.ValueHash) - if err := encodeVarintPrefixedBytes(w, shaSum256(value)); err != nil { - return fmt.Errorf("writing value, %w", err) - } - } else { - if err := encodeVarintPrefixedBytes(w, leftHash); err != nil { - return fmt.Errorf("writing left hash, %w", err) - } - if err := encodeVarintPrefixedBytes(w, rightHash); err != nil { - return fmt.Errorf("writing right hash, %w", err) - } - } - - return nil -} - -// encodeVarintPrefixedBytes writes a varint length-prefixed byte slice to the writer, -// it's used for hash computation, must be compactible with the official IAVL implementation. -func encodeVarintPrefixedBytes(w io.Writer, bz []byte) error { - var buf [binary.MaxVarintLen64]byte - n := binary.PutUvarint(buf[:], uint64(len(bz))) - if _, err := w.Write(buf[0:n]); err != nil { - return err - } - _, err := w.Write(bz) - return err -} diff --git a/iavl/internal/node_update.go b/iavl/internal/node_update.go deleted file mode 100644 index 7441c93a9273..000000000000 --- a/iavl/internal/node_update.go +++ /dev/null @@ -1,386 +0,0 @@ -package internal - -import "bytes" - -func newLeafNode(key, value []byte, version uint32) *MemNode { - return &MemNode{ - height: 0, - size: 1, - version: version, - key: key, - value: value, - } -} - -// setRecursive do set operation. -// it always do modification and return new `MemNode`, even if the value is the same. -// also returns if it's an update or insertion, if update, the tree height and balance is not changed. -func setRecursive(nodePtr *NodePointer, leafNode *MemNode, ctx *mutationContext) (*NodePointer, bool, error) { - if nodePtr == nil { - return NewNodePointer(leafNode), true, nil - } - - node, err := nodePtr.Resolve() - if err != nil { - return nil, false, err - } - - nodeKey, err := node.Key() - if err != nil { - return nil, false, err - } - if node.IsLeaf() { - leafNodePtr := NewNodePointer(leafNode) - cmp := bytes.Compare(leafNode.key, nodeKey) - if cmp == 0 { - ctx.AddOrphan(nodePtr.id) - return leafNodePtr, true, nil - } - n := &MemNode{ - height: 1, - size: 2, - version: ctx.version, - } - switch cmp { - case -1: - n.left = leafNodePtr - n.right = nodePtr - n.key = nodeKey - // n._keyRef = node - case 1: - n.left = nodePtr - n.right = leafNodePtr - n.key = leafNode.key - // n._keyRef = leafNode - default: - panic("unreachable") - } - return NewNodePointer(n), false, nil - } else { - var ( - newChildPtr *NodePointer - newNode *MemNode - updated bool - err error - ) - if bytes.Compare(leafNode.key, nodeKey) == -1 { - newChildPtr, updated, err = setRecursive(node.Left(), leafNode, ctx) - if err != nil { - return nil, false, err - } - newNode, err = ctx.mutateBranch(node) - if err != nil { - return nil, false, err - } - newNode.left = newChildPtr - } else { - newChildPtr, updated, err = setRecursive(node.Right(), leafNode, ctx) - if err != nil { - return nil, false, err - } - newNode, err = ctx.mutateBranch(node) - if err != nil { - return nil, false, err - } - newNode.right = newChildPtr - } - - if !updated { - err = newNode.updateHeightSize() - if err != nil { - return nil, false, err - } - - newNode, err = newNode.reBalance(ctx) - if err != nil { - return nil, false, err - } - } - - return NewNodePointer(newNode), updated, nil - } -} - -type newKeyWrapper struct { - key []byte - // keyRef keyRefLink -} - -// removeRecursive returns: -// - (nil, origNode, nil) -> nothing changed in subtree -// - (value, nil, nil) -> leaf node is removed -// - (value, new node, newKey) -> subtree changed -func removeRecursive(nodePtr *NodePointer, key []byte, ctx *mutationContext) (value []byte, newNodePtr *NodePointer, newKey *newKeyWrapper, err error) { - if nodePtr == nil { - return nil, nil, nil, nil - } - - node, err := nodePtr.Resolve() - if err != nil { - return nil, nil, nil, err - } - - nodeKey, err := node.Key() - if err != nil { - return nil, nil, nil, err - } - - if node.IsLeaf() { - if bytes.Equal(nodeKey, key) { - ctx.AddOrphan(nodePtr.id) - value, err := node.Value() - return value, nil, nil, err - } - return nil, nodePtr, nil, nil - } - - if bytes.Compare(key, nodeKey) == -1 { - value, newLeft, newKey, err := removeRecursive(node.Left(), key, ctx) - if err != nil { - return nil, nil, nil, err - } - - if value == nil { - return nil, nodePtr, nil, nil - } - - if newLeft == nil { - ctx.AddOrphan(nodePtr.id) - return value, node.Right(), &newKeyWrapper{ - key: nodeKey, - // keyRef: nodePtr, - }, nil - } - - newNode, err := ctx.mutateBranch(node) - if err != nil { - return nil, nil, nil, err - } - newNode.left = newLeft - err = newNode.updateHeightSize() - if err != nil { - return nil, nil, nil, err - } - newNode, err = newNode.reBalance(ctx) - if err != nil { - return nil, nil, nil, err - } - - return value, NewNodePointer(newNode), newKey, nil - } - - value, newRight, newKey, err := removeRecursive(node.Right(), key, ctx) - if err != nil { - return nil, nil, nil, err - } - - if value == nil { - return nil, nodePtr, nil, nil - } - - if newRight == nil { - ctx.AddOrphan(nodePtr.id) - return value, node.Left(), nil, nil - } - - newNode, err := ctx.mutateBranch(node) - if err != nil { - return nil, nil, nil, err - } - - newNode.right = newRight - if newKey != nil { - newNode.key = newKey.key - // newNode._keyRef = newKey.keyRef - } - - err = newNode.updateHeightSize() - if err != nil { - return nil, nil, nil, err - } - - newNode, err = newNode.reBalance(ctx) - if err != nil { - return nil, nil, nil, err - } - - return value, NewNodePointer(newNode), nil, nil -} - -// IMPORTANT: nodes called with this method must be new or copies first. -// Code reviewers should use find usages to ensure that all callers follow this rule! -func (node *MemNode) updateHeightSize() error { - leftNode, err := node.left.Resolve() - if err != nil { - return err - } - - rightNode, err := node.right.Resolve() - if err != nil { - return err - } - - node.height = maxUint8(leftNode.Height(), rightNode.Height()) + 1 - node.size = leftNode.Size() + rightNode.Size() - return nil -} - -func maxUint8(a, b uint8) uint8 { - if a > b { - return a - } - return b -} - -// IMPORTANT: nodes called with this method must be new or copies first. -// Code reviewers should use find usages to ensure that all callers follow this rule! -func (node *MemNode) reBalance(ctx *mutationContext) (*MemNode, error) { - balance, err := calcBalance(node) - if err != nil { - return nil, err - } - switch { - case balance > 1: - left, err := node.left.Resolve() - if err != nil { - return nil, err - } - - leftBalance, err := calcBalance(left) - if err != nil { - return nil, err - } - - if leftBalance >= 0 { - // left left - return node.rotateRight(ctx) - } - - // left right - newLeft, err := ctx.mutateBranch(left) - if err != nil { - return nil, err - } - newLeft, err = newLeft.rotateLeft(ctx) - if err != nil { - return nil, err - } - node.left = NewNodePointer(newLeft) - return node.rotateRight(ctx) - case balance < -1: - right, err := node.right.Resolve() - if err != nil { - return nil, err - } - - rightBalance, err := calcBalance(right) - if err != nil { - return nil, err - } - - if rightBalance <= 0 { - // right right - return node.rotateLeft(ctx) - } - - // right left - newRight, err := ctx.mutateBranch(right) - if err != nil { - return nil, err - } - newRight, err = newRight.rotateRight(ctx) - node.right = NewNodePointer(newRight) - return node.rotateLeft(ctx) - default: - // nothing changed - return node, err - } -} - -func calcBalance(node Node) (int, error) { - leftNode, err := node.Left().Resolve() - if err != nil { - return 0, err - } - - rightNode, err := node.Right().Resolve() - if err != nil { - return 0, err - } - - return int(leftNode.Height()) - int(rightNode.Height()), nil -} - -// IMPORTANT: nodes called with this method must be new or copies first. -// Code reviewers should use find usages to ensure that all callers follow this rule! -func (node *MemNode) rotateRight(ctx *mutationContext) (*MemNode, error) { - left, err := node.left.Resolve() - if err != nil { - return nil, err - } - newSelf, err := ctx.mutateBranch(left) - if err != nil { - return nil, err - } - node.left = left.Right() - newSelf.right = NewNodePointer(node) - - err = node.updateHeightSize() - if err != nil { - return nil, err - } - err = newSelf.updateHeightSize() - if err != nil { - return nil, err - } - - return newSelf, nil -} - -// IMPORTANT: nodes called with this method must be new or copies first. -// Code reviewers should use find usages to ensure that all callers follow this rule! -func (node *MemNode) rotateLeft(ctx *mutationContext) (*MemNode, error) { - right, err := node.right.Resolve() - if err != nil { - return nil, err - } - - newSelf, err := ctx.mutateBranch(right) - if err != nil { - return nil, err - } - - node.right = right.Left() - newSelf.left = NewNodePointer(node) - - err = node.updateHeightSize() - if err != nil { - return nil, err - } - - err = newSelf.updateHeightSize() - if err != nil { - return nil, err - } - - return newSelf, nil -} - -type mutationContext struct { - version uint32 - orphans []NodeID -} - -func (ctx *mutationContext) mutateBranch(node Node) (*MemNode, error) { - id := node.ID() - if !id.IsEmpty() { - ctx.orphans = append(ctx.orphans, id) - } - return node.MutateBranch(ctx.version) -} - -func (ctx *mutationContext) AddOrphan(id NodeID) { - if !id.IsEmpty() { - ctx.orphans = append(ctx.orphans, id) - } -} diff --git a/iavl/internal/verify.go b/iavl/internal/verify.go deleted file mode 100644 index fbf7fa97e420..000000000000 --- a/iavl/internal/verify.go +++ /dev/null @@ -1,120 +0,0 @@ -package internal - -import ( - "bytes" - "fmt" -) - -func verifyNode(np *NodePointer) error { - node, err := np.Resolve() - if err != nil { - return fmt.Errorf("resolve node %s: %w", np.id, err) - } - - if node.Version() != uint32(np.id.Version) { - return fmt.Errorf("node %s has version %d, expected %d", np.id, node.Version(), np.id.Version) - } - - if node.IsLeaf() { - if node.Height() != 0 { - return fmt.Errorf("leaf node %s has height %d", np.id, node.Height()) - } - - if node.Size() != 1 { - return fmt.Errorf("leaf node %s has size %d, expected 1", np.id, node.Size()) - } - - if node.Left() != nil { - return fmt.Errorf("leaf node %s has non-nil left child", np.id) - } - - if node.Right() != nil { - return fmt.Errorf("leaf node %s has non-nil right child", np.id) - } - - hash, err := computeHash(node, nil, nil) - if err != nil { - return fmt.Errorf("compute hash for leaf node %s: %w", np.id, err) - } - - if !bytes.Equal(hash, node.Hash()) { - return fmt.Errorf("leaf node %s has invalid hash", np.id) - } - } else { - leftPtr := node.Left() - if leftPtr == nil { - return fmt.Errorf("branch node %s has nil left child", np.id) - } - - rightPtr := node.Right() - if rightPtr == nil { - return fmt.Errorf("branch node %s has nil right child", np.id) - } - - left, err := leftPtr.Resolve() - if err != nil { - return fmt.Errorf("resolve left child of node %s: %w", np.id, err) - } - - right, err := rightPtr.Resolve() - if err != nil { - return fmt.Errorf("resolve right child of node %s: %w", np.id, err) - } - - key, err := node.Key() - if err != nil { - return fmt.Errorf("get key of node %s: %w", np.id, err) - } - - leftKey, err := left.Key() - if err != nil { - return fmt.Errorf("get key of left child of node %s: %w", np.id, err) - } - - rightKey, err := right.Key() - if err != nil { - return fmt.Errorf("get key of right child of node %s: %w", np.id, err) - } - - if bytes.Compare(leftKey, key) >= 0 { - return fmt.Errorf("branch node %s with id %s has key %x, but left child %s, has key %x", node, np.id, key, left, leftKey) - } - - if bytes.Compare(rightKey, key) < 0 { - return fmt.Errorf("branch node %s with id %s has key %x, but right child %s, has key %x", node, np.id, key, right, rightKey) - } - - if left.Size()+right.Size() != node.Size() { - return fmt.Errorf("branch node %s has size %d, but children sizes are %d and %d", np.id, node.Size(), left.Size(), right.Size()) - } - - expectedHeight := maxUint8(left.Height(), right.Height()) + 1 - if node.Height() != expectedHeight { - return fmt.Errorf("branch node %s has height %d, expected %d, left height %d, right height %d", np.id, node.Height(), expectedHeight, left.Height(), right.Height()) - } - - // ensure balanced - balance := int(left.Height()) - int(right.Height()) - if balance < -1 || balance > 1 { - return fmt.Errorf("branch node %s is unbalanced: left height %d, right height %d", np.id, left.Height(), right.Height()) - } - - hash, err := computeHash(node, left.Hash(), right.Hash()) - if err != nil { - return fmt.Errorf("compute hash for branch node %s: %w", np.id, err) - } - - if !bytes.Equal(hash, node.Hash()) { - return fmt.Errorf("branch node %s has invalid hash", np.id) - } - - if err := verifyNode(leftPtr); err != nil { - return err - } - - if err := verifyNode(rightPtr); err != nil { - return err - } - } - return nil -} From e37410b89760ea48454c1c767dd8869d44995368 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 11:44:25 -0500 Subject: [PATCH 13/34] add get tests and update docs --- iavl/internal/mem_node_test.go | 178 +++++++++++++++++++++++++++++++++ iavl/internal/node.go | 5 + 2 files changed, 183 insertions(+) diff --git a/iavl/internal/mem_node_test.go b/iavl/internal/mem_node_test.go index 2df32bc0597c..aa97143d2213 100644 --- a/iavl/internal/mem_node_test.go +++ b/iavl/internal/mem_node_test.go @@ -130,3 +130,181 @@ func TestMemNode_MutateBranch(t *testing.T) { require.Equal(t, uint32(5), original.Version()) require.Equal(t, []byte("oldhash"), original.Hash()) } + +func TestMemNode_Get_Leaf(t *testing.T) { + tests := []struct { + name string + nodeKey string + nodeValue string + searchKey string + wantValue []byte + wantIndex int64 + }{ + { + name: "exact match", + nodeKey: "b", + nodeValue: "val_b", + searchKey: "b", + wantValue: []byte("val_b"), + wantIndex: 0, + }, + { + name: "search key less than node key", + nodeKey: "b", + nodeValue: "val_b", + searchKey: "a", + wantValue: nil, + wantIndex: 0, + }, + { + name: "search key greater than node key", + nodeKey: "b", + nodeValue: "val_b", + searchKey: "c", + wantValue: nil, + wantIndex: 1, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &MemNode{ + height: 0, + size: 1, + key: []byte(tt.nodeKey), + value: []byte(tt.nodeValue), + } + val, idx, err := node.Get([]byte(tt.searchKey)) + require.NoError(t, err) + require.Equal(t, tt.wantValue, val) + require.Equal(t, tt.wantIndex, idx) + }) + } +} + +func TestMemNode_Get_Branch(t *testing.T) { + // Hand-construct a simple tree: + // + // [b] <- branch, key="b", size=2 + // / \ + // [a] [b] <- leaves + // + // In IAVL, branch key = smallest key in right subtree + + leftLeaf := &MemNode{ + height: 0, + size: 1, + key: []byte("a"), + value: []byte("val_a"), + } + rightLeaf := &MemNode{ + height: 0, + size: 1, + key: []byte("b"), + value: []byte("val_b"), + } + root := &MemNode{ + height: 1, + size: 2, + key: []byte("b"), // smallest key in right subtree + left: NewNodePointer(leftLeaf), + right: NewNodePointer(rightLeaf), + } + + tests := []struct { + name string + searchKey string + wantValue []byte + wantIndex int64 + }{ + { + name: "find in left subtree", + searchKey: "a", + wantValue: []byte("val_a"), + wantIndex: 0, + }, + { + name: "find in right subtree", + searchKey: "b", + wantValue: []byte("val_b"), + wantIndex: 1, + }, + { + name: "key not found - less than all", + searchKey: "0", + wantValue: nil, + wantIndex: 0, + }, + { + name: "key not found - greater than all", + searchKey: "z", + wantValue: nil, + wantIndex: 2, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + val, idx, err := root.Get([]byte(tt.searchKey)) + require.NoError(t, err) + require.Equal(t, tt.wantValue, val) + require.Equal(t, tt.wantIndex, idx) + }) + } +} + +func TestMemNode_Get_DeeperTree(t *testing.T) { + // Hand-construct a 3-level tree: + // + // [c] <- root, size=4 + // / \ + // [b] [d] <- branches, size=2 each + // / \ / \ + // [a] [b] [c] [d] <- leaves + // + // Sorted keys: a=0, b=1, c=2, d=3 + + leafA := &MemNode{height: 0, size: 1, key: []byte("a"), value: []byte("val_a")} + leafB := &MemNode{height: 0, size: 1, key: []byte("b"), value: []byte("val_b")} + leafC := &MemNode{height: 0, size: 1, key: []byte("c"), value: []byte("val_c")} + leafD := &MemNode{height: 0, size: 1, key: []byte("d"), value: []byte("val_d")} + + branchLeft := &MemNode{ + height: 1, + size: 2, + key: []byte("b"), + left: NewNodePointer(leafA), + right: NewNodePointer(leafB), + } + branchRight := &MemNode{ + height: 1, + size: 2, + key: []byte("d"), + left: NewNodePointer(leafC), + right: NewNodePointer(leafD), + } + root := &MemNode{ + height: 2, + size: 4, + key: []byte("c"), // smallest key in right subtree + left: NewNodePointer(branchLeft), + right: NewNodePointer(branchRight), + } + + tests := []struct { + searchKey string + wantValue []byte + wantIndex int64 + }{ + {"a", []byte("val_a"), 0}, + {"b", []byte("val_b"), 1}, + {"c", []byte("val_c"), 2}, + {"d", []byte("val_d"), 3}, + } + for _, tt := range tests { + t.Run(tt.searchKey, func(t *testing.T) { + val, idx, err := root.Get([]byte(tt.searchKey)) + require.NoError(t, err) + require.Equal(t, tt.wantValue, val) + require.Equal(t, tt.wantIndex, idx) + }) + } +} diff --git a/iavl/internal/node.go b/iavl/internal/node.go index 894195adfc3c..19b45ff6899e 100644 --- a/iavl/internal/node.go +++ b/iavl/internal/node.go @@ -39,6 +39,11 @@ type Node interface { Version() uint32 // Get traverses this subtree to find the value associated with the given key. + // If the key is found, value contains the associated value. + // If the key is not found, value is nil (not an error). + // The index is the 0-based position where the key exists or would be inserted + // in sorted order among all leaf keys in this subtree. This is useful for + // range queries and determining a key's position even when it doesn't exist. Get(key []byte) (value []byte, index int64, err error) // MutateBranch creates a mutable copy of this branch node created at the specified version. From 0564bdb050eb17a11c1cb2344adc4630b6aae750 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 11:46:51 -0500 Subject: [PATCH 14/34] add more test explanations --- iavl/internal/mem_node_test.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/iavl/internal/mem_node_test.go b/iavl/internal/mem_node_test.go index aa97143d2213..ca948f070103 100644 --- a/iavl/internal/mem_node_test.go +++ b/iavl/internal/mem_node_test.go @@ -132,6 +132,11 @@ func TestMemNode_MutateBranch(t *testing.T) { } func TestMemNode_Get_Leaf(t *testing.T) { + // When Get is called on a leaf node: + // - If key matches: returns (value, 0, nil) + // - If key not found: returns (nil, index, nil) where index is the insertion point + // - key < nodeKey: index=0 (would insert before this leaf) + // - key > nodeKey: index=1 (would insert after this leaf) tests := []struct { name string nodeKey string @@ -154,7 +159,7 @@ func TestMemNode_Get_Leaf(t *testing.T) { nodeValue: "val_b", searchKey: "a", wantValue: nil, - wantIndex: 0, + wantIndex: 0, // "a" would be inserted before "b" }, { name: "search key greater than node key", @@ -162,7 +167,7 @@ func TestMemNode_Get_Leaf(t *testing.T) { nodeValue: "val_b", searchKey: "c", wantValue: nil, - wantIndex: 1, + wantIndex: 1, // "c" would be inserted after "b" }, } for _, tt := range tests { @@ -186,9 +191,13 @@ func TestMemNode_Get_Branch(t *testing.T) { // // [b] <- branch, key="b", size=2 // / \ - // [a] [b] <- leaves + // [a] [b] <- leaves (index 0, index 1) // // In IAVL, branch key = smallest key in right subtree + // + // Index is the 0-based position in sorted leaf order: + // - "a" is at index 0, "b" is at index 1 + // - Keys not found return the insertion point leftLeaf := &MemNode{ height: 0, @@ -232,13 +241,13 @@ func TestMemNode_Get_Branch(t *testing.T) { name: "key not found - less than all", searchKey: "0", wantValue: nil, - wantIndex: 0, + wantIndex: 0, // "0" would be inserted at position 0 }, { name: "key not found - greater than all", searchKey: "z", wantValue: nil, - wantIndex: 2, + wantIndex: 2, // "z" would be inserted at position 2 (after both leaves) }, } for _, tt := range tests { From f4607bf5212f373146b3c9d92152035d7616f6aa Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 11:53:02 -0500 Subject: [PATCH 15/34] update doc, add missing test --- iavl/internal/node.go | 3 ++- iavl/internal/node_id_test.go | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/iavl/internal/node.go b/iavl/internal/node.go index 19b45ff6899e..f449ccdf45db 100644 --- a/iavl/internal/node.go +++ b/iavl/internal/node.go @@ -14,7 +14,8 @@ type Node interface { // Key returns the key of this node. Key() ([]byte, error) - // Value returns the value of this node. It is an error to call this method on non-leaf nodes. + // Value returns the value of this node. + // Calling this on a non-leaf node will return nil and possibly an error. Value() ([]byte, error) // Left returns a pointer to the left child node. diff --git a/iavl/internal/node_id_test.go b/iavl/internal/node_id_test.go index a99f44c37d21..dffccb205c9b 100644 --- a/iavl/internal/node_id_test.go +++ b/iavl/internal/node_id_test.go @@ -34,3 +34,8 @@ func TestNodeID(t *testing.T) { }) } } + +func TestNodeID_IsEmpty(t *testing.T) { + require.True(t, NodeID{}.IsEmpty()) + require.False(t, NewNodeID(true, 1, 1).IsEmpty()) +} From 78faae576c8e1f342c0999f3ededc4c45b90c869 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 16:03:03 -0500 Subject: [PATCH 16/34] feat(iavl): define KV data format --- iavl/internal/file_writer.go | 38 +++++++++++ iavl/internal/kvdata.go | 17 +++++ iavl/internal/kvdata_writer.go | 114 +++++++++++++++++++++++++++++++++ iavl/internal/mem_node.go | 21 +++--- iavl/internal/mem_node_test.go | 20 +++--- iavl/internal/update.go | 16 +++++ 6 files changed, 206 insertions(+), 20 deletions(-) create mode 100644 iavl/internal/file_writer.go create mode 100644 iavl/internal/kvdata.go create mode 100644 iavl/internal/kvdata_writer.go create mode 100644 iavl/internal/update.go diff --git a/iavl/internal/file_writer.go b/iavl/internal/file_writer.go new file mode 100644 index 000000000000..afe33bb1c3a9 --- /dev/null +++ b/iavl/internal/file_writer.go @@ -0,0 +1,38 @@ +package internal + +import ( + "bufio" + "fmt" + "io" + "os" +) + +type FileWriter struct { + writer *bufio.Writer + written int +} + +func NewFileWriter(file *os.File) *FileWriter { + return &FileWriter{ + writer: bufio.NewWriterSize(file, 512*1024 /* 512kb */), // TODO: maybe we can have this as a config option? + } +} + +func (f *FileWriter) Write(p []byte) (n int, err error) { + n, err = f.writer.Write(p) + f.written += n + return n, err +} + +func (f *FileWriter) Flush() error { + if err := f.writer.Flush(); err != nil { + return fmt.Errorf("failed to flush writer: %w", err) + } + return nil +} + +func (f *FileWriter) Size() int { + return f.written +} + +var _ io.Writer = (*FileWriter)(nil) diff --git a/iavl/internal/kvdata.go b/iavl/internal/kvdata.go new file mode 100644 index 000000000000..fd083e7ae322 --- /dev/null +++ b/iavl/internal/kvdata.go @@ -0,0 +1,17 @@ +package internal + +// KVDataEntryType represents the type of entry in the KV data file. +type KVDataEntryType byte + +const ( + // KVDataEntryTypeWALStart is the first entry in an uncompacted KV data file which indicates that this KV data file + // can be used for WAL replay restoration. It must immediately be followed by the varint-encoded version number + // corresponding to the first version in this changeset. + KVDataEntryTypeWALStart KVDataEntryType = iota + // KVDataEntryTypeWALSet indicates a set operation for a key-value pair. + KVDataEntryTypeWALSet = iota + KVDataEntryTypeWALDelete + KVDataEntryTypeWALCommit + KVDataEntryTypeExtraK + KVDataEntryTypeExtraKV +) diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go new file mode 100644 index 000000000000..a0de9a2a34c9 --- /dev/null +++ b/iavl/internal/kvdata_writer.go @@ -0,0 +1,114 @@ +package internal + +import ( + "encoding/binary" + "fmt" + "math" + "os" +) + +type KVDataWriter struct { + *FileWriter +} + +func NewKVDataWriter(file *os.File) *KVDataWriter { + fw := NewFileWriter(file) + return &KVDataWriter{ + FileWriter: fw, + } +} + +func (kvs *KVDataWriter) WriteK(key []byte) (offset uint64, err error) { + _, err = kvs.Write([]byte{KVDataEntryTypeExtraK}) + if err != nil { + return offset, err + } + + return kvs.writeLenPrefixedBytes(key) +} + +func (kvs *KVDataWriter) WriteKV(key, value []byte) (offset uint32, err error) { + _, err = kvs.Write([]byte{KVDataEntryTypeExtraKV}) + if err != nil { + return offset, err + } + + offset, err = kvs.writeLenPrefixedBytes(key) + if err != nil { + return 0, err + } + _, err = kvs.writeLenPrefixedBytes(value) + return offset, err +} + +func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { + for _, update := range updates { + if deleteKey := update.DeleteKey; deleteKey != nil { + _, err := kvs.Write([]byte{KVDataEntryTypeDelete}) + if err != nil { + return err + } + _, err = kvs.writeLenPrefixedBytes(deleteKey) + if err != nil { + return err + } + } else if memNode := update.SetNode; memNode != nil { + _, err := kvs.Write([]byte{KVDataEntryTypeSet}) + if err != nil { + return err + } + offset, err := kvs.writeLenPrefixedBytes(memNode.key) + if err != nil { + return err + } + memNode.kvOffset = offset + + _, err = kvs.writeLenPrefixedBytes(memNode.value) + if err != nil { + return err + } + } else { + return fmt.Errorf("invalid update: neither SetNode nor DeleteKey is set") + } + } + return nil +} + +func (kvs *KVDataWriter) WriteCommit(version uint32) error { + _, err := kvs.Write([]byte{KVDataEntryTypeCommit}) + if err != nil { + return err + } + + return kvs.writeLEU32(version) +} + +func (kvs *KVDataWriter) writeLenPrefixedBytes(key []byte) (offset uint64, err error) { + lenKey := len(key) + if lenKey > math.MaxUint32 { + return 0, fmt.Errorf("key too large: %d bytes", lenKey) + } + + offset = uint64(kvs.Size()) + + // write little endian uint32 length prefix + err = kvs.writeLEU32(uint32(lenKey)) + if err != nil { + return offset, err + } + + // write key bytes + _, err = kvs.Write(key) + if err != nil { + return offset, err + } + + return offset, nil +} + +func (kvs *KVDataWriter) writeLEU32(x uint32) error { + var buf [4]byte + binary.LittleEndian.PutUint32(buf[:], x) + _, err := kvs.Write(buf[:]) + return err +} diff --git a/iavl/internal/mem_node.go b/iavl/internal/mem_node.go index 20e63485dc03..5c4e56ea9949 100644 --- a/iavl/internal/mem_node.go +++ b/iavl/internal/mem_node.go @@ -8,16 +8,17 @@ import ( // MemNode represents an in-memory node that has recently been created and may or may not have // been serialized to disk yet. type MemNode struct { - height uint8 - version uint32 - size int64 - key []byte - value []byte - left *NodePointer - right *NodePointer - hash []byte - nodeId NodeID // ID of this node, 0 if not yet assigned - keyOffset uint32 + height uint8 + version uint32 + size int64 + key []byte + value []byte + left *NodePointer + right *NodePointer + hash []byte + nodeId NodeID // ID of this node, 0 if not yet assigned + // kvOffset is the offset of the key or key/value data in the KV data file. + kvOffset uint64 } // ID implements the Node interface. diff --git a/iavl/internal/mem_node_test.go b/iavl/internal/mem_node_test.go index ca948f070103..a2f19000c19e 100644 --- a/iavl/internal/mem_node_test.go +++ b/iavl/internal/mem_node_test.go @@ -12,16 +12,16 @@ func TestMemNode_Getters(t *testing.T) { nodeId := NewNodeID(true, 5, 10) node := &MemNode{ - height: 3, - version: 7, - size: 42, - key: []byte("testkey"), - value: []byte("testvalue"), - hash: []byte("testhash"), - left: left, - right: right, - nodeId: nodeId, - keyOffset: 100, + height: 3, + version: 7, + size: 42, + key: []byte("testkey"), + value: []byte("testvalue"), + hash: []byte("testhash"), + left: left, + right: right, + nodeId: nodeId, + kvOffset: 100, } require.Equal(t, uint8(3), node.Height()) diff --git a/iavl/internal/update.go b/iavl/internal/update.go new file mode 100644 index 000000000000..04720074121b --- /dev/null +++ b/iavl/internal/update.go @@ -0,0 +1,16 @@ +package internal + +// KVUpdate represents either a set or delete operation for a key-value pair. +// If SetNode is non-nil, it indicates a set operation. +// If DeleteKey is non-nil, it indicates a delete operation. +type KVUpdate struct { + // SetNode uses a MemNode to represent a set operation. + // If non-nil, it indicates that the key-value pair should be set/updated. + // We use a *MemNode directly here because when KV data is serialized, we track + // the KV offset inside the MemNode.kvOffset field which is later used + // to reference the KV data in serialized tree nodes. + SetNode *MemNode + + // DeleteKey uses a byte slice to represent the key being deleted. + DeleteKey []byte +} From e8cfa9300d0fd2c1b32e0a7caec6349923cbc9ab Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 17:20:34 -0500 Subject: [PATCH 17/34] WIP on kv data design --- iavl/internal/kvdata.go | 43 +++++++++++---- iavl/internal/kvdata_writer.go | 97 +++++++++++++++++++++++++--------- 2 files changed, 106 insertions(+), 34 deletions(-) diff --git a/iavl/internal/kvdata.go b/iavl/internal/kvdata.go index fd083e7ae322..39ddd089be3f 100644 --- a/iavl/internal/kvdata.go +++ b/iavl/internal/kvdata.go @@ -1,17 +1,40 @@ package internal -// KVDataEntryType represents the type of entry in the KV data file. -type KVDataEntryType byte +// KVEntryType represents the type of entry in the KV data file. +type KVEntryType byte const ( - // KVDataEntryTypeWALStart is the first entry in an uncompacted KV data file which indicates that this KV data file + // KVEntryWALStart is the first entry in an uncompacted KV data file which indicates that this KV data file // can be used for WAL replay restoration. It must immediately be followed by the varint-encoded version number // corresponding to the first version in this changeset. - KVDataEntryTypeWALStart KVDataEntryType = iota - // KVDataEntryTypeWALSet indicates a set operation for a key-value pair. - KVDataEntryTypeWALSet = iota - KVDataEntryTypeWALDelete - KVDataEntryTypeWALCommit - KVDataEntryTypeExtraK - KVDataEntryTypeExtraKV + KVEntryWALStart KVEntryType = 0x0 + + // KVEntryWALSet indicates a set operation for a key-value pair. + // This must be followed by: + // - 16-bit little-endian length-prefixed key or 32-bit offset to cached key if KVFlagCachedKey is set + // - varint length-prefixed value + KVEntryWALSet KVEntryType = 0x1 + + // KVEntryWALDelete indicates a delete operation for a key. + // This must be followed by: + // - 16-bit little-endian length-prefixed key or 32-bit offset to cached key if KVFlagCachedKey is set + KVEntryWALDelete KVEntryType = 0x2 + + // KVEntryWALCommit indicates the end of a batch of operations for a specific version. + // This must be followed by a varint-encoded version number. + KVEntryWALCommit KVEntryType = 0x3 + + // KVEntryKeyData indicates a key entry in the KV data file. + // This must be followed by: + // - 16-bit little-endian length-prefixed key + KVEntryKeyData KVEntryType = 0x4 + + // KVEntryKeyValueData indicates a value entry in the KV data file. + // This must be followed by: + // - 16-bit little-endian length-prefixed key or 32-bit offset to cached key if KVFlagCachedKey is set + // - varint length-prefixed value + KVEntryKeyValueData KVEntryType = 0x5 + + // KVFlagCachedKey indicates that the key is stored as a 32-bit offset to a cached key entry. + KVFlagCachedKey KVEntryType = 0x80 ) diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index a0de9a2a34c9..b5547eb162a1 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -9,48 +9,37 @@ import ( type KVDataWriter struct { *FileWriter + keyCache map[string]uint32 } func NewKVDataWriter(file *os.File) *KVDataWriter { fw := NewFileWriter(file) return &KVDataWriter{ FileWriter: fw, + keyCache: make(map[string]uint32), } } -func (kvs *KVDataWriter) WriteK(key []byte) (offset uint64, err error) { - _, err = kvs.Write([]byte{KVDataEntryTypeExtraK}) +func (kvs *KVDataWriter) StartWAL(version uint64) error { + err := kvs.writeType(KVEntryWALStart) if err != nil { - return offset, err - } - - return kvs.writeLenPrefixedBytes(key) -} - -func (kvs *KVDataWriter) WriteKV(key, value []byte) (offset uint32, err error) { - _, err = kvs.Write([]byte{KVDataEntryTypeExtraKV}) - if err != nil { - return offset, err - } - - offset, err = kvs.writeLenPrefixedBytes(key) - if err != nil { - return 0, err + return err } - _, err = kvs.writeLenPrefixedBytes(value) - return offset, err + return kvs.writeVarUint(version) } func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { for _, update := range updates { if deleteKey := update.DeleteKey; deleteKey != nil { - _, err := kvs.Write([]byte{KVDataEntryTypeDelete}) + cachedOffset, cached := kvs.keyCache[string(deleteKey)] + err := kvs.writeType(KVEntryWALDelete | KVFlagCachedKey) if err != nil { return err } - _, err = kvs.writeLenPrefixedBytes(deleteKey) - if err != nil { - return err + + if cached { + } else { + } } else if memNode := update.SetNode; memNode != nil { _, err := kvs.Write([]byte{KVDataEntryTypeSet}) @@ -83,6 +72,29 @@ func (kvs *KVDataWriter) WriteCommit(version uint32) error { return kvs.writeLEU32(version) } +func (kvs *KVDataWriter) WriteK(key []byte) (offset uint64, err error) { + _, err = kvs.Write([]byte{KVDataEntryTypeExtraK}) + if err != nil { + return offset, err + } + + return kvs.writeLenPrefixedBytes(key) +} + +func (kvs *KVDataWriter) WriteKV(key, value []byte) (offset uint32, err error) { + _, err = kvs.Write([]byte{KVDataEntryTypeExtraKV}) + if err != nil { + return offset, err + } + + offset, err = kvs.writeLenPrefixedBytes(key) + if err != nil { + return 0, err + } + _, err = kvs.writeLenPrefixedBytes(value) + return offset, err +} + func (kvs *KVDataWriter) writeLenPrefixedBytes(key []byte) (offset uint64, err error) { lenKey := len(key) if lenKey > math.MaxUint32 { @@ -106,7 +118,44 @@ func (kvs *KVDataWriter) writeLenPrefixedBytes(key []byte) (offset uint64, err e return offset, nil } -func (kvs *KVDataWriter) writeLEU32(x uint32) error { +func (kvs *KVDataWriter) writeKey(typ KVEntryType, key []byte) error { + cachedOffset, cached := kvs.keyCache[string(key)] + if cached { + typ |= KVFlagCachedKey + } + if err := kvs.writeType(typ); err != nil { + return err + } + if cached { + return kvs.writeLEU32(uint32(cachedOffset)) + } else { + + } +} + +func (kvs *KVDataWriter) writeType(x KVEntryType) error { + _, err := kvs.Write([]byte{byte(x)}) + return err +} + +func (kvs *KVDataWriter) writeVarUint(x uint64) error { + var buf [binary.MaxVarintLen64]byte + n := binary.PutUvarint(buf[:], x) + _, err := kvs.Write(buf[0:n]) + return err +} + +func (kvs *KVDataWriter) writeLEU16(x uint64) error { + if x > math.MaxUint16 { + return fmt.Errorf("value overflows uint16: %d", x) + } +} + +func (kvs *KVDataWriter) writeLEU32(x uint64) error { + if x > math.MaxUint32 { + return fmt.Errorf("value overflows uint32: %d", x) + } + var buf [4]byte binary.LittleEndian.PutUint32(buf[:], x) _, err := kvs.Write(buf[:]) From 3ff4da508f319fdb0a85d1f163dd011b40e90271 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 17:32:17 -0500 Subject: [PATCH 18/34] WIP on kv data design --- iavl/internal/kvdata.go | 30 +++++++++++++----------------- iavl/internal/leaf_layout.go | 4 ++++ iavl/internal/mem_node.go | 22 +++++++++++----------- iavl/internal/mem_node_test.go | 21 +++++++++++---------- 4 files changed, 39 insertions(+), 38 deletions(-) diff --git a/iavl/internal/kvdata.go b/iavl/internal/kvdata.go index 39ddd089be3f..a2f0c8a092e4 100644 --- a/iavl/internal/kvdata.go +++ b/iavl/internal/kvdata.go @@ -10,31 +10,27 @@ const ( KVEntryWALStart KVEntryType = 0x0 // KVEntryWALSet indicates a set operation for a key-value pair. - // This must be followed by: - // - 16-bit little-endian length-prefixed key or 32-bit offset to cached key if KVFlagCachedKey is set - // - varint length-prefixed value + // This should be followed by a varint-encoded length and the raw bytes OR + // if the KVFlagCachedKey flag is set, a 32-bit little-endian offset referencing a cached key, + // AND then a varint-encoded length and the value bytes. KVEntryWALSet KVEntryType = 0x1 // KVEntryWALDelete indicates a delete operation for a key. - // This must be followed by: - // - 16-bit little-endian length-prefixed key or 32-bit offset to cached key if KVFlagCachedKey is set + // This should be followed by a varint-encoded length and the key bytes OR + // if the KVFlagCachedKey flag is set, a 32-bit little-endian offset referencing a cached key. KVEntryWALDelete KVEntryType = 0x2 - // KVEntryWALCommit indicates the end of a batch of operations for a specific version. + // KVEntryWALCommit indicates the commit operation for a version. // This must be followed by a varint-encoded version number. KVEntryWALCommit KVEntryType = 0x3 - // KVEntryKeyData indicates a key entry in the KV data file. - // This must be followed by: - // - 16-bit little-endian length-prefixed key - KVEntryKeyData KVEntryType = 0x4 + // KVEntryBlob indicates a blob entry storing raw key-value data. + // This should be followed by a varint-encoded length and the raw bytes. + // This entry type is used for compacted (non-WAL) KV data files or + // for branch keys that aren't otherwise cached. + KVEntryBlob KVEntryType = 0x4 - // KVEntryKeyValueData indicates a value entry in the KV data file. - // This must be followed by: - // - 16-bit little-endian length-prefixed key or 32-bit offset to cached key if KVFlagCachedKey is set - // - varint length-prefixed value - KVEntryKeyValueData KVEntryType = 0x5 - - // KVFlagCachedKey indicates that the key is stored as a 32-bit offset to a cached key entry. + // KVFlagCachedKey indicates that the key for this entry is cached and should be referenced by + // a 32-bit little-endian offset instead of being stored inline. KVFlagCachedKey KVEntryType = 0x80 ) diff --git a/iavl/internal/leaf_layout.go b/iavl/internal/leaf_layout.go index 95e67ca3a938..9a880c2a37a4 100644 --- a/iavl/internal/leaf_layout.go +++ b/iavl/internal/leaf_layout.go @@ -31,6 +31,10 @@ type LeafLayout struct { // this existing "compact" format. KeyOffset uint32 + // ValueOffset is the offset the value data for this node in the key value data file. + // The same size considerations apply here as for KeyOffset. + ValueOffset uint32 + // Hash is the hash of this leaf node. Hash [32]byte } diff --git a/iavl/internal/mem_node.go b/iavl/internal/mem_node.go index 5c4e56ea9949..02dd485f43fe 100644 --- a/iavl/internal/mem_node.go +++ b/iavl/internal/mem_node.go @@ -8,17 +8,17 @@ import ( // MemNode represents an in-memory node that has recently been created and may or may not have // been serialized to disk yet. type MemNode struct { - height uint8 - version uint32 - size int64 - key []byte - value []byte - left *NodePointer - right *NodePointer - hash []byte - nodeId NodeID // ID of this node, 0 if not yet assigned - // kvOffset is the offset of the key or key/value data in the KV data file. - kvOffset uint64 + height uint8 + version uint32 + size int64 + key []byte + value []byte + left *NodePointer + right *NodePointer + hash []byte + nodeId NodeID // ID of this node, 0 if not yet assigned + keyOffset uint32 + valueOffset uint32 } // ID implements the Node interface. diff --git a/iavl/internal/mem_node_test.go b/iavl/internal/mem_node_test.go index a2f19000c19e..916141824ea2 100644 --- a/iavl/internal/mem_node_test.go +++ b/iavl/internal/mem_node_test.go @@ -12,16 +12,17 @@ func TestMemNode_Getters(t *testing.T) { nodeId := NewNodeID(true, 5, 10) node := &MemNode{ - height: 3, - version: 7, - size: 42, - key: []byte("testkey"), - value: []byte("testvalue"), - hash: []byte("testhash"), - left: left, - right: right, - nodeId: nodeId, - kvOffset: 100, + height: 3, + version: 7, + size: 42, + key: []byte("testkey"), + value: []byte("testvalue"), + hash: []byte("testhash"), + left: left, + right: right, + nodeId: nodeId, + keyOffset: 100, + valueOffset: 200, } require.Equal(t, uint8(3), node.Height()) From c75dde05804ca6f057ed2310eaa0d60973df504f Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 17:43:44 -0500 Subject: [PATCH 19/34] WIP on kv data writer --- iavl/internal/kvdata.go | 19 +++-- iavl/internal/kvdata_writer.go | 149 +++++++++++++++++++-------------- 2 files changed, 97 insertions(+), 71 deletions(-) diff --git a/iavl/internal/kvdata.go b/iavl/internal/kvdata.go index a2f0c8a092e4..7d6a9ab98087 100644 --- a/iavl/internal/kvdata.go +++ b/iavl/internal/kvdata.go @@ -10,24 +10,25 @@ const ( KVEntryWALStart KVEntryType = 0x0 // KVEntryWALSet indicates a set operation for a key-value pair. - // This should be followed by a varint-encoded length and the raw bytes OR - // if the KVFlagCachedKey flag is set, a 32-bit little-endian offset referencing a cached key, - // AND then a varint-encoded length and the value bytes. + // This should be followed by: + // - varint key length + key bytes, OR if KVFlagCachedKey is set, a 32-bit LE offset to a cached key + // - varint value length + value bytes + // Offsets point to the start of the varint length field, not the type byte. KVEntryWALSet KVEntryType = 0x1 // KVEntryWALDelete indicates a delete operation for a key. - // This should be followed by a varint-encoded length and the key bytes OR - // if the KVFlagCachedKey flag is set, a 32-bit little-endian offset referencing a cached key. + // This should be followed by: + // - varint key length + key bytes, OR if KVFlagCachedKey is set, a 32-bit LE offset to a cached key + // Offsets point to the start of the varint length field, not the type byte. KVEntryWALDelete KVEntryType = 0x2 // KVEntryWALCommit indicates the commit operation for a version. // This must be followed by a varint-encoded version number. KVEntryWALCommit KVEntryType = 0x3 - // KVEntryBlob indicates a blob entry storing raw key-value data. - // This should be followed by a varint-encoded length and the raw bytes. - // This entry type is used for compacted (non-WAL) KV data files or - // for branch keys that aren't otherwise cached. + // KVEntryBlob indicates a standalone blob entry (key or value data). + // This should be followed by varint length + raw bytes. + // Used for compacted (non-WAL) data or branch keys not already cached. KVEntryBlob KVEntryType = 0x4 // KVFlagCachedKey indicates that the key for this entry is cached and should be referenced by diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index b5547eb162a1..45f8057f67c8 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -5,6 +5,7 @@ import ( "fmt" "math" "os" + "unsafe" ) type KVDataWriter struct { @@ -31,31 +32,58 @@ func (kvs *KVDataWriter) StartWAL(version uint64) error { func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { for _, update := range updates { if deleteKey := update.DeleteKey; deleteKey != nil { - cachedOffset, cached := kvs.keyCache[string(deleteKey)] - err := kvs.writeType(KVEntryWALDelete | KVFlagCachedKey) + cachedOffset, cached := kvs.keyCache[unsafeBytesToString(deleteKey)] + typ := KVEntryWALDelete + if cached { + typ |= KVFlagCachedKey + } + err := kvs.writeType(typ) if err != nil { return err } if cached { + err = kvs.writeLEU32(cachedOffset) + if err != nil { + return err + } } else { - + _, err := kvs.writeLenPrefixedBytes(deleteKey) + if err != nil { + return err + } } } else if memNode := update.SetNode; memNode != nil { - _, err := kvs.Write([]byte{KVDataEntryTypeSet}) - if err != nil { - return err + key := memNode.key + cachedOffset, cached := kvs.keyCache[unsafeBytesToString(key)] + typ := KVEntryWALSet + if cached { + typ |= KVFlagCachedKey } - offset, err := kvs.writeLenPrefixedBytes(memNode.key) + err := kvs.writeType(typ) if err != nil { return err } - memNode.kvOffset = offset - _, err = kvs.writeLenPrefixedBytes(memNode.value) + if cached { + err = kvs.writeLEU32(cachedOffset) + if err != nil { + return err + } + } else { + keyOffset, err := kvs.writeLenPrefixedBytes(key) + if err != nil { + return err + } + memNode.keyOffset = keyOffset + kvs.keyCache[unsafeBytesToString(key)] = keyOffset + } + + valueOffset, err := kvs.writeLenPrefixedBytes(memNode.value) if err != nil { return err } + memNode.valueOffset = valueOffset } else { return fmt.Errorf("invalid update: neither SetNode nor DeleteKey is set") } @@ -63,79 +91,82 @@ func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { return nil } -func (kvs *KVDataWriter) WriteCommit(version uint32) error { - _, err := kvs.Write([]byte{KVDataEntryTypeCommit}) +func (kvs *KVDataWriter) WriteCommit(version uint64) error { + err := kvs.writeType(KVEntryWALCommit) if err != nil { return err } - return kvs.writeLEU32(version) + return kvs.writeVarUint(version) } -func (kvs *KVDataWriter) WriteK(key []byte) (offset uint64, err error) { - _, err = kvs.Write([]byte{KVDataEntryTypeExtraK}) +func (kvs *KVDataWriter) WriteKey(key []byte) (offset uint32, err error) { + if offset, found := kvs.keyCache[unsafeBytesToString(key)]; found { + return offset, nil + } + + offset, err = kvs.writeBlob(key) if err != nil { - return offset, err + return 0, err } - return kvs.writeLenPrefixedBytes(key) + kvs.keyCache[unsafeBytesToString(key)] = offset + + return offset, nil } -func (kvs *KVDataWriter) WriteKV(key, value []byte) (offset uint32, err error) { - _, err = kvs.Write([]byte{KVDataEntryTypeExtraKV}) +func (kvs *KVDataWriter) WriteKeyValue(key, value []byte) (keyOffset, branchOffset uint32, err error) { + keyOffset, err = kvs.WriteKey(key) if err != nil { - return offset, err + return 0, 0, err } - offset, err = kvs.writeLenPrefixedBytes(key) + branchOffset, err = kvs.writeBlob(value) if err != nil { - return 0, err - } - _, err = kvs.writeLenPrefixedBytes(value) - return offset, err -} - -func (kvs *KVDataWriter) writeLenPrefixedBytes(key []byte) (offset uint64, err error) { - lenKey := len(key) - if lenKey > math.MaxUint32 { - return 0, fmt.Errorf("key too large: %d bytes", lenKey) + return 0, 0, err } - offset = uint64(kvs.Size()) + return keyOffset, branchOffset, nil +} - // write little endian uint32 length prefix - err = kvs.writeLEU32(uint32(lenKey)) +func (kvs *KVDataWriter) writeBlob(bz []byte) (offset uint32, err error) { + err = kvs.writeType(KVEntryBlob) if err != nil { - return offset, err + return 0, err } - - // write key bytes - _, err = kvs.Write(key) + offset, err = kvs.writeLenPrefixedBytes(bz) if err != nil { - return offset, err + return 0, err } return offset, nil } -func (kvs *KVDataWriter) writeKey(typ KVEntryType, key []byte) error { - cachedOffset, cached := kvs.keyCache[string(key)] - if cached { - typ |= KVFlagCachedKey +func (kvs *KVDataWriter) writeType(x KVEntryType) error { + _, err := kvs.Write([]byte{byte(x)}) + return err +} + +func (kvs *KVDataWriter) writeLenPrefixedBytes(key []byte) (offset uint32, err error) { + lenKey := len(key) + err = kvs.writeVarUint(uint64(lenKey)) + if err != nil { + return 0, err } - if err := kvs.writeType(typ); err != nil { - return err + + sz := kvs.Size() + if sz > math.MaxUint32 { + return 0, fmt.Errorf("file size overflows uint32: %d", sz) } - if cached { - return kvs.writeLEU32(uint32(cachedOffset)) - } else { + offset = uint32(sz) + // write key bytes + _, err = kvs.Write(key) + if err != nil { + return offset, err } -} -func (kvs *KVDataWriter) writeType(x KVEntryType) error { - _, err := kvs.Write([]byte{byte(x)}) - return err + return offset, nil } func (kvs *KVDataWriter) writeVarUint(x uint64) error { @@ -145,19 +176,13 @@ func (kvs *KVDataWriter) writeVarUint(x uint64) error { return err } -func (kvs *KVDataWriter) writeLEU16(x uint64) error { - if x > math.MaxUint16 { - return fmt.Errorf("value overflows uint16: %d", x) - } -} - -func (kvs *KVDataWriter) writeLEU32(x uint64) error { - if x > math.MaxUint32 { - return fmt.Errorf("value overflows uint32: %d", x) - } - +func (kvs *KVDataWriter) writeLEU32(x uint32) error { var buf [4]byte binary.LittleEndian.PutUint32(buf[:], x) _, err := kvs.Write(buf[:]) return err } + +func unsafeBytesToString(b []byte) string { + return unsafe.String(unsafe.SliceData(b), len(b)) +} From aea0c865d863533873ff3a6bde80f3aa17408d47 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Wed, 3 Dec 2025 17:52:27 -0500 Subject: [PATCH 20/34] WIP on kv data writer --- iavl/internal/kvdata_writer.go | 40 +++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index 45f8057f67c8..8671627aadda 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -48,14 +48,16 @@ func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { return err } } else { - _, err := kvs.writeLenPrefixedBytes(deleteKey) + keyOffset, err := kvs.writeLenPrefixedBytes(deleteKey) if err != nil { return err } + + kvs.keyCache[unsafeBytesToString(deleteKey)] = keyOffset } } else if memNode := update.SetNode; memNode != nil { key := memNode.key - cachedOffset, cached := kvs.keyCache[unsafeBytesToString(key)] + keyOffset, cached := kvs.keyCache[unsafeBytesToString(key)] typ := KVEntryWALSet if cached { typ |= KVFlagCachedKey @@ -66,16 +68,16 @@ func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { } if cached { - err = kvs.writeLEU32(cachedOffset) + err = kvs.writeLEU32(keyOffset) if err != nil { return err } } else { - keyOffset, err := kvs.writeLenPrefixedBytes(key) + var err error + keyOffset, err = kvs.writeLenPrefixedBytes(key) if err != nil { return err } - memNode.keyOffset = keyOffset kvs.keyCache[unsafeBytesToString(key)] = keyOffset } @@ -83,6 +85,8 @@ func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { if err != nil { return err } + + memNode.keyOffset = keyOffset memNode.valueOffset = valueOffset } else { return fmt.Errorf("invalid update: neither SetNode nor DeleteKey is set") @@ -115,18 +119,18 @@ func (kvs *KVDataWriter) WriteKey(key []byte) (offset uint32, err error) { return offset, nil } -func (kvs *KVDataWriter) WriteKeyValue(key, value []byte) (keyOffset, branchOffset uint32, err error) { +func (kvs *KVDataWriter) WriteKeyValue(key, value []byte) (keyOffset, valueOffset uint32, err error) { keyOffset, err = kvs.WriteKey(key) if err != nil { return 0, 0, err } - branchOffset, err = kvs.writeBlob(value) + valueOffset, err = kvs.writeBlob(value) if err != nil { return 0, 0, err } - return keyOffset, branchOffset, nil + return keyOffset, valueOffset, nil } func (kvs *KVDataWriter) writeBlob(bz []byte) (offset uint32, err error) { @@ -147,23 +151,23 @@ func (kvs *KVDataWriter) writeType(x KVEntryType) error { return err } -func (kvs *KVDataWriter) writeLenPrefixedBytes(key []byte) (offset uint32, err error) { - lenKey := len(key) - err = kvs.writeVarUint(uint64(lenKey)) - if err != nil { - return 0, err - } - +func (kvs *KVDataWriter) writeLenPrefixedBytes(bz []byte) (offset uint32, err error) { sz := kvs.Size() if sz > math.MaxUint32 { return 0, fmt.Errorf("file size overflows uint32: %d", sz) } offset = uint32(sz) - // write key bytes - _, err = kvs.Write(key) + lenKey := len(bz) + err = kvs.writeVarUint(uint64(lenKey)) if err != nil { - return offset, err + return 0, err + } + + // write bytes + _, err = kvs.Write(bz) + if err != nil { + return 0, err } return offset, nil From 8a74b7aab6ff8332ffd85075c5d2dabec8a5ec0a Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Thu, 4 Dec 2025 10:50:20 -0500 Subject: [PATCH 21/34] update leaf size, add missing ChangesetInfo size check --- iavl/internal/changeset_info.go | 20 +++++++++++++++----- iavl/internal/leaf_layout.go | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/iavl/internal/changeset_info.go b/iavl/internal/changeset_info.go index 12c6ad63799e..21e7fcebce42 100644 --- a/iavl/internal/changeset_info.go +++ b/iavl/internal/changeset_info.go @@ -7,6 +7,17 @@ import ( "unsafe" ) +const ( + sizeChangesetInfo = 32 +) + +func init() { + // Verify the size of ChangesetInfo is what we expect it to be at runtime. + if unsafe.Sizeof(ChangesetInfo{}) != sizeChangesetInfo { + panic(fmt.Sprintf("invalid ChangesetInfo size: got %d, want %d", unsafe.Sizeof(ChangesetInfo{}), sizeChangesetInfo)) + } +} + // ChangesetInfo holds metadata about a changeset. // This mainly tracks the start and end version of the changeset and also contains statistics about orphans in the // changeset so that compaction can be efficiently scheduled. @@ -34,7 +45,7 @@ type ChangesetInfo struct { // RewriteChangesetInfo rewrites the info file with the given changeset info. // This method is okay to call the first time the file is created as well. func RewriteChangesetInfo(file *os.File, info *ChangesetInfo) error { - data := unsafe.Slice((*byte)(unsafe.Pointer(info)), int(unsafe.Sizeof(*info))) + data := unsafe.Slice((*byte)(unsafe.Pointer(info)), sizeChangesetInfo) if _, err := file.WriteAt(data, 0); err != nil { return fmt.Errorf("failed to write changeset info: %w", err) } @@ -45,8 +56,7 @@ func RewriteChangesetInfo(file *os.File, info *ChangesetInfo) error { // ReadChangesetInfo reads changeset info from a file. It returns an empty default struct if file is empty. func ReadChangesetInfo(file *os.File) (*ChangesetInfo, error) { var info ChangesetInfo - size := int(unsafe.Sizeof(info)) - data := unsafe.Slice((*byte)(unsafe.Pointer(&info)), size) + data := unsafe.Slice((*byte)(unsafe.Pointer(&info)), sizeChangesetInfo) n, err := file.ReadAt(data, 0) if err == io.EOF && n == 0 { @@ -55,8 +65,8 @@ func ReadChangesetInfo(file *os.File) (*ChangesetInfo, error) { if err != nil && err != io.EOF { return nil, fmt.Errorf("failed to read changeset info: %w", err) } - if n != size { - return nil, fmt.Errorf("info file has unexpected size: %d, expected %d", n, size) + if n != sizeChangesetInfo { + return nil, fmt.Errorf("info file has unexpected size: %d, expected %d", n, sizeChangesetInfo) } return &info, nil diff --git a/iavl/internal/leaf_layout.go b/iavl/internal/leaf_layout.go index 9a880c2a37a4..1081c123ca59 100644 --- a/iavl/internal/leaf_layout.go +++ b/iavl/internal/leaf_layout.go @@ -6,7 +6,7 @@ import ( ) const ( - sizeLeaf = 44 + sizeLeaf = 48 ) func init() { From 9b4a3962c8da35705296ad1cd8ca6e21d7f986b8 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Thu, 4 Dec 2025 11:14:42 -0500 Subject: [PATCH 22/34] add Mmap --- iavl/internal/mmap.go | 73 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 iavl/internal/mmap.go diff --git a/iavl/internal/mmap.go b/iavl/internal/mmap.go new file mode 100644 index 000000000000..9fab6489cd91 --- /dev/null +++ b/iavl/internal/mmap.go @@ -0,0 +1,73 @@ +package internal + +import ( + "fmt" + "io" + "os" +) +import "github.com/edsrzf/mmap-go" + +// Mmap represents a read-only memory map into a file. +type Mmap struct { + handle mmap.MMap +} + +// NewMmap creates a new read-only Mmap for the given file. +func NewMmap(file *os.File) (*Mmap, error) { + // Check file size + fi, err := file.Stat() + if err != nil { + return nil, fmt.Errorf("failed to stat file: %w", err) + } + + // Empty files are valid - just don't mmap them + if fi.Size() == 0 { + return &Mmap{}, nil + } + + handle, err := mmap.Map(file, mmap.RDONLY, 0) + if err != nil { + return nil, fmt.Errorf("failed to mmap file: %w", err) + } + + return &Mmap{handle: handle}, nil +} + +// UnsafeSlice returns a byte slice pointing to the mmap-ed data at the given offset and size. +// If the offset and size exceed the mapped data, an error is returned. +// WARNING: The returned byte slice is unsafe and should not be used after the mmap is closed. +func (m Mmap) UnsafeSlice(offset, size int) ([]byte, error) { + if offset+size > len(m.handle) { + return nil, fmt.Errorf("trying to read beyond mapped data: %d + %d >= %d", offset, size, len(m.handle)) + } + bz := m.handle[offset : offset+size] + return bz, nil +} + +// UnsafeSliceVar returns a byte slice pointing to the mmap-ed data at the given offset with a maximum size. +// If the offset exceeds the mapped data, an error is returned. +// If the requested size exceeds the mapped data, it is truncated to fit within the mapped data. +// The number of bytes read is also returned. +// WARNING: The returned byte slice is unsafe and should not be used after the mmap is closed. +func (m Mmap) UnsafeSliceVar(offset, maxSize int) (int, []byte, error) { + if offset >= len(m.handle) { + return 0, nil, fmt.Errorf("trying to read beyond mapped data: %d >= %d", offset, len(m.handle)) + } + if offset+maxSize > len(m.handle) { + maxSize = len(m.handle) - offset + } + data := m.handle[offset : offset+maxSize] + return maxSize, data, nil +} + +// Len returns the length of the mmap-ed data. +func (m Mmap) Len() int { + return len(m.handle) +} + +// Close unmaps the memory-mapped file but does not close the underlying file. +func (m Mmap) Close() error { + return m.handle.Unmap() +} + +var _ io.Closer = (*Mmap)(nil) From b6b140d3e1d5dd1fad4342b15a182b9f8faf6ec4 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Thu, 4 Dec 2025 13:20:12 -0500 Subject: [PATCH 23/34] implement KVDataReader --- iavl/internal/{kvdata.go => kvdata_entry.go} | 11 +- iavl/internal/kvdata_reader.go | 242 +++++++++++++++++++ iavl/internal/kvdata_writer.go | 149 +++++++----- iavl/internal/mmap.go | 6 + 4 files changed, 348 insertions(+), 60 deletions(-) rename iavl/internal/{kvdata.go => kvdata_entry.go} (76%) create mode 100644 iavl/internal/kvdata_reader.go diff --git a/iavl/internal/kvdata.go b/iavl/internal/kvdata_entry.go similarity index 76% rename from iavl/internal/kvdata.go rename to iavl/internal/kvdata_entry.go index 7d6a9ab98087..b55402221a04 100644 --- a/iavl/internal/kvdata.go +++ b/iavl/internal/kvdata_entry.go @@ -26,10 +26,17 @@ const ( // This must be followed by a varint-encoded version number. KVEntryWALCommit KVEntryType = 0x3 - // KVEntryBlob indicates a standalone blob entry (key or value data). + // KVEntryKeyBlob indicates a standalone key data entry. // This should be followed by varint length + raw bytes. // Used for compacted (non-WAL) data or branch keys not already cached. - KVEntryBlob KVEntryType = 0x4 + KVEntryKeyBlob KVEntryType = 0x4 + + // KVEntryValueBlob indicates a standalone blob value data entry. + // This should be followed by varint length + raw bytes. + // Used for compacted (non-WAL) data or branch keys not already cached. + // The main difference between KVEntryKeyBlob and KVEntryValueBlob is that key + // entries may be cached for faster access, while value entries are not cached. + KVEntryValueBlob KVEntryType = 0x5 // KVFlagCachedKey indicates that the key for this entry is cached and should be referenced by // a 32-bit little-endian offset instead of being stored inline. diff --git a/iavl/internal/kvdata_reader.go b/iavl/internal/kvdata_reader.go new file mode 100644 index 000000000000..481ffbaf3ded --- /dev/null +++ b/iavl/internal/kvdata_reader.go @@ -0,0 +1,242 @@ +package internal + +import ( + "encoding/binary" + "fmt" + "os" +) + +// KVDataReader reads data from the key-value data file which can serve as a write-ahead log (WAL) +// and blob storage for keys and values. +type KVDataReader struct { + *Mmap +} + +// NewKVDataReader creates a new KVDataReader. +func NewKVDataReader(file *os.File) (*KVDataReader, error) { + mmap, err := NewMmap(file) + if err != nil { + return nil, err + } + return &KVDataReader{ + Mmap: mmap, + }, nil +} + +// HasWAL checks if the KV data starts with a valid WAL start entry. +// It returns true and the start version if a valid WAL start entry is found. +// If not, it returns false and zero. +func (kvr *KVDataReader) HasWAL() (ok bool, startVersion uint64) { + var err error + ok, startVersion, _, err = kvr.hasWAL() + if err != nil { + return false, 0 + } + return ok, startVersion +} + +func (kvr *KVDataReader) hasWAL() (ok bool, startVersion uint64, bytesRead int, err error) { + if kvr.At(0) != byte(KVEntryWALStart) { + return false, 0, 0, nil + } + startVersion, bytesRead, err = kvr.readVarint(1) + if err != nil { + return false, 0, 0, fmt.Errorf("failed to read WAL start version: %w", err) + } + return true, startVersion, bytesRead + 1, nil +} + +// ReadWAL returns a WALReader to read WAL entries from the KV data. +// If the data does not start with a valid WAL start entry, an error is returned. +func (kvr *KVDataReader) ReadWAL() (*WALReader, error) { + haveWal, startVersion, bytesRead, err := kvr.hasWAL() + if !haveWal { + return nil, fmt.Errorf("data does not contain a valid WAL start entry") + } + if err != nil { + return nil, err + } + return &WALReader{ + rdr: kvr, + offset: 1 + bytesRead, + Version: startVersion, + }, nil +} + +// UnsafeReadBlob reads a blob from the KV data at the given offset. +// It is expected that the blob is prefixed with its size as a varint. +// This function can be used to read any sort of key or value blob whether or not it is part of a WAL entry. +// However, this function doesn't do any checking to ensure that the offset does indeed point to a valid blob. +// The returned byte slice is unsafe and should not be used after the underlying mmap is closed. +// If it is to be retained longer, it should be copied first. +func (kvr *KVDataReader) UnsafeReadBlob(offset int) ([]byte, error) { + bz, _, err := kvr.unsafeReadBlob(offset) + return bz, err +} + +func (kvr *KVDataReader) unsafeReadBlob(offset int) ([]byte, int, error) { + // Read size prefix + size, bytesRead, err := kvr.readVarint(offset) + + // Read blob data + offset += bytesRead + bz, err := kvr.UnsafeSlice(offset, int(size)) + if err != nil { + return nil, 0, err + } + + return bz, int(size) + bytesRead, nil +} + +func (kvr *KVDataReader) readVarint(offset int) (varint uint64, bytesRead int, err error) { + _, bz, err := kvr.UnsafeSliceVar(offset, binary.MaxVarintLen64) + if err != nil { + return 0, 0, err + } + varint, bytesRead = binary.Uvarint(bz) + if bytesRead <= 0 { + return 0, 0, fmt.Errorf("failed to read varint at offset %d", offset) + } + return varint, bytesRead, nil +} + +func (kvr *KVDataReader) readLEU32(offset int) (uint32, error) { + bz, err := kvr.UnsafeSlice(offset, 4) + if err != nil { + return 0, err + } + return binary.LittleEndian.Uint32(bz), nil +} + +// WALReader reads WAL entries from a KVDataReader. +// Call Next() to read the next entry and read the Key, Value and Version fields directly as needed. +type WALReader struct { + rdr *KVDataReader + offset int + keyMappings map[int][]byte + + // Version is the version of the WAL entries currently being read. + Version uint64 + // Key is the key of the current WAL entry. This is valid for Set and Delete entries. + Key []byte + + // Value is the value of the current WAL entry. + // This is only valid for Set entries. + Value []byte +} + +// Next reads the next WAL entry, skipping over any blob entries. +// It returns the entry type, a boolean indicating if an entry was read and an error if any. +// If no more entries are available, ok will be false. +// It should only be expected that Set, Delete and Commit entries are returned. +func (wr *WALReader) Next() (entryType KVEntryType, ok bool, err error) { + for { + entryType, ok, err = wr.next() + if !ok || err != nil { + return entryType, ok, err + } + + // skip over all blob entries, otherwise return + switch entryType { + case KVEntryKeyBlob, KVEntryValueBlob: + continue + default: + return entryType, ok, err + } + } +} + +func (wr *WALReader) next() (entryType KVEntryType, ok bool, err error) { + entryType = KVEntryType(wr.rdr.At(wr.offset)) + wr.offset++ + switch entryType { + case KVEntryWALSet: + err := wr.readKey() + if err != nil { + return 0, false, err + } + + err = wr.readValue() + if err != nil { + return 0, false, err + } + case KVEntryWALSet | KVFlagCachedKey: + err := wr.readCachedKey() + if err != nil { + return 0, false, err + } + + err = wr.readValue() + if err != nil { + return 0, false, err + } + case KVEntryWALDelete: + err := wr.readKey() + if err != nil { + return 0, false, err + } + case KVEntryWALDelete | KVFlagCachedKey: + err := wr.readCachedKey() + if err != nil { + return 0, false, err + } + case KVEntryWALCommit: + var bytesRead int + wr.Version, bytesRead, err = wr.rdr.readVarint(wr.offset) + if err != nil { + return 0, false, fmt.Errorf("failed to read WAL commit version at offset %d: %w", wr.offset, err) + } + wr.offset += bytesRead + case KVEntryKeyBlob: + err = wr.readKey() + if err != nil { + return 0, false, fmt.Errorf("failed to read key blob at offset %d: %w", wr.offset, err) + } + case KVEntryValueBlob: + _, err = wr.rdr.UnsafeReadBlob(wr.offset) + if err != nil { + return 0, false, fmt.Errorf("failed to read blob at offset %d: %w", wr.offset, err) + } + default: + return 0, false, fmt.Errorf("invalid KV entry type %d at offset %d", entryType, wr.offset) + } + return entryType, true, nil +} + +func (wr *WALReader) readKey() error { + var n int + var err error + wr.Key, n, err = wr.rdr.unsafeReadBlob(wr.offset) + if err != nil { + return fmt.Errorf("failed to read WAL key at offset %d: %w", wr.offset, err) + } + // cache the key + wr.keyMappings[wr.offset] = wr.Key + wr.offset += n + return nil +} + +func (wr *WALReader) readCachedKey() error { + cachedKeyOffset, err := wr.rdr.readLEU32(wr.offset) + if err != nil { + return fmt.Errorf("failed to read cached key offset at %d: %w", wr.offset, err) + } + wr.offset += 4 + var ok bool + wr.Key, ok = wr.keyMappings[int(cachedKeyOffset)] + if !ok { + return fmt.Errorf("cached key not found at offset %d", cachedKeyOffset) + } + return nil +} + +func (wr *WALReader) readValue() error { + var n int + var err error + wr.Value, n, err = wr.rdr.unsafeReadBlob(wr.offset) + if err != nil { + return fmt.Errorf("failed to read WAL value at offset %d: %w", wr.offset, err) + } + wr.offset += n + return nil +} diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index 8671627aadda..7633660dbdf9 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -8,11 +8,14 @@ import ( "unsafe" ) +// KVDataWriter writes data to the key-value data file which can serve as a write-ahead log (WAL) +// and blob storage for keys and values. type KVDataWriter struct { *FileWriter keyCache map[string]uint32 } +// NewKVDataWriter creates a new KVDataWriter. func NewKVDataWriter(file *os.File) *KVDataWriter { fw := NewFileWriter(file) return &KVDataWriter{ @@ -21,7 +24,10 @@ func NewKVDataWriter(file *os.File) *KVDataWriter { } } -func (kvs *KVDataWriter) StartWAL(version uint64) error { +func (kvs *KVDataWriter) WriteStartWAL(version uint64) error { + if kvs.Size() != 0 { + return fmt.Errorf("cannot write WAL start to non-empty file") + } err := kvs.writeType(KVEntryWALStart) if err != nil { return err @@ -29,63 +35,18 @@ func (kvs *KVDataWriter) StartWAL(version uint64) error { return kvs.writeVarUint(version) } -func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { +func (kvs *KVDataWriter) WriteWALUpdates(updates []KVUpdate) error { for _, update := range updates { if deleteKey := update.DeleteKey; deleteKey != nil { - cachedOffset, cached := kvs.keyCache[unsafeBytesToString(deleteKey)] - typ := KVEntryWALDelete - if cached { - typ |= KVFlagCachedKey - } - err := kvs.writeType(typ) + err := kvs.WriteWALDelete(deleteKey) if err != nil { return err } - - if cached { - err = kvs.writeLEU32(cachedOffset) - if err != nil { - return err - } - } else { - keyOffset, err := kvs.writeLenPrefixedBytes(deleteKey) - if err != nil { - return err - } - - kvs.keyCache[unsafeBytesToString(deleteKey)] = keyOffset - } } else if memNode := update.SetNode; memNode != nil { - key := memNode.key - keyOffset, cached := kvs.keyCache[unsafeBytesToString(key)] - typ := KVEntryWALSet - if cached { - typ |= KVFlagCachedKey - } - err := kvs.writeType(typ) - if err != nil { - return err - } - - if cached { - err = kvs.writeLEU32(keyOffset) - if err != nil { - return err - } - } else { - var err error - keyOffset, err = kvs.writeLenPrefixedBytes(key) - if err != nil { - return err - } - kvs.keyCache[unsafeBytesToString(key)] = keyOffset - } - - valueOffset, err := kvs.writeLenPrefixedBytes(memNode.value) + keyOffset, valueOffset, err := kvs.WriteWALSet(memNode.key, memNode.value) if err != nil { return err } - memNode.keyOffset = keyOffset memNode.valueOffset = valueOffset } else { @@ -95,7 +56,68 @@ func (kvs *KVDataWriter) WriteUpdates(updates []KVUpdate) error { return nil } -func (kvs *KVDataWriter) WriteCommit(version uint64) error { +func (kvs *KVDataWriter) WriteWALSet(key, value []byte) (keyOffset, valueOffset uint32, err error) { + keyOffset, cached := kvs.keyCache[unsafeBytesToString(key)] + typ := KVEntryWALSet + if cached { + typ |= KVFlagCachedKey + } + err = kvs.writeType(typ) + if err != nil { + return 0, 0, err + } + + if cached { + err = kvs.writeLEU32(keyOffset) + if err != nil { + return 0, 0, err + } + } else { + var err error + keyOffset, err = kvs.writeLenPrefixedBytes(key) + if err != nil { + return 0, 0, err + } + kvs.addKeyToCache(key, keyOffset) + } + + valueOffset, err = kvs.writeLenPrefixedBytes(value) + if err != nil { + return 0, 0, err + } + + return keyOffset, valueOffset, nil +} + +func (kvs *KVDataWriter) WriteWALDelete(key []byte) error { + cachedOffset, cached := kvs.keyCache[unsafeBytesToString(key)] + typ := KVEntryWALDelete + if cached { + typ |= KVFlagCachedKey + } + err := kvs.writeType(typ) + if err != nil { + return err + } + + if cached { + err = kvs.writeLEU32(cachedOffset) + if err != nil { + return err + } + } else { + keyOffset, err := kvs.writeLenPrefixedBytes(key) + if err != nil { + return err + } + + kvs.addKeyToCache(key, keyOffset) + } + + return nil +} + +func (kvs *KVDataWriter) WriteWALCommit(version uint64) error { err := kvs.writeType(KVEntryWALCommit) if err != nil { return err @@ -104,28 +126,28 @@ func (kvs *KVDataWriter) WriteCommit(version uint64) error { return kvs.writeVarUint(version) } -func (kvs *KVDataWriter) WriteKey(key []byte) (offset uint32, err error) { +func (kvs *KVDataWriter) WriteKeyBlob(key []byte) (offset uint32, err error) { if offset, found := kvs.keyCache[unsafeBytesToString(key)]; found { return offset, nil } - offset, err = kvs.writeBlob(key) + offset, err = kvs.writeBlob(KVEntryKeyBlob, key) if err != nil { return 0, err } - kvs.keyCache[unsafeBytesToString(key)] = offset + kvs.addKeyToCache(key, offset) return offset, nil } -func (kvs *KVDataWriter) WriteKeyValue(key, value []byte) (keyOffset, valueOffset uint32, err error) { - keyOffset, err = kvs.WriteKey(key) +func (kvs *KVDataWriter) WriteKeyValueBlobs(key, value []byte) (keyOffset, valueOffset uint32, err error) { + keyOffset, err = kvs.WriteKeyBlob(key) if err != nil { return 0, 0, err } - valueOffset, err = kvs.writeBlob(value) + valueOffset, err = kvs.writeBlob(KVEntryKeyBlob, value) if err != nil { return 0, 0, err } @@ -133,8 +155,8 @@ func (kvs *KVDataWriter) WriteKeyValue(key, value []byte) (keyOffset, valueOffse return keyOffset, valueOffset, nil } -func (kvs *KVDataWriter) writeBlob(bz []byte) (offset uint32, err error) { - err = kvs.writeType(KVEntryBlob) +func (kvs *KVDataWriter) writeBlob(blobType KVEntryType, bz []byte) (offset uint32, err error) { + err = kvs.writeType(blobType) if err != nil { return 0, err } @@ -146,12 +168,23 @@ func (kvs *KVDataWriter) writeBlob(bz []byte) (offset uint32, err error) { return offset, nil } +func (kvs *KVDataWriter) addKeyToCache(key []byte, offset uint32) { + if len(key) < 4 { + // don't cache very small keys + return + } + kvs.keyCache[unsafeBytesToString(key)] = offset +} + func (kvs *KVDataWriter) writeType(x KVEntryType) error { _, err := kvs.Write([]byte{byte(x)}) return err } func (kvs *KVDataWriter) writeLenPrefixedBytes(bz []byte) (offset uint32, err error) { + // TODO: should we limit the max size of bz? + // for keys we should probably never have anything bigger than 2^16 bytes, + // and for values maybe 2^24 bytes? sz := kvs.Size() if sz > math.MaxUint32 { return 0, fmt.Errorf("file size overflows uint32: %d", sz) diff --git a/iavl/internal/mmap.go b/iavl/internal/mmap.go index 9fab6489cd91..59e288a67e8d 100644 --- a/iavl/internal/mmap.go +++ b/iavl/internal/mmap.go @@ -33,6 +33,12 @@ func NewMmap(file *os.File) (*Mmap, error) { return &Mmap{handle: handle}, nil } +// At returns the byte at the given index in the mmap-ed data. +// If the index is out of bounds, it panics. +func (m Mmap) At(i int) byte { + return m.handle[i] +} + // UnsafeSlice returns a byte slice pointing to the mmap-ed data at the given offset and size. // If the offset and size exceed the mapped data, an error is returned. // WARNING: The returned byte slice is unsafe and should not be used after the mmap is closed. From 8f726732cf24282889bafdfee9839fe8cb27e61f Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Thu, 4 Dec 2025 13:27:36 -0500 Subject: [PATCH 24/34] fixes to KVDataReader --- iavl/internal/kvdata_entry.go | 6 +++--- iavl/internal/kvdata_reader.go | 34 +++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/iavl/internal/kvdata_entry.go b/iavl/internal/kvdata_entry.go index b55402221a04..4c635640d763 100644 --- a/iavl/internal/kvdata_entry.go +++ b/iavl/internal/kvdata_entry.go @@ -28,12 +28,12 @@ const ( // KVEntryKeyBlob indicates a standalone key data entry. // This should be followed by varint length + raw bytes. - // Used for compacted (non-WAL) data or branch keys not already cached. + // Used for compacted (non-WAL) leaf or branch keys not already cached. KVEntryKeyBlob KVEntryType = 0x4 - // KVEntryValueBlob indicates a standalone blob value data entry. + // KVEntryValueBlob indicates a standalone value data entry. // This should be followed by varint length + raw bytes. - // Used for compacted (non-WAL) data or branch keys not already cached. + // Used for compacted (non-WAL) leaf values. // The main difference between KVEntryKeyBlob and KVEntryValueBlob is that key // entries may be cached for faster access, while value entries are not cached. KVEntryValueBlob KVEntryType = 0x5 diff --git a/iavl/internal/kvdata_reader.go b/iavl/internal/kvdata_reader.go index 481ffbaf3ded..169617f61594 100644 --- a/iavl/internal/kvdata_reader.go +++ b/iavl/internal/kvdata_reader.go @@ -50,16 +50,17 @@ func (kvr *KVDataReader) hasWAL() (ok bool, startVersion uint64, bytesRead int, // If the data does not start with a valid WAL start entry, an error is returned. func (kvr *KVDataReader) ReadWAL() (*WALReader, error) { haveWal, startVersion, bytesRead, err := kvr.hasWAL() - if !haveWal { - return nil, fmt.Errorf("data does not contain a valid WAL start entry") - } if err != nil { return nil, err } + if !haveWal { + return nil, fmt.Errorf("data does not contain a valid WAL start entry") + } return &WALReader{ - rdr: kvr, - offset: 1 + bytesRead, - Version: startVersion, + rdr: kvr, + offset: bytesRead, + Version: startVersion, + keyMappings: make(map[int][]byte), }, nil } @@ -77,6 +78,9 @@ func (kvr *KVDataReader) UnsafeReadBlob(offset int) ([]byte, error) { func (kvr *KVDataReader) unsafeReadBlob(offset int) ([]byte, int, error) { // Read size prefix size, bytesRead, err := kvr.readVarint(offset) + if err != nil { + return nil, 0, err + } // Read blob data offset += bytesRead @@ -147,6 +151,11 @@ func (wr *WALReader) Next() (entryType KVEntryType, ok bool, err error) { } func (wr *WALReader) next() (entryType KVEntryType, ok bool, err error) { + // check for end of data + if wr.offset >= wr.rdr.Len() { + return 0, false, nil + } + entryType = KVEntryType(wr.rdr.At(wr.offset)) wr.offset++ switch entryType { @@ -193,26 +202,29 @@ func (wr *WALReader) next() (entryType KVEntryType, ok bool, err error) { return 0, false, fmt.Errorf("failed to read key blob at offset %d: %w", wr.offset, err) } case KVEntryValueBlob: - _, err = wr.rdr.UnsafeReadBlob(wr.offset) + var bytesRead int + _, bytesRead, err = wr.rdr.unsafeReadBlob(wr.offset) if err != nil { return 0, false, fmt.Errorf("failed to read blob at offset %d: %w", wr.offset, err) } + + wr.offset += bytesRead default: - return 0, false, fmt.Errorf("invalid KV entry type %d at offset %d", entryType, wr.offset) + return 0, false, fmt.Errorf("invalid KV entry type %d at offset %d", entryType, wr.offset-1) } return entryType, true, nil } func (wr *WALReader) readKey() error { - var n int + var bytesRead int var err error - wr.Key, n, err = wr.rdr.unsafeReadBlob(wr.offset) + wr.Key, bytesRead, err = wr.rdr.unsafeReadBlob(wr.offset) if err != nil { return fmt.Errorf("failed to read WAL key at offset %d: %w", wr.offset, err) } // cache the key wr.keyMappings[wr.offset] = wr.Key - wr.offset += n + wr.offset += bytesRead return nil } From 647f9321beecee291aaf7a2a7f7161c610398110 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Thu, 4 Dec 2025 13:29:43 -0500 Subject: [PATCH 25/34] fixes to KVDataReader --- iavl/internal/kvdata_reader.go | 2 +- iavl/internal/kvdata_writer.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/iavl/internal/kvdata_reader.go b/iavl/internal/kvdata_reader.go index 169617f61594..f375c4325022 100644 --- a/iavl/internal/kvdata_reader.go +++ b/iavl/internal/kvdata_reader.go @@ -36,7 +36,7 @@ func (kvr *KVDataReader) HasWAL() (ok bool, startVersion uint64) { } func (kvr *KVDataReader) hasWAL() (ok bool, startVersion uint64, bytesRead int, err error) { - if kvr.At(0) != byte(KVEntryWALStart) { + if kvr.Len() == 0 || kvr.At(0) != byte(KVEntryWALStart) { return false, 0, 0, nil } startVersion, bytesRead, err = kvr.readVarint(1) diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index 7633660dbdf9..9b16af81b030 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -147,7 +147,7 @@ func (kvs *KVDataWriter) WriteKeyValueBlobs(key, value []byte) (keyOffset, value return 0, 0, err } - valueOffset, err = kvs.writeBlob(KVEntryKeyBlob, value) + valueOffset, err = kvs.writeBlob(KVEntryValueBlob, value) if err != nil { return 0, 0, err } From f42a7afa03a1a9d203370b832fc6c28d47e6c38d Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 10:17:36 -0500 Subject: [PATCH 26/34] add tests, update go.mod's --- go.mod | 1 + go.sum | 1 + iavl/internal/kvdata_test.go | 689 +++++++++++++++++++++++++++++++++++ 3 files changed, 691 insertions(+) create mode 100644 iavl/internal/kvdata_test.go diff --git a/go.mod b/go.mod index 975f6e3f1608..d37c26844ed0 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/cosmos/gogoproto v1.7.2 github.com/cosmos/ledger-cosmos-go v0.16.0 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.4.0 + github.com/edsrzf/mmap-go v1.0.0 github.com/golang/protobuf v1.5.4 github.com/google/go-cmp v0.7.0 github.com/google/gofuzz v1.2.0 diff --git a/go.sum b/go.sum index 7e9358fa5c50..638a300944b8 100644 --- a/go.sum +++ b/go.sum @@ -274,6 +274,7 @@ github.com/dvsekhvalnov/jose2go v1.7.0/go.mod h1:QsHjhyTlD/lAVqn/NSbVZmSCGeDehTB github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I= +github.com/edsrzf/mmap-go v1.0.0 h1:CEBF7HpRnUCSJgGUb5h1Gm7e3VkmVDrR8lvWVLtrOFw= github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M= github.com/emicklei/dot v1.8.0 h1:HnD60yAKFAevNeT+TPYr9pb8VB9bqdeSo0nzwIW6IOI= github.com/emicklei/dot v1.8.0/go.mod h1:DeV7GvQtIw4h2u73RKBkkFdvVAz0D9fzeJrgPW6gy/s= diff --git a/iavl/internal/kvdata_test.go b/iavl/internal/kvdata_test.go new file mode 100644 index 000000000000..6d7f71c462d6 --- /dev/null +++ b/iavl/internal/kvdata_test.go @@ -0,0 +1,689 @@ +package internal + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestKVData_WALStart(t *testing.T) { + tests := []struct { + name string + version uint64 + }{ + {"version 0", 0}, + {"version 1", 1}, + {"version 100", 100}, + {"large version", 1<<32 + 12345}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(tt.version) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + // Verify HasWAL + hasWAL, startVersion := r.HasWAL() + require.True(t, hasWAL) + require.Equal(t, tt.version, startVersion) + + // Verify ReadWAL + wr, err := r.ReadWAL() + require.NoError(t, err) + require.Equal(t, tt.version, wr.Version) + }) + } +} + +func TestKVData_WALStart_NonEmptyFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write some data first + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + + // Try to write WAL start again - should fail + err = w.WriteStartWAL(2) + require.Error(t, err) + require.Contains(t, err.Error(), "non-empty") + + require.NoError(t, f.Close()) +} + +func TestKVData_WALStart_EmptyFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Create empty file + f, err := os.Create(path) + require.NoError(t, err) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + // HasWAL should return false for empty file + hasWAL, startVersion := r.HasWAL() + require.False(t, hasWAL) + require.Equal(t, uint64(0), startVersion) + + // ReadWAL should return error + _, err = r.ReadWAL() + require.Error(t, err) +} + +func TestKVData_WALSet(t *testing.T) { + tests := []struct { + name string + key []byte + value []byte + }{ + {"simple kv", []byte("hello"), []byte("world")}, + {"empty value", []byte("key"), []byte{}}, + {"binary key", []byte{0x00, 0x01, 0x02}, []byte("value")}, + {"medium key", []byte(strings.Repeat("k", 100)), []byte(strings.Repeat("v", 200))}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + keyOffset, valueOffset, err := w.WriteWALSet(tt.key, tt.value) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + // Verify via WALReader + wr, err := r.ReadWAL() + require.NoError(t, err) + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, tt.key, wr.Key) + require.Equal(t, tt.value, wr.Value) + + // Verify offsets via UnsafeReadBlob + keyRead, err := r.UnsafeReadBlob(int(keyOffset)) + require.NoError(t, err) + require.Equal(t, tt.key, keyRead) + + valueRead, err := r.UnsafeReadBlob(int(valueOffset)) + require.NoError(t, err) + require.Equal(t, tt.value, valueRead) + + // No more entries + _, ok, err = wr.Next() + require.NoError(t, err) + require.False(t, ok) + }) + } +} + +func TestKVData_WALDelete(t *testing.T) { + tests := []struct { + name string + key []byte + }{ + {"simple key", []byte("deleteMe")}, + {"binary key", []byte{0xFF, 0xFE, 0xFD}}, + {"medium key", []byte(strings.Repeat("d", 50))}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + err = w.WriteWALDelete(tt.key) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + wr, err := r.ReadWAL() + require.NoError(t, err) + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALDelete, entryType) + require.Equal(t, tt.key, wr.Key) + + // No more entries + _, ok, err = wr.Next() + require.NoError(t, err) + require.False(t, ok) + }) + } +} + +func TestKVData_WALCommit(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + _, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) + require.NoError(t, err) + err = w.WriteWALCommit(1) + require.NoError(t, err) + _, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) + require.NoError(t, err) + err = w.WriteWALCommit(2) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + wr, err := r.ReadWAL() + require.NoError(t, err) + require.Equal(t, uint64(1), wr.Version) // Start version + + // Entry 1: Set + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, []byte("key1"), wr.Key) + + // Entry 2: Commit - version should update + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALCommit, entryType) + require.Equal(t, uint64(1), wr.Version) + + // Entry 3: Set + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, []byte("key2"), wr.Key) + + // Entry 4: Commit + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALCommit, entryType) + require.Equal(t, uint64(2), wr.Version) + + // No more entries + _, ok, err = wr.Next() + require.NoError(t, err) + require.False(t, ok) +} + +func TestKVData_KeyCaching(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + key := []byte("cachedKey") // > 4 bytes, will be cached + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + + // First write - should be inline + keyOffset1, _, err := w.WriteWALSet(key, []byte("value1")) + require.NoError(t, err) + + // Second write of same key - should use cached offset + keyOffset2, _, err := w.WriteWALSet(key, []byte("value2")) + require.NoError(t, err) + + // Both should have the same key offset + require.Equal(t, keyOffset1, keyOffset2) + + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read raw bytes to verify cached flag is set + data, err := os.ReadFile(path) + require.NoError(t, err) + + // Find the second WALSet entry - it should have the cached flag (0x81) + // First entry is at offset after WALStart + foundCached := false + for i := 0; i < len(data); i++ { + if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { + foundCached = true + break + } + } + require.True(t, foundCached, "second WALSet should have cached key flag") + + // Read and verify both entries work correctly + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + wr, err := r.ReadWAL() + require.NoError(t, err) + + // First entry + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, key, wr.Key) + require.Equal(t, []byte("value1"), wr.Value) + + // Second entry - should also resolve key correctly via cache + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet|KVFlagCachedKey, entryType) + require.Equal(t, key, wr.Key) + require.Equal(t, []byte("value2"), wr.Value) +} + +func TestKVData_KeyCaching_ShortKeys(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + shortKey := []byte("abc") // 3 bytes, should NOT be cached + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + + // First write + _, _, err = w.WriteWALSet(shortKey, []byte("value1")) + require.NoError(t, err) + + // Second write - should NOT use cache (key too short) + _, _, err = w.WriteWALSet(shortKey, []byte("value2")) + require.NoError(t, err) + + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read raw bytes - should NOT find cached flag for short keys + data, err := os.ReadFile(path) + require.NoError(t, err) + + cachedCount := 0 + for i := 0; i < len(data); i++ { + if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { + cachedCount++ + } + } + require.Equal(t, 0, cachedCount, "short keys should not be cached") +} + +func TestKVData_WALReplay(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write a realistic WAL sequence + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(10) + require.NoError(t, err) + + // Version 10 operations + _, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) + require.NoError(t, err) + _, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) + require.NoError(t, err) + err = w.WriteWALDelete([]byte("oldKey")) + require.NoError(t, err) + err = w.WriteWALCommit(10) + require.NoError(t, err) + + // Version 11 operations + _, _, err = w.WriteWALSet([]byte("key1"), []byte("val1_updated")) + require.NoError(t, err) + _, _, err = w.WriteWALSet([]byte("key3"), []byte("val3")) + require.NoError(t, err) + err = w.WriteWALCommit(11) + require.NoError(t, err) + + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Replay + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + wr, err := r.ReadWAL() + require.NoError(t, err) + require.Equal(t, uint64(10), wr.Version) + + expectedEntries := []struct { + entryType KVEntryType + key []byte + value []byte + version uint64 + }{ + {KVEntryWALSet, []byte("key1"), []byte("val1"), 10}, + {KVEntryWALSet, []byte("key2"), []byte("val2"), 10}, + {KVEntryWALDelete, []byte("oldKey"), nil, 10}, + {KVEntryWALCommit, nil, nil, 10}, + {KVEntryWALSet | KVFlagCachedKey, []byte("key1"), []byte("val1_updated"), 10}, // key1 cached + {KVEntryWALSet, []byte("key3"), []byte("val3"), 10}, + {KVEntryWALCommit, nil, nil, 11}, + } + + for i, exp := range expectedEntries { + entryType, ok, err := wr.Next() + require.NoError(t, err, "entry %d", i) + require.True(t, ok, "entry %d", i) + require.Equal(t, exp.entryType, entryType, "entry %d type", i) + + if exp.key != nil { + require.Equal(t, exp.key, wr.Key, "entry %d key", i) + } + if exp.value != nil { + require.Equal(t, exp.value, wr.Value, "entry %d value", i) + } + if entryType == KVEntryWALCommit { + require.Equal(t, exp.version, wr.Version, "entry %d version", i) + } + } + + // No more entries + _, ok, err := wr.Next() + require.NoError(t, err) + require.False(t, ok) +} + +func TestKVData_Blobs(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + key := []byte("blobKey") + value := []byte("blobValue") + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + + // Write key blob + keyOffset, err := w.WriteKeyBlob(key) + require.NoError(t, err) + + // Write same key again - should return cached offset + keyOffset2, err := w.WriteKeyBlob(key) + require.NoError(t, err) + require.Equal(t, keyOffset, keyOffset2) + + // Write key/value blobs + keyOffset3, valueOffset, err := w.WriteKeyValueBlobs(key, value) + require.NoError(t, err) + require.Equal(t, keyOffset, keyOffset3) // same key, cached + + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + // Read key via offset + keyRead, err := r.UnsafeReadBlob(int(keyOffset)) + require.NoError(t, err) + require.Equal(t, key, keyRead) + + // Read value via offset + valueRead, err := r.UnsafeReadBlob(int(valueOffset)) + require.NoError(t, err) + require.Equal(t, value, valueRead) +} + +func TestKVData_Blobs_ValueType(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write phase + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + _, valueOffset, err := w.WriteKeyValueBlobs([]byte("key"), []byte("value")) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read raw bytes to verify value uses KVEntryValueBlob type + data, err := os.ReadFile(path) + require.NoError(t, err) + + // The byte before valueOffset should be the type byte + // valueOffset points to the varint length, type byte is 1 before + require.Equal(t, byte(KVEntryValueBlob), data[valueOffset-1]) +} + +func TestKVData_MixedEntries(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write WAL with blob entries interspersed + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + + // WAL set + _, _, err = w.WriteWALSet([]byte("walKey1"), []byte("walVal1")) + require.NoError(t, err) + + // Blob entry (non-WAL) + blobOffset, err := w.WriteKeyBlob([]byte("blobKey")) + require.NoError(t, err) + + // Another WAL set + _, _, err = w.WriteWALSet([]byte("walKey2"), []byte("walVal2")) + require.NoError(t, err) + + err = w.WriteWALCommit(1) + require.NoError(t, err) + + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read phase + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + wr, err := r.ReadWAL() + require.NoError(t, err) + + // WALReader.Next() should skip blob entries + // Entry 1: WAL Set + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, []byte("walKey1"), wr.Key) + + // Entry 2: WAL Set (blob skipped) + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, []byte("walKey2"), wr.Key) + + // Entry 3: Commit + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALCommit, entryType) + + // Blob is still readable via offset + blobData, err := r.UnsafeReadBlob(int(blobOffset)) + require.NoError(t, err) + require.Equal(t, []byte("blobKey"), blobData) +} + +func TestKVData_EdgeCases_LargeKeys(t *testing.T) { + tests := []struct { + name string + keySize int + }{ + {"127 bytes (1-byte varint)", 127}, + {"128 bytes (2-byte varint)", 128}, + {"16383 bytes (2-byte varint max)", 16383}, + {"16384 bytes (3-byte varint)", 16384}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + key := make([]byte, tt.keySize) + for i := range key { + key[i] = byte(i % 256) + } + value := []byte("value") + + // Write + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + keyOffset, _, err := w.WriteWALSet(key, value) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + // Via WALReader + wr, err := r.ReadWAL() + require.NoError(t, err) + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, key, wr.Key) + require.Equal(t, value, wr.Value) + + // Via direct blob read + keyRead, err := r.UnsafeReadBlob(int(keyOffset)) + require.NoError(t, err) + require.Equal(t, key, keyRead) + }) + } +} + +func TestKVData_EdgeCases_EmptyKey(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kv.dat") + + // Write + f, err := os.Create(path) + require.NoError(t, err) + w := NewKVDataWriter(f) + err = w.WriteStartWAL(1) + require.NoError(t, err) + _, _, err = w.WriteWALSet([]byte{}, []byte("value")) + require.NoError(t, err) + require.NoError(t, w.Flush()) + require.NoError(t, f.Close()) + + // Read + f, err = os.Open(path) + require.NoError(t, err) + defer f.Close() + r, err := NewKVDataReader(f) + require.NoError(t, err) + + wr, err := r.ReadWAL() + require.NoError(t, err) + entryType, ok, err := wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, []byte{}, wr.Key) + require.Equal(t, []byte("value"), wr.Value) +} From 4156068491d1731195fe0597ca585588575dc9f7 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 13:19:35 -0500 Subject: [PATCH 27/34] WIP on tests, add WAL mode checks --- iavl/internal/kvdata_test.go | 1390 ++++++++++++++++++-------------- iavl/internal/kvdata_writer.go | 37 + 2 files changed, 843 insertions(+), 584 deletions(-) diff --git a/iavl/internal/kvdata_test.go b/iavl/internal/kvdata_test.go index 6d7f71c462d6..9d42fbf5ca19 100644 --- a/iavl/internal/kvdata_test.go +++ b/iavl/internal/kvdata_test.go @@ -1,689 +1,911 @@ package internal import ( - "os" - "path/filepath" - "strings" "testing" "github.com/stretchr/testify/require" ) -func TestKVData_WALStart(t *testing.T) { - tests := []struct { - name string - version uint64 - }{ - {"version 0", 0}, - {"version 1", 1}, - {"version 100", 100}, - {"large version", 1<<32 + 12345}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(tt.version) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) - - // Verify HasWAL - hasWAL, startVersion := r.HasWAL() - require.True(t, hasWAL) - require.Equal(t, tt.version, startVersion) - - // Verify ReadWAL - wr, err := r.ReadWAL() - require.NoError(t, err) - require.Equal(t, tt.version, wr.Version) - }) - } +type kvDataWriterHelper struct { + files *ChangesetFiles + *KVDataWriter } -func TestKVData_WALStart_NonEmptyFile(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write some data first - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) +func openTestKVDataWriter(t *testing.T) *kvDataWriterHelper { + files, err := CreateChangesetFiles(t.TempDir(), 1, 0) require.NoError(t, err) - - // Try to write WAL start again - should fail - err = w.WriteStartWAL(2) - require.Error(t, err) - require.Contains(t, err.Error(), "non-empty") - - require.NoError(t, f.Close()) + t.Cleanup(func() { + require.NoError(t, files.Close()) + }) + writer := NewKVDataWriter(files.KVDataFile()) + return &kvDataWriterHelper{ + files: files, + KVDataWriter: writer, + } } -func TestKVData_WALStart_EmptyFile(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Create empty file - f, err := os.Create(path) - require.NoError(t, err) - require.NoError(t, f.Close()) - - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) +func (h *kvDataWriterHelper) openReader(t *testing.T) *KVDataReader { + require.NoError(t, h.writer.Flush()) + rdr, err := NewKVDataReader(h.files.KVDataFile()) require.NoError(t, err) - - // HasWAL should return false for empty file - hasWAL, startVersion := r.HasWAL() - require.False(t, hasWAL) - require.Equal(t, uint64(0), startVersion) - - // ReadWAL should return error - _, err = r.ReadWAL() - require.Error(t, err) + t.Cleanup(func() { + require.NoError(t, rdr.Close()) + }) + return rdr } -func TestKVData_WALSet(t *testing.T) { - tests := []struct { - name string - key []byte - value []byte - }{ - {"simple kv", []byte("hello"), []byte("world")}, - {"empty value", []byte("key"), []byte{}}, - {"binary key", []byte{0x00, 0x01, 0x02}, []byte("value")}, - {"medium key", []byte(strings.Repeat("k", 100)), []byte(strings.Repeat("v", 200))}, +func TestKVData_WAL(t *testing.T) { + writer := openTestKVDataWriter(t) + + // Write WAL start + require.NoError(t, writer.WriteStartWAL(42)) + // Write WAL set with short key + shortKey := []byte("key") // short key, should not be cached + shortValue := []byte("value") + shortKeyOffset, shortValueOffset, err := writer.WriteWALSet(shortKey, shortValue) + require.NoError(t, err) + // Write WAL set with longer key + longerKey := []byte("longerKey") // longer key, should be cached + longerValue := []byte("longerValue") + longKeyOffset, longerValueOffset, err := writer.WriteWALSet(longerKey, longerValue) + require.NoError(t, err) + // Write WAL delete + oldKey := []byte("oldKey") + require.NoError(t, writer.WriteWALDelete(oldKey)) + // Write WAL commit + require.NoError(t, writer.WriteWALCommit(42)) + + // Write short key again to test caching + shortValue2 := []byte("value2") + shortKeyOffset2, shortValue2Offset, err := writer.WriteWALSet(shortKey, shortValue2) + // short key should NOT be cached + require.NotEqual(t, shortKeyOffset, shortKeyOffset2) + // Write longer key again to test caching + longerValue2 := []byte("longerValue2") + longKeyOffset2, longerValue2Offset, err := writer.WriteWALSet(longerKey, longerValue2) + // longer key should be cached + require.Equal(t, longKeyOffset, longKeyOffset2) + + // Write WAL Updates + memKey1 := []byte("memKey1") + memValue1 := []byte("memValue1") + memNode1 := &MemNode{ + key: memKey1, + value: memValue1, } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) - keyOffset, valueOffset, err := w.WriteWALSet(tt.key, tt.value) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) - - // Verify via WALReader - wr, err := r.ReadWAL() - require.NoError(t, err) - entryType, ok, err := wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, tt.key, wr.Key) - require.Equal(t, tt.value, wr.Value) - - // Verify offsets via UnsafeReadBlob - keyRead, err := r.UnsafeReadBlob(int(keyOffset)) - require.NoError(t, err) - require.Equal(t, tt.key, keyRead) - - valueRead, err := r.UnsafeReadBlob(int(valueOffset)) - require.NoError(t, err) - require.Equal(t, tt.value, valueRead) - - // No more entries - _, ok, err = wr.Next() - require.NoError(t, err) - require.False(t, ok) - }) + memValue2 := []byte("memValue2") + memNode2 := &MemNode{ + key: longerKey, // should use cached key offset + value: memValue2, } -} - -func TestKVData_WALDelete(t *testing.T) { - tests := []struct { - name string - key []byte - }{ - {"simple key", []byte("deleteMe")}, - {"binary key", []byte{0xFF, 0xFE, 0xFD}}, - {"medium key", []byte(strings.Repeat("d", 50))}, + reinsertedValue := []byte("valueReinserted") + memNode3 := &MemNode{ + key: oldKey, + value: reinsertedValue, } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) - err = w.WriteWALDelete(tt.key) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) - - wr, err := r.ReadWAL() - require.NoError(t, err) - entryType, ok, err := wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALDelete, entryType) - require.Equal(t, tt.key, wr.Key) - - // No more entries - _, ok, err = wr.Next() - require.NoError(t, err) - require.False(t, ok) - }) - } -} - -func TestKVData_WALCommit(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) - _, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) - require.NoError(t, err) - err = w.WriteWALCommit(1) - require.NoError(t, err) - _, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) - require.NoError(t, err) - err = w.WriteWALCommit(2) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) - + err = writer.WriteWALUpdates([]KVUpdate{ + { + SetNode: memNode1, + }, + { + DeleteKey: oldKey, + }, + { + SetNode: memNode2, + }, + { + SetNode: memNode3, + }, + }) + require.NoError(t, err) + require.NotZero(t, memNode1.keyOffset) + require.NotZero(t, memNode1.valueOffset) + require.NotZero(t, memNode2.keyOffset) + require.NotZero(t, memNode2.valueOffset) + // memNode2 should use cached key offset + require.Equal(t, longKeyOffset, memNode2.keyOffset) + + require.NoError(t, writer.WriteWALCommit(43)) + + // test caching again with some blobs + blobKeyOffset, err := writer.WriteKeyBlob([]byte("longerKey")) + require.NoError(t, err) + // should use cached offset + require.Equal(t, longKeyOffset, blobKeyOffset) + + blobKeyOffset2, err := writer.WriteKeyBlob([]byte("key")) + require.NoError(t, err) + // short key should NOT be cached + require.NotEqual(t, shortKeyOffset, blobKeyOffset2) + + // open reader + r := writer.openReader(t) + // Verify via WALReader wr, err := r.ReadWAL() require.NoError(t, err) - require.Equal(t, uint64(1), wr.Version) // Start version - - // Entry 1: Set + require.Equal(t, uint64(42), wr.Version) + // Entry 1: WAL Set short key entryType, ok, err := wr.Next() require.NoError(t, err) require.True(t, ok) require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, []byte("key1"), wr.Key) + require.Equal(t, []byte("key"), wr.Key) + require.Equal(t, []byte("value"), wr.Value) - // Entry 2: Commit - version should update + // Entry 2: WAL Set longer key entryType, ok, err = wr.Next() require.NoError(t, err) require.True(t, ok) - require.Equal(t, KVEntryWALCommit, entryType) - require.Equal(t, uint64(1), wr.Version) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, []byte("longerKey"), wr.Key) + require.Equal(t, []byte("longerValue"), wr.Value) - // Entry 3: Set + // Entry 3: WAL Delete entryType, ok, err = wr.Next() require.NoError(t, err) require.True(t, ok) - require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, []byte("key2"), wr.Key) + require.Equal(t, KVEntryWALDelete, entryType) + require.Equal(t, []byte("oldKey"), wr.Key) - // Entry 4: Commit + // Entry 4: WAL Commit entryType, ok, err = wr.Next() require.NoError(t, err) require.True(t, ok) require.Equal(t, KVEntryWALCommit, entryType) - require.Equal(t, uint64(2), wr.Version) - - // No more entries - _, ok, err = wr.Next() - require.NoError(t, err) - require.False(t, ok) -} - -func TestKVData_KeyCaching(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - key := []byte("cachedKey") // > 4 bytes, will be cached - - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) + require.Equal(t, uint64(42), wr.Version) - // First write - should be inline - keyOffset1, _, err := w.WriteWALSet(key, []byte("value1")) - require.NoError(t, err) - - // Second write of same key - should use cached offset - keyOffset2, _, err := w.WriteWALSet(key, []byte("value2")) - require.NoError(t, err) - - // Both should have the same key offset - require.Equal(t, keyOffset1, keyOffset2) - - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read raw bytes to verify cached flag is set - data, err := os.ReadFile(path) - require.NoError(t, err) - - // Find the second WALSet entry - it should have the cached flag (0x81) - // First entry is at offset after WALStart - foundCached := false - for i := 0; i < len(data); i++ { - if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { - foundCached = true - break - } - } - require.True(t, foundCached, "second WALSet should have cached key flag") - - // Read and verify both entries work correctly - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) - - wr, err := r.ReadWAL() - require.NoError(t, err) - - // First entry - entryType, ok, err := wr.Next() + // Entry 5: WAL Set short key again (not cached) + entryType, ok, err = wr.Next() require.NoError(t, err) require.True(t, ok) require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, key, wr.Key) - require.Equal(t, []byte("value1"), wr.Value) + require.Equal(t, []byte("key"), wr.Key) + require.Equal(t, []byte("value2"), wr.Value) - // Second entry - should also resolve key correctly via cache + // Entry 6: WAL Set longer key again (cached) entryType, ok, err = wr.Next() require.NoError(t, err) require.True(t, ok) require.Equal(t, KVEntryWALSet|KVFlagCachedKey, entryType) - require.Equal(t, key, wr.Key) - require.Equal(t, []byte("value2"), wr.Value) -} - -func TestKVData_KeyCaching_ShortKeys(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - shortKey := []byte("abc") // 3 bytes, should NOT be cached + require.Equal(t, []byte("longerKey"), wr.Key) + require.Equal(t, []byte("longerValue2"), wr.Value) - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) - - // First write - _, _, err = w.WriteWALSet(shortKey, []byte("value1")) + // Entry 7-10: WAL Updates + for i, expected := range []struct { + entryType KVEntryType + key []byte + value []byte + }{ + {KVEntryWALSet, memKey1, memValue1}, + {KVEntryWALDelete | KVFlagCachedKey, oldKey, nil}, + {KVEntryWALSet | KVFlagCachedKey, longerKey, memValue2}, + {KVEntryWALSet | KVFlagCachedKey, oldKey, reinsertedValue}, + } { + entryType, ok, err = wr.Next() + require.NoError(t, err, "WAL Update entry %d", i) + require.True(t, ok, "WAL Update entry %d", i) + require.Equal(t, expected.entryType, entryType, "WAL Update entry %d type", i) + require.Equal(t, expected.key, wr.Key, "WAL Update entry %d key", i) + if expected.value != nil { + require.Equal(t, expected.value, wr.Value, "WAL Update entry %d value", i) + } + } + // Entry 11: WAL Commit + entryType, ok, err = wr.Next() require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALCommit, entryType) + require.Equal(t, uint64(43), wr.Version) - // Second write - should NOT use cache (key too short) - _, _, err = w.WriteWALSet(shortKey, []byte("value2")) + // No more entries + _, ok, err = wr.Next() require.NoError(t, err) + require.False(t, ok) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read raw bytes - should NOT find cached flag for short keys - data, err := os.ReadFile(path) + // Check that all offsets are readable + shortKeyRead, err := r.UnsafeReadBlob(int(shortKeyOffset)) require.NoError(t, err) - - cachedCount := 0 - for i := 0; i < len(data); i++ { - if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { - cachedCount++ - } - } - require.Equal(t, 0, cachedCount, "short keys should not be cached") -} - -func TestKVData_WALReplay(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write a realistic WAL sequence - f, err := os.Create(path) + require.Equal(t, shortKey, shortKeyRead) + shortValueRead, err := r.UnsafeReadBlob(int(shortValueOffset)) require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(10) + require.Equal(t, shortValue, shortValueRead) + longKeyRead, err := r.UnsafeReadBlob(int(longKeyOffset)) require.NoError(t, err) - - // Version 10 operations - _, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) + require.Equal(t, longerKey, longKeyRead) + longValueRead, err := r.UnsafeReadBlob(int(longerValueOffset)) require.NoError(t, err) - _, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) + require.Equal(t, longerValue, longValueRead) + shortKeyRead2, err := r.UnsafeReadBlob(int(shortKeyOffset2)) require.NoError(t, err) - err = w.WriteWALDelete([]byte("oldKey")) + require.Equal(t, shortKey, shortKeyRead2) + longValueRead2, err := r.UnsafeReadBlob(int(longerValue2Offset)) require.NoError(t, err) - err = w.WriteWALCommit(10) + require.Equal(t, longerValue2, longValueRead2) + shorterValue2Read, err := r.UnsafeReadBlob(int(shortValue2Offset)) require.NoError(t, err) - - // Version 11 operations - _, _, err = w.WriteWALSet([]byte("key1"), []byte("val1_updated")) + require.Equal(t, shortValue2, shorterValue2Read) + // also check all memNode offsets + memKey1Read, err := r.UnsafeReadBlob(int(memNode1.keyOffset)) require.NoError(t, err) - _, _, err = w.WriteWALSet([]byte("key3"), []byte("val3")) + require.Equal(t, memKey1, memKey1Read) + memValue1Read, err := r.UnsafeReadBlob(int(memNode1.valueOffset)) require.NoError(t, err) - err = w.WriteWALCommit(11) + require.Equal(t, memValue1, memValue1Read) + memKey2Read, err := r.UnsafeReadBlob(int(memNode2.keyOffset)) require.NoError(t, err) - - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Replay - f, err = os.Open(path) + require.Equal(t, longerKey, memKey2Read) + memValue2Read, err := r.UnsafeReadBlob(int(memNode2.valueOffset)) require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) + require.Equal(t, memValue2, memValue2Read) + memKey3Read, err := r.UnsafeReadBlob(int(memNode3.keyOffset)) require.NoError(t, err) - - wr, err := r.ReadWAL() + require.Equal(t, oldKey, memKey3Read) + memValue3Read, err := r.UnsafeReadBlob(int(memNode3.valueOffset)) require.NoError(t, err) - require.Equal(t, uint64(10), wr.Version) + require.Equal(t, reinsertedValue, memValue3Read) +} - expectedEntries := []struct { - entryType KVEntryType - key []byte - value []byte - version uint64 +func TestKVData_WALStart(t *testing.T) { + tests := []struct { + name string + version uint64 }{ - {KVEntryWALSet, []byte("key1"), []byte("val1"), 10}, - {KVEntryWALSet, []byte("key2"), []byte("val2"), 10}, - {KVEntryWALDelete, []byte("oldKey"), nil, 10}, - {KVEntryWALCommit, nil, nil, 10}, - {KVEntryWALSet | KVFlagCachedKey, []byte("key1"), []byte("val1_updated"), 10}, // key1 cached - {KVEntryWALSet, []byte("key3"), []byte("val3"), 10}, - {KVEntryWALCommit, nil, nil, 11}, - } - - for i, exp := range expectedEntries { - entryType, ok, err := wr.Next() - require.NoError(t, err, "entry %d", i) - require.True(t, ok, "entry %d", i) - require.Equal(t, exp.entryType, entryType, "entry %d type", i) - - if exp.key != nil { - require.Equal(t, exp.key, wr.Key, "entry %d key", i) - } - if exp.value != nil { - require.Equal(t, exp.value, wr.Value, "entry %d value", i) - } - if entryType == KVEntryWALCommit { - require.Equal(t, exp.version, wr.Version, "entry %d version", i) - } + {"version 0", 0}, + {"version 1", 1}, + {"version 100", 100}, + {"large version", 1<<32 + 12345}, } - // No more entries - _, ok, err := wr.Next() - require.NoError(t, err) - require.False(t, ok) -} - -func TestKVData_Blobs(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - key := []byte("blobKey") - value := []byte("blobValue") - - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - - // Write key blob - keyOffset, err := w.WriteKeyBlob(key) - require.NoError(t, err) - - // Write same key again - should return cached offset - keyOffset2, err := w.WriteKeyBlob(key) - require.NoError(t, err) - require.Equal(t, keyOffset, keyOffset2) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + writer := openTestKVDataWriter(t) - // Write key/value blobs - keyOffset3, valueOffset, err := w.WriteKeyValueBlobs(key, value) - require.NoError(t, err) - require.Equal(t, keyOffset, keyOffset3) // same key, cached + // Write WAL start + require.NoError(t, writer.WriteStartWAL(tt.version)) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) + r := writer.openReader(t) - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) + // Verify HasWAL + hasWAL, startVersion := r.HasWAL() + require.True(t, hasWAL) + require.Equal(t, tt.version, startVersion) - // Read key via offset - keyRead, err := r.UnsafeReadBlob(int(keyOffset)) - require.NoError(t, err) - require.Equal(t, key, keyRead) + // Verify ReadWAL + wr, err := r.ReadWAL() + require.NoError(t, err) + require.Equal(t, tt.version, wr.Version) + }) + } +} - // Read value via offset - valueRead, err := r.UnsafeReadBlob(int(valueOffset)) - require.NoError(t, err) - require.Equal(t, value, valueRead) +//func TestKVData_WALMode(t *testing.T) { +// tests := []struct { +// name string +// startInWalMode bool +// op func(t *testing.T, writer *KVDataWriter) +// }{ +// { +// name: "WAL", +// startInWalMode: true, +// op: func(t *testing.T, writer *KVDataWriter) { +// +// }, +// }, +// } +//} + +/*func TestKVData_WALStart_EmptyFile(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Create empty file +f, err := os.Create(path) +require.NoError(t, err) +require.NoError(t, f.Close()) + +// Read phase +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +// HasWAL should return false for empty file +hasWAL, startVersion := r.HasWAL() +require.False(t, hasWAL) +require.Equal(t, uint64(0), startVersion) + +// ReadWAL should return error +_, err = r.ReadWAL() +require.Error(t, err) } -func TestKVData_Blobs_ValueType(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") +func TestKVData_WALSet(t *testing.T) { +tests := []struct { +name string +key []byte +value []byte +}{ +{"simple kv", []byte("hello"), []byte("world")}, +{"empty value", []byte("key"), []byte{}}, +{"binary key", []byte{0x00, 0x01, 0x02}, []byte("value")}, +{"medium key", []byte(strings.Repeat("k", 100)), []byte(strings.Repeat("v", 200))}, +} - // Write phase - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - _, valueOffset, err := w.WriteKeyValueBlobs([]byte("key"), []byte("value")) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) +for _, tt := range tests { +t.Run(tt.name, func (t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) +keyOffset, valueOffset, err := w.WriteWALSet(tt.key, tt.value) +require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read phase +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +// Verify via WALReader +wr, err := r.ReadWAL() +require.NoError(t, err) +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, tt.key, wr.Key) +require.Equal(t, tt.value, wr.Value) + +// Verify offsets via UnsafeReadBlob +keyRead, err := r.UnsafeReadBlob(int(keyOffset)) +require.NoError(t, err) +require.Equal(t, tt.key, keyRead) + +valueRead, err := r.UnsafeReadBlob(int(valueOffset)) +require.NoError(t, err) +require.Equal(t, tt.value, valueRead) + +// No more entries +_, ok, err = wr.Next() +require.NoError(t, err) +require.False(t, ok) +}) +} +} - // Read raw bytes to verify value uses KVEntryValueBlob type - data, err := os.ReadFile(path) - require.NoError(t, err) +func TestKVData_WALDelete(t *testing.T) { +tests := []struct { +name string +key []byte +}{ +{"simple key", []byte("deleteMe")}, +{"binary key", []byte{0xFF, 0xFE, 0xFD}}, +{"medium key", []byte(strings.Repeat("d", 50))}, +} - // The byte before valueOffset should be the type byte - // valueOffset points to the varint length, type byte is 1 before - require.Equal(t, byte(KVEntryValueBlob), data[valueOffset-1]) +for _, tt := range tests { +t.Run(tt.name, func (t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) +err = w.WriteWALDelete(tt.key) +require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read phase +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +wr, err := r.ReadWAL() +require.NoError(t, err) +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALDelete, entryType) +require.Equal(t, tt.key, wr.Key) + +// No more entries +_, ok, err = wr.Next() +require.NoError(t, err) +require.False(t, ok) +}) +} } -func TestKVData_MixedEntries(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") +func TestKVData_WALCommit(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) +_, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) +require.NoError(t, err) +err = w.WriteWALCommit(1) +require.NoError(t, err) +_, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) +require.NoError(t, err) +err = w.WriteWALCommit(2) +require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read phase +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +wr, err := r.ReadWAL() +require.NoError(t, err) +require.Equal(t, uint64(1), wr.Version) // Start version + +// Entry 1: Set +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, []byte("key1"), wr.Key) + +// Entry 2: Commit - version should update +entryType, ok, err = wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALCommit, entryType) +require.Equal(t, uint64(1), wr.Version) + +// Entry 3: Set +entryType, ok, err = wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, []byte("key2"), wr.Key) + +// Entry 4: Commit +entryType, ok, err = wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALCommit, entryType) +require.Equal(t, uint64(2), wr.Version) + +// No more entries +_, ok, err = wr.Next() +require.NoError(t, err) +require.False(t, ok) +} - // Write WAL with blob entries interspersed - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) +func TestKVData_KeyCaching(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +key := []byte("cachedKey") // > 4 bytes, will be cached + +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) + +// First write - should be inline +keyOffset1, _, err := w.WriteWALSet(key, []byte("value1")) +require.NoError(t, err) + +// Second write of same key - should use cached offset +keyOffset2, _, err := w.WriteWALSet(key, []byte("value2")) +require.NoError(t, err) + +// Both should have the same key offset +require.Equal(t, keyOffset1, keyOffset2) + +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read raw bytes to verify cached flag is set +data, err := os.ReadFile(path) +require.NoError(t, err) + +// Find the second WALSet entry - it should have the cached flag (0x81) +// First entry is at offset after WALStart +foundCached := false +for i := 0; i < len(data); i++ { +if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { +foundCached = true +break +} +} +require.True(t, foundCached, "second WALSet should have cached key flag") + +// Read and verify both entries work correctly +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +wr, err := r.ReadWAL() +require.NoError(t, err) + +// First entry +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, key, wr.Key) +require.Equal(t, []byte("value1"), wr.Value) + +// Second entry - should also resolve key correctly via cache +entryType, ok, err = wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet|KVFlagCachedKey, entryType) +require.Equal(t, key, wr.Key) +require.Equal(t, []byte("value2"), wr.Value) +} - // WAL set - _, _, err = w.WriteWALSet([]byte("walKey1"), []byte("walVal1")) - require.NoError(t, err) +func TestKVData_KeyCaching_ShortKeys(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") - // Blob entry (non-WAL) - blobOffset, err := w.WriteKeyBlob([]byte("blobKey")) - require.NoError(t, err) +shortKey := []byte("abc") // 3 bytes, should NOT be cached - // Another WAL set - _, _, err = w.WriteWALSet([]byte("walKey2"), []byte("walVal2")) - require.NoError(t, err) +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) - err = w.WriteWALCommit(1) - require.NoError(t, err) +// First write +_, _, err = w.WriteWALSet(shortKey, []byte("value1")) +require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) +// Second write - should NOT use cache (key too short) +_, _, err = w.WriteWALSet(shortKey, []byte("value2")) +require.NoError(t, err) - // Read phase - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) - wr, err := r.ReadWAL() - require.NoError(t, err) +// Read raw bytes - should NOT find cached flag for short keys +data, err := os.ReadFile(path) +require.NoError(t, err) - // WALReader.Next() should skip blob entries - // Entry 1: WAL Set - entryType, ok, err := wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, []byte("walKey1"), wr.Key) +cachedCount := 0 +for i := 0; i < len(data); i++ { +if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { +cachedCount++ +} +} +require.Equal(t, 0, cachedCount, "short keys should not be cached") +} - // Entry 2: WAL Set (blob skipped) - entryType, ok, err = wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, []byte("walKey2"), wr.Key) +func TestKVData_WALReplay(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write a realistic WAL sequence +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(10) +require.NoError(t, err) + +// Version 10 operations +_, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) +require.NoError(t, err) +_, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) +require.NoError(t, err) +err = w.WriteWALDelete([]byte("oldKey")) +require.NoError(t, err) +err = w.WriteWALCommit(10) +require.NoError(t, err) + +// Version 11 operations +_, _, err = w.WriteWALSet([]byte("key1"), []byte("val1_updated")) +require.NoError(t, err) +_, _, err = w.WriteWALSet([]byte("key3"), []byte("val3")) +require.NoError(t, err) +err = w.WriteWALCommit(11) +require.NoError(t, err) + +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Replay +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +wr, err := r.ReadWAL() +require.NoError(t, err) +require.Equal(t, uint64(10), wr.Version) + +expectedEntries := []struct { +entryType KVEntryType +key []byte +value []byte +version uint64 +}{ +{KVEntryWALSet, []byte("key1"), []byte("val1"), 10}, +{KVEntryWALSet, []byte("key2"), []byte("val2"), 10}, +{KVEntryWALDelete, []byte("oldKey"), nil, 10}, +{KVEntryWALCommit, nil, nil, 10}, +{KVEntryWALSet | KVFlagCachedKey, []byte("key1"), []byte("val1_updated"), 10}, // key1 cached +{KVEntryWALSet, []byte("key3"), []byte("val3"), 10}, +{KVEntryWALCommit, nil, nil, 11}, +} - // Entry 3: Commit - entryType, ok, err = wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALCommit, entryType) +for i, exp := range expectedEntries { +entryType, ok, err := wr.Next() +require.NoError(t, err, "entry %d", i) +require.True(t, ok, "entry %d", i) +require.Equal(t, exp.entryType, entryType, "entry %d type", i) - // Blob is still readable via offset - blobData, err := r.UnsafeReadBlob(int(blobOffset)) - require.NoError(t, err) - require.Equal(t, []byte("blobKey"), blobData) +if exp.key != nil { +require.Equal(t, exp.key, wr.Key, "entry %d key", i) +} +if exp.value != nil { +require.Equal(t, exp.value, wr.Value, "entry %d value", i) +} +if entryType == KVEntryWALCommit { +require.Equal(t, exp.version, wr.Version, "entry %d version", i) +} } -func TestKVData_EdgeCases_LargeKeys(t *testing.T) { - tests := []struct { - name string - keySize int - }{ - {"127 bytes (1-byte varint)", 127}, - {"128 bytes (2-byte varint)", 128}, - {"16383 bytes (2-byte varint max)", 16383}, - {"16384 bytes (3-byte varint)", 16384}, - } +// No more entries +_, ok, err := wr.Next() +require.NoError(t, err) +require.False(t, ok) +} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") +func TestKVData_Blobs(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +key := []byte("blobKey") +value := []byte("blobValue") + +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) + +// Write key blob +keyOffset, err := w.WriteKeyBlob(key) +require.NoError(t, err) + +// Write same key again - should return cached offset +keyOffset2, err := w.WriteKeyBlob(key) +require.NoError(t, err) +require.Equal(t, keyOffset, keyOffset2) + +// Write key/value blobs +keyOffset3, valueOffset, err := w.WriteKeyValueBlobs(key, value) +require.NoError(t, err) +require.Equal(t, keyOffset, keyOffset3) // same key, cached + +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read phase +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +// Read key via offset +keyRead, err := r.UnsafeReadBlob(int(keyOffset)) +require.NoError(t, err) +require.Equal(t, key, keyRead) + +// Read value via offset +valueRead, err := r.UnsafeReadBlob(int(valueOffset)) +require.NoError(t, err) +require.Equal(t, value, valueRead) +} - key := make([]byte, tt.keySize) - for i := range key { - key[i] = byte(i % 256) - } - value := []byte("value") +func TestKVData_Blobs_ValueType(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write phase +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +_, valueOffset, err := w.WriteKeyValueBlobs([]byte("key"), []byte("value")) +require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read raw bytes to verify value uses KVEntryValueBlob type +data, err := os.ReadFile(path) +require.NoError(t, err) + +// The byte before valueOffset should be the type byte +// valueOffset points to the varint length, type byte is 1 before +require.Equal(t, byte(KVEntryValueBlob), data[valueOffset-1]) +} - // Write - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) - keyOffset, _, err := w.WriteWALSet(key, value) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) +func TestKVData_MixedEntries(t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write WAL with blob entries interspersed +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) + +// WAL set +_, _, err = w.WriteWALSet([]byte("walKey1"), []byte("walVal1")) +require.NoError(t, err) + +// Blob entry (non-WAL) +blobOffset, err := w.WriteKeyBlob([]byte("blobKey")) +require.NoError(t, err) + +// Another WAL set +_, _, err = w.WriteWALSet([]byte("walKey2"), []byte("walVal2")) +require.NoError(t, err) + +err = w.WriteWALCommit(1) +require.NoError(t, err) + +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read phase +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +wr, err := r.ReadWAL() +require.NoError(t, err) + +// WALReader.Next() should skip blob entries +// Entry 1: WAL Set +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, []byte("walKey1"), wr.Key) + +// Entry 2: WAL Set (blob skipped) +entryType, ok, err = wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, []byte("walKey2"), wr.Key) + +// Entry 3: Commit +entryType, ok, err = wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALCommit, entryType) + +// Blob is still readable via offset +blobData, err := r.UnsafeReadBlob(int(blobOffset)) +require.NoError(t, err) +require.Equal(t, []byte("blobKey"), blobData) +} - // Read - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) +func TestKVData_EdgeCases_LargeKeys(t *testing.T) { +tests := []struct { +name string +keySize int +}{ +{"127 bytes (1-byte varint)", 127}, +{"128 bytes (2-byte varint)", 128}, +{"16383 bytes (2-byte varint max)", 16383}, +{"16384 bytes (3-byte varint)", 16384}, +} - // Via WALReader - wr, err := r.ReadWAL() - require.NoError(t, err) - entryType, ok, err := wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, key, wr.Key) - require.Equal(t, value, wr.Value) +for _, tt := range tests { +t.Run(tt.name, func (t *testing.T) { +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") - // Via direct blob read - keyRead, err := r.UnsafeReadBlob(int(keyOffset)) - require.NoError(t, err) - require.Equal(t, key, keyRead) - }) - } +key := make([]byte, tt.keySize) +for i := range key { +key[i] = byte(i % 256) +} +value := []byte("value") + +// Write +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) +keyOffset, _, err := w.WriteWALSet(key, value) +require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +// Via WALReader +wr, err := r.ReadWAL() +require.NoError(t, err) +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, key, wr.Key) +require.Equal(t, value, wr.Value) + +// Via direct blob read +keyRead, err := r.UnsafeReadBlob(int(keyOffset)) +require.NoError(t, err) +require.Equal(t, key, keyRead) +}) +} } func TestKVData_EdgeCases_EmptyKey(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "kv.dat") - - // Write - f, err := os.Create(path) - require.NoError(t, err) - w := NewKVDataWriter(f) - err = w.WriteStartWAL(1) - require.NoError(t, err) - _, _, err = w.WriteWALSet([]byte{}, []byte("value")) - require.NoError(t, err) - require.NoError(t, w.Flush()) - require.NoError(t, f.Close()) - - // Read - f, err = os.Open(path) - require.NoError(t, err) - defer f.Close() - r, err := NewKVDataReader(f) - require.NoError(t, err) - - wr, err := r.ReadWAL() - require.NoError(t, err) - entryType, ok, err := wr.Next() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, KVEntryWALSet, entryType) - require.Equal(t, []byte{}, wr.Key) - require.Equal(t, []byte("value"), wr.Value) +dir := t.TempDir() +path := filepath.Join(dir, "kv.dat") + +// Write +f, err := os.Create(path) +require.NoError(t, err) +w := NewKVDataWriter(f) +err = w.WriteStartWAL(1) +require.NoError(t, err) +_, _, err = w.WriteWALSet([]byte{}, []byte("value")) +require.NoError(t, err) +require.NoError(t, w.Flush()) +require.NoError(t, f.Close()) + +// Read +f, err = os.Open(path) +require.NoError(t, err) +defer f.Close() +r, err := NewKVDataReader(f) +require.NoError(t, err) + +wr, err := r.ReadWAL() +require.NoError(t, err) +entryType, ok, err := wr.Next() +require.NoError(t, err) +require.True(t, ok) +require.Equal(t, KVEntryWALSet, entryType) +require.Equal(t, []byte{}, wr.Key) +require.Equal(t, []byte("value"), wr.Value) } +*/ diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index 9b16af81b030..21e59309103a 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -13,6 +13,7 @@ import ( type KVDataWriter struct { *FileWriter keyCache map[string]uint32 + walMode bool } // NewKVDataWriter creates a new KVDataWriter. @@ -24,10 +25,22 @@ func NewKVDataWriter(file *os.File) *KVDataWriter { } } +// IsInWALMode returns true if the writer is in WAL mode. +// This is set to true after WriteStartWAL is called. +// If the writer is not in WAL mode, only key-value blobs can be written. +func (kvs *KVDataWriter) IsInWALMode() bool { + return kvs.walMode +} + +// WriteStartWAL writes a WAL start entry with the given version. +// This method can ONLY be called on an empty file at the start of creating a KV data file. +// An error is returned if the file is not empty. +// This puts the writer into WAL mode. func (kvs *KVDataWriter) WriteStartWAL(version uint64) error { if kvs.Size() != 0 { return fmt.Errorf("cannot write WAL start to non-empty file") } + kvs.walMode = true err := kvs.writeType(KVEntryWALStart) if err != nil { return err @@ -35,6 +48,8 @@ func (kvs *KVDataWriter) WriteStartWAL(version uint64) error { return kvs.writeVarUint(version) } +// WriteWALUpdates writes a batch of WAL updates. +// This can ONLY be called when the writer is in WAL mode. func (kvs *KVDataWriter) WriteWALUpdates(updates []KVUpdate) error { for _, update := range updates { if deleteKey := update.DeleteKey; deleteKey != nil { @@ -56,7 +71,12 @@ func (kvs *KVDataWriter) WriteWALUpdates(updates []KVUpdate) error { return nil } +// WriteWALSet writes a WAL set entry for the given key and value and returns their offsets in the file. +// This can ONLY be called when the writer is in WAL mode. func (kvs *KVDataWriter) WriteWALSet(key, value []byte) (keyOffset, valueOffset uint32, err error) { + if !kvs.walMode { + return 0, 0, fmt.Errorf("cannot write WAL set entry when not in WAL mode") + } keyOffset, cached := kvs.keyCache[unsafeBytesToString(key)] typ := KVEntryWALSet if cached { @@ -89,7 +109,12 @@ func (kvs *KVDataWriter) WriteWALSet(key, value []byte) (keyOffset, valueOffset return keyOffset, valueOffset, nil } +// WriteWALDelete writes a WAL delete entry for the given key. +// This can ONLY be called when the writer is in WAL mode. func (kvs *KVDataWriter) WriteWALDelete(key []byte) error { + if !kvs.walMode { + return fmt.Errorf("cannot write WAL delete entry when not in WAL mode") + } cachedOffset, cached := kvs.keyCache[unsafeBytesToString(key)] typ := KVEntryWALDelete if cached { @@ -117,7 +142,12 @@ func (kvs *KVDataWriter) WriteWALDelete(key []byte) error { return nil } +// WriteWALCommit writes a WAL commit entry for the given version. +// This can ONLY be called when the writer is in WAL mode. func (kvs *KVDataWriter) WriteWALCommit(version uint64) error { + if !kvs.walMode { + return fmt.Errorf("cannot write WAL commit entry when not in WAL mode") + } err := kvs.writeType(KVEntryWALCommit) if err != nil { return err @@ -126,6 +156,8 @@ func (kvs *KVDataWriter) WriteWALCommit(version uint64) error { return kvs.writeVarUint(version) } +// WriteKeyBlob writes a key blob and returns its offset in the file. +// This should be used for writing keys outside of WAL entries to take advantage of key caching. func (kvs *KVDataWriter) WriteKeyBlob(key []byte) (offset uint32, err error) { if offset, found := kvs.keyCache[unsafeBytesToString(key)]; found { return offset, nil @@ -141,6 +173,8 @@ func (kvs *KVDataWriter) WriteKeyBlob(key []byte) (offset uint32, err error) { return offset, nil } +// WriteKeyValueBlobs writes a key blob and a value blob and returns their offsets in the file. +// This should be used for writing key-value pairs in changesets where the WAL has been dropped. func (kvs *KVDataWriter) WriteKeyValueBlobs(key, value []byte) (keyOffset, valueOffset uint32, err error) { keyOffset, err = kvs.WriteKeyBlob(key) if err != nil { @@ -220,6 +254,9 @@ func (kvs *KVDataWriter) writeLEU32(x uint32) error { return err } +// unsafeBytesToString converts a byte slice to a string without allocation. +// This should be used with caution and only when the byte slice is not modified. +// But generally when we are storing a byte slice as a key in a map, this is what we should use. func unsafeBytesToString(b []byte) string { return unsafe.String(unsafe.SliceData(b), len(b)) } From b76da6bf667dea1a4a8a5cb7a6bd01363d246579 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 13:32:15 -0500 Subject: [PATCH 28/34] WIP on tests --- iavl/internal/kvdata_test.go | 732 ++++----------------------------- iavl/internal/kvdata_writer.go | 3 + 2 files changed, 75 insertions(+), 660 deletions(-) diff --git a/iavl/internal/kvdata_test.go b/iavl/internal/kvdata_test.go index 9d42fbf5ca19..aed657e2f105 100644 --- a/iavl/internal/kvdata_test.go +++ b/iavl/internal/kvdata_test.go @@ -58,11 +58,13 @@ func TestKVData_WAL(t *testing.T) { // Write short key again to test caching shortValue2 := []byte("value2") shortKeyOffset2, shortValue2Offset, err := writer.WriteWALSet(shortKey, shortValue2) + require.NoError(t, err) // short key should NOT be cached require.NotEqual(t, shortKeyOffset, shortKeyOffset2) // Write longer key again to test caching longerValue2 := []byte("longerValue2") longKeyOffset2, longerValue2Offset, err := writer.WriteWALSet(longerKey, longerValue2) + require.NoError(t, err) // longer key should be cached require.Equal(t, longKeyOffset, longKeyOffset2) @@ -108,18 +110,23 @@ func TestKVData_WAL(t *testing.T) { require.NoError(t, writer.WriteWALCommit(43)) // test caching again with some blobs - blobKeyOffset, err := writer.WriteKeyBlob([]byte("longerKey")) + blobKeyOffset, err := writer.WriteKeyBlob(longerKey) require.NoError(t, err) // should use cached offset require.Equal(t, longKeyOffset, blobKeyOffset) - blobKeyOffset2, err := writer.WriteKeyBlob([]byte("key")) + blobKeyOffset2, shortValueOffset2, err := writer.WriteKeyValueBlobs(shortKey, shortValue) require.NoError(t, err) // short key should NOT be cached require.NotEqual(t, shortKeyOffset, blobKeyOffset2) // open reader r := writer.openReader(t) + // Verify that the reader has a WAL + hasWal, startVersion := r.HasWAL() + require.True(t, hasWal) + require.Equal(t, uint64(42), startVersion) + // Verify via WALReader wr, err := r.ReadWAL() require.NoError(t, err) @@ -224,6 +231,15 @@ func TestKVData_WAL(t *testing.T) { shorterValue2Read, err := r.UnsafeReadBlob(int(shortValue2Offset)) require.NoError(t, err) require.Equal(t, shortValue2, shorterValue2Read) + blobKeyRead, err := r.UnsafeReadBlob(int(blobKeyOffset)) + require.NoError(t, err) + require.Equal(t, longerKey, blobKeyRead) + blobKeyRead2, err := r.UnsafeReadBlob(int(blobKeyOffset2)) + require.NoError(t, err) + require.Equal(t, shortKey, blobKeyRead2) + shortValueRead2, err := r.UnsafeReadBlob(int(shortValueOffset2)) + require.NoError(t, err) + require.Equal(t, shortValue, shortValueRead2) // also check all memNode offsets memKey1Read, err := r.UnsafeReadBlob(int(memNode1.keyOffset)) require.NoError(t, err) @@ -245,667 +261,63 @@ func TestKVData_WAL(t *testing.T) { require.Equal(t, reinsertedValue, memValue3Read) } -func TestKVData_WALStart(t *testing.T) { - tests := []struct { - name string - version uint64 - }{ - {"version 0", 0}, - {"version 1", 1}, - {"version 100", 100}, - {"large version", 1<<32 + 12345}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - writer := openTestKVDataWriter(t) - - // Write WAL start - require.NoError(t, writer.WriteStartWAL(tt.version)) - - r := writer.openReader(t) - - // Verify HasWAL - hasWAL, startVersion := r.HasWAL() - require.True(t, hasWAL) - require.Equal(t, tt.version, startVersion) - - // Verify ReadWAL - wr, err := r.ReadWAL() - require.NoError(t, err) - require.Equal(t, tt.version, wr.Version) - }) - } -} - -//func TestKVData_WALMode(t *testing.T) { -// tests := []struct { -// name string -// startInWalMode bool -// op func(t *testing.T, writer *KVDataWriter) -// }{ -// { -// name: "WAL", -// startInWalMode: true, -// op: func(t *testing.T, writer *KVDataWriter) { -// -// }, -// }, -// } -//} - -/*func TestKVData_WALStart_EmptyFile(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Create empty file -f, err := os.Create(path) -require.NoError(t, err) -require.NoError(t, f.Close()) - -// Read phase -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -// HasWAL should return false for empty file -hasWAL, startVersion := r.HasWAL() -require.False(t, hasWAL) -require.Equal(t, uint64(0), startVersion) - -// ReadWAL should return error -_, err = r.ReadWAL() -require.Error(t, err) -} - -func TestKVData_WALSet(t *testing.T) { -tests := []struct { -name string -key []byte -value []byte -}{ -{"simple kv", []byte("hello"), []byte("world")}, -{"empty value", []byte("key"), []byte{}}, -{"binary key", []byte{0x00, 0x01, 0x02}, []byte("value")}, -{"medium key", []byte(strings.Repeat("k", 100)), []byte(strings.Repeat("v", 200))}, -} - -for _, tt := range tests { -t.Run(tt.name, func (t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) -keyOffset, valueOffset, err := w.WriteWALSet(tt.key, tt.value) -require.NoError(t, err) -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read phase -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -// Verify via WALReader -wr, err := r.ReadWAL() -require.NoError(t, err) -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, tt.key, wr.Key) -require.Equal(t, tt.value, wr.Value) - -// Verify offsets via UnsafeReadBlob -keyRead, err := r.UnsafeReadBlob(int(keyOffset)) -require.NoError(t, err) -require.Equal(t, tt.key, keyRead) - -valueRead, err := r.UnsafeReadBlob(int(valueOffset)) -require.NoError(t, err) -require.Equal(t, tt.value, valueRead) - -// No more entries -_, ok, err = wr.Next() -require.NoError(t, err) -require.False(t, ok) -}) -} -} - -func TestKVData_WALDelete(t *testing.T) { -tests := []struct { -name string -key []byte -}{ -{"simple key", []byte("deleteMe")}, -{"binary key", []byte{0xFF, 0xFE, 0xFD}}, -{"medium key", []byte(strings.Repeat("d", 50))}, -} - -for _, tt := range tests { -t.Run(tt.name, func (t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) -err = w.WriteWALDelete(tt.key) -require.NoError(t, err) -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read phase -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -wr, err := r.ReadWAL() -require.NoError(t, err) -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALDelete, entryType) -require.Equal(t, tt.key, wr.Key) - -// No more entries -_, ok, err = wr.Next() -require.NoError(t, err) -require.False(t, ok) -}) -} -} - -func TestKVData_WALCommit(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) -_, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) -require.NoError(t, err) -err = w.WriteWALCommit(1) -require.NoError(t, err) -_, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) -require.NoError(t, err) -err = w.WriteWALCommit(2) -require.NoError(t, err) -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read phase -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -wr, err := r.ReadWAL() -require.NoError(t, err) -require.Equal(t, uint64(1), wr.Version) // Start version - -// Entry 1: Set -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, []byte("key1"), wr.Key) - -// Entry 2: Commit - version should update -entryType, ok, err = wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALCommit, entryType) -require.Equal(t, uint64(1), wr.Version) - -// Entry 3: Set -entryType, ok, err = wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, []byte("key2"), wr.Key) - -// Entry 4: Commit -entryType, ok, err = wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALCommit, entryType) -require.Equal(t, uint64(2), wr.Version) - -// No more entries -_, ok, err = wr.Next() -require.NoError(t, err) -require.False(t, ok) -} - -func TestKVData_KeyCaching(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -key := []byte("cachedKey") // > 4 bytes, will be cached - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) - -// First write - should be inline -keyOffset1, _, err := w.WriteWALSet(key, []byte("value1")) -require.NoError(t, err) - -// Second write of same key - should use cached offset -keyOffset2, _, err := w.WriteWALSet(key, []byte("value2")) -require.NoError(t, err) - -// Both should have the same key offset -require.Equal(t, keyOffset1, keyOffset2) - -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read raw bytes to verify cached flag is set -data, err := os.ReadFile(path) -require.NoError(t, err) - -// Find the second WALSet entry - it should have the cached flag (0x81) -// First entry is at offset after WALStart -foundCached := false -for i := 0; i < len(data); i++ { -if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { -foundCached = true -break -} -} -require.True(t, foundCached, "second WALSet should have cached key flag") - -// Read and verify both entries work correctly -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -wr, err := r.ReadWAL() -require.NoError(t, err) - -// First entry -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, key, wr.Key) -require.Equal(t, []byte("value1"), wr.Value) - -// Second entry - should also resolve key correctly via cache -entryType, ok, err = wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet|KVFlagCachedKey, entryType) -require.Equal(t, key, wr.Key) -require.Equal(t, []byte("value2"), wr.Value) -} - -func TestKVData_KeyCaching_ShortKeys(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -shortKey := []byte("abc") // 3 bytes, should NOT be cached - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) - -// First write -_, _, err = w.WriteWALSet(shortKey, []byte("value1")) -require.NoError(t, err) - -// Second write - should NOT use cache (key too short) -_, _, err = w.WriteWALSet(shortKey, []byte("value2")) -require.NoError(t, err) - -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read raw bytes - should NOT find cached flag for short keys -data, err := os.ReadFile(path) -require.NoError(t, err) - -cachedCount := 0 -for i := 0; i < len(data); i++ { -if data[i] == byte(KVEntryWALSet|KVFlagCachedKey) { -cachedCount++ -} -} -require.Equal(t, 0, cachedCount, "short keys should not be cached") -} - -func TestKVData_WALReplay(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Write a realistic WAL sequence -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(10) -require.NoError(t, err) - -// Version 10 operations -_, _, err = w.WriteWALSet([]byte("key1"), []byte("val1")) -require.NoError(t, err) -_, _, err = w.WriteWALSet([]byte("key2"), []byte("val2")) -require.NoError(t, err) -err = w.WriteWALDelete([]byte("oldKey")) -require.NoError(t, err) -err = w.WriteWALCommit(10) -require.NoError(t, err) - -// Version 11 operations -_, _, err = w.WriteWALSet([]byte("key1"), []byte("val1_updated")) -require.NoError(t, err) -_, _, err = w.WriteWALSet([]byte("key3"), []byte("val3")) -require.NoError(t, err) -err = w.WriteWALCommit(11) -require.NoError(t, err) - -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Replay -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -wr, err := r.ReadWAL() -require.NoError(t, err) -require.Equal(t, uint64(10), wr.Version) - -expectedEntries := []struct { -entryType KVEntryType -key []byte -value []byte -version uint64 -}{ -{KVEntryWALSet, []byte("key1"), []byte("val1"), 10}, -{KVEntryWALSet, []byte("key2"), []byte("val2"), 10}, -{KVEntryWALDelete, []byte("oldKey"), nil, 10}, -{KVEntryWALCommit, nil, nil, 10}, -{KVEntryWALSet | KVFlagCachedKey, []byte("key1"), []byte("val1_updated"), 10}, // key1 cached -{KVEntryWALSet, []byte("key3"), []byte("val3"), 10}, -{KVEntryWALCommit, nil, nil, 11}, -} - -for i, exp := range expectedEntries { -entryType, ok, err := wr.Next() -require.NoError(t, err, "entry %d", i) -require.True(t, ok, "entry %d", i) -require.Equal(t, exp.entryType, entryType, "entry %d type", i) - -if exp.key != nil { -require.Equal(t, exp.key, wr.Key, "entry %d key", i) -} -if exp.value != nil { -require.Equal(t, exp.value, wr.Value, "entry %d value", i) -} -if entryType == KVEntryWALCommit { -require.Equal(t, exp.version, wr.Version, "entry %d version", i) -} -} - -// No more entries -_, ok, err := wr.Next() -require.NoError(t, err) -require.False(t, ok) -} - -func TestKVData_Blobs(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -key := []byte("blobKey") -value := []byte("blobValue") - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) - -// Write key blob -keyOffset, err := w.WriteKeyBlob(key) -require.NoError(t, err) - -// Write same key again - should return cached offset -keyOffset2, err := w.WriteKeyBlob(key) -require.NoError(t, err) -require.Equal(t, keyOffset, keyOffset2) - -// Write key/value blobs -keyOffset3, valueOffset, err := w.WriteKeyValueBlobs(key, value) -require.NoError(t, err) -require.Equal(t, keyOffset, keyOffset3) // same key, cached - -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read phase -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -// Read key via offset -keyRead, err := r.UnsafeReadBlob(int(keyOffset)) -require.NoError(t, err) -require.Equal(t, key, keyRead) - -// Read value via offset -valueRead, err := r.UnsafeReadBlob(int(valueOffset)) -require.NoError(t, err) -require.Equal(t, value, valueRead) -} - -func TestKVData_Blobs_ValueType(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Write phase -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -_, valueOffset, err := w.WriteKeyValueBlobs([]byte("key"), []byte("value")) -require.NoError(t, err) -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read raw bytes to verify value uses KVEntryValueBlob type -data, err := os.ReadFile(path) -require.NoError(t, err) - -// The byte before valueOffset should be the type byte -// valueOffset points to the varint length, type byte is 1 before -require.Equal(t, byte(KVEntryValueBlob), data[valueOffset-1]) -} - -func TestKVData_MixedEntries(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -// Write WAL with blob entries interspersed -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) - -// WAL set -_, _, err = w.WriteWALSet([]byte("walKey1"), []byte("walVal1")) -require.NoError(t, err) - -// Blob entry (non-WAL) -blobOffset, err := w.WriteKeyBlob([]byte("blobKey")) -require.NoError(t, err) - -// Another WAL set -_, _, err = w.WriteWALSet([]byte("walKey2"), []byte("walVal2")) -require.NoError(t, err) - -err = w.WriteWALCommit(1) -require.NoError(t, err) - -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read phase -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -wr, err := r.ReadWAL() -require.NoError(t, err) - -// WALReader.Next() should skip blob entries -// Entry 1: WAL Set -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, []byte("walKey1"), wr.Key) - -// Entry 2: WAL Set (blob skipped) -entryType, ok, err = wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, []byte("walKey2"), wr.Key) - -// Entry 3: Commit -entryType, ok, err = wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALCommit, entryType) - -// Blob is still readable via offset -blobData, err := r.UnsafeReadBlob(int(blobOffset)) -require.NoError(t, err) -require.Equal(t, []byte("blobKey"), blobData) -} - -func TestKVData_EdgeCases_LargeKeys(t *testing.T) { -tests := []struct { -name string -keySize int -}{ -{"127 bytes (1-byte varint)", 127}, -{"128 bytes (2-byte varint)", 128}, -{"16383 bytes (2-byte varint max)", 16383}, -{"16384 bytes (3-byte varint)", 16384}, -} - -for _, tt := range tests { -t.Run(tt.name, func (t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") - -key := make([]byte, tt.keySize) -for i := range key { -key[i] = byte(i % 256) -} -value := []byte("value") - -// Write -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) -keyOffset, _, err := w.WriteWALSet(key, value) -require.NoError(t, err) -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) - -// Read -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) - -// Via WALReader -wr, err := r.ReadWAL() -require.NoError(t, err) -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, key, wr.Key) -require.Equal(t, value, wr.Value) +func TestKVData_BlobStore(t *testing.T) { + writer := openTestKVDataWriter(t) + // Write some blobs + key1 := []byte("key") // short key, should not be cached + key2 := []byte("a much longer key 2 that should be cached") + value1 := []byte("value1") + value2 := []byte("value2") -// Via direct blob read -keyRead, err := r.UnsafeReadBlob(int(keyOffset)) -require.NoError(t, err) -require.Equal(t, key, keyRead) -}) -} -} + key1Offset, err := writer.WriteKeyBlob(key1) + require.NoError(t, err) + key2Offset, err := writer.WriteKeyBlob(key2) + require.NoError(t, err) + key1Offset2, value1Offset, err := writer.WriteKeyValueBlobs(key1, value1) + require.NoError(t, err) + key2Offset2, value2Offset, err := writer.WriteKeyValueBlobs(key2, value2) + require.NoError(t, err) + // key1 should NOT be cached + require.NotEqual(t, key1Offset, key1Offset2) + // key2 should be cached + require.Equal(t, key2Offset, key2Offset2) + + // verify we're not in WAL mode and that WAL operations fail + require.False(t, writer.IsInWALMode()) + require.Error(t, writer.WriteStartWAL(1)) + require.Error(t, writer.WriteWALDelete(key1)) + _, _, err = writer.WriteWALSet(key1, value1) + require.Error(t, err) + require.Error(t, writer.WriteWALUpdates([]KVUpdate{})) + require.Error(t, writer.WriteWALCommit(1)) -func TestKVData_EdgeCases_EmptyKey(t *testing.T) { -dir := t.TempDir() -path := filepath.Join(dir, "kv.dat") + // open reader + r := writer.openReader(t) -// Write -f, err := os.Create(path) -require.NoError(t, err) -w := NewKVDataWriter(f) -err = w.WriteStartWAL(1) -require.NoError(t, err) -_, _, err = w.WriteWALSet([]byte{}, []byte("value")) -require.NoError(t, err) -require.NoError(t, w.Flush()) -require.NoError(t, f.Close()) + // verify that the reader does not have a WAL + hasWal, _ := r.HasWAL() + require.False(t, hasWal) -// Read -f, err = os.Open(path) -require.NoError(t, err) -defer f.Close() -r, err := NewKVDataReader(f) -require.NoError(t, err) + _, err = r.ReadWAL() + require.Error(t, err) -wr, err := r.ReadWAL() -require.NoError(t, err) -entryType, ok, err := wr.Next() -require.NoError(t, err) -require.True(t, ok) -require.Equal(t, KVEntryWALSet, entryType) -require.Equal(t, []byte{}, wr.Key) -require.Equal(t, []byte("value"), wr.Value) + // check that all offsets are readable + key1Read, err := r.UnsafeReadBlob(int(key1Offset)) + require.NoError(t, err) + require.Equal(t, key1, key1Read) + key2Read, err := r.UnsafeReadBlob(int(key2Offset)) + require.NoError(t, err) + require.Equal(t, key2, key2Read) + key1Read2, err := r.UnsafeReadBlob(int(key1Offset2)) + require.NoError(t, err) + require.Equal(t, key1, key1Read2) + value1Read, err := r.UnsafeReadBlob(int(value1Offset)) + require.NoError(t, err) + require.Equal(t, value1, value1Read) + key2Read2, err := r.UnsafeReadBlob(int(key2Offset2)) + require.NoError(t, err) + require.Equal(t, key2, key2Read2) + value2Read, err := r.UnsafeReadBlob(int(value2Offset)) + require.NoError(t, err) + require.Equal(t, value2, value2Read) } -*/ diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index 21e59309103a..42190d3d0094 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -51,6 +51,9 @@ func (kvs *KVDataWriter) WriteStartWAL(version uint64) error { // WriteWALUpdates writes a batch of WAL updates. // This can ONLY be called when the writer is in WAL mode. func (kvs *KVDataWriter) WriteWALUpdates(updates []KVUpdate) error { + if !kvs.walMode { + return fmt.Errorf("cannot write WAL updates when not in WAL mode") + } for _, update := range updates { if deleteKey := update.DeleteKey; deleteKey != nil { err := kvs.WriteWALDelete(deleteKey) From a34a1745634a5e20c8dad539c6a3fe467989e332 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 13:38:29 -0500 Subject: [PATCH 29/34] WIP on tests --- iavl/internal/kvdata_test.go | 23 +++++++++++++++++++++++ iavl/internal/kvdata_writer.go | 22 +++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/iavl/internal/kvdata_test.go b/iavl/internal/kvdata_test.go index aed657e2f105..15af7c9a8213 100644 --- a/iavl/internal/kvdata_test.go +++ b/iavl/internal/kvdata_test.go @@ -120,6 +120,23 @@ func TestKVData_WAL(t *testing.T) { // short key should NOT be cached require.NotEqual(t, shortKeyOffset, blobKeyOffset2) + // write invalid updates, should error + require.Error(t, writer.WriteWALUpdates([]KVUpdate{ + {}, + })) + require.Error(t, writer.WriteWALUpdates([]KVUpdate{ + { + DeleteKey: shortKey, + SetNode: &MemNode{ + key: shortKey, + value: shortValue, + }, + }, + })) + + // write an empty commit + require.NoError(t, writer.WriteWALCommit(44)) + // open reader r := writer.openReader(t) // Verify that the reader has a WAL @@ -203,6 +220,12 @@ func TestKVData_WAL(t *testing.T) { require.True(t, ok) require.Equal(t, KVEntryWALCommit, entryType) require.Equal(t, uint64(43), wr.Version) + // Entry 12: WAL Commit (empty) + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALCommit, entryType) + require.Equal(t, uint64(44), wr.Version) // No more entries _, ok, err = wr.Next() diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index 42190d3d0094..fc12f2e01361 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -55,20 +55,28 @@ func (kvs *KVDataWriter) WriteWALUpdates(updates []KVUpdate) error { return fmt.Errorf("cannot write WAL updates when not in WAL mode") } for _, update := range updates { - if deleteKey := update.DeleteKey; deleteKey != nil { + deleteKey := update.DeleteKey + setNode := update.SetNode + if deleteKey != nil && setNode != nil { + return fmt.Errorf("invalid update: both SetNode and DeleteKey are set") + } + + if deleteKey == nil && setNode == nil { + return fmt.Errorf("invalid update: neither SetNode nor DeleteKey is set") + } + + if deleteKey != nil { err := kvs.WriteWALDelete(deleteKey) if err != nil { return err } - } else if memNode := update.SetNode; memNode != nil { - keyOffset, valueOffset, err := kvs.WriteWALSet(memNode.key, memNode.value) + } else { // setNode != nil + keyOffset, valueOffset, err := kvs.WriteWALSet(setNode.key, setNode.value) if err != nil { return err } - memNode.keyOffset = keyOffset - memNode.valueOffset = valueOffset - } else { - return fmt.Errorf("invalid update: neither SetNode nor DeleteKey is set") + setNode.keyOffset = keyOffset + setNode.valueOffset = valueOffset } } return nil From db113dc596044a2dd7db2a6879c11e3e6883dcc6 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 13:47:36 -0500 Subject: [PATCH 30/34] document FileWriter --- iavl/internal/file_writer.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/iavl/internal/file_writer.go b/iavl/internal/file_writer.go index afe33bb1c3a9..774dcaf58b73 100644 --- a/iavl/internal/file_writer.go +++ b/iavl/internal/file_writer.go @@ -7,23 +7,29 @@ import ( "os" ) +// FileWriter is a buffered writer that tracks the number of bytes written. type FileWriter struct { writer *bufio.Writer written int } +// NewFileWriter creates a new FileWriter. +// Currently, it uses a buffer size of 512kb. +// If we want to make that configurable, we can add a constructor with a buffer size parameter in the future. func NewFileWriter(file *os.File) *FileWriter { return &FileWriter{ writer: bufio.NewWriterSize(file, 512*1024 /* 512kb */), // TODO: maybe we can have this as a config option? } } +// Write writes data to the underlying buffered writer and updates the written byte count. func (f *FileWriter) Write(p []byte) (n int, err error) { n, err = f.writer.Write(p) f.written += n return n, err } +// Flush flushes the underlying buffered writer. func (f *FileWriter) Flush() error { if err := f.writer.Flush(); err != nil { return fmt.Errorf("failed to flush writer: %w", err) @@ -31,6 +37,7 @@ func (f *FileWriter) Flush() error { return nil } +// Size returns the total number of bytes written so far. func (f *FileWriter) Size() int { return f.written } From 493430b6fd5d0c7e14fe4e655c626877379a56b2 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 13:48:25 -0500 Subject: [PATCH 31/34] add empty key/value test cases --- iavl/internal/kvdata_test.go | 62 ++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/iavl/internal/kvdata_test.go b/iavl/internal/kvdata_test.go index 15af7c9a8213..09e78ab920c3 100644 --- a/iavl/internal/kvdata_test.go +++ b/iavl/internal/kvdata_test.go @@ -137,6 +137,19 @@ func TestKVData_WAL(t *testing.T) { // write an empty commit require.NoError(t, writer.WriteWALCommit(44)) + // Test empty key/value edge cases via WriteWALUpdates + emptyKey := []byte{} + emptyValue := []byte{} + emptyMemNode := &MemNode{key: emptyKey, value: emptyValue} + err = writer.WriteWALUpdates([]KVUpdate{ + {DeleteKey: emptyKey}, + {SetNode: emptyMemNode}, + }) + require.NoError(t, err) + require.NotZero(t, emptyMemNode.keyOffset) + require.NotZero(t, emptyMemNode.valueOffset) + require.NoError(t, writer.WriteWALCommit(45)) + // open reader r := writer.openReader(t) // Verify that the reader has a WAL @@ -227,6 +240,28 @@ func TestKVData_WAL(t *testing.T) { require.Equal(t, KVEntryWALCommit, entryType) require.Equal(t, uint64(44), wr.Version) + // Entry 13: WAL Delete with empty key + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALDelete, entryType) + require.Equal(t, emptyKey, wr.Key) + + // Entry 14: WAL Set with empty key and empty value + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALSet, entryType) + require.Equal(t, emptyKey, wr.Key) + require.Equal(t, emptyValue, wr.Value) + + // Entry 15: WAL Commit + entryType, ok, err = wr.Next() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, KVEntryWALCommit, entryType) + require.Equal(t, uint64(45), wr.Version) + // No more entries _, ok, err = wr.Next() require.NoError(t, err) @@ -282,6 +317,13 @@ func TestKVData_WAL(t *testing.T) { memValue3Read, err := r.UnsafeReadBlob(int(memNode3.valueOffset)) require.NoError(t, err) require.Equal(t, reinsertedValue, memValue3Read) + // check empty memNode offsets + emptyKeyRead, err := r.UnsafeReadBlob(int(emptyMemNode.keyOffset)) + require.NoError(t, err) + require.Equal(t, emptyKey, emptyKeyRead) + emptyValueRead, err := r.UnsafeReadBlob(int(emptyMemNode.valueOffset)) + require.NoError(t, err) + require.Equal(t, emptyValue, emptyValueRead) } func TestKVData_BlobStore(t *testing.T) { @@ -305,6 +347,16 @@ func TestKVData_BlobStore(t *testing.T) { // key2 should be cached require.Equal(t, key2Offset, key2Offset2) + // Test empty key/value edge cases + emptyKey := []byte{} + emptyValue := []byte{} + emptyKeyOffset, err := writer.WriteKeyBlob(emptyKey) + require.NoError(t, err) + emptyKeyOffset2, emptyValueOffset, err := writer.WriteKeyValueBlobs(emptyKey, emptyValue) + require.NoError(t, err) + // empty key should NOT be cached (len < 4) + require.NotEqual(t, emptyKeyOffset, emptyKeyOffset2) + // verify we're not in WAL mode and that WAL operations fail require.False(t, writer.IsInWALMode()) require.Error(t, writer.WriteStartWAL(1)) @@ -343,4 +395,14 @@ func TestKVData_BlobStore(t *testing.T) { value2Read, err := r.UnsafeReadBlob(int(value2Offset)) require.NoError(t, err) require.Equal(t, value2, value2Read) + // check empty key/value offsets + emptyKeyRead, err := r.UnsafeReadBlob(int(emptyKeyOffset)) + require.NoError(t, err) + require.Equal(t, emptyKey, emptyKeyRead) + emptyKeyRead2, err := r.UnsafeReadBlob(int(emptyKeyOffset2)) + require.NoError(t, err) + require.Equal(t, emptyKey, emptyKeyRead2) + emptyValueRead, err := r.UnsafeReadBlob(int(emptyValueOffset)) + require.NoError(t, err) + require.Equal(t, emptyValue, emptyValueRead) } From 9db9564fa129d50c1bfee1706ceb1ad33a3cb7a5 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 14:10:09 -0500 Subject: [PATCH 32/34] add mmap tests, fix empty close bug, minor cleanups --- iavl/internal/file_writer.go | 3 +- iavl/internal/kvdata_writer.go | 4 +- iavl/internal/leaf_layout.go | 2 +- iavl/internal/mmap.go | 6 ++- iavl/internal/mmap_test.go | 75 ++++++++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 iavl/internal/mmap_test.go diff --git a/iavl/internal/file_writer.go b/iavl/internal/file_writer.go index 774dcaf58b73..24c939e43e5f 100644 --- a/iavl/internal/file_writer.go +++ b/iavl/internal/file_writer.go @@ -17,8 +17,9 @@ type FileWriter struct { // Currently, it uses a buffer size of 512kb. // If we want to make that configurable, we can add a constructor with a buffer size parameter in the future. func NewFileWriter(file *os.File) *FileWriter { + const defaultBufferSize = 512 * 1024 // 512kb return &FileWriter{ - writer: bufio.NewWriterSize(file, 512*1024 /* 512kb */), // TODO: maybe we can have this as a config option? + writer: bufio.NewWriterSize(file, defaultBufferSize), } } diff --git a/iavl/internal/kvdata_writer.go b/iavl/internal/kvdata_writer.go index fc12f2e01361..d57112774630 100644 --- a/iavl/internal/kvdata_writer.go +++ b/iavl/internal/kvdata_writer.go @@ -104,7 +104,6 @@ func (kvs *KVDataWriter) WriteWALSet(key, value []byte) (keyOffset, valueOffset return 0, 0, err } } else { - var err error keyOffset, err = kvs.writeLenPrefixedBytes(key) if err != nil { return 0, 0, err @@ -214,7 +213,8 @@ func (kvs *KVDataWriter) writeBlob(blobType KVEntryType, bz []byte) (offset uint } func (kvs *KVDataWriter) addKeyToCache(key []byte, offset uint32) { - if len(key) < 4 { + const minCacheKeyLen = 4 // we choose 4 because offsets are uint32 (4 bytes) + if len(key) < minCacheKeyLen { // don't cache very small keys return } diff --git a/iavl/internal/leaf_layout.go b/iavl/internal/leaf_layout.go index 1081c123ca59..bf9dbd52560e 100644 --- a/iavl/internal/leaf_layout.go +++ b/iavl/internal/leaf_layout.go @@ -25,7 +25,7 @@ type LeafLayout struct { // KeyOffset is the offset the key data for this node in the key value data file. // NOTE: that a 32-bit offset means that the key data file can be at most 4GB in size. // If we want to support larger key/value data files in the future, we can change this to a 40-bit offset. - // However, this would require changing the size of this struct from 44 bytes to 48 bytes which would break + // However, this would require growing the size of this struct which would break // on-disk compatibility. // Such an upgrade could be made by introducing a "wide changeset" format that lives alongside // this existing "compact" format. diff --git a/iavl/internal/mmap.go b/iavl/internal/mmap.go index 59e288a67e8d..a0caf4a5dea7 100644 --- a/iavl/internal/mmap.go +++ b/iavl/internal/mmap.go @@ -4,8 +4,9 @@ import ( "fmt" "io" "os" + + "github.com/edsrzf/mmap-go" ) -import "github.com/edsrzf/mmap-go" // Mmap represents a read-only memory map into a file. type Mmap struct { @@ -73,6 +74,9 @@ func (m Mmap) Len() int { // Close unmaps the memory-mapped file but does not close the underlying file. func (m Mmap) Close() error { + if m.handle == nil { + return nil + } return m.handle.Unmap() } diff --git a/iavl/internal/mmap_test.go b/iavl/internal/mmap_test.go new file mode 100644 index 000000000000..628d5a44f00b --- /dev/null +++ b/iavl/internal/mmap_test.go @@ -0,0 +1,75 @@ +package internal + +import ( + "os" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMmap_ReadWrite(t *testing.T) { + f, err := os.CreateTemp(t.TempDir(), "test") + require.NoError(t, err) + defer f.Close() + _, err = f.Write([]byte("hello world")) + require.NoError(t, err) + + m, err := NewMmap(f) + require.NoError(t, err) + defer m.Close() + + require.Equal(t, 11, m.Len()) + require.Equal(t, byte('h'), m.At(0)) + require.Equal(t, byte('d'), m.At(10)) + + // UnsafeSlice - within bounds + bz, err := m.UnsafeSlice(0, 5) + require.NoError(t, err) + require.Equal(t, []byte("hello"), bz) + + // UnsafeSlice - within bounds + bz, err = m.UnsafeSlice(6, 5) + require.NoError(t, err) + require.Equal(t, []byte("world"), bz) + + // UnsafeSlice - out of bounds + _, err = m.UnsafeSlice(10, 5) + require.Error(t, err) + + // UnsafeSliceVar - more than available + n, bz, err := m.UnsafeSliceVar(0, 20) + require.NoError(t, err) + require.Equal(t, 11, n) + require.Equal(t, []byte("hello world"), bz) + + // UnsafeSliceVar - less than available + n, bz, err = m.UnsafeSliceVar(6, 3) + require.NoError(t, err) + require.Equal(t, 3, n) + require.Equal(t, []byte("wor"), bz) + + // UnsafeSliceVar - exactly available + n, bz, err = m.UnsafeSliceVar(0, 11) + require.NoError(t, err) + require.Equal(t, 11, n) + require.Equal(t, []byte("hello world"), bz) + + // UnsafeSliceVar - no data + n, bz, err = m.UnsafeSliceVar(11, 5) + require.Error(t, err) + + // UnsafeSliceVar - out of bounds + n, bz, err = m.UnsafeSliceVar(20, 5) + require.Error(t, err) +} + +func TestMmap_EmptyFile(t *testing.T) { + f, err := os.CreateTemp(t.TempDir(), "empty") + require.NoError(t, err) + defer f.Close() + + m, err := NewMmap(f) + require.NoError(t, err) + require.Equal(t, 0, m.Len()) + require.NoError(t, m.Close()) +} From 951c767060fa4b7456aa6cf7815ebd6fceca1048 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 14:13:12 -0500 Subject: [PATCH 33/34] lint fix --- iavl/internal/kvdata_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/iavl/internal/kvdata_test.go b/iavl/internal/kvdata_test.go index 09e78ab920c3..88ede8ed95e7 100644 --- a/iavl/internal/kvdata_test.go +++ b/iavl/internal/kvdata_test.go @@ -12,6 +12,8 @@ type kvDataWriterHelper struct { } func openTestKVDataWriter(t *testing.T) *kvDataWriterHelper { + t.Helper() + files, err := CreateChangesetFiles(t.TempDir(), 1, 0) require.NoError(t, err) t.Cleanup(func() { @@ -25,6 +27,8 @@ func openTestKVDataWriter(t *testing.T) *kvDataWriterHelper { } func (h *kvDataWriterHelper) openReader(t *testing.T) *KVDataReader { + t.Helper() + require.NoError(t, h.writer.Flush()) rdr, err := NewKVDataReader(h.files.KVDataFile()) require.NoError(t, err) From 938c3c346f21ac74c6d8cc6f99efe4d8ccf6aaf1 Mon Sep 17 00:00:00 2001 From: Aaron Craelius Date: Fri, 5 Dec 2025 15:43:46 -0500 Subject: [PATCH 34/34] fix lint --- iavl/internal/mmap_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iavl/internal/mmap_test.go b/iavl/internal/mmap_test.go index 628d5a44f00b..5bb5fa18f0de 100644 --- a/iavl/internal/mmap_test.go +++ b/iavl/internal/mmap_test.go @@ -55,11 +55,11 @@ func TestMmap_ReadWrite(t *testing.T) { require.Equal(t, []byte("hello world"), bz) // UnsafeSliceVar - no data - n, bz, err = m.UnsafeSliceVar(11, 5) + _, bz, err = m.UnsafeSliceVar(11, 5) require.Error(t, err) // UnsafeSliceVar - out of bounds - n, bz, err = m.UnsafeSliceVar(20, 5) + _, bz, err = m.UnsafeSliceVar(20, 5) require.Error(t, err) }