-
Notifications
You must be signed in to change notification settings - Fork 275
perf(block): sync.Map -> atomic bitmap for cache dirty tracking #2235
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 10 commits
ec523da
b4b62cc
7238599
791950e
f8e5216
6b71c54
7fc43fc
be976c0
779e033
19a857a
3f0daf4
c65d2b2
433c5d4
cb37eb1
51ba221
86f31cb
43b647d
adf386d
3d1e86c
1c53aec
64b640a
4f9b0cb
3b4cd7d
75b52cc
e67b4ed
b77211a
e08ddaa
6f0fe99
d4542cd
60c54ff
0835f26
2840ad4
07387c8
ae1ff1e
a89f1ff
4cdb5f2
59563a6
7f3307f
dd98117
098d060
3764ee2
3fdbd41
dcfc4ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| // Package atomicbitset provides a fixed-size bitset with atomic set operations. | ||
| package atomicbitset | ||
|
|
||
| import ( | ||
| "iter" | ||
| "math" | ||
| "math/bits" | ||
| "sync/atomic" | ||
| ) | ||
|
|
||
| // Bitset is a fixed-size bitset backed by atomic uint64 words. | ||
| // SetRange uses atomic OR, so concurrent writers are safe without | ||
| // external locking. | ||
| // | ||
| // A Bitset must not be copied after first use (copies share the | ||
| // underlying array). | ||
| type Bitset struct { | ||
| words []atomic.Uint64 | ||
| n uint | ||
| } | ||
|
|
||
| // New returns a Bitset with capacity for n bits, all initially zero. | ||
| func New(n uint) Bitset { | ||
|
||
| return Bitset{ | ||
| words: make([]atomic.Uint64, (n+63)/64), | ||
| n: n, | ||
| } | ||
| } | ||
|
|
||
| // Len returns the capacity in bits. | ||
| func (b *Bitset) Len() uint { return b.n } | ||
|
|
||
| // Has reports whether bit i is set. Out-of-range returns false. | ||
| func (b *Bitset) Has(i uint) bool { | ||
levb marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if i >= b.n { | ||
| return false | ||
| } | ||
|
|
||
| return b.words[i/64].Load()&(1<<(i%64)) != 0 | ||
| } | ||
|
|
||
| // wordMask returns a bitmask covering bits [lo, hi) within a single uint64 word. | ||
| func wordMask(lo, hi uint) uint64 { | ||
| if hi-lo == 64 { | ||
| return math.MaxUint64 | ||
| } | ||
|
|
||
| return ((1 << (hi - lo)) - 1) << lo | ||
| } | ||
|
|
||
| // HasRange reports whether every bit in [lo, hi) is set. | ||
| // An empty range returns true. hi is capped to Len(). | ||
| // Returns false if lo is out of range and the range is non-empty. | ||
| func (b *Bitset) HasRange(lo, hi uint) bool { | ||
| if lo >= hi { | ||
| return true | ||
| } | ||
| if hi > b.n { | ||
| hi = b.n | ||
| } | ||
| if lo >= hi { | ||
| return false | ||
| } | ||
| for i := lo; i < hi; { | ||
| w := i / 64 | ||
| bit := i % 64 | ||
| top := min(hi-w*64, 64) | ||
| mask := wordMask(bit, top) | ||
|
|
||
| if b.words[w].Load()&mask != mask { | ||
| return false | ||
| } | ||
| i = (w + 1) * 64 | ||
| } | ||
|
|
||
| return true | ||
| } | ||
|
|
||
| // SetRange sets every bit in [lo, hi) using atomic OR. | ||
| // hi is capped to Len(). | ||
| func (b *Bitset) SetRange(lo, hi uint) { | ||
| if hi > b.n { | ||
| hi = b.n | ||
| } | ||
| if lo >= hi { | ||
| return | ||
| } | ||
| for i := lo; i < hi; { | ||
| w := i / 64 | ||
| bit := i % 64 | ||
| top := min(hi-w*64, 64) | ||
|
|
||
| b.words[w].Or(wordMask(bit, top)) | ||
|
|
||
| i = (w + 1) * 64 | ||
| } | ||
| } | ||
|
|
||
| // Iterator returns an iterator over the indices of set bits | ||
| // in ascending order. | ||
| func (b *Bitset) Iterator() iter.Seq[uint] { | ||
| return func(yield func(uint) bool) { | ||
| for wi := range b.words { | ||
| word := b.words[wi].Load() | ||
| base := uint(wi) * 64 | ||
| for word != 0 { | ||
| tz := uint(bits.TrailingZeros64(word)) | ||
| if !yield(base + tz) { | ||
| return | ||
| } | ||
| word &= word - 1 | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we have a 100GB disk and we set a dirty block at the end, won't we use like 400KB of memory just for storing that one bit?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Won't this be 3.125MiB?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be fair here though, this is also a problem with the bitset we use now. Only the roaring bitmaps solve this, but they don't support the iteration/lookup we would ideally need, so for pause proccessing I would still like to use the current bitset (or implement similarly effective iterator over the bitmaps).
One other node—with the current NBD the cache map might not be under much contention (but might be relevant if we use that for the other caches).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
divided by 64 as you have 64 bits (chunks) for one array slot
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👌 My bad (not bad in the end).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I added 2 implementations now - roaring (by far the most compact, and plenty fast on its own, single threaded), and a sharded atomic uint64 array which is still quite compact and much (100x) faster when concurrency is involved.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The roaring implementation looks great. Unless we really need the the speed optimization, I'm for opting for now for the roaring approach only, leaving the sharded atomic optimization outside of this PR