1
1
// Copyright (c) 2023, DeepLink.
2
2
3
+ #include < cstddef>
3
4
#include < functional>
4
5
#include < memory>
5
6
#include < stack>
14
15
15
16
namespace dipu {
16
17
17
- // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
18
- const size_t kMaxExtendSize = get_env_or_default(" DIPU_MAX_EXTEND_SIZE" , 1024 )
19
- << 20U ;
20
-
18
+ inline size_t round_up_to_alignment (size_t nbytes, size_t alignment_size) {
19
+ if (nbytes <= 0 ) {
20
+ return alignment_size;
21
+ }
22
+ return ((nbytes - 1 ) | (alignment_size - 1 )) + 1 ;
23
+ }
21
24
class BFCachingAllocatorImpl {
22
25
public:
23
26
using allocate_fn_t = std::function<void *(size_t )>;
@@ -32,9 +35,11 @@ class BFCachingAllocatorImpl {
32
35
static constexpr int kNumSubBins = 4 ;
33
36
static constexpr int kLogNumSubBins = 2 ;
34
37
// Allocation parameters
35
- static constexpr size_t kMinAllocationSize = 512 ;
36
- static constexpr size_t kMaxInternalFragmentation = 8U << 20U ; // 8MB
37
- static constexpr size_t kMinExtendSize = 8U << 20U ; // 8MB
38
+ static constexpr int kMinAllocationSize = 512 ;
39
+ static constexpr int kSmallBlockSize = 2 << 20 ;
40
+ static constexpr int kMiddleBlockSize = 20 << 20 ;
41
+ static constexpr int kLargeBlockSize = 200 << 20 ;
42
+ static constexpr int kLargeAlignSize = 1024 << 20 ;
38
43
39
44
size_t cachedBytes = 0 ;
40
45
size_t allocatedBytes = 0 ;
@@ -67,8 +72,6 @@ class BFCachingAllocatorImpl {
67
72
__uint128_t bits = 0 ;
68
73
// Virtual chunks which are the heads of the bins
69
74
std::array<int , static_cast <size_t >(kNumBigBins * kNumSubBins )> binHeads_{};
70
- // The extending size next time
71
- size_t currExtendSize_ = kMinExtendSize ;
72
75
73
76
explicit StreamSet (size_t id) : id(id) {}
74
77
@@ -140,7 +143,10 @@ class BFCachingAllocatorImpl {
140
143
mutable mutex_t mut_;
141
144
142
145
static size_t roundBytes (size_t nbytes) {
143
- return ((nbytes - 1 ) | (kMinAllocationSize - 1 )) + 1 ;
146
+ if (nbytes < kLargeBlockSize ) {
147
+ return round_up_to_alignment (nbytes, kMinAllocationSize );
148
+ }
149
+ return round_up_to_alignment (nbytes, kSmallBlockSize );
144
150
}
145
151
146
152
int newChunk (void * ptr, size_t size, size_t stream) {
@@ -293,20 +299,21 @@ class BFCachingAllocatorImpl {
293
299
294
300
int extend (size_t nbytes, StreamSetHandle& set) {
295
301
emptyCacheWithoutLock ();
296
- auto & extSize = set->currExtendSize_ ;
297
302
bool increased = false ;
298
- while (extSize < nbytes && extSize < kMaxExtendSize ) {
299
- extSize *= 2 ;
300
- increased = true ;
303
+ size_t allocateSize = nbytes;
304
+ if (nbytes < kSmallBlockSize ) {
305
+ allocateSize = kSmallBlockSize ;
306
+ } else if (nbytes < kMiddleBlockSize ) {
307
+ allocateSize = kMiddleBlockSize ;
308
+ } else if (nbytes < kLargeBlockSize ) {
309
+ allocateSize = round_up_to_alignment (nbytes, kMiddleBlockSize );
310
+ } else {
311
+ allocateSize = round_up_to_alignment (nbytes, kLargeAlignSize );
301
312
}
302
313
303
- size_t currBytes = std::max (nbytes, extSize );
314
+ size_t currBytes = std::max (nbytes, allocateSize );
304
315
void * ptr = allocateOnDevice (currBytes);
305
- if (ptr) {
306
- if (!increased && extSize < kMaxExtendSize ) {
307
- extSize *= 2 ;
308
- }
309
- } else {
316
+ if (!ptr) {
310
317
if (currBytes > nbytes) {
311
318
currBytes = nbytes;
312
319
ptr = allocateOnDevice (currBytes);
@@ -371,8 +378,17 @@ class BFCachingAllocatorImpl {
371
378
}
372
379
373
380
if (id) {
374
- if (chunks_[id].size >= nbytes * 2 ||
375
- chunks_[id].size >= nbytes + kMaxInternalFragmentation ) {
381
+ int internlalMaxFragnmentSize = 0 ;
382
+ const int chunk_size = static_cast <int >(chunks_[id].size );
383
+ if (chunk_size < kSmallBlockSize ) {
384
+ internlalMaxFragnmentSize = kMinAllocationSize ;
385
+ } else if (chunk_size < kLargeAlignSize ) {
386
+ internlalMaxFragnmentSize = kSmallBlockSize ;
387
+ } else {
388
+ internlalMaxFragnmentSize = kLargeAlignSize ;
389
+ }
390
+ if ((chunk_size >= (nbytes << 1 )) ||
391
+ (chunk_size > (nbytes + internlalMaxFragnmentSize))) {
376
392
id = split (id, nbytes);
377
393
}
378
394
chunks_[id].allocated = true ;
0 commit comments