1
1
// Copyright (c) 2023, DeepLink.
2
2
3
- #include < cstddef>
4
3
#include < functional>
5
4
#include < memory>
6
5
#include < stack>
7
6
#include < thread>
8
7
#include < utility>
9
8
#include < vector>
10
9
10
+ #include " csrc_dipu/utils/env.hpp"
11
+
11
12
#include " DIPUCachingAllocator.h"
12
13
#include " DIPUSpinMutex.h"
13
14
14
15
namespace dipu {
15
16
16
- inline size_t round_up_to_alignment ( size_t nbytes, size_t alignment_size) {
17
- return ((nbytes - 1 ) | (alignment_size - 1 )) + 1 ;
18
- }
17
+ // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
18
+ const size_t kMaxExtendSize = get_env_or_default( " DIPU_MAX_EXTEND_SIZE " , 1024 )
19
+ << 20U ;
19
20
20
21
class BFCachingAllocatorImpl {
21
22
public:
@@ -30,23 +31,10 @@ class BFCachingAllocatorImpl {
30
31
// Number of second level bins (linearly)
31
32
static constexpr int kNumSubBins = 4 ;
32
33
static constexpr int kLogNumSubBins = 2 ;
33
-
34
34
// Allocation parameters
35
- static constexpr size_t kMinBlockSize =
36
- 512 ; // all sizes are rounded to at least 512 bytes
37
- static constexpr size_t kSmallSize =
38
- 1048576 ; // largest "small" allocation is 1 MiB
39
- static constexpr size_t kSmallBuffer =
40
- 2097152 ; // "small" allocations are packed in 2 MiB blocks
41
- static constexpr size_t kLargeBuffer =
42
- 20971520 ; // "large" allocations may be packed in 20 MiB blocks
43
- static constexpr size_t kMinLargeAlloc =
44
- 10485760 ; // allocations between 1 and 10 MiB may use kLargeBuffer
45
- static constexpr size_t kRoundLarge =
46
- 2097152 ; // round up large allocations to 2 MiB
47
- static constexpr size_t kMaxSplitableBlockSize =
48
- 200 << 20 ; // To further reduce fragmentation, blocks >= 200MB are not
49
- // allowed to be split
35
+ static constexpr size_t kMinAllocationSize = 512 ;
36
+ static constexpr size_t kMaxInternalFragmentation = 8U << 20U ; // 8MB
37
+ static constexpr size_t kMinExtendSize = 8U << 20U ; // 8MB
50
38
51
39
size_t cachedBytes = 0 ;
52
40
size_t allocatedBytes = 0 ;
@@ -79,6 +67,8 @@ class BFCachingAllocatorImpl {
79
67
__uint128_t bits = 0 ;
80
68
// Virtual chunks which are the heads of the bins
81
69
std::array<int , static_cast <size_t >(kNumBigBins * kNumSubBins )> binHeads_{};
70
+ // The extending size next time
71
+ size_t currExtendSize_ = kMinExtendSize ;
82
72
83
73
explicit StreamSet (size_t id) : id(id) {}
84
74
@@ -150,11 +140,7 @@ class BFCachingAllocatorImpl {
150
140
mutable mutex_t mut_;
151
141
152
142
static size_t roundBytes (size_t nbytes) {
153
- if (nbytes <= kMinBlockSize ) {
154
- return kMinBlockSize ;
155
- }
156
- int clz = __builtin_clzll (nbytes - 1 );
157
- return (1LU << (sizeof (int64_t ) - clz));
143
+ return ((nbytes - 1 ) | (kMinAllocationSize - 1 )) + 1 ;
158
144
}
159
145
160
146
int newChunk (void * ptr, size_t size, size_t stream) {
@@ -177,7 +163,7 @@ class BFCachingAllocatorImpl {
177
163
// Big bin range:
178
164
// [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
179
165
// Split big bin into `kNumSubBins` sub bins
180
- size_t nBlocks = nbytes / kMinBlockSize ;
166
+ size_t nBlocks = nbytes / kMinAllocationSize ;
181
167
constexpr int kMaxBinIdx = 63 ;
182
168
int bigBinIdx = kMaxBinIdx - __builtin_clzll (nBlocks);
183
169
// If `nbytes` is so large, we just put it into the last
@@ -253,22 +239,16 @@ class BFCachingAllocatorImpl {
253
239
return id;
254
240
}
255
241
256
- void shrink (StreamSetHandle& set, size_t try_release_size = 0 ) {
257
- size_t released_size = 0 ;
242
+ void shrink (StreamSetHandle& set) {
258
243
for (int binHead : set->binHeads_ ) {
259
244
int k = chunks_[binHead].nextChunkInList ;
260
245
while (k) {
261
- auto & chunk_k = chunks_[k];
262
- if (chunk_k.isMonoBlock ()) {
263
- released_size += chunk_k.size ;
264
- releaseOnDevice (chunk_k.ptr , chunk_k.size );
246
+ if (chunks_[k].isMonoBlock ()) {
247
+ releaseOnDevice (chunks_[k].ptr , chunks_[k].size );
265
248
removeChunkFromBin (k);
266
249
recycleIds_.push (k);
267
- if (try_release_size > 0 && released_size >= try_release_size) {
268
- break ;
269
- }
270
250
}
271
- k = chunk_k .nextChunkInList ;
251
+ k = chunks_[k] .nextChunkInList ;
272
252
}
273
253
}
274
254
}
@@ -311,39 +291,32 @@ class BFCachingAllocatorImpl {
311
291
return id;
312
292
}
313
293
314
- size_t getAllocateSize (size_t nbytes) {
315
- if (nbytes <= kSmallSize ) {
316
- return kSmallBuffer ;
317
- }
318
- if (nbytes < kMinLargeAlloc ) {
319
- return kLargeBuffer ;
320
- }
321
- return round_up_to_alignment (nbytes, kRoundLarge );
322
- }
323
-
324
294
int extend (size_t nbytes, StreamSetHandle& set) {
325
- size_t allocateSize = getAllocateSize (nbytes);
326
-
327
- void * ptr = allocateOnDevice (allocateSize);
328
- if (!ptr) {
329
- shrink (set, allocateSize);
330
- ptr = allocateOnDevice (allocateSize);
331
- }
332
- if (!ptr) {
333
- shrink (set);
334
- ptr = allocateOnDevice (allocateSize);
335
- }
336
- if (!ptr) {
337
- if (allocateSize > nbytes) {
338
- allocateSize = nbytes;
339
- ptr = allocateOnDevice (allocateSize);
295
+ emptyCacheWithoutLock ();
296
+ auto & extSize = set->currExtendSize_ ;
297
+ bool increased = false ;
298
+ while (extSize < nbytes && extSize < kMaxExtendSize ) {
299
+ extSize *= 2 ;
300
+ increased = true ;
301
+ }
302
+
303
+ size_t currBytes = std::max (nbytes, extSize);
304
+ void * ptr = allocateOnDevice (currBytes);
305
+ if (ptr) {
306
+ if (!increased && extSize < kMaxExtendSize ) {
307
+ extSize *= 2 ;
308
+ }
309
+ } else {
310
+ if (currBytes > nbytes) {
311
+ currBytes = nbytes;
312
+ ptr = allocateOnDevice (currBytes);
340
313
}
341
314
}
342
315
if (!ptr) {
343
316
return 0 ;
344
317
}
345
318
346
- int id = newChunk (ptr, allocateSize , set->id );
319
+ int id = newChunk (ptr, currBytes , set->id );
347
320
return id;
348
321
}
349
322
@@ -398,7 +371,8 @@ class BFCachingAllocatorImpl {
398
371
}
399
372
400
373
if (id) {
401
- if (chunks_[id].size >= (nbytes << 1 )) {
374
+ if (chunks_[id].size >= nbytes * 2 ||
375
+ chunks_[id].size >= nbytes + kMaxInternalFragmentation ) {
402
376
id = split (id, nbytes);
403
377
}
404
378
chunks_[id].allocated = true ;
@@ -532,9 +506,6 @@ class BFCachingAllocator : public CacheAllocator {
532
506
: DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes) {}
533
507
534
508
~Context () {
535
- if (size () <= 0 ) {
536
- return ;
537
- }
538
509
auto allocator_ = static_cast <const BFCachingAllocator*>(allocator ());
539
510
DIPU_DEBUG_ALLOCATOR (8 , " BFCachingAllocator: add to async_mem_pool:"
540
511
<< ptr () << " , " << size () << " nbytes, id:"
@@ -544,22 +515,18 @@ class BFCachingAllocator : public CacheAllocator {
544
515
if (ptr ()) {
545
516
allocator_->metrics_producer .deallocate (ptr ());
546
517
std::deque<DIPUEvent> events;
547
- bool record_block = false ;
548
518
for (auto const & stream : streams ()) {
549
519
events.emplace_back ();
550
520
DIPU_DEBUG_ALLOCATOR (8 , " BFCachingAllocator: record to stream:"
551
521
<< stream.rawstream ());
552
522
events.back ().record (stream);
553
- record_block = true ;
554
523
}
555
524
allocator_->async_mem_pool ()->add (std::make_tuple (ptr (), id_),
556
525
events);
557
526
allocator_->set_memory_allocated (allocator_->memory_allocated () -
558
527
nbytes_);
559
- if (!record_block) {
560
- allocator_->restore ();
561
- }
562
528
}
529
+ allocator_->restore ();
563
530
} else {
564
531
DIPU_DEBUG_ALLOCATOR (8 ,
565
532
" BFCachingAllocator:~Context: destory tensor "
@@ -570,12 +537,12 @@ class BFCachingAllocator : public CacheAllocator {
570
537
571
538
friend class Context ;
572
539
573
- c10::DataPtr allocate (size_t origin_size ) const override {
540
+ c10::DataPtr allocate (size_t size ) const override {
574
541
restore ();
575
542
if (async_mem_pool ()->size () > kMaxAsyncResourcePoolLength ) {
576
543
try_empty_resource_pool ();
577
544
}
578
- size_t size = getMemoryAlignmentStrategy ()->roundBytes (origin_size );
545
+ size = getMemoryAlignmentStrategy ()->roundBytes (size );
579
546
std::tuple<void *, int , size_t > block = impl->allocateRaw (size);
580
547
void * ptr = std::get<0 >(block);
581
548
if (ptr == nullptr && size > 0 ) {
@@ -601,8 +568,8 @@ class BFCachingAllocator : public CacheAllocator {
601
568
deleteBFContext, device ());
602
569
DIPU_DEBUG_ALLOCATOR (
603
570
4 , " BFCachingAllocator: malloc "
604
- << nbytes << " ,requires " << origin_size
605
- << " nbytes, ptr: " << ptr << " ,device:" << device ()
571
+ << nbytes << " ,requires " << size << " nbytes, ptr: " << ptr
572
+ << " ,device:" << device ()
606
573
<< " ,async_mempool.size:" << async_mem_pool ()->size ());
607
574
c10::reportMemoryUsageToProfiler (
608
575
ptr, static_cast <int64_t >(nbytes), memory_allocated (),
0 commit comments