8
8
#include < utility>
9
9
#include < vector>
10
10
11
- #include " csrc_dipu/utils/env.hpp"
12
-
13
11
#include " DIPUCachingAllocator.h"
14
12
#include " DIPUSpinMutex.h"
15
13
16
14
namespace dipu {
17
15
18
16
inline size_t round_up_to_alignment (size_t nbytes, size_t alignment_size) {
19
- if (nbytes <= 0 ) {
20
- return alignment_size;
21
- }
22
17
return ((nbytes - 1 ) | (alignment_size - 1 )) + 1 ;
23
18
}
19
+
24
20
class BFCachingAllocatorImpl {
25
21
public:
26
22
using allocate_fn_t = std::function<void *(size_t )>;
@@ -34,12 +30,23 @@ class BFCachingAllocatorImpl {
34
30
// Number of second level bins (linearly)
35
31
static constexpr int kNumSubBins = 4 ;
36
32
static constexpr int kLogNumSubBins = 2 ;
33
+
37
34
// Allocation parameters
38
- static constexpr int kMinAllocationSize = 512 ;
39
- static constexpr int kSmallBlockSize = 2 << 20 ;
40
- static constexpr int kMiddleBlockSize = 20 << 20 ;
41
- static constexpr int kLargeBlockSize = 200 << 20 ;
42
- static constexpr int kLargeAlignSize = 1024 << 20 ;
35
+ static constexpr size_t kMinBlockSize =
36
+ 512 ; // all sizes are rounded to at least 512 bytes
37
+ static constexpr size_t kSmallSize =
38
+ 1048576 ; // largest "small" allocation is 1 MiB
39
+ static constexpr size_t kSmallBuffer =
40
+ 2097152 ; // "small" allocations are packed in 2 MiB blocks
41
+ static constexpr size_t kLargeBuffer =
42
+ 20971520 ; // "large" allocations may be packed in 20 MiB blocks
43
+ static constexpr size_t kMinLargeAlloc =
44
+ 10485760 ; // allocations between 1 and 10 MiB may use kLargeBuffer
45
+ static constexpr size_t kRoundLarge =
46
+ 2097152 ; // round up large allocations to 2 MiB
47
+ static constexpr size_t kMaxSplitableBlockSize =
48
+ 200 << 20 ; // To further reduce fragmentation, blocks >= 200MB are not
49
+ // allowed to be split
43
50
44
51
size_t cachedBytes = 0 ;
45
52
size_t allocatedBytes = 0 ;
@@ -143,10 +150,11 @@ class BFCachingAllocatorImpl {
143
150
mutable mutex_t mut_;
144
151
145
152
static size_t roundBytes (size_t nbytes) {
146
- if (nbytes < kLargeBlockSize ) {
147
- return round_up_to_alignment (nbytes, kMinAllocationSize ) ;
153
+ if (nbytes <= kMinBlockSize ) {
154
+ return kMinBlockSize ;
148
155
}
149
- return round_up_to_alignment (nbytes, kSmallBlockSize );
156
+ int clz = __builtin_clzll (nbytes - 1 );
157
+ return (1 << (sizeof (int64_t ) - clz));
150
158
}
151
159
152
160
int newChunk (void * ptr, size_t size, size_t stream) {
@@ -169,7 +177,7 @@ class BFCachingAllocatorImpl {
169
177
// Big bin range:
170
178
// [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
171
179
// Split big bin into `kNumSubBins` sub bins
172
- size_t nBlocks = nbytes / kMinAllocationSize ;
180
+ size_t nBlocks = nbytes / kMinBlockSize ;
173
181
constexpr int kMaxBinIdx = 63 ;
174
182
int bigBinIdx = kMaxBinIdx - __builtin_clzll (nBlocks);
175
183
// If `nbytes` is so large, we just put it into the last
@@ -245,16 +253,22 @@ class BFCachingAllocatorImpl {
245
253
return id;
246
254
}
247
255
248
- void shrink (StreamSetHandle& set) {
256
+ void shrink (StreamSetHandle& set, size_t try_release_size = 0 ) {
257
+ size_t released_size = 0 ;
249
258
for (int binHead : set->binHeads_ ) {
250
259
int k = chunks_[binHead].nextChunkInList ;
251
260
while (k) {
252
- if (chunks_[k].isMonoBlock ()) {
253
- releaseOnDevice (chunks_[k].ptr , chunks_[k].size );
261
+ auto & chunk_k = chunks_[k];
262
+ if (chunk_k.isMonoBlock ()) {
263
+ released_size += chunk_k.size ;
264
+ releaseOnDevice (chunk_k.ptr , chunk_k.size );
254
265
removeChunkFromBin (k);
255
266
recycleIds_.push (k);
267
+ if (try_release_size > 0 && released_size >= try_release_size) {
268
+ break ;
269
+ }
256
270
}
257
- k = chunks_[k] .nextChunkInList ;
271
+ k = chunk_k .nextChunkInList ;
258
272
}
259
273
}
260
274
}
@@ -297,33 +311,39 @@ class BFCachingAllocatorImpl {
297
311
return id;
298
312
}
299
313
300
- int extend (size_t nbytes, StreamSetHandle& set) {
301
- emptyCacheWithoutLock ();
302
- bool increased = false ;
303
- size_t allocateSize = nbytes;
304
- if (nbytes < kSmallBlockSize ) {
305
- allocateSize = kSmallBlockSize ;
306
- } else if (nbytes < kMiddleBlockSize ) {
307
- allocateSize = kMiddleBlockSize ;
308
- } else if (nbytes < kLargeBlockSize ) {
309
- allocateSize = round_up_to_alignment (nbytes, kMiddleBlockSize );
310
- } else {
311
- allocateSize = round_up_to_alignment (nbytes, kLargeAlignSize );
314
+ size_t getAllocateSize (size_t nbytes) {
315
+ if (nbytes <= kSmallSize ) {
316
+ return kSmallBuffer ;
312
317
}
318
+ if (nbytes < kMinLargeAlloc ) {
319
+ return kLargeBuffer ;
320
+ }
321
+ return round_up_to_alignment (nbytes, kRoundLarge );
322
+ }
313
323
314
- size_t currBytes = std::max (nbytes, allocateSize);
315
- void * ptr = allocateOnDevice (currBytes);
324
+ int extend (size_t nbytes, StreamSetHandle& set) {
325
+ size_t allocateSize = getAllocateSize (nbytes);
326
+
327
+ void * ptr = allocateOnDevice (allocateSize);
328
+ if (!ptr) {
329
+ shrink (set, allocateSize);
330
+ ptr = allocateOnDevice (allocateSize);
331
+ }
332
+ if (!ptr) {
333
+ shrink (set);
334
+ ptr = allocateOnDevice (allocateSize);
335
+ }
316
336
if (!ptr) {
317
- if (currBytes > nbytes) {
318
- currBytes = nbytes;
319
- ptr = allocateOnDevice (currBytes );
337
+ if (allocateSize > nbytes) {
338
+ allocateSize = nbytes;
339
+ ptr = allocateOnDevice (allocateSize );
320
340
}
321
341
}
322
342
if (!ptr) {
323
343
return 0 ;
324
344
}
325
345
326
- int id = newChunk (ptr, currBytes , set->id );
346
+ int id = newChunk (ptr, allocateSize , set->id );
327
347
return id;
328
348
}
329
349
@@ -378,17 +398,7 @@ class BFCachingAllocatorImpl {
378
398
}
379
399
380
400
if (id) {
381
- int internlalMaxFragnmentSize = 0 ;
382
- const int chunk_size = static_cast <int >(chunks_[id].size );
383
- if (chunk_size < kSmallBlockSize ) {
384
- internlalMaxFragnmentSize = kMinAllocationSize ;
385
- } else if (chunk_size < kLargeAlignSize ) {
386
- internlalMaxFragnmentSize = kSmallBlockSize ;
387
- } else {
388
- internlalMaxFragnmentSize = kLargeAlignSize ;
389
- }
390
- if ((chunk_size >= (nbytes << 1 )) ||
391
- (chunk_size > (nbytes + internlalMaxFragnmentSize))) {
401
+ if (chunks_[id].size >= (nbytes << 1 )) {
392
402
id = split (id, nbytes);
393
403
}
394
404
chunks_[id].allocated = true ;
@@ -522,6 +532,9 @@ class BFCachingAllocator : public CacheAllocator {
522
532
: DataPtrContextBase(allocator, ptr, size), id_(id), nbytes_(nbytes) {}
523
533
524
534
~Context () {
535
+ if (size () <= 0 ) {
536
+ return ;
537
+ }
525
538
auto allocator_ = static_cast <const BFCachingAllocator*>(allocator ());
526
539
DIPU_DEBUG_ALLOCATOR (8 , " BFCachingAllocator: add to async_mem_pool:"
527
540
<< ptr () << " , " << size () << " nbytes, id:"
@@ -531,16 +544,21 @@ class BFCachingAllocator : public CacheAllocator {
531
544
if (ptr ()) {
532
545
allocator_->metrics_producer .deallocate (ptr ());
533
546
std::deque<DIPUEvent> events;
547
+ bool record_block = false ;
534
548
for (auto const & stream : streams ()) {
535
549
events.emplace_back ();
536
550
DIPU_DEBUG_ALLOCATOR (8 , " BFCachingAllocator: record to stream:"
537
551
<< stream.rawstream ());
538
552
events.back ().record (stream);
553
+ record_block = true ;
539
554
}
540
555
allocator_->async_mem_pool ()->add (std::make_tuple (ptr (), id_),
541
556
events);
542
557
allocator_->set_memory_allocated (allocator_->memory_allocated () -
543
558
nbytes_);
559
+ if (!record_block) {
560
+ allocator_->restore ();
561
+ }
544
562
}
545
563
} else {
546
564
DIPU_DEBUG_ALLOCATOR (8 ,
@@ -552,12 +570,12 @@ class BFCachingAllocator : public CacheAllocator {
552
570
553
571
friend class Context ;
554
572
555
- c10::DataPtr allocate (size_t size ) const override {
573
+ c10::DataPtr allocate (size_t origin_size ) const override {
556
574
restore ();
557
575
if (async_mem_pool ()->size () > kMaxAsyncResourcePoolLength ) {
558
576
try_empty_resource_pool ();
559
577
}
560
- size = getMemoryAlignmentStrategy ()->roundBytes (size );
578
+ size_t size = getMemoryAlignmentStrategy ()->roundBytes (origin_size );
561
579
std::tuple<void *, int , size_t > block = impl->allocateRaw (size);
562
580
void * ptr = std::get<0 >(block);
563
581
if (ptr == nullptr && size > 0 ) {
@@ -583,7 +601,7 @@ class BFCachingAllocator : public CacheAllocator {
583
601
deleteBFContext, device ());
584
602
DIPU_DEBUG_ALLOCATOR (
585
603
4 , " BFCachingAllocator: malloc "
586
- << nbytes << " ,requires " << size << " nbytes, ptr:" << ptr
604
+ << nbytes << " ,requires " << origin_size << " nbytes, ptr:" << ptr
587
605
<< " ,device:" << device ()
588
606
<< " ,async_mempool.size:" << async_mem_pool ()->size ());
589
607
c10::reportMemoryUsageToProfiler (
0 commit comments