Skip to content

Commit 740b07c

Browse files
optimize bf allocator
1 parent 15ef81e commit 740b07c

File tree

3 files changed

+46
-24
lines changed

3 files changed

+46
-24
lines changed

dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright (c) 2023, DeepLink.
22

3+
#include <cstddef>
34
#include <functional>
45
#include <memory>
56
#include <stack>
@@ -14,10 +15,12 @@
1415

1516
namespace dipu {
1617

17-
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
18-
const size_t kMaxExtendSize = get_env_or_default("DIPU_MAX_EXTEND_SIZE", 1024)
19-
<< 20U;
20-
18+
inline size_t round_up_to_alignment(size_t nbytes, size_t alignment_size) {
19+
if (nbytes <= 0) {
20+
return alignment_size;
21+
}
22+
return ((nbytes - 1) | (alignment_size - 1)) + 1;
23+
}
2124
class BFCachingAllocatorImpl {
2225
public:
2326
using allocate_fn_t = std::function<void*(size_t)>;
@@ -32,9 +35,11 @@ class BFCachingAllocatorImpl {
3235
static constexpr int kNumSubBins = 4;
3336
static constexpr int kLogNumSubBins = 2;
3437
// Allocation parameters
35-
static constexpr size_t kMinAllocationSize = 512;
36-
static constexpr size_t kMaxInternalFragmentation = 8U << 20U; // 8MB
37-
static constexpr size_t kMinExtendSize = 8U << 20U; // 8MB
38+
static constexpr int kMinAllocationSize = 512;
39+
static constexpr int kSmallBlockSize = 2 << 20;
40+
static constexpr int kMiddleBlockSize = 20 << 20;
41+
static constexpr int kLargeBlockSize = 200 << 20;
42+
static constexpr int kLargeAlignSize = 1024 << 20;
3843

3944
size_t cachedBytes = 0;
4045
size_t allocatedBytes = 0;
@@ -67,8 +72,6 @@ class BFCachingAllocatorImpl {
6772
__uint128_t bits = 0;
6873
// Virtual chunks which are the heads of the bins
6974
std::array<int, static_cast<size_t>(kNumBigBins* kNumSubBins)> binHeads_{};
70-
// The extending size next time
71-
size_t currExtendSize_ = kMinExtendSize;
7275

7376
explicit StreamSet(size_t id) : id(id) {}
7477

@@ -140,7 +143,10 @@ class BFCachingAllocatorImpl {
140143
mutable mutex_t mut_;
141144

142145
static size_t roundBytes(size_t nbytes) {
143-
return ((nbytes - 1) | (kMinAllocationSize - 1)) + 1;
146+
if (nbytes < kLargeBlockSize) {
147+
return round_up_to_alignment(nbytes, kMinAllocationSize);
148+
}
149+
return round_up_to_alignment(nbytes, kSmallBlockSize);
144150
}
145151

146152
int newChunk(void* ptr, size_t size, size_t stream) {
@@ -293,20 +299,21 @@ class BFCachingAllocatorImpl {
293299

294300
int extend(size_t nbytes, StreamSetHandle& set) {
295301
emptyCacheWithoutLock();
296-
auto& extSize = set->currExtendSize_;
297302
bool increased = false;
298-
while (extSize < nbytes && extSize < kMaxExtendSize) {
299-
extSize *= 2;
300-
increased = true;
303+
size_t allocateSize = nbytes;
304+
if (nbytes < kSmallBlockSize) {
305+
allocateSize = kSmallBlockSize;
306+
} else if (nbytes < kMiddleBlockSize) {
307+
allocateSize = kMiddleBlockSize;
308+
} else if (nbytes < kLargeBlockSize) {
309+
allocateSize = round_up_to_alignment(nbytes, kMiddleBlockSize);
310+
} else {
311+
allocateSize = round_up_to_alignment(nbytes, kLargeAlignSize);
301312
}
302313

303-
size_t currBytes = std::max(nbytes, extSize);
314+
size_t currBytes = std::max(nbytes, allocateSize);
304315
void* ptr = allocateOnDevice(currBytes);
305-
if (ptr) {
306-
if (!increased && extSize < kMaxExtendSize) {
307-
extSize *= 2;
308-
}
309-
} else {
316+
if (!ptr) {
310317
if (currBytes > nbytes) {
311318
currBytes = nbytes;
312319
ptr = allocateOnDevice(currBytes);
@@ -371,8 +378,17 @@ class BFCachingAllocatorImpl {
371378
}
372379

373380
if (id) {
374-
if (chunks_[id].size >= nbytes * 2 ||
375-
chunks_[id].size >= nbytes + kMaxInternalFragmentation) {
381+
int internlalMaxFragnmentSize = 0;
382+
const int chunk_size = static_cast<int>(chunks_[id].size);
383+
if (chunk_size < kSmallBlockSize) {
384+
internlalMaxFragnmentSize = kMinAllocationSize;
385+
} else if (chunk_size < kLargeAlignSize) {
386+
internlalMaxFragnmentSize = kSmallBlockSize;
387+
} else {
388+
internlalMaxFragnmentSize = kLargeAlignSize;
389+
}
390+
if ((chunk_size >= (nbytes << 1)) ||
391+
(chunk_size > (nbytes + internlalMaxFragnmentSize))) {
376392
id = split(id, nbytes);
377393
}
378394
chunks_[id].allocated = true;

dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ c10::DataPtr DIPURawDeviceAllocator::allocate(
5050
if (nbytes > 0) {
5151
devproxy::mallocDevice(&data, nbytes);
5252
DIPU_DEBUG_ALLOCATOR(1, "devproxy::mallocDevice: malloc "
53-
<< nbytes << " nbytes, ptr:" << data);
53+
<< nbytes << "(" << (nbytes >> 20) << "MB)"
54+
<< " nbytes, ptr:" << data);
5455
}
5556
return {data, data, &DIPURawDeviceAllocatorDeleter,
5657
c10::Device(dipu::DIPU_DEVICE_TYPE, device_index)};
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
#include <cstdint>
2+
13
#include "csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h"
4+
#include "csrc_dipu/utils/env.hpp"
25

36
const static int ascend_init = []() {
47
// – 内存大小向上对齐成32整数倍+32字节(m=ALIGN_UP[len,32]+32字节);
58
// – 内存起始地址需满足64字节对齐(ALIGN_UP[m,64])。
69
// nbytes = align_64(1 * nbytes + 32);
7-
static dipu::MemoryAlignmentStrategy memoryAlignStrategy(64, 1, 32);
10+
11+
int64_t align = dipu::get_env_or_default("ASCEND_ALLOCATE_ALIGN", 512);
12+
static dipu::MemoryAlignmentStrategy memoryAlignStrategy(align, 1, 32);
813
dipu::setMemoryAlignmentStrategy(&memoryAlignStrategy);
914
return 0;
1015
}();

0 commit comments

Comments
 (0)