Skip to content

Commit

Permalink
Merge pull request #10484 from michal-shalev/fix-memic-alloc-fail
Browse files Browse the repository at this point in the history
TEST/GTEST/UCT: Retry when MEMIC allocation fails
  • Loading branch information
yosefe authored Feb 12, 2025
2 parents d44ff68 + 75df7ab commit 4a5d339
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 28 deletions.
2 changes: 1 addition & 1 deletion test/gtest/uct/test_atomic_key_reg_rdma_mem_type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ UCS_TEST_SKIP_COND_P(uct_atomic_key_reg_rdma_mem_type, fadd64,
!check_rdma_memory())
{
mapped_buffer recvbuf(sizeof(uint64_t), receiver(), 0UL,
UCS_MEMORY_TYPE_RDMA);
UCS_MEMORY_TYPE_RDMA, UCT_MD_MEM_ACCESS_ALL, 10);
uint64_t add = rand64();

run_workers(static_cast<send_func_t>(
Expand Down
65 changes: 41 additions & 24 deletions test/gtest/uct/uct_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -931,11 +931,12 @@ uct_test::entity::entity(const resource& resource, uct_md_config_t *md_config,

void uct_test::entity::mem_alloc(size_t length, unsigned mem_flags,
uct_allocated_memory_t *mem,
ucs_memory_type_t mem_type) const
ucs_memory_type_t mem_type,
unsigned num_retries) const
{
void *address = NULL;
uct_md_h uct_md = md();
ucs_status_t status;
void *address = NULL;
uct_md_h uct_md = md();
ucs_status_t status = UCS_OK;
uct_mem_alloc_params_t params;

params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_FLAGS |
Expand All @@ -947,22 +948,38 @@ void uct_test::entity::mem_alloc(size_t length, unsigned mem_flags,
params.mem_type = mem_type;
params.address = address;

if ((md_attr().flags & (UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_REG)) &&
(mem_type == UCS_MEMORY_TYPE_HOST)) {
status = uct_iface_mem_alloc(m_iface, length, mem_flags, "uct_test",
mem);
ASSERT_UCS_OK(status);
} else {
uct_alloc_method_t alloc_methods[] = {UCT_ALLOC_METHOD_MMAP,
UCT_ALLOC_METHOD_MD};
params.field_mask |= UCT_MEM_ALLOC_PARAM_FIELD_MDS;
params.mds.mds = &uct_md;
params.mds.count = 1;
status = uct_mem_alloc(length, alloc_methods,
ucs_static_array_size(alloc_methods), &params,
mem);
ASSERT_UCS_OK(status);
for (unsigned i = 0; i <= num_retries; ++i) {
scoped_log_handler slh(wrap_errors_logger);
if ((md_attr().flags & (UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_REG)) &&
(mem_type == UCS_MEMORY_TYPE_HOST)) {
status = uct_iface_mem_alloc(m_iface, length, mem_flags, "uct_test",
mem);
} else {
uct_alloc_method_t alloc_methods[] = {UCT_ALLOC_METHOD_MMAP,
UCT_ALLOC_METHOD_MD};
params.field_mask |= UCT_MEM_ALLOC_PARAM_FIELD_MDS;
params.mds.mds = &uct_md;
params.mds.count = 1;
status = uct_mem_alloc(length, alloc_methods,
ucs_static_array_size(alloc_methods),
&params, mem);
}

if (status != UCS_ERR_NO_MEMORY) {
break;
}

if (i < num_retries) {
UCS_TEST_MESSAGE << "Retry " << (i + 1) << "/" << num_retries
<< ": Allocation failed - "
<< ucs_status_string(status);
/* Sleep only if there are more retries remaining */
usleep(ucs::rand() % 10000);
}
}

ASSERT_UCS_OK(status);

ucs_assert(mem->mem_type == mem_type);
}

Expand Down Expand Up @@ -1414,16 +1431,16 @@ void uct_test::mapped_buffer::reset()
uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed,
const entity &entity, size_t offset,
ucs_memory_type_t mem_type,
unsigned mem_flags) :
mapped_buffer(size, entity, offset, mem_type, mem_flags)
unsigned mem_flags, unsigned num_retries) :
mapped_buffer(size, entity, offset, mem_type, mem_flags, num_retries)
{
pattern_fill(seed);
}

uct_test::mapped_buffer::mapped_buffer(size_t size,
uct_test::mapped_buffer::mapped_buffer(size_t size,
const entity &entity, size_t offset,
ucs_memory_type_t mem_type,
unsigned mem_flags) :
unsigned mem_flags, unsigned num_retries) :
m_entity(entity)
{
if (size == 0) {
Expand All @@ -1433,7 +1450,7 @@ uct_test::mapped_buffer::mapped_buffer(size_t size,

size_t alloc_size = size + offset;
if ((mem_type == UCS_MEMORY_TYPE_HOST) || (mem_type == UCS_MEMORY_TYPE_RDMA)) {
m_entity.mem_alloc(alloc_size, mem_flags, &m_mem, mem_type);
m_entity.mem_alloc(alloc_size, mem_flags, &m_mem, mem_type, num_retries);
} else {
m_mem.method = UCT_ALLOC_METHOD_LAST;
m_mem.address = mem_buffer::allocate(alloc_size, mem_type);
Expand Down
10 changes: 7 additions & 3 deletions test/gtest/uct/uct_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ class uct_test : public testing::TestWithParam<const resource*>,

void mem_alloc(size_t length, unsigned mem_flags,
uct_allocated_memory_t *mem,
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST) const;
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
unsigned num_retries = 0) const;

void mem_free(const uct_allocated_memory_t *mem) const;

Expand Down Expand Up @@ -245,12 +246,15 @@ class uct_test : public testing::TestWithParam<const resource*>,
public:
mapped_buffer(size_t size, const entity &entity, size_t offset = 0,
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL);
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL,
unsigned num_retries = 0);

mapped_buffer(size_t size, uint64_t seed, const entity &entity,
size_t offset = 0,
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL);
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL,
unsigned num_retries = 0);

virtual ~mapped_buffer();

mapped_buffer(mapped_buffer &&other);
Expand Down

0 comments on commit 4a5d339

Please sign in to comment.