Skip to content

Commit 002320b

Browse files
committed
prov/efa: Implement dmabuf try/fallback logic
feat: Add default dmabuf attempt with fallback for all efa_hmem_ifaces Problem: - Need to make dmabuf usage default going forward - Need fallback mechanism when dmabuf is not supported or fails Solution: - Modified initial PR from @jiaxiyan at 6aa6708 - Added dmabuf_supported_by_device_b flag in efa_hmem_info structure in prov/efa/src/efa_hmem.h - Updated dmabuf_supported_by_device_b detection in each fi_hmem_iface p2p_support function in prov/efa/src/efa_hmem.c - Modified efa_mr_reg_ibv_mr() in prov/efa/src/efa_mr.c to use efa_mr->peer.iface for dmabuf checks - Implemented try-dmabuf-first with fallback to ibv_reg_mr in efa_mr_reg_ibv_mr() - Added environment variable control for dmabuf enable/disable per interface Testing: - Ran MPI perf tests on 2 nodes on p5en with dmabuf and fallback - Ran MPI perf tests on 16 nodes on p5en with dmabuf and fallback - Verified fallback works when dmabuf is unavailable Sim Issue: - N/A Signed-off-by: Nick Mazzilli <[email protected]>
1 parent 80a80f3 commit 002320b

File tree

3 files changed

+165
-89
lines changed

3 files changed

+165
-89
lines changed

prov/efa/src/efa_hmem.c

Lines changed: 95 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,34 @@
77

88
struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];
99

10+
/**
11+
* @brief Check if DMABUF is enabled for a specific HMEM interface
12+
*
13+
* This function checks the environment variables to determine if DMABUF
14+
* should be used for the specified HMEM interface. It respects both
15+
* EFA-specific and core libfabric environment variables.
16+
*
17+
* @param[in] iface The HMEM interface to check
18+
* @return true if DMABUF is enabled for the interface, false otherwise
19+
*/
20+
bool efa_hmem_is_dmabuf_env_var_enabled(enum fi_hmem_iface iface)
21+
{
22+
switch (iface) {
23+
case FI_HMEM_SYSTEM:
24+
return false;
25+
case FI_HMEM_CUDA:
26+
return cuda_is_dmabuf_requested();
27+
case FI_HMEM_NEURON:
28+
return neuron_is_dmabuf_requested();
29+
case FI_HMEM_SYNAPSEAI:
30+
return synapseai_is_dmabuf_requested();
31+
case FI_HMEM_ROCR:
32+
return rocr_is_dmabuf_requested();
33+
default:
34+
return false;
35+
}
36+
}
37+
1038
#if HAVE_CUDA || HAVE_NEURON
1139
static size_t efa_max_eager_msg_size_with_largest_header() {
1240
int mtu_size;
@@ -118,6 +146,8 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in
118146
int dmabuf_fd;
119147
uint64_t dmabuf_offset;
120148

149+
info->dmabuf_supported_by_device_b = false;
150+
121151
cuda_ret = ofi_cudaMalloc(&ptr, len);
122152
if (cuda_ret != cudaSuccess) {
123153
info->initialized = false;
@@ -134,21 +164,28 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in
134164
}
135165

136166
#if HAVE_EFA_DMABUF_MR
137-
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
138-
if (ret == FI_SUCCESS) {
139-
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
140-
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
141-
(void)cuda_put_dmabuf_fd(dmabuf_fd);
142-
if (!ibv_mr) {
167+
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_CUDA)) {
168+
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_CUDA, ptr, len, &dmabuf_fd, &dmabuf_offset);
169+
if (ret == FI_SUCCESS) {
170+
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
171+
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
172+
(void)ofi_hmem_put_dmabuf_fd(FI_HMEM_CUDA, dmabuf_fd);
173+
if (!ibv_mr) {
174+
EFA_INFO(FI_LOG_CORE,
175+
"Unable to register CUDA device buffer via dmabuf: %s. "
176+
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
177+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
178+
} else {
179+
info->dmabuf_supported_by_device_b = true;
180+
}
181+
} else {
143182
EFA_INFO(FI_LOG_CORE,
144-
"Unable to register CUDA device buffer via dmabuf: %s. "
145-
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
183+
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
184+
"Fall back to ibv_reg_mr\n", ret);
146185
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
147186
}
148187
} else {
149-
EFA_INFO(FI_LOG_CORE,
150-
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
151-
"Fall back to ibv_reg_mr\n", ret);
188+
EFA_INFO(FI_LOG_CORE, "FI_HMEM_CUDA DMABUF disabled by environment variable\n");
152189
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
153190
}
154191
#else
@@ -193,6 +230,8 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *
193230
uint64_t offset;
194231
int ret;
195232

233+
info->dmabuf_supported_by_device_b = false;
234+
196235
if (g_efa_selected_device_list[0].device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ) {
197236
ibv_access |= IBV_ACCESS_REMOTE_READ;
198237
}
@@ -217,15 +256,29 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *
217256
}
218257

219258
#if HAVE_EFA_DMABUF_MR
220-
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
221-
if (ret == FI_SUCCESS) {
222-
ibv_mr = ibv_reg_dmabuf_mr(
223-
ibv_pd, offset,
224-
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
225-
} else if (ret == -FI_EOPNOTSUPP) {
226-
EFA_INFO(FI_LOG_MR,
227-
"Unable to retrieve dmabuf fd of Neuron device buffer, "
228-
"Fall back to ibv_reg_mr\n");
259+
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_NEURON)) {
260+
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_NEURON, ptr, (uint64_t)len, &dmabuf_fd, &offset);
261+
if (ret == FI_SUCCESS) {
262+
ibv_mr = ibv_reg_dmabuf_mr(
263+
ibv_pd, offset,
264+
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
265+
(void)ofi_hmem_put_dmabuf_fd(FI_HMEM_NEURON, dmabuf_fd);
266+
if (ibv_mr) {
267+
info->dmabuf_supported_by_device_b = true;
268+
} else {
269+
EFA_INFO(FI_LOG_CORE,
270+
"Unable to register Neuron device buffer via dmabuf: %s. "
271+
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
272+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
273+
}
274+
} else {
275+
EFA_INFO(FI_LOG_MR,
276+
"Unable to retrieve dmabuf fd of Neuron device buffer: %d. "
277+
"Fall back to ibv_reg_mr\n", ret);
278+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
279+
}
280+
} else {
281+
EFA_INFO(FI_LOG_CORE, "FI_HMEM_NEURON DMABUF disabled by environment variable\n");
229282
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
230283
}
231284
#else
@@ -284,9 +337,30 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
284337
}
285338

286339
info->initialized = true;
340+
info->max_medium_msg_size = 0;
341+
info->runt_size = 0;
342+
info->min_read_msg_size = 0;
343+
info->min_read_write_size = 0;
287344

288-
if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) {
345+
if (iface == FI_HMEM_SYNAPSEAI) {
346+
info->p2p_supported_by_device = true;
347+
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_SYNAPSEAI)) {
348+
info->dmabuf_supported_by_device_b = true;
349+
} else {
350+
EFA_INFO(FI_LOG_CORE, "FI_HMEM_SYNAPSEAI DMABUF disabled by environment variable\n");
351+
info->dmabuf_supported_by_device_b = false;
352+
}
353+
} else if(iface == FI_HMEM_SYSTEM) {
289354
info->p2p_supported_by_device = true;
355+
info->dmabuf_supported_by_device_b = false;
356+
} else if (iface == FI_HMEM_ROCR) {
357+
info->p2p_supported_by_device = true;
358+
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_ROCR)) {
359+
info->dmabuf_supported_by_device_b = true;
360+
} else {
361+
EFA_INFO(FI_LOG_CORE, "FI_HMEM_ROCR DMABUF disabled by environment variable\n");
362+
info->dmabuf_supported_by_device_b = false;
363+
}
290364
} else if (ofi_hmem_p2p_disabled()) {
291365
info->p2p_supported_by_device = false;
292366
} else {

prov/efa/src/efa_hmem.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ static const enum fi_hmem_iface efa_hmem_ifaces[] = {
2424
struct efa_hmem_info {
2525
bool initialized; /* do we support it at all */
2626
bool p2p_supported_by_device; /* do we support p2p with this device */
27+
bool dmabuf_supported_by_device_b; /* do we support dmabuf with this device */
2728

2829
size_t max_medium_msg_size;
2930
size_t runt_size;
@@ -104,4 +105,5 @@ static inline int efa_copy_to_hmem(void *desc, void *dest, const void *buff, siz
104105

105106
ssize_t efa_copy_from_hmem_iov(void **desc, char *buff, size_t buff_size, const struct iovec *hmem_iov, size_t iov_count);
106107
ssize_t efa_copy_to_hmem_iov(void **desc, struct iovec *hmem_iov, size_t iov_count, char *buff, size_t buff_size);
108+
bool efa_hmem_is_dmabuf_env_var_enabled(enum fi_hmem_iface iface);
107109
#endif

prov/efa/src/efa_mr.c

Lines changed: 68 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -506,77 +506,77 @@ struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
506506
* @param flags flags in fi_mr_reg/fi_mr_regattr
507507
* @return struct ibv_mr* the ptr to the registered MR
508508
*/
509-
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr,
510-
int access, const uint64_t flags)
509+
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr,
510+
struct fi_mr_attr *mr_attr,
511+
int access, const uint64_t flags)
511512
{
512-
int dmabuf_fd;
513-
uint64_t offset;
514-
int ret;
515-
struct ibv_mr *ibv_mr;
516-
517-
if (flags & FI_MR_DMABUF)
518-
return efa_mr_reg_ibv_dmabuf_mr(
519-
efa_mr->domain->ibv_pd,
520-
mr_attr->dmabuf->offset,
521-
mr_attr->dmabuf->len,
522-
(uintptr_t) mr_attr->dmabuf->base_addr + mr_attr->dmabuf->offset,
523-
mr_attr->dmabuf->fd,
524-
access
525-
);
526-
527-
if (efa_mr_is_synapseai(efa_mr)) {
528-
ret = ofi_hmem_get_dmabuf_fd(efa_mr->peer.iface,
529-
mr_attr->mr_iov->iov_base,
530-
(uint64_t) mr_attr->mr_iov->iov_len,
531-
&dmabuf_fd, &offset);
532-
if (ret != FI_SUCCESS) {
533-
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n");
534-
return NULL;
535-
}
536-
return efa_mr_reg_ibv_dmabuf_mr(
537-
efa_mr->domain->ibv_pd, offset,
538-
mr_attr->mr_iov->iov_len,
539-
(uint64_t)mr_attr->mr_iov->iov_base,
540-
dmabuf_fd, access);
541-
}
542-
543-
/*
544-
* TODO: need such fallback for cuda as well when
545-
* FI_CUDA_API_PERMITTED is true
546-
*/
547-
if (efa_mr_is_neuron(efa_mr)) {
548-
ret = ofi_hmem_get_dmabuf_fd(
549-
efa_mr->peer.iface,
550-
mr_attr->mr_iov->iov_base,
551-
mr_attr->mr_iov->iov_len,
552-
&dmabuf_fd,
553-
&offset);
554-
555-
if (ret == FI_SUCCESS) {
556-
/* Success => invoke ibv_reg_dmabuf_mr */
557-
ibv_mr = efa_mr_reg_ibv_dmabuf_mr(
558-
efa_mr->domain->ibv_pd, 0,
559-
mr_attr->mr_iov->iov_len,
560-
(uint64_t)mr_attr->mr_iov->iov_base,
561-
dmabuf_fd, access);
562-
(void) ofi_hmem_put_dmabuf_fd(efa_mr->peer.iface, dmabuf_fd);
563-
return ibv_mr;
564-
} else if (ret == -FI_EOPNOTSUPP) {
565-
/* Protocol not available => fallback */
566-
EFA_INFO(FI_LOG_MR,
567-
"Unable to get dmabuf fd for Neuron device buffer, "
568-
"Fall back to ibv_reg_mr\n");
569-
return ibv_reg_mr(
570-
efa_mr->domain->ibv_pd,
571-
(void *)mr_attr->mr_iov->iov_base,
572-
mr_attr->mr_iov->iov_len, access);
573-
}
574-
return NULL;
513+
int dmabuf_fd;
514+
uint64_t offset;
515+
int ret;
516+
517+
/* Explicit dmabuf registration */
518+
if (flags & FI_MR_DMABUF) {
519+
if (!mr_attr->dmabuf) {
520+
EFA_WARN(FI_LOG_MR, "FI_MR_DMABUF set but mr_attr->dmabuf == NULL\n");
521+
return NULL;
522+
}
523+
if (!g_efa_hmem_info[mr_attr->iface].dmabuf_supported_by_device_b) {
524+
EFA_WARN(FI_LOG_MR,
525+
"Requested FI_MR_DMABUF, but dmabuf not supported for iface=%d\n",
526+
mr_attr->iface);
527+
return NULL;
528+
}
529+
530+
EFA_INFO(FI_LOG_MR,
531+
"FI_MR_DMABUF: fd=%d offset=%lu len=%zu\n",
532+
mr_attr->dmabuf->fd, mr_attr->dmabuf->offset,
533+
mr_attr->dmabuf->len);
534+
535+
return efa_mr_reg_ibv_dmabuf_mr(
536+
efa_mr->domain->ibv_pd,
537+
mr_attr->dmabuf->offset,
538+
mr_attr->dmabuf->len,
539+
(uintptr_t) mr_attr->dmabuf->base_addr + mr_attr->dmabuf->offset,
540+
mr_attr->dmabuf->fd,
541+
access);
542+
}
543+
544+
/* Implicit VA path with dmabuf-first */
545+
if (g_efa_hmem_info[mr_attr->iface].dmabuf_supported_by_device_b) {
546+
ret = ofi_hmem_get_dmabuf_fd(
547+
mr_attr->iface,
548+
mr_attr->mr_iov->iov_base,
549+
(uint64_t) mr_attr->mr_iov->iov_len,
550+
&dmabuf_fd, &offset);
551+
552+
if (ret == FI_SUCCESS) {
553+
struct ibv_mr *dmabuf_mr;
554+
EFA_INFO(FI_LOG_MR,
555+
"Registering dmabuf MR: fd=%d offset=%lu len=%zu\n",
556+
dmabuf_fd, offset, mr_attr->mr_iov->iov_len);
557+
558+
dmabuf_mr = efa_mr_reg_ibv_dmabuf_mr(
559+
efa_mr->domain->ibv_pd, offset,
560+
mr_attr->mr_iov->iov_len,
561+
(uint64_t)mr_attr->mr_iov->iov_base,
562+
dmabuf_fd, access);
563+
564+
/* Close the dmabuf file descriptor - it's no longer needed after registration */
565+
(void) ofi_hmem_put_dmabuf_fd(mr_attr->iface, dmabuf_fd);
566+
567+
return dmabuf_mr;
568+
}
569+
570+
EFA_WARN(FI_LOG_MR,
571+
"ofi_hmem_get_dmabuf_fd failed for iface=%d: ret=%d (%s)\n",
572+
mr_attr->iface, ret, fi_strerror(-ret));
573+
} else {
574+
return ibv_reg_mr(efa_mr->domain->ibv_pd,
575+
(void *)mr_attr->mr_iov->iov_base,
576+
mr_attr->mr_iov->iov_len, access);
575577
}
576578

577-
return ibv_reg_mr(efa_mr->domain->ibv_pd,
578-
(void *)mr_attr->mr_iov->iov_base,
579-
mr_attr->mr_iov->iov_len, access);
579+
return NULL;
580580
}
581581

582582
#if HAVE_CUDA

0 commit comments

Comments
 (0)