Skip to content

Commit c1be6c7

Browse files
committed
prov/efa: dmabuf try / fallback logic
feat: Adding default dmabuf attempt and fallback logic for all efa_hmem_ifaces Problem: - How make dmabuf usage default going forward and have fallback mechanism if dmabuf not supported Solution: - Modified initial PR from @jiaxiyan at 6aa6708#diff-9b57a9410ed94ed1f1aea837412e68bbe9b49582edce813ba352fffb37dcc007 - Added dmabuf_supported_by_device_b flag in efa_hmem_info structure in prov/efa/efa_hmem.h - Updated dmabuf_supported_by_device_b in each fi_hmem_iface type p2p_support fcn in prov/efa/efa_hmem.c - Removed per fi_hmem_iface type checks in prov/efa/efa_mr_reg_ibv_mr.c - Testing: - Ran mpi perf tests on 2 nodes on p5en with dmabuf and fallback option hard set - Ran mpi perf tests on 16 nodes on p5en with dmabuf and fallback option hard set Sim Issue: - N/A Signed-off-by: Nick Mazzilli <[email protected]>
1 parent 48e09ea commit c1be6c7

File tree

5 files changed

+119
-34
lines changed

5 files changed

+119
-34
lines changed

prov/efa/src/efa_env.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ struct efa_env efa_env = {
4141
.internal_rx_refill_threshold = 8,
4242
.use_data_path_direct = true,
4343
.implicit_av_size = 0,
44+
45+
/* HMEM DMABUF defaults */
46+
.hmem_system_use_dmabuf = 0,
47+
.hmem_cuda_use_dmabuf = 1,
48+
.hmem_neuron_use_dmabuf = 1,
49+
.hmem_synapseai_use_dmabuf = 1,
50+
.hmem_rocr_use_dmabuf = 1,
4451
};
4552

4653
/* @brief Read and store the FI_EFA_* environment variables.
@@ -141,6 +148,14 @@ void efa_env_param_get(void)
141148
}
142149
fi_param_get_bool(&efa_prov, "use_data_path_direct", &efa_env.use_data_path_direct);
143150

151+
/* HMEM DMABUF parameters */
152+
fi_param_get_bool(&efa_prov, "hmem_system_use_dmabuf", &efa_env.hmem_system_use_dmabuf);
153+
fi_param_get_bool(&efa_prov, "hmem_neuron_use_dmabuf", &efa_env.hmem_neuron_use_dmabuf);
154+
fi_param_get_bool(&efa_prov, "hmem_synapseai_use_dmabuf", &efa_env.hmem_synapseai_use_dmabuf);
155+
/* Use core libfabric parameters for CUDA and ROCR */
156+
fi_param_get_bool(NULL, "hmem_cuda_use_dmabuf", &efa_env.hmem_cuda_use_dmabuf);
157+
fi_param_get_bool(NULL, "hmem_rocr_use_dmabuf", &efa_env.hmem_rocr_use_dmabuf);
158+
144159
efa_fork_support_request_initialize();
145160
}
146161

@@ -184,6 +199,15 @@ void efa_env_define()
184199
"Set the maximum number of receive operations before the provider returns -FI_EAGAIN.");
185200
fi_param_define(&efa_prov, "rx_copy_unexp", FI_PARAM_BOOL,
186201
"Enables the use of a separate pool of bounce-buffers to copy unexpected messages out of the pre-posted receive buffers. (Default: 1)");
202+
203+
/* HMEM DMABUF environment variables */
204+
fi_param_define(&efa_prov, "hmem_system_use_dmabuf", FI_PARAM_BOOL,
205+
"Use DMABUF for system memory registration. (Default: 0)");
206+
fi_param_define(&efa_prov, "hmem_neuron_use_dmabuf", FI_PARAM_BOOL,
207+
"Use DMABUF for Neuron device memory registration. (Default: 1)");
208+
fi_param_define(&efa_prov, "hmem_synapseai_use_dmabuf", FI_PARAM_BOOL,
209+
"Use DMABUF for SynapseAI device memory registration. (Default: 1)");
210+
187211
fi_param_define(&efa_prov, "rx_copy_ooo", FI_PARAM_BOOL,
188212
"Enables the use of a separate pool of bounce-buffers to copy out-of-order RTM packets out of the pre-posted receive buffers. (Default: 1)");
189213
fi_param_define(&efa_prov, "max_timeout", FI_PARAM_INT,

prov/efa/src/efa_env.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@ struct efa_env {
7575
* Value of 0 means there is no limit on the size.
7676
*/
7777
size_t implicit_av_size;
78+
79+
/* HMEM DMABUF settings */
80+
int hmem_system_use_dmabuf;
81+
int hmem_cuda_use_dmabuf;
82+
int hmem_neuron_use_dmabuf;
83+
int hmem_synapseai_use_dmabuf;
84+
int hmem_rocr_use_dmabuf;
7885
};
7986

8087
extern struct efa_env efa_env;

prov/efa/src/efa_hmem.c

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,32 @@
77

88
struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];
99

10+
/**
11+
* @brief Check if DMABUF is enabled for a specific HMEM interface
12+
*
13+
* This function checks the environment variables to determine if DMABUF
14+
* should be used for the specified HMEM interface. It respects both
15+
* EFA-specific and core libfabric environment variables.
16+
*
17+
* @param[in] iface The HMEM interface to check
18+
* @return true if DMABUF is enabled for the interface, false otherwise
19+
*/
20+
bool efa_hmem_is_dmabuf_env_var_enabled(enum fi_hmem_iface iface)
21+
{
22+
switch (iface) {
23+
case FI_HMEM_SYSTEM:
24+
return efa_env.hmem_system_use_dmabuf;
25+
case FI_HMEM_CUDA:
26+
return efa_env.hmem_cuda_use_dmabuf;
27+
case FI_HMEM_NEURON:
28+
return efa_env.hmem_neuron_use_dmabuf;
29+
case FI_HMEM_SYNAPSEAI:
30+
return efa_env.hmem_synapseai_use_dmabuf;
31+
default:
32+
return false;
33+
}
34+
}
35+
1036
#if HAVE_CUDA || HAVE_NEURON
1137
static size_t efa_max_eager_msg_size_with_largest_header() {
1238
int mtu_size;
@@ -136,22 +162,28 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in
136162
}
137163

138164
#if HAVE_EFA_DMABUF_MR
139-
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
140-
if (ret == FI_SUCCESS) {
141-
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
142-
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
143-
(void)cuda_put_dmabuf_fd(dmabuf_fd);
144-
if (!ibv_mr) {
165+
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_CUDA)) {
166+
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
167+
if (ret == FI_SUCCESS) {
168+
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
169+
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
170+
(void)cuda_put_dmabuf_fd(dmabuf_fd);
171+
if (!ibv_mr) {
172+
EFA_INFO(FI_LOG_CORE,
173+
"Unable to register CUDA device buffer via dmabuf: %s. "
174+
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
175+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
176+
} else {
177+
info->dmabuf_supported_by_device_b = true;
178+
}
179+
} else {
145180
EFA_INFO(FI_LOG_CORE,
146-
"Unable to register CUDA device buffer via dmabuf: %s. "
147-
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
181+
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
182+
"Fall back to ibv_reg_mr\n", ret);
148183
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
149184
}
150-
info->dmabuf_supported_by_device_b = true;
151185
} else {
152-
EFA_INFO(FI_LOG_CORE,
153-
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
154-
"Fall back to ibv_reg_mr\n", ret);
186+
EFA_INFO(FI_LOG_CORE, "CUDA DMABUF disabled by environment variable\n");
155187
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
156188
}
157189
#else
@@ -222,16 +254,28 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *
222254
}
223255

224256
#if HAVE_EFA_DMABUF_MR
225-
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
226-
if (ret == FI_SUCCESS) {
227-
ibv_mr = ibv_reg_dmabuf_mr(
228-
ibv_pd, offset,
229-
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
230-
info->dmabuf_supported_by_device_b = true;
231-
} else if (ret == -FI_EOPNOTSUPP) {
232-
EFA_INFO(FI_LOG_MR,
233-
"Unable to retrieve dmabuf fd of Neuron device buffer, "
234-
"Fall back to ibv_reg_mr\n");
257+
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_NEURON)) {
258+
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
259+
if (ret == FI_SUCCESS) {
260+
ibv_mr = ibv_reg_dmabuf_mr(
261+
ibv_pd, offset,
262+
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
263+
if (ibv_mr) {
264+
info->dmabuf_supported_by_device_b = true;
265+
} else {
266+
EFA_INFO(FI_LOG_CORE,
267+
"Unable to register Neuron device buffer via dmabuf: %s. "
268+
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
269+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
270+
}
271+
} else {
272+
EFA_INFO(FI_LOG_MR,
273+
"Unable to retrieve dmabuf fd of Neuron device buffer: %d. "
274+
"Fall back to ibv_reg_mr\n", ret);
275+
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
276+
}
277+
} else {
278+
EFA_INFO(FI_LOG_CORE, "Neuron DMABUF disabled by environment variable\n");
235279
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
236280
}
237281
#else
@@ -295,9 +339,12 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
295339
info->min_read_msg_size = 0;
296340
info->min_read_write_size = 0;
297341

298-
if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) {
342+
if (iface == FI_HMEM_SYNAPSEAI) {
299343
info->p2p_supported_by_device = true;
300344
info->dmabuf_supported_by_device_b = true;
345+
} else if(iface == FI_HMEM_SYSTEM) {
346+
info->p2p_supported_by_device = true;
347+
info->dmabuf_supported_by_device_b = false;
301348
} else if (ofi_hmem_p2p_disabled()) {
302349
info->p2p_supported_by_device = false;
303350
} else {

prov/efa/src/efa_hmem.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,5 @@ static inline int efa_copy_to_hmem(void *desc, void *dest, const void *buff, siz
105105

106106
ssize_t efa_copy_from_hmem_iov(void **desc, char *buff, size_t buff_size, const struct iovec *hmem_iov, size_t iov_count);
107107
ssize_t efa_copy_to_hmem_iov(void **desc, struct iovec *hmem_iov, size_t iov_count, char *buff, size_t buff_size);
108+
bool efa_hmem_is_dmabuf_env_var_enabled(enum fi_hmem_iface iface);
108109
#endif

prov/efa/src/efa_mr.c

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -550,38 +550,44 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr,
550550
&dmabuf_fd, &offset);
551551

552552
if (ret == FI_SUCCESS) {
553+
struct ibv_mr *dmabuf_mr;
553554
EFA_INFO(FI_LOG_MR,
554555
"Registering dmabuf MR: fd=%d offset=%lu len=%zu\n",
555556
dmabuf_fd, offset, mr_attr->mr_iov->iov_len);
556557

557-
return efa_mr_reg_ibv_dmabuf_mr(
558+
dmabuf_mr = efa_mr_reg_ibv_dmabuf_mr(
558559
efa_mr->domain->ibv_pd, offset,
559560
mr_attr->mr_iov->iov_len,
560561
(uint64_t)mr_attr->mr_iov->iov_base,
561562
dmabuf_fd, access);
563+
564+
/* Close the dmabuf file descriptor - it's no longer needed after registration */
565+
(void) ofi_hmem_put_dmabuf_fd(mr_attr->iface, dmabuf_fd);
566+
567+
if (dmabuf_mr) {
568+
return dmabuf_mr;
569+
}
570+
571+
EFA_INFO(FI_LOG_MR, "DMABUF registration failed, returning NULL\n");
562572
}
563573

564574
if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOSYS) {
565575
EFA_WARN(FI_LOG_MR,
566-
"dmabuf not supported at runtime for iface=%d, disabling\n",
576+
"dmabuf not supported at runtime for iface=%d \n",
567577
mr_attr->iface);
568-
g_efa_hmem_info[mr_attr->iface].dmabuf_supported_by_device_b = false;
569578
} else {
570579
EFA_WARN(FI_LOG_MR,
571580
"ofi_hmem_get_dmabuf_fd failed: ret=%d (%s)\n",
572581
ret, fi_strerror(-ret));
573582
}
574583
/* fall through to ibv_reg_mr */
575-
}
576-
577-
/* Fallback: plain ibv_reg_mr */
578-
EFA_WARN(FI_LOG_MR,
579-
"Fallback ibv_reg_mr: addr=%p len=%zu\n",
580-
mr_attr->mr_iov->iov_base, mr_attr->mr_iov->iov_len);
581-
582-
return ibv_reg_mr(efa_mr->domain->ibv_pd,
584+
} else {
585+
return ibv_reg_mr(efa_mr->domain->ibv_pd,
583586
(void *)mr_attr->mr_iov->iov_base,
584587
mr_attr->mr_iov->iov_len, access);
588+
}
589+
590+
return NULL;
585591
}
586592

587593
#if HAVE_CUDA

0 commit comments

Comments
 (0)