Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/ofi_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ int rocr_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd,
uint64_t *offset);
int rocr_hmem_put_dmabuf_fd(int fd);
bool rocr_is_dmabuf_requested(void);

int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
Expand All @@ -193,6 +194,7 @@ bool cuda_is_ipc_enabled(void);
int cuda_get_ipc_handle_size(size_t *size);
bool cuda_is_gdrcopy_enabled(void);
bool cuda_is_dmabuf_supported(void);
bool cuda_is_dmabuf_requested(void);
int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd,
uint64_t *offset);
int cuda_put_dmabuf_fd(int fd);
Expand Down Expand Up @@ -257,6 +259,7 @@ void neuron_free(void **handle);
int neuron_get_dmabuf_fd(const void *addr, uint64_t size, int *fd,
uint64_t *offset);
int neuron_put_dmabuf_fd(int fd);
bool neuron_is_dmabuf_requested(void);

int synapseai_init(void);
int synapseai_cleanup(void);
Expand All @@ -270,6 +273,7 @@ bool synapseai_is_addr_valid(const void *addr, uint64_t *device,
uint64_t *flags);
int synapseai_host_register(void *ptr, size_t size);
int synapseai_host_unregister(void *ptr);
bool synapseai_is_dmabuf_requested(void);

static inline int ofi_memcpy(uint64_t device, void *dest, const void *src,
size_t size)
Expand Down
105 changes: 84 additions & 21 deletions prov/efa/src/efa_hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,34 @@

struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];

/**
* @brief Check if DMABUF is enabled for a specific HMEM interface
*
* This function checks the environment variables to determine if DMABUF
* should be used for the specified HMEM interface. It respects both
* EFA-specific and core libfabric environment variables.
*
* @param[in] iface The HMEM interface to check
* @return true if DMABUF is enabled for the interface, false otherwise
*/
bool efa_hmem_is_dmabuf_env_var_enabled(enum fi_hmem_iface iface)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add a method to the common hmem interface and define an ofi_hmem_is_dmabuf_requested function in src/hmem.c. That make the code reusable for other providers. Also need to add the function for FI_HMEM_ZE is going this route.

{
switch (iface) {
case FI_HMEM_SYSTEM:
return false;
case FI_HMEM_CUDA:
return cuda_is_dmabuf_requested();
case FI_HMEM_NEURON:
return neuron_is_dmabuf_requested();
case FI_HMEM_SYNAPSEAI:
return synapseai_is_dmabuf_requested();
case FI_HMEM_ROCR:
return rocr_is_dmabuf_requested();
default:
return false;
}
}

#if HAVE_CUDA || HAVE_NEURON
static size_t efa_max_eager_msg_size_with_largest_header() {
int mtu_size;
Expand Down Expand Up @@ -134,21 +162,28 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in
}

#if HAVE_EFA_DMABUF_MR
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
(void)cuda_put_dmabuf_fd(dmabuf_fd);
if (!ibv_mr) {
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_CUDA)) {
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_CUDA, ptr, len, &dmabuf_fd, &dmabuf_offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
(void)ofi_hmem_put_dmabuf_fd(FI_HMEM_CUDA, dmabuf_fd);
if (!ibv_mr) {
EFA_INFO(FI_LOG_CORE,
"Unable to register CUDA device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
} else {
info->dmabuf_supported_by_device_b = true;
}
} else {
EFA_INFO(FI_LOG_CORE,
"Unable to register CUDA device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
"Fall back to ibv_reg_mr\n", ret);
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
}
} else {
EFA_INFO(FI_LOG_CORE,
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
"Fall back to ibv_reg_mr\n", ret);
EFA_INFO(FI_LOG_CORE, "FI_HMEM_CUDA DMABUF disabled by environment variable\n");
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
}
#else
Expand Down Expand Up @@ -217,15 +252,29 @@ static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *
}

#if HAVE_EFA_DMABUF_MR
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(
ibv_pd, offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
} else if (ret == -FI_EOPNOTSUPP) {
EFA_INFO(FI_LOG_MR,
"Unable to retrieve dmabuf fd of Neuron device buffer, "
"Fall back to ibv_reg_mr\n");
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_NEURON)) {
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_NEURON, ptr, (uint64_t)len, &dmabuf_fd, &offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(
ibv_pd, offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
(void)ofi_hmem_put_dmabuf_fd(FI_HMEM_NEURON, dmabuf_fd);
if (!ibv_mr) {
EFA_INFO(FI_LOG_CORE,
"Unable to register Neuron device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
} else {
info->dmabuf_supported_by_device_b = true;
}
} else {
EFA_INFO(FI_LOG_MR,
"Unable to retrieve dmabuf fd of Neuron device buffer: %d. "
"Fall back to ibv_reg_mr\n", ret);
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
}
} else {
EFA_INFO(FI_LOG_CORE, "FI_HMEM_NEURON DMABUF disabled by environment variable\n");
ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
}
#else
Expand Down Expand Up @@ -284,9 +333,23 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
}

info->initialized = true;
info->max_medium_msg_size = 0;
info->runt_size = 0;
info->min_read_msg_size = 0;
info->min_read_write_size = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are default to 0 and set in efa_domain_hmem_info_init_protocol_thresholds.

info->dmabuf_supported_by_device_b = false;

if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) {
if (iface == FI_HMEM_SYNAPSEAI) {
info->p2p_supported_by_device = true;
if (efa_hmem_is_dmabuf_env_var_enabled(FI_HMEM_SYNAPSEAI)) {
info->dmabuf_supported_by_device_b = true;
} else {
EFA_INFO(FI_LOG_CORE, "FI_HMEM_SYNAPSEAI DMABUF disabled by environment variable\n");
info->dmabuf_supported_by_device_b = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to set it again here

}
} else if(iface == FI_HMEM_SYSTEM) {
info->p2p_supported_by_device = true;
info->dmabuf_supported_by_device_b = false;
} else if (ofi_hmem_p2p_disabled()) {
info->p2p_supported_by_device = false;
} else {
Expand Down
2 changes: 2 additions & 0 deletions prov/efa/src/efa_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ static const enum fi_hmem_iface efa_hmem_ifaces[] = {
struct efa_hmem_info {
bool initialized; /* do we support it at all */
bool p2p_supported_by_device; /* do we support p2p with this device */
bool dmabuf_supported_by_device_b; /* do we support dmabuf with this device */

size_t max_medium_msg_size;
size_t runt_size;
Expand Down Expand Up @@ -104,4 +105,5 @@ static inline int efa_copy_to_hmem(void *desc, void *dest, const void *buff, siz

ssize_t efa_copy_from_hmem_iov(void **desc, char *buff, size_t buff_size, const struct iovec *hmem_iov, size_t iov_count);
ssize_t efa_copy_to_hmem_iov(void **desc, struct iovec *hmem_iov, size_t iov_count, char *buff, size_t buff_size);
bool efa_hmem_is_dmabuf_env_var_enabled(enum fi_hmem_iface iface);
#endif
136 changes: 68 additions & 68 deletions prov/efa/src/efa_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,77 +506,77 @@ struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
* @param flags flags in fi_mr_reg/fi_mr_regattr
* @return struct ibv_mr* the ptr to the registered MR
*/
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr,
int access, const uint64_t flags)
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr,
struct fi_mr_attr *mr_attr,
int access, const uint64_t flags)
{
int dmabuf_fd;
uint64_t offset;
int ret;
struct ibv_mr *ibv_mr;

if (flags & FI_MR_DMABUF)
return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd,
mr_attr->dmabuf->offset,
mr_attr->dmabuf->len,
(uintptr_t) mr_attr->dmabuf->base_addr + mr_attr->dmabuf->offset,
mr_attr->dmabuf->fd,
access
);

if (efa_mr_is_synapseai(efa_mr)) {
ret = ofi_hmem_get_dmabuf_fd(efa_mr->peer.iface,
mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd, &offset);
if (ret != FI_SUCCESS) {
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n");
return NULL;
}
return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd, offset,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
}

/*
* TODO: need such fallback for cuda as well when
* FI_CUDA_API_PERMITTED is true
*/
if (efa_mr_is_neuron(efa_mr)) {
ret = ofi_hmem_get_dmabuf_fd(
efa_mr->peer.iface,
mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len,
&dmabuf_fd,
&offset);

if (ret == FI_SUCCESS) {
/* Success => invoke ibv_reg_dmabuf_mr */
ibv_mr = efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd, 0,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
(void) ofi_hmem_put_dmabuf_fd(efa_mr->peer.iface, dmabuf_fd);
return ibv_mr;
} else if (ret == -FI_EOPNOTSUPP) {
/* Protocol not available => fallback */
EFA_INFO(FI_LOG_MR,
"Unable to get dmabuf fd for Neuron device buffer, "
"Fall back to ibv_reg_mr\n");
return ibv_reg_mr(
efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
}
return NULL;
int dmabuf_fd;
uint64_t offset;
int ret;

/* Explicit dmabuf registration */
if (flags & FI_MR_DMABUF) {
if (!mr_attr->dmabuf) {
EFA_WARN(FI_LOG_MR, "FI_MR_DMABUF set but mr_attr->dmabuf == NULL\n");
return NULL;
}
if (!g_efa_hmem_info[efa_mr->peer.iface].dmabuf_supported_by_device_b) {
EFA_WARN(FI_LOG_MR,
"Requested FI_MR_DMABUF, but dmabuf not supported for iface=%d\n",
efa_mr->peer.iface);
return NULL;
}

EFA_INFO(FI_LOG_MR,
"FI_MR_DMABUF: fd=%d offset=%lu len=%zu\n",
mr_attr->dmabuf->fd, mr_attr->dmabuf->offset,
mr_attr->dmabuf->len);

return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd,
mr_attr->dmabuf->offset,
mr_attr->dmabuf->len,
(uintptr_t) mr_attr->dmabuf->base_addr + mr_attr->dmabuf->offset,
mr_attr->dmabuf->fd,
access);
}

/* Implicit VA path with dmabuf-first */
if (g_efa_hmem_info[efa_mr->peer.iface].dmabuf_supported_by_device_b) {
ret = ofi_hmem_get_dmabuf_fd(
efa_mr->peer.iface,
mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd, &offset);

if (ret == FI_SUCCESS) {
struct ibv_mr *dmabuf_mr;
EFA_INFO(FI_LOG_MR,
"Registering dmabuf MR: fd=%d offset=%lu len=%zu\n",
dmabuf_fd, offset, mr_attr->mr_iov->iov_len);

dmabuf_mr = efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd, 0,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);

/* Close the dmabuf file descriptor - it's no longer needed after registration */
(void) ofi_hmem_put_dmabuf_fd(efa_mr->peer.iface, dmabuf_fd);

return dmabuf_mr;
}

EFA_WARN(FI_LOG_MR,
"ofi_hmem_get_dmabuf_fd failed for iface=%d: ret=%d (%s)\n",
mr_attr->iface, ret, fi_strerror(-ret));
} else {
return ibv_reg_mr(efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
}

return ibv_reg_mr(efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
return NULL;
}

#if HAVE_CUDA
Expand Down
16 changes: 12 additions & 4 deletions src/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ int cuda_hmem_init(void)
"this variable is not checked. (default: true)");

fi_param_define(NULL, "hmem_cuda_use_dmabuf", FI_PARAM_BOOL,
"Use dma-buf for sharing buffer with hardware. (default:true)");
"Use dma-buf for sharing buffer with hardware. (default: true)");

ret = cuda_hmem_dl_init();
if (ret != FI_SUCCESS)
Expand Down Expand Up @@ -959,13 +959,16 @@ bool cuda_is_gdrcopy_enabled(void)
return cuda_attr.use_gdrcopy;
}

bool cuda_is_dmabuf_supported(void)
bool cuda_is_dmabuf_requested(void)
{
int use_dmabuf = 1;

fi_param_get_bool(NULL, "hmem_cuda_use_dmabuf", &use_dmabuf);
return use_dmabuf != 0;
}

return use_dmabuf && cuda_attr.dmabuf_supported;
bool cuda_is_dmabuf_supported(void)
{
return cuda_is_dmabuf_requested() && cuda_attr.dmabuf_supported;
}

#else
Expand Down Expand Up @@ -1068,6 +1071,11 @@ bool cuda_is_dmabuf_supported(void)
return false;
}

bool cuda_is_dmabuf_requested(void)
{
return false;
}

int cuda_get_dmabuf_fd(const void *addr, uint64_t size, int *fd,
uint64_t *offset)
{
Expand Down
Loading