Skip to content

Commit b12149a

Browse files
authored
Merge pull request #13200 from lrbison/alltoall_disables
coll/han: disable alltoall for device buffers and MPI_IN_PLACE
2 parents 2ed4f4f + 7b7269d commit b12149a

File tree

1 file changed

+72
-26
lines changed

1 file changed

+72
-26
lines changed

ompi/mca/coll/han/coll_han_alltoall.c

Lines changed: 72 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,16 @@ int mca_coll_han_alltoall_using_smsc(
6969
{
7070

7171
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
72+
opal_convertor_t convertor;
73+
int send_needs_bounce, have_device_buffer;
74+
size_t packed_size = 0;
75+
enum {
76+
BOUNCE_NOT_INITIALIZED = 0,
77+
BOUNCE_IS_FROM_RBUF = 1,
78+
BOUNCE_IS_FROM_FREELIST = 2,
79+
BOUNCE_IS_FROM_MALLOC = 3,
80+
};
81+
7282

7383
OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output,
7484
"Entering mca_coll_han_alltoall_using_smsc\n"));
@@ -82,6 +92,44 @@ int mca_coll_han_alltoall_using_smsc(
8292
comm, han_module->previous_alltoall_module);
8393
}
8494

95+
if (sbuf == MPI_IN_PLACE) {
96+
/* This is not an in-place algorithm */
97+
return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype,
98+
comm, han_module->previous_alltoall_module);
99+
}
100+
101+
OBJ_CONSTRUCT( &convertor, opal_convertor_t );
102+
send_needs_bounce = 0;
103+
have_device_buffer = 0;
104+
/* get converter for copying to one of the leader ranks, and get packed size: */
105+
opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor);
106+
have_device_buffer |= opal_convertor_on_device(&convertor);
107+
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
108+
opal_convertor_cleanup(&convertor);
109+
110+
opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor);
111+
have_device_buffer |= opal_convertor_on_device(&convertor);
112+
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
113+
opal_convertor_get_packed_size( &convertor, &packed_size );
114+
opal_convertor_cleanup(&convertor);
115+
116+
if (have_device_buffer) {
117+
/*
118+
Although this algorithm is functional for device buffers, it requires an
119+
extra copy through the bounce buffer that doesn't make it efficient.
120+
Prefer another algorithm instead.
121+
122+
Note that Open MPI makes assumptions that if one rank uses a device
123+
buffer in a collective, then all ranks will use device buffers, so there
124+
is no need to communicate before taking this branch.
125+
*/
126+
OBJ_DESTRUCT(&convertor);
127+
return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype,
128+
comm, han_module->previous_alltoall_module);
129+
}
130+
131+
132+
85133
/* Create the subcommunicators */
86134
if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) {
87135
opal_output_verbose(1, mca_coll_han_component.han_output,
@@ -107,12 +155,11 @@ int mca_coll_han_alltoall_using_smsc(
107155
comm, han_module->previous_alltoall_module);
108156
}
109157

110-
int rc, send_needs_bounce, ii_push_data;
158+
int rc, ii_push_data;
111159
size_t sndsize;
112160
MPI_Aint sextent, rextent, lb;
113-
char *send_bounce;
114-
opal_convertor_t convertor;
115-
size_t packed_size = 0, packed_size_tmp;
161+
char *send_bounce = NULL;
162+
size_t packed_size_tmp;
116163
int use_isend;
117164
void *gather_buf_in[4];
118165
int up_rank;
@@ -140,22 +187,6 @@ int mca_coll_han_alltoall_using_smsc(
140187
}
141188
if (fanout > up_size) { fanout = up_size; }
142189

143-
OBJ_CONSTRUCT( &convertor, opal_convertor_t );
144-
145-
146-
send_needs_bounce = 0;
147-
/* get converter for copying to one of the leader ranks, and get packed size: */
148-
opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor);
149-
send_needs_bounce |= 0 != opal_convertor_on_device(&convertor);
150-
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
151-
opal_convertor_cleanup(&convertor);
152-
153-
opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor);
154-
send_needs_bounce |= 0 != opal_convertor_on_device(&convertor);
155-
send_needs_bounce |= opal_convertor_need_buffers(&convertor);
156-
opal_convertor_get_packed_size( &convertor, &packed_size );
157-
opal_convertor_cleanup(&convertor);
158-
159190
/*
160191
Because push-mode needs extra synchronizations, we'd like to avoid it,
161192
however it might be necessary:
@@ -166,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc(
166197
167198
If the application buffer is device memory, we'll also need to exchange
168199
in push mode so that the process which has device registrations can
169-
perform the reads.
200+
perform the reads. (this mode has been disabled)
170201
171202
In both of these cases, we'll need to use the bounce buffer too.
172203
*/
@@ -186,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc(
186217
inter_recv_reqs = malloc(sizeof(*inter_recv_reqs) * up_size );
187218
char **low_bufs = malloc(low_size * sizeof(*low_bufs));
188219
void **sbuf_map_ctx = malloc(low_size * sizeof(&sbuf_map_ctx));
220+
opal_free_list_item_t *send_fl_item = NULL;
189221

190222
const int nptrs_gather = 3;
191223
void **gather_buf_out = calloc(low_size*nptrs_gather, sizeof(void*));
192-
bool send_bounce_is_allocated = false;
224+
int send_bounce_status = BOUNCE_NOT_INITIALIZED;
193225

194226
do {
195227
start_allgather:
196228
if ( 0 == send_needs_bounce ) {
197229
send_bounce = (char*)rbuf + up_rank*send_bytes_per_fan;
230+
send_bounce_status = BOUNCE_IS_FROM_RBUF;
198231
} else {
199-
if (!send_bounce_is_allocated) {
200-
send_bounce = malloc(send_bytes_per_fan * fanout);
201-
send_bounce_is_allocated = true;
232+
if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF) {
233+
if (send_bytes_per_fan * fanout < mca_coll_han_component.han_packbuf_bytes) {
234+
send_fl_item = opal_free_list_get(&mca_coll_han_component.pack_buffers);
235+
if (send_fl_item) {
236+
send_bounce_status = BOUNCE_IS_FROM_FREELIST;
237+
send_bounce = send_fl_item->ptr;
238+
}
239+
}
240+
if (!send_fl_item) {
241+
send_bounce = malloc(send_bytes_per_fan * fanout);
242+
send_bounce_status = BOUNCE_IS_FROM_MALLOC;
243+
}
202244
}
203245
}
204246

@@ -384,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc(
384426
}
385427
}
386428
OBJ_DESTRUCT(&convertor);
387-
if (send_bounce_is_allocated) free(send_bounce);
429+
if (send_bounce_status == BOUNCE_IS_FROM_FREELIST) {
430+
opal_free_list_return(&mca_coll_han_component.pack_buffers, send_fl_item);
431+
} else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC) {
432+
free(send_bounce);
433+
}
388434
free(inter_send_reqs);
389435
free(inter_recv_reqs);
390436
free(sbuf_map_ctx);

0 commit comments

Comments
 (0)