Skip to content

Commit 7b7269d

Browse files
committed
coll/han: alltoall uses han freelist when possible
As an optimization, when the han alltoall algorithm is forced to allocate a bounce buffer, and that bounce buffer is small enough, then use a buffer from the han_component.pack_buffers free list. Signed-off-by: Luke Robison <[email protected]>
1 parent de71f5a commit 7b7269d

File tree

1 file changed

+27
-6
lines changed

1 file changed

+27
-6
lines changed

ompi/mca/coll/han/coll_han_alltoall.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ int mca_coll_han_alltoall_using_smsc(
7272
opal_convertor_t convertor;
7373
int send_needs_bounce, have_device_buffer;
7474
size_t packed_size = 0;
75+
enum {
76+
BOUNCE_NOT_INITIALIZED = 0,
77+
BOUNCE_IS_FROM_RBUF = 1,
78+
BOUNCE_IS_FROM_FREELIST = 2,
79+
BOUNCE_IS_FROM_MALLOC = 3,
80+
};
7581

7682

7783
OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output,
@@ -191,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc(
191197
192198
If the application buffer is device memory, we'll also need to exchange
193199
in push mode so that the process which has device registrations can
194-
perform the reads.
200+
perform the reads. (this mode has been disabled)
195201
196202
In both of these cases, we'll need to use the bounce buffer too.
197203
*/
@@ -211,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc(
211217
inter_recv_reqs = malloc(sizeof(*inter_recv_reqs) * up_size );
212218
char **low_bufs = malloc(low_size * sizeof(*low_bufs));
213219
void **sbuf_map_ctx = malloc(low_size * sizeof(&sbuf_map_ctx));
220+
opal_free_list_item_t *send_fl_item = NULL;
214221

215222
const int nptrs_gather = 3;
216223
void **gather_buf_out = calloc(low_size*nptrs_gather, sizeof(void*));
217-
bool send_bounce_is_allocated = false;
224+
int send_bounce_status = BOUNCE_NOT_INITIALIZED;
218225

219226
do {
220227
start_allgather:
221228
if ( 0 == send_needs_bounce ) {
222229
send_bounce = (char*)rbuf + up_rank*send_bytes_per_fan;
230+
send_bounce_status = BOUNCE_IS_FROM_RBUF;
223231
} else {
224-
if (!send_bounce_is_allocated) {
225-
send_bounce = malloc(send_bytes_per_fan * fanout);
226-
send_bounce_is_allocated = true;
232+
if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF) {
233+
if (send_bytes_per_fan * fanout < mca_coll_han_component.han_packbuf_bytes) {
234+
send_fl_item = opal_free_list_get(&mca_coll_han_component.pack_buffers);
235+
if (send_fl_item) {
236+
send_bounce_status = BOUNCE_IS_FROM_FREELIST;
237+
send_bounce = send_fl_item->ptr;
238+
}
239+
}
240+
if (!send_fl_item) {
241+
send_bounce = malloc(send_bytes_per_fan * fanout);
242+
send_bounce_status = BOUNCE_IS_FROM_MALLOC;
243+
}
227244
}
228245
}
229246

@@ -409,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc(
409426
}
410427
}
411428
OBJ_DESTRUCT(&convertor);
412-
if (send_bounce_is_allocated) free(send_bounce);
429+
if (send_bounce_status == BOUNCE_IS_FROM_FREELIST) {
430+
opal_free_list_return(&mca_coll_han_component.pack_buffers, send_fl_item);
431+
} else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC) {
432+
free(send_bounce);
433+
}
413434
free(inter_send_reqs);
414435
free(inter_recv_reqs);
415436
free(sbuf_map_ctx);

0 commit comments

Comments
 (0)