@@ -69,6 +69,16 @@ int mca_coll_han_alltoall_using_smsc(
69
69
{
70
70
71
71
mca_coll_han_module_t * han_module = (mca_coll_han_module_t * )module ;
72
+ opal_convertor_t convertor ;
73
+ int send_needs_bounce , have_device_buffer ;
74
+ size_t packed_size = 0 ;
75
+ enum {
76
+ BOUNCE_NOT_INITIALIZED = 0 ,
77
+ BOUNCE_IS_FROM_RBUF = 1 ,
78
+ BOUNCE_IS_FROM_FREELIST = 2 ,
79
+ BOUNCE_IS_FROM_MALLOC = 3 ,
80
+ };
81
+
72
82
73
83
OPAL_OUTPUT_VERBOSE ((90 , mca_coll_han_component .han_output ,
74
84
"Entering mca_coll_han_alltoall_using_smsc\n" ));
@@ -82,6 +92,44 @@ int mca_coll_han_alltoall_using_smsc(
82
92
comm , han_module -> previous_alltoall_module );
83
93
}
84
94
95
+ if (sbuf == MPI_IN_PLACE ) {
96
+ /* This is not an in-place algorithm */
97
+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
98
+ comm , han_module -> previous_alltoall_module );
99
+ }
100
+
101
+ OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
102
+ send_needs_bounce = 0 ;
103
+ have_device_buffer = 0 ;
104
+ /* get converter for copying to one of the leader ranks, and get packed size: */
105
+ opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
106
+ have_device_buffer |= opal_convertor_on_device (& convertor );
107
+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
108
+ opal_convertor_cleanup (& convertor );
109
+
110
+ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
111
+ have_device_buffer |= opal_convertor_on_device (& convertor );
112
+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
113
+ opal_convertor_get_packed_size ( & convertor , & packed_size );
114
+ opal_convertor_cleanup (& convertor );
115
+
116
+ if (have_device_buffer ) {
117
+ /*
118
+ Although this algorithm is functional for device buffers, it requires an
119
+ extra copy through the bounce buffer that doesn't make it efficient.
120
+ Prefer another algorithm instead.
121
+
122
+ Note that Open MPI makes assumptions that if one rank uses a device
123
+ buffer in a collective, then all ranks will use device buffers, so there
124
+ is no need to communicate before taking this branch.
125
+ */
126
+ OBJ_DESTRUCT (& convertor );
127
+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
128
+ comm , han_module -> previous_alltoall_module );
129
+ }
130
+
131
+
132
+
85
133
/* Create the subcommunicators */
86
134
if ( OMPI_SUCCESS != mca_coll_han_comm_create_new (comm , han_module ) ) {
87
135
opal_output_verbose (1 , mca_coll_han_component .han_output ,
@@ -107,12 +155,11 @@ int mca_coll_han_alltoall_using_smsc(
107
155
comm , han_module -> previous_alltoall_module );
108
156
}
109
157
110
- int rc , send_needs_bounce , ii_push_data ;
158
+ int rc , ii_push_data ;
111
159
size_t sndsize ;
112
160
MPI_Aint sextent , rextent , lb ;
113
- char * send_bounce ;
114
- opal_convertor_t convertor ;
115
- size_t packed_size = 0 , packed_size_tmp ;
161
+ char * send_bounce = NULL ;
162
+ size_t packed_size_tmp ;
116
163
int use_isend ;
117
164
void * gather_buf_in [4 ];
118
165
int up_rank ;
@@ -140,22 +187,6 @@ int mca_coll_han_alltoall_using_smsc(
140
187
}
141
188
if (fanout > up_size ) { fanout = up_size ; }
142
189
143
- OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
144
-
145
-
146
- send_needs_bounce = 0 ;
147
- /* get converter for copying to one of the leader ranks, and get packed size: */
148
- opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
149
- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
150
- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
151
- opal_convertor_cleanup (& convertor );
152
-
153
- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
154
- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
155
- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
156
- opal_convertor_get_packed_size ( & convertor , & packed_size );
157
- opal_convertor_cleanup (& convertor );
158
-
159
190
/*
160
191
Because push-mode needs extra synchronizations, we'd like to avoid it,
161
192
however it might be necessary:
@@ -166,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc(
166
197
167
198
If the application buffer is device memory, we'll also need to exchange
168
199
in push mode so that the process which has device registrations can
169
- perform the reads.
200
+ perform the reads. (this mode has been disabled)
170
201
171
202
In both of these cases, we'll need to use the bounce buffer too.
172
203
*/
@@ -186,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc(
186
217
inter_recv_reqs = malloc (sizeof (* inter_recv_reqs ) * up_size );
187
218
char * * low_bufs = malloc (low_size * sizeof (* low_bufs ));
188
219
void * * sbuf_map_ctx = malloc (low_size * sizeof (& sbuf_map_ctx ));
220
+ opal_free_list_item_t * send_fl_item = NULL ;
189
221
190
222
const int nptrs_gather = 3 ;
191
223
void * * gather_buf_out = calloc (low_size * nptrs_gather , sizeof (void * ));
192
- bool send_bounce_is_allocated = false ;
224
+ int send_bounce_status = BOUNCE_NOT_INITIALIZED ;
193
225
194
226
do {
195
227
start_allgather :
196
228
if ( 0 == send_needs_bounce ) {
197
229
send_bounce = (char * )rbuf + up_rank * send_bytes_per_fan ;
230
+ send_bounce_status = BOUNCE_IS_FROM_RBUF ;
198
231
} else {
199
- if (!send_bounce_is_allocated ) {
200
- send_bounce = malloc (send_bytes_per_fan * fanout );
201
- send_bounce_is_allocated = true;
232
+ if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF ) {
233
+ if (send_bytes_per_fan * fanout < mca_coll_han_component .han_packbuf_bytes ) {
234
+ send_fl_item = opal_free_list_get (& mca_coll_han_component .pack_buffers );
235
+ if (send_fl_item ) {
236
+ send_bounce_status = BOUNCE_IS_FROM_FREELIST ;
237
+ send_bounce = send_fl_item -> ptr ;
238
+ }
239
+ }
240
+ if (!send_fl_item ) {
241
+ send_bounce = malloc (send_bytes_per_fan * fanout );
242
+ send_bounce_status = BOUNCE_IS_FROM_MALLOC ;
243
+ }
202
244
}
203
245
}
204
246
@@ -384,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc(
384
426
}
385
427
}
386
428
OBJ_DESTRUCT (& convertor );
387
- if (send_bounce_is_allocated ) free (send_bounce );
429
+ if (send_bounce_status == BOUNCE_IS_FROM_FREELIST ) {
430
+ opal_free_list_return (& mca_coll_han_component .pack_buffers , send_fl_item );
431
+ } else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC ) {
432
+ free (send_bounce );
433
+ }
388
434
free (inter_send_reqs );
389
435
free (inter_recv_reqs );
390
436
free (sbuf_map_ctx );
0 commit comments