Skip to content

Commit 60a7bc2

Browse files
author
Ralph Castain
committed
Enable the PMIx notification callback system. This currently is only supported by the pmix120 component, which is not selected by default. All other components will ignore error registration requests, and thus do not support debugger attach when launched via mpirun. Note that direct launched applications will support such attachment, but may not do so in a scalable fashion.
Fixes ##1225
1 parent c18af0d commit 60a7bc2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+2430
-601
lines changed

ompi/mca/rte/orte/rte_orte.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ OMPI_DECLSPEC void __opal_attribute_noreturn__
9191
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
9292

9393
/* Init and finalize objects and operations */
94-
#define ompi_rte_init(a, b) orte_init(a, b, ORTE_PROC_MPI)
94+
OMPI_DECLSPEC int ompi_rte_init(int *pargc, char ***pargv);
9595
#define ompi_rte_finalize() orte_finalize()
9696
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);
9797

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,79 @@
5252

5353
extern ompi_rte_orte_component_t mca_rte_orte_component;
5454

55+
typedef struct {
56+
volatile bool active;
57+
int status;
58+
int errhandler;
59+
} errhandler_t;
60+
61+
static void register_cbfunc(int status, int errhndler, void *cbdata)
62+
{
63+
errhandler_t *cd = (errhandler_t*)cbdata;
64+
cd->status = status;
65+
cd->errhandler = errhndler;
66+
cd->active = false;
67+
}
68+
69+
static volatile bool wait_for_release = true;
70+
static int errhandler = -1;
71+
72+
static void notify_cbfunc(int status,
73+
opal_list_t *procs,
74+
opal_list_t *info,
75+
opal_pmix_release_cbfunc_t cbfunc,
76+
void *cbdata)
77+
{
78+
if (NULL != cbfunc) {
79+
cbfunc(cbdata);
80+
}
81+
wait_for_release = false;
82+
}
83+
84+
85+
int ompi_rte_init(int *pargc, char ***pargv)
86+
{
87+
int rc;
88+
opal_list_t info;
89+
opal_value_t val;
90+
errhandler_t cd;
91+
92+
if (ORTE_SUCCESS != (rc = orte_init(pargc, pargv, ORTE_PROC_MPI))) {
93+
return rc;
94+
}
95+
96+
if (!orte_standalone_operation) {
97+
/* register to receive any debugger release */
98+
OBJ_CONSTRUCT(&info, opal_list_t);
99+
OBJ_CONSTRUCT(&val, opal_value_t);
100+
val.key = strdup(OPAL_PMIX_ERROR_NAME);
101+
val.type = OPAL_INT;
102+
val.data.integer = OPAL_ERR_DEBUGGER_RELEASE;
103+
opal_list_append(&info, &val.super);
104+
cd.status = ORTE_ERROR;
105+
cd.errhandler = -1;
106+
cd.active = true;
107+
108+
opal_pmix.register_errhandler(&info, notify_cbfunc, register_cbfunc, &cd);
109+
110+
/* let the MPI progress engine run while we wait for
111+
* registration to complete */
112+
OMPI_WAIT_FOR_COMPLETION(cd.active);
113+
/* safely deconstruct the list */
114+
opal_list_remove_first(&info);
115+
OBJ_DESTRUCT(&val);
116+
OBJ_DESTRUCT(&info);
117+
if (OPAL_SUCCESS != cd.status) {
118+
/* ouch - we are doomed */
119+
ORTE_ERROR_LOG(cd.status);
120+
return OMPI_ERROR;
121+
}
122+
errhandler = cd.errhandler;
123+
}
124+
125+
return OMPI_SUCCESS;
126+
}
127+
55128
void ompi_rte_abort(int error_code, char *fmt, ...)
56129
{
57130
va_list arglist;
@@ -100,10 +173,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
100173
* attaching debuggers -- see big comment in
101174
* orte/tools/orterun/debuggers.c explaining the two scenarios.
102175
*/
176+
103177
void ompi_rte_wait_for_debugger(void)
104178
{
105179
int debugger;
106-
orte_rml_recv_cb_t xfer;
107180

108181
/* See lengthy comment in orte/tools/orterun/debuggers.c about
109182
orte_in_parallel_debugger */
@@ -117,12 +190,12 @@ void ompi_rte_wait_for_debugger(void)
117190
/* if not, just return */
118191
return;
119192
}
120-
121193
/* if we are being debugged, then we need to find
122194
* the correct plug-ins
123195
*/
124196
ompi_debugger_setup_dlls();
125197

198+
/* wait for the debugger to attach */
126199
if (orte_standalone_operation) {
127200
/* spin until debugger attaches and releases us */
128201
while (MPIR_debug_gate == 0) {
@@ -133,23 +206,9 @@ void ompi_rte_wait_for_debugger(void)
133206
#endif
134207
}
135208
} else {
136-
/* only the rank=0 proc waits for either a message from the
137-
* HNP or for the debugger to attach - everyone else will just
138-
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139-
* joins them.
140-
*/
141-
if (0 != ORTE_PROC_MY_NAME->vpid) {
142-
return;
143-
}
144-
145-
/* VPID 0 waits for a message from the HNP */
146-
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
147-
xfer.active = true;
148-
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
149-
ORTE_RML_TAG_DEBUGGER_RELEASE,
150-
ORTE_RML_NON_PERSISTENT,
151-
orte_rml_recv_callback, &xfer);
152-
/* let the MPI progress engine run while we wait */
153-
OMPI_WAIT_FOR_COMPLETION(xfer.active);
209+
/* now wait for the notification to occur */
210+
OMPI_WAIT_FOR_COMPLETION(wait_for_release);
211+
/* deregister the errhandler */
212+
opal_pmix.deregister_errhandler(errhandler, NULL, NULL);
154213
}
155214
}

opal/dss/dss_compare.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
13-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
1413
* Copyright (c) 2014-2016 Research Organization for Information Science
1514
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -449,3 +449,12 @@ int opal_dss_compare_jobid(opal_jobid_t *value1,
449449
return OPAL_EQUAL;
450450
}
451451

452+
int opal_dss_compare_status(int *value1, int *value2, opal_data_type_t type)
453+
{
454+
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
455+
456+
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
457+
458+
return OPAL_EQUAL;
459+
}
460+

opal/dss/dss_copy.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1313
* Copyright (c) 2014-2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* $COPYRIGHT$
@@ -61,6 +61,7 @@ int opal_dss_std_copy(void **dest, void *src, opal_data_type_t type)
6161

6262
case OPAL_INT:
6363
case OPAL_UINT:
64+
case OPAL_STATUS:
6465
datasize = sizeof(int);
6566
break;
6667

opal/dss/dss_internal.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2014 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@@ -332,6 +332,9 @@ int opal_dss_pack_jobid(opal_buffer_t *buffer, const void *src,
332332
int opal_dss_pack_vpid(opal_buffer_t *buffer, const void *src,
333333
int32_t num_vals, opal_data_type_t type);
334334

335+
int opal_dss_pack_status(opal_buffer_t *buffer, const void *src,
336+
int32_t num_vals, opal_data_type_t type);
337+
335338
/*
336339
* Internal unpack functions
337340
*/
@@ -401,6 +404,8 @@ int opal_dss_unpack_jobid(opal_buffer_t *buffer, void *dest,
401404
int opal_dss_unpack_vpid(opal_buffer_t *buffer, void *dest,
402405
int32_t *num_vals, opal_data_type_t type);
403406

407+
int opal_dss_unpack_status(opal_buffer_t *buffer, void *dest,
408+
int32_t *num_vals, opal_data_type_t type);
404409

405410
/*
406411
* Internal copy functions
@@ -497,6 +502,8 @@ int opal_dss_compare_jobid(opal_jobid_t *value1,
497502
opal_jobid_t *value2,
498503
opal_data_type_t type);
499504

505+
int opal_dss_compare_status(int *value1, int *value2, opal_data_type_t type);
506+
500507
/*
501508
* Internal print functions
502509
*/
@@ -536,6 +543,7 @@ int opal_dss_print_time(char **output, char *prefix, time_t *src, opal_data_type
536543
int opal_dss_print_name(char **output, char *prefix, opal_process_name_t *name, opal_data_type_t type);
537544
int opal_dss_print_jobid(char **output, char *prefix, opal_process_name_t *src, opal_data_type_t type);
538545
int opal_dss_print_vpid(char **output, char *prefix, opal_process_name_t *src, opal_data_type_t type);
546+
int opal_dss_print_status(char **output, char *prefix, int *src, opal_data_type_t type);
539547

540548

541549
/*

opal/dss/dss_open_close.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* $COPYRIGHT$
@@ -611,6 +611,17 @@ int opal_dss_open(void)
611611
return rc;
612612
}
613613

614+
615+
tmp = OPAL_STATUS;
616+
if (OPAL_SUCCESS != (rc = opal_dss.register_type(opal_dss_pack_status,
617+
opal_dss_unpack_status,
618+
(opal_dss_copy_fn_t)opal_dss_std_copy,
619+
(opal_dss_compare_fn_t)opal_dss_compare_status,
620+
(opal_dss_print_fn_t)opal_dss_print_status,
621+
OPAL_DSS_UNSTRUCTURED,
622+
"OPAL_STATUS", &tmp))) {
623+
return rc;
624+
}
614625
/* All done */
615626

616627
opal_dss_initialized = true;

opal/dss/dss_pack.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2014 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -1240,3 +1240,20 @@ int opal_dss_pack_vpid(opal_buffer_t *buffer, const void *src,
12401240
return ret;
12411241
}
12421242

1243+
/*
1244+
* STATUS
1245+
*/
1246+
int opal_dss_pack_status(opal_buffer_t *buffer, const void *src,
1247+
int32_t num_vals, opal_data_type_t type)
1248+
{
1249+
int ret;
1250+
1251+
/* Turn around and pack the real type */
1252+
ret = opal_dss_pack_buffer(buffer, src, num_vals, OPAL_INT);
1253+
if (OPAL_SUCCESS != ret) {
1254+
OPAL_ERROR_LOG(ret);
1255+
}
1256+
1257+
return ret;
1258+
}
1259+

opal/dss/dss_print.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
13-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2014 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -25,6 +25,7 @@
2525
#include "opal_stdint.h"
2626
#include <stdio.h>
2727

28+
#include "opal/util/error.h"
2829
#include "opal/dss/dss_internal.h"
2930

3031
int opal_dss_print(char **output, char *prefix, void *src, opal_data_type_t type)
@@ -1060,3 +1061,29 @@ int opal_dss_print_vpid(char **output, char *prefix,
10601061

10611062
return OPAL_SUCCESS;
10621063
}
1064+
1065+
int opal_dss_print_status(char **output, char *prefix,
1066+
int *src, opal_data_type_t type)
1067+
{
1068+
char *prefx;
1069+
1070+
/* deal with NULL prefix */
1071+
if (NULL == prefix) asprintf(&prefx, " ");
1072+
else prefx = prefix;
1073+
1074+
/* if src is NULL, just print data type and return */
1075+
if (NULL == src) {
1076+
asprintf(output, "%sData type: OPAL_STATUS\tValue: NULL pointer", prefx);
1077+
if (prefx != prefix) {
1078+
free(prefx);
1079+
}
1080+
return OPAL_SUCCESS;
1081+
}
1082+
1083+
asprintf(output, "%sData type: OPAL_STATUS\tValue: %s", prefx, opal_strerror(*src));
1084+
if (prefx != prefix) {
1085+
free(prefx);
1086+
}
1087+
1088+
return OPAL_SUCCESS;
1089+
}

opal/dss/dss_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights
1515
* reserved.
16-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
1716
* Copyright (c) 2014-2016 Research Organization for Information Science
1817
* and Technology (RIST). All rights reserved.
18+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -120,6 +120,8 @@ typedef struct {
120120
#define OPAL_NAME (opal_data_type_t) 50
121121
#define OPAL_JOBID (opal_data_type_t) 51
122122
#define OPAL_VPID (opal_data_type_t) 52
123+
#define OPAL_STATUS (opal_data_type_t) 53
124+
123125
/* OPAL Dynamic */
124126
#define OPAL_DSS_ID_DYNAMIC (opal_data_type_t) 100
125127

@@ -245,6 +247,7 @@ typedef struct {
245247
float fval;
246248
double dval;
247249
struct timeval tv;
250+
int status;
248251
opal_process_name_t name;
249252
opal_bool_array_t flag_array;
250253
opal_uint8_array_t byte_array;

opal/dss/dss_unpack.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012-2015 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2014-2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* $COPYRIGHT$
@@ -1519,3 +1519,20 @@ int opal_dss_unpack_vpid(opal_buffer_t *buffer, void *dest,
15191519

15201520
return ret;
15211521
}
1522+
1523+
/*
1524+
* STATUS
1525+
*/
1526+
int opal_dss_unpack_status(opal_buffer_t *buffer, void *dest,
1527+
int32_t *num_vals, opal_data_type_t type)
1528+
{
1529+
int ret;
1530+
1531+
/* Turn around and unpack the real type */
1532+
ret = opal_dss_unpack_buffer(buffer, dest, num_vals, OPAL_INT);
1533+
if (OPAL_SUCCESS != ret) {
1534+
OPAL_ERROR_LOG(ret);
1535+
}
1536+
1537+
return ret;
1538+
}

0 commit comments

Comments
 (0)