Skip to content

Commit 142e38c

Browse files
author
rhc54
committed
Merge pull request #1358 from rhc54/topic/notification
Enable the PMIx notification callback system and fix debugger attach
2 parents c18af0d + 60a7bc2 commit 142e38c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+2430
-601
lines changed

ompi/mca/rte/orte/rte_orte.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ OMPI_DECLSPEC void __opal_attribute_noreturn__
9191
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
9292

9393
/* Init and finalize objects and operations */
94-
#define ompi_rte_init(a, b) orte_init(a, b, ORTE_PROC_MPI)
94+
OMPI_DECLSPEC int ompi_rte_init(int *pargc, char ***pargv);
9595
#define ompi_rte_finalize() orte_finalize()
9696
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);
9797

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,79 @@
5252

5353
extern ompi_rte_orte_component_t mca_rte_orte_component;
5454

55+
typedef struct {
56+
volatile bool active;
57+
int status;
58+
int errhandler;
59+
} errhandler_t;
60+
61+
static void register_cbfunc(int status, int errhndler, void *cbdata)
62+
{
63+
errhandler_t *cd = (errhandler_t*)cbdata;
64+
cd->status = status;
65+
cd->errhandler = errhndler;
66+
cd->active = false;
67+
}
68+
69+
static volatile bool wait_for_release = true;
70+
static int errhandler = -1;
71+
72+
static void notify_cbfunc(int status,
73+
opal_list_t *procs,
74+
opal_list_t *info,
75+
opal_pmix_release_cbfunc_t cbfunc,
76+
void *cbdata)
77+
{
78+
if (NULL != cbfunc) {
79+
cbfunc(cbdata);
80+
}
81+
wait_for_release = false;
82+
}
83+
84+
85+
int ompi_rte_init(int *pargc, char ***pargv)
86+
{
87+
int rc;
88+
opal_list_t info;
89+
opal_value_t val;
90+
errhandler_t cd;
91+
92+
if (ORTE_SUCCESS != (rc = orte_init(pargc, pargv, ORTE_PROC_MPI))) {
93+
return rc;
94+
}
95+
96+
if (!orte_standalone_operation) {
97+
/* register to receive any debugger release */
98+
OBJ_CONSTRUCT(&info, opal_list_t);
99+
OBJ_CONSTRUCT(&val, opal_value_t);
100+
val.key = strdup(OPAL_PMIX_ERROR_NAME);
101+
val.type = OPAL_INT;
102+
val.data.integer = OPAL_ERR_DEBUGGER_RELEASE;
103+
opal_list_append(&info, &val.super);
104+
cd.status = ORTE_ERROR;
105+
cd.errhandler = -1;
106+
cd.active = true;
107+
108+
opal_pmix.register_errhandler(&info, notify_cbfunc, register_cbfunc, &cd);
109+
110+
/* let the MPI progress engine run while we wait for
111+
* registration to complete */
112+
OMPI_WAIT_FOR_COMPLETION(cd.active);
113+
/* safely deconstruct the list */
114+
opal_list_remove_first(&info);
115+
OBJ_DESTRUCT(&val);
116+
OBJ_DESTRUCT(&info);
117+
if (OPAL_SUCCESS != cd.status) {
118+
/* ouch - we are doomed */
119+
ORTE_ERROR_LOG(cd.status);
120+
return OMPI_ERROR;
121+
}
122+
errhandler = cd.errhandler;
123+
}
124+
125+
return OMPI_SUCCESS;
126+
}
127+
55128
void ompi_rte_abort(int error_code, char *fmt, ...)
56129
{
57130
va_list arglist;
@@ -100,10 +173,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
100173
* attaching debuggers -- see big comment in
101174
* orte/tools/orterun/debuggers.c explaining the two scenarios.
102175
*/
176+
103177
void ompi_rte_wait_for_debugger(void)
104178
{
105179
int debugger;
106-
orte_rml_recv_cb_t xfer;
107180

108181
/* See lengthy comment in orte/tools/orterun/debuggers.c about
109182
orte_in_parallel_debugger */
@@ -117,12 +190,12 @@ void ompi_rte_wait_for_debugger(void)
117190
/* if not, just return */
118191
return;
119192
}
120-
121193
/* if we are being debugged, then we need to find
122194
* the correct plug-ins
123195
*/
124196
ompi_debugger_setup_dlls();
125197

198+
/* wait for the debugger to attach */
126199
if (orte_standalone_operation) {
127200
/* spin until debugger attaches and releases us */
128201
while (MPIR_debug_gate == 0) {
@@ -133,23 +206,9 @@ void ompi_rte_wait_for_debugger(void)
133206
#endif
134207
}
135208
} else {
136-
/* only the rank=0 proc waits for either a message from the
137-
* HNP or for the debugger to attach - everyone else will just
138-
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139-
* joins them.
140-
*/
141-
if (0 != ORTE_PROC_MY_NAME->vpid) {
142-
return;
143-
}
144-
145-
/* VPID 0 waits for a message from the HNP */
146-
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
147-
xfer.active = true;
148-
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
149-
ORTE_RML_TAG_DEBUGGER_RELEASE,
150-
ORTE_RML_NON_PERSISTENT,
151-
orte_rml_recv_callback, &xfer);
152-
/* let the MPI progress engine run while we wait */
153-
OMPI_WAIT_FOR_COMPLETION(xfer.active);
209+
/* now wait for the notification to occur */
210+
OMPI_WAIT_FOR_COMPLETION(wait_for_release);
211+
/* deregister the errhandler */
212+
opal_pmix.deregister_errhandler(errhandler, NULL, NULL);
154213
}
155214
}

opal/dss/dss_compare.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
13-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
1413
* Copyright (c) 2014-2016 Research Organization for Information Science
1514
* and Technology (RIST). All rights reserved.
15+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -449,3 +449,12 @@ int opal_dss_compare_jobid(opal_jobid_t *value1,
449449
return OPAL_EQUAL;
450450
}
451451

452+
int opal_dss_compare_status(int *value1, int *value2, opal_data_type_t type)
453+
{
454+
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
455+
456+
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
457+
458+
return OPAL_EQUAL;
459+
}
460+

opal/dss/dss_copy.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1313
* Copyright (c) 2014-2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* $COPYRIGHT$
@@ -61,6 +61,7 @@ int opal_dss_std_copy(void **dest, void *src, opal_data_type_t type)
6161

6262
case OPAL_INT:
6363
case OPAL_UINT:
64+
case OPAL_STATUS:
6465
datasize = sizeof(int);
6566
break;
6667

opal/dss/dss_internal.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2014 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@@ -332,6 +332,9 @@ int opal_dss_pack_jobid(opal_buffer_t *buffer, const void *src,
332332
int opal_dss_pack_vpid(opal_buffer_t *buffer, const void *src,
333333
int32_t num_vals, opal_data_type_t type);
334334

335+
int opal_dss_pack_status(opal_buffer_t *buffer, const void *src,
336+
int32_t num_vals, opal_data_type_t type);
337+
335338
/*
336339
* Internal unpack functions
337340
*/
@@ -401,6 +404,8 @@ int opal_dss_unpack_jobid(opal_buffer_t *buffer, void *dest,
401404
int opal_dss_unpack_vpid(opal_buffer_t *buffer, void *dest,
402405
int32_t *num_vals, opal_data_type_t type);
403406

407+
int opal_dss_unpack_status(opal_buffer_t *buffer, void *dest,
408+
int32_t *num_vals, opal_data_type_t type);
404409

405410
/*
406411
* Internal copy functions
@@ -497,6 +502,8 @@ int opal_dss_compare_jobid(opal_jobid_t *value1,
497502
opal_jobid_t *value2,
498503
opal_data_type_t type);
499504

505+
int opal_dss_compare_status(int *value1, int *value2, opal_data_type_t type);
506+
500507
/*
501508
* Internal print functions
502509
*/
@@ -536,6 +543,7 @@ int opal_dss_print_time(char **output, char *prefix, time_t *src, opal_data_type
536543
int opal_dss_print_name(char **output, char *prefix, opal_process_name_t *name, opal_data_type_t type);
537544
int opal_dss_print_jobid(char **output, char *prefix, opal_process_name_t *src, opal_data_type_t type);
538545
int opal_dss_print_vpid(char **output, char *prefix, opal_process_name_t *src, opal_data_type_t type);
546+
int opal_dss_print_status(char **output, char *prefix, int *src, opal_data_type_t type);
539547

540548

541549
/*

opal/dss/dss_open_close.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* $COPYRIGHT$
@@ -611,6 +611,17 @@ int opal_dss_open(void)
611611
return rc;
612612
}
613613

614+
615+
tmp = OPAL_STATUS;
616+
if (OPAL_SUCCESS != (rc = opal_dss.register_type(opal_dss_pack_status,
617+
opal_dss_unpack_status,
618+
(opal_dss_copy_fn_t)opal_dss_std_copy,
619+
(opal_dss_compare_fn_t)opal_dss_compare_status,
620+
(opal_dss_print_fn_t)opal_dss_print_status,
621+
OPAL_DSS_UNSTRUCTURED,
622+
"OPAL_STATUS", &tmp))) {
623+
return rc;
624+
}
614625
/* All done */
615626

616627
opal_dss_initialized = true;

opal/dss/dss_pack.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2014 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -1240,3 +1240,20 @@ int opal_dss_pack_vpid(opal_buffer_t *buffer, const void *src,
12401240
return ret;
12411241
}
12421242

1243+
/*
1244+
* STATUS
1245+
*/
1246+
int opal_dss_pack_status(opal_buffer_t *buffer, const void *src,
1247+
int32_t num_vals, opal_data_type_t type)
1248+
{
1249+
int ret;
1250+
1251+
/* Turn around and pack the real type */
1252+
ret = opal_dss_pack_buffer(buffer, src, num_vals, OPAL_INT);
1253+
if (OPAL_SUCCESS != ret) {
1254+
OPAL_ERROR_LOG(ret);
1255+
}
1256+
1257+
return ret;
1258+
}
1259+

opal/dss/dss_print.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
13-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2014 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -25,6 +25,7 @@
2525
#include "opal_stdint.h"
2626
#include <stdio.h>
2727

28+
#include "opal/util/error.h"
2829
#include "opal/dss/dss_internal.h"
2930

3031
int opal_dss_print(char **output, char *prefix, void *src, opal_data_type_t type)
@@ -1060,3 +1061,29 @@ int opal_dss_print_vpid(char **output, char *prefix,
10601061

10611062
return OPAL_SUCCESS;
10621063
}
1064+
1065+
int opal_dss_print_status(char **output, char *prefix,
1066+
int *src, opal_data_type_t type)
1067+
{
1068+
char *prefx;
1069+
1070+
/* deal with NULL prefix */
1071+
if (NULL == prefix) asprintf(&prefx, " ");
1072+
else prefx = prefix;
1073+
1074+
/* if src is NULL, just print data type and return */
1075+
if (NULL == src) {
1076+
asprintf(output, "%sData type: OPAL_STATUS\tValue: NULL pointer", prefx);
1077+
if (prefx != prefix) {
1078+
free(prefx);
1079+
}
1080+
return OPAL_SUCCESS;
1081+
}
1082+
1083+
asprintf(output, "%sData type: OPAL_STATUS\tValue: %s", prefx, opal_strerror(*src));
1084+
if (prefx != prefix) {
1085+
free(prefx);
1086+
}
1087+
1088+
return OPAL_SUCCESS;
1089+
}

opal/dss/dss_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights
1515
* reserved.
16-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
1716
* Copyright (c) 2014-2016 Research Organization for Information Science
1817
* and Technology (RIST). All rights reserved.
18+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -120,6 +120,8 @@ typedef struct {
120120
#define OPAL_NAME (opal_data_type_t) 50
121121
#define OPAL_JOBID (opal_data_type_t) 51
122122
#define OPAL_VPID (opal_data_type_t) 52
123+
#define OPAL_STATUS (opal_data_type_t) 53
124+
123125
/* OPAL Dynamic */
124126
#define OPAL_DSS_ID_DYNAMIC (opal_data_type_t) 100
125127

@@ -245,6 +247,7 @@ typedef struct {
245247
float fval;
246248
double dval;
247249
struct timeval tv;
250+
int status;
248251
opal_process_name_t name;
249252
opal_bool_array_t flag_array;
250253
opal_uint8_array_t byte_array;

opal/dss/dss_unpack.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012-2015 Los Alamos National Security, Inc. All rights reserved.
14-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2014-2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* $COPYRIGHT$
@@ -1519,3 +1519,20 @@ int opal_dss_unpack_vpid(opal_buffer_t *buffer, void *dest,
15191519

15201520
return ret;
15211521
}
1522+
1523+
/*
1524+
* STATUS
1525+
*/
1526+
int opal_dss_unpack_status(opal_buffer_t *buffer, void *dest,
1527+
int32_t *num_vals, opal_data_type_t type)
1528+
{
1529+
int ret;
1530+
1531+
/* Turn around and unpack the real type */
1532+
ret = opal_dss_unpack_buffer(buffer, dest, num_vals, OPAL_INT);
1533+
if (OPAL_SUCCESS != ret) {
1534+
OPAL_ERROR_LOG(ret);
1535+
}
1536+
1537+
return ret;
1538+
}

0 commit comments

Comments
 (0)