diff --git a/.github/workflows/builds.yaml b/.github/workflows/builds.yaml index d02be34cef..4e9c1424ec 100644 --- a/.github/workflows/builds.yaml +++ b/.github/workflows/builds.yaml @@ -22,8 +22,16 @@ jobs: - name: Build OpenPMIx run: | cd openpmix/master + # Homebrew doesn't install Libevent's (or libev's) header or + # library files into a default search location. Shrug. So + # use pkg-config to get the location and explicitly pass it to + # configure. + libevent_cppflags=$(pkg-config libevent --cflags) + libevent_ldflags=$(pkg-config libevent --libs | perl -pe 's/^.*(-L[^ ]+).*$/\1/') ./autogen.pl - ./configure --prefix=$RUNNER_TEMP/pmixinstall + ./configure --prefix=$RUNNER_TEMP/pmixinstall \ + CPPFLAGS=$libevent_cppflags \ + LDFLAGS=$libevent_ldflags make -j make install - name: Git clone PRRTE @@ -37,6 +45,11 @@ jobs: sphinx= if test "${{ matrix.sphinx }}" = sphinx; then + # The macos Github Action environment gets angry at us if + # we try to pip install into the global environment. So + # make a virtual environment and install sphinx into that. + python -m venv venv + . ./venv/bin/activate pip3 install -r docs/requirements.txt sphinx=--enable-sphinx fi @@ -48,7 +61,16 @@ jobs: c=../configure fi - $c --prefix=$RUNNER_TEMP/prteinstall --with-pmix=$RUNNER_TEMP/pmixinstall $sphinx + # Homebrew doesn't install Libevent's (or libev's) header or + # library files into a default search location. Shrug. So + # use pkg-config to get the location and explicitly pass it to + # configure. + libevent_cppflags=$(pkg-config libevent --cflags) + libevent_ldflags=$(pkg-config libevent --libs | perl -pe 's/^.*(-L[^ ]+).*$/\1/') + + $c --prefix=$RUNNER_TEMP/prteinstall --with-pmix=$RUNNER_TEMP/pmixinstall $sphinx \ + CPPFLAGS=$libevent_cppflags \ + LDFLAGS=$libevent_ldflags make -j make install make uninstall diff --git a/.github/workflows/close-stale-issues.yaml b/.github/workflows/close-stale-issues.yaml deleted file mode 100644 index 5f57cac239..0000000000 --- a/.github/workflows/close-stale-issues.yaml +++ /dev/null @@ -1,65 +0,0 @@ -# The idea behind this Action is to prevent the situation where a user -# files a Github Issue, someone asks for clarification / more -# information, but the original poster never provides the information. -# The issue then becomes forgotten and abondoned. -# -# Instead of that scenario, PMIx community members can assign a -# label to Github Issues indicating that we're waiting for the user to -# reply. If too much time elapses with no reply, mark the Issue as -# stale and emit a warning that we'll close the issue if we continue -# to receive no reply. If we timeout again with no reply after the -# warning, close the Issue and emit a comment explaining why. -# -# If the user *does* reply, the label is removed, and this bot won't -# touch the Issue. Specifically: this bot will never mark stale / -# close an Issue that doesn't have the specified label. -# -# Additionally, we are *only* marking stale / auto-closing Github -# Issues -- not Pull Requests. -# -# This is a cron-based Action that runs a few times a day, just so -# that we don't mark stale / close a bunch of issues all at once. -# -# While the actions/stale bot Action used here is capable of removing -# the label when a user replies to the Issue, we actually use a 2nd -# Action (removing-awaiting-user-info-label.yaml) to remove the label. -# We do this because that 2nd Action runs whenever a comment is -# created -- not via cron. Hence, the 2nd Action will remove the -# label effectively immediately when the user replies (vs. up to -# several hours later). - -name: Close stale issues -on: - schedule: - # Run it a few times a day so as not to necessarily mark stale / - # close a bunch of issues at once. - - cron: '0 1,5,9,13,17,21 * * *' - -jobs: - stale: - runs-on: ubuntu-latest - steps: - # https://github.com/marketplace/actions/close-stale-issues - - uses: actions/stale@v9 - with: - # If there are no replies for 14 days, mark the issue as - # "stale" (and emit a warning). - days-before-stale: 14 - # If there are no replies for 14 days after becoming stale, - # then close the issue (and emit a message explaining why). - days-before-close: 14 - - # Never close PRs - days-before-pr-close: -1 - - # We only close issues with this label - only-labels: "Awaiting response" - close-issue-label: Closed due to no reply - - # Messages that we put in comments on issues - stale-issue-message: | - It looks like this issue is expecting a response, but hasn't gotten one yet. If there are no responses in the next 2 weeks, we'll assume that the issue has been abandoned and will close it. - close-issue-message: | - Per the above comment, it has been a month with no reply on this issue. It looks like this issue has been abandoned. - - I'm going to close this issue. If I'm wrong and this issue is *not* abandoned, please feel free to re-open it. Thank you! diff --git a/.github/workflows/remove-awaiting-user-info-label.yaml b/.github/workflows/remove-awaiting-user-info-label.yaml deleted file mode 100644 index 592264e34b..0000000000 --- a/.github/workflows/remove-awaiting-user-info-label.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# This Action is run in conjunction with close-stale-issues.yaml. See -# that file for a more complete description of how they work together. - -name: 'Remove "Awaiting response" label when there has been a reply' -on: - issue_comment: - types: - - created - -jobs: - build: - runs-on: ubuntu-latest - # From - # https://github.com/marketplace/actions/close-issues-after-no-reply: - # only remove the label if someone replies to an issue who is not - # an owner or collaborator on the repo. - if: | - github.event.comment.author_association != 'OWNER' && - github.event.comment.author_association != 'COLLABORATOR' - steps: - - name: 'Remove "Awaiting response" label' - uses: octokit/request-action@v2.x - continue-on-error: true - with: - route: DELETE /repos/:repository/issues/:issue/labels/:label - repository: ${{ github.repository }} - issue: ${{ github.event.issue.number }} - label: "Awaiting response" - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/VERSION b/VERSION index 1c39e5efaf..3a648a4251 100644 --- a/VERSION +++ b/VERSION @@ -23,7 +23,7 @@ release=0 # List in x.y.z format. pmix_min_version=4.2.4 -hwloc_min_version=1.11.0 +hwloc_min_version=2.1.0 event_min_version=2.0.21 automake_min_version=1.13.4 autoconf_min_version=2.69.0 @@ -37,7 +37,7 @@ flex_min_version=2.5.4 # requirement is that it must be entirely printable ASCII characters # and have no white space. -greek=a1 +greek=ompi-a1 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" diff --git a/config/prte_check_cflags.m4 b/config/prte_check_cflags.m4 index bb032dad2f..1befbd8135 100644 --- a/config/prte_check_cflags.m4 +++ b/config/prte_check_cflags.m4 @@ -2,7 +2,7 @@ dnl -*- shell-script -*- dnl dnl Copyright (c) 2021 IBM Corporation. All rights reserved. dnl -dnl Copyright (c) 2021 Nanook Consulting. All rights reserved. +dnl Copyright (c) 2021-2024 Nanook Consulting All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -40,3 +40,15 @@ AC_MSG_CHECKING(if $CC supports ([$1])) AC_MSG_RESULT([yes]) fi ]) + + +AC_DEFUN([_PRTE_CHECK_LTO_FLAG], [ + chkflg=`echo $1 | grep -- -flto` + if test -n "$chkflg"; then + AC_MSG_WARN([Configure has detected the presence of the -flto]) + AC_MSG_WARN([compiler directive in $2. PRRTE does not currently]) + AC_MSG_WARN([support this flag as it conflicts with the]) + AC_MSG_WARN([plugin architecture of the PRRTE code base.]) + AC_MSG_ERROR([Please remove this directive and re-run configure.]) + fi +]) diff --git a/configure.ac b/configure.ac index bc14a8dab2..11937a15b4 100644 --- a/configure.ac +++ b/configure.ac @@ -931,6 +931,19 @@ CPPFLAGS="$CPP_INCLUDES $CPPFLAGS $PRTE_FINAL_CPPFLAGS" LDFLAGS="$LDFLAGS $PRTE_FINAL_LDFLAGS" LIBS="$LIBS $PRTE_FINAL_LIBS" + +# We do not currently support the "lto" optimizer as it +# aggregates all the headers from our plugins, resulting +# in a configuration that generates warnings/errors when +# passed through their optimizer phase. We therefore check +# for the flag, and if found, output a message explaining +# the situation and aborting configure +_PRTE_CHECK_LTO_FLAG($CPPFLAGS, CPPFLAGS) +_PRTE_CHECK_LTO_FLAG($CFLAGS, CFLAGS) +_PRTE_CHECK_LTO_FLAG($LDFLAGS, LDFLAGS) +_PRTE_CHECK_LTO_FLAG($LIBS, LIBS) + + # restore any user-provided Werror flags AS_IF([test ! -z "$PRTE_CFLAGS_cache"], [CFLAGS="$CFLAGS $PRTE_CFLAGS_cache"]) diff --git a/contrib/scaling/scaling.pl b/contrib/scaling/scaling.pl index 9db7cb63fa..f800ba0081 100755 --- a/contrib/scaling/scaling.pl +++ b/contrib/scaling/scaling.pl @@ -135,7 +135,7 @@ push @starters, $starter; $opt = $starteroptionlist[$idx] . " --npernode " . $ppn; if ($multiplier gt 1) { - $opt = $opt . " --mca rtc ^hwloc --mca ras_base_multiplier " . $multiplier; + $opt = $opt . " --bind-to none --mca ras_base_multiplier " . $multiplier; } push @starteroptions, $opt; } elsif ($useaprun && $starter eq "aprun") { @@ -294,7 +294,7 @@ () if ($starter eq "prun") { my $dvm = "prte-dvm --system-server"; if ($multiplier gt 1) { - $dvm = $dvm . " --mca rtc ^hwloc --mca ras_base_multiplier " . $multiplier; + $dvm = $dvm . " --bind-to none --mca ras_base_multiplier " . $multiplier; } # need to start it print "##DVM: Launching $dvm\n"; diff --git a/docs/conf.py b/docs/conf.py index 4f82d6435f..13e17472c1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,6 +86,15 @@ if key in os.environ and os.environ[key] == 'True': print("PRRTE: found ReadTheDocs build environment") + # Tell Jinja2 templates the build is running on Read the Docs + if "html_context" not in globals(): + html_context = {} + html_context["READTHEDOCS"] = True + + # Define the canonical URL if you are using a custom domain on + # Read the Docs + html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "") + rtd_v = os.environ['READTHEDOCS_VERSION'] if os.environ['READTHEDOCS_VERSION_TYPE'] == 'external': # Make "release" be shorter than the full "prte_ver" value. diff --git a/src/docs/show-help-files/build-dummy-ini-files.py b/src/docs/show-help-files/build-dummy-ini-files.py index 57781cb57a..e5ab6d9f55 100755 --- a/src/docs/show-help-files/build-dummy-ini-files.py +++ b/src/docs/show-help-files/build-dummy-ini-files.py @@ -2,7 +2,7 @@ # # Copyright 2023 Jeffrey M. Squyres. All rights reserved. # -# Copyright (c) 2023 Nanook Consulting. All rights reserved. +# Copyright (c) 2023-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -57,7 +57,7 @@ # Find all the "[section]" lines. sections = list() for line in src_rst: - match = re.search('\s*\[(.+)\]\s*$', line) + match = re.search(r"\s*\[(.+)\]\s*$", line) if match: sections.append(match.group(1)) diff --git a/src/docs/show-help-files/help-prte-hwloc-base.rst b/src/docs/show-help-files/help-prte-hwloc-base.rst index 8badbc13cb..ef811ba4fa 100644 --- a/src/docs/show-help-files/help-prte-hwloc-base.rst +++ b/src/docs/show-help-files/help-prte-hwloc-base.rst @@ -81,7 +81,6 @@ additional information may be of help: .. code:: Message: %s - Cache level: %d [missing-cpulist] diff --git a/src/docs/show-help-files/help-prun.rst b/src/docs/show-help-files/help-prun.rst index e3f57bfa2d..acb2e07837 100644 --- a/src/docs/show-help-files/help-prun.rst +++ b/src/docs/show-help-files/help-prun.rst @@ -598,7 +598,8 @@ This is a fatal error; %s will now abort. No processes were launched. %s was unable to launch the specified application as it encountered an error: Error: system limit exceeded on number of pipes that can be open -Node: %s + + Node: %s when attempting to start process rank %lu. @@ -612,7 +613,8 @@ by rearranging your processes to place fewer of them on that node. %s was unable to launch the specified application as it encountered an error: Error: system limit exceeded on number of files that can be open -Node: %s + + Node: %s when attempting to start process rank %lu. @@ -626,7 +628,8 @@ by rearranging your processes to place fewer of them on that node. %s was unable to launch the specified application as it encountered an error: Error: pipe function call failed when setting up I/O forwarding subsystem -Node: %s + + Node: %s while attempting to start process rank %lu. @@ -635,7 +638,8 @@ while attempting to start process rank %lu. %s was unable to launch the specified application as it encountered an error: Error: system limit exceeded on number of processes that can be started -Node: %s + + Node: %s when attempting to start process rank %lu. @@ -649,7 +653,8 @@ on that node. Error: reading tty attributes function call failed while setting up I/O forwarding system -Node: %s + + Node: %s while attempting to start process rank %lu. @@ -658,8 +663,8 @@ while attempting to start process rank %lu. %s was unable to launch the specified application as it could not find the specified working directory: -Working directory: %s -Node: %s +| Working directory: %s +| Node: %s while attempting to start process rank %lu. @@ -668,8 +673,8 @@ while attempting to start process rank %lu. %s was unable to launch the specified application as it lacks permissions to change to the specified working directory: -Working directory: %s -Node: %s +| Working directory: %s +| Node: %s while attempting to start process rank %lu. @@ -683,16 +688,16 @@ NOTE: A common cause for this error is misspelling a %s command line parameter option (remember that %s interprets the first unrecognized command line token as the executable). -Node: %s -Executable: %s +| Node: %s +| Executable: %s [prun:exe-not-accessible] %s was unable to launch the specified application as it lacked permissions to execute an executable: -Executable: %s -Node: %s +| Executable: %s +| Node: %s while attempting to start process rank %lu. @@ -701,7 +706,8 @@ while attempting to start process rank %lu. %s was unable to launch the specified application as it encountered an error: Error: reading from a pipe function call failed while spawning a local process -Node: %s + + Node: %s while attempting to start process rank %lu. @@ -710,9 +716,9 @@ while attempting to start process rank %lu. %s was unable to start the specified application as it encountered an error: -Error code: %d -Error name: %s -Node: %s +| Error code: %d +| Error name: %s +| Node: %s when attempting to start process rank %lu. @@ -726,13 +732,13 @@ error on node %s. More information may be available above. %s was unable to parse the filename where contact info for the prte-server was to be found. The option we were given was: ---prte-server %s + --prte-server %s This appears to be missing the required ':' following the keyword "file". Please remember that the correct format for this command line option is: ---prte-server file:path-to-file + --prte-server file:path-to-file where path-to-file can be either relative to the cwd or absolute. @@ -741,12 +747,12 @@ where path-to-file can be either relative to the cwd or absolute. %s was unable to parse the filename where contact info for the prte-server was to be found. The option we were given was: ---prte-server %s + --prte-server %s This appears to be missing a filename following the ':'. Please remember that the correct format for this command line option is: ---prte-server file:path-to-file + --prte-server file:path-to-file where path-to-file can be either relative to the cwd or absolute. @@ -755,11 +761,11 @@ where path-to-file can be either relative to the cwd or absolute. %s was unable to access the filename where contact info for the prte-server was to be found. The option we were given was: ---prte-server %s + --prte-server %s Please remember that the correct format for this command line option is: ---prte-server file:path-to-file + --prte-server file:path-to-file where path-to-file can be either relative to the cwd or absolute, and that you must have read access permissions to that file. @@ -788,8 +794,8 @@ uri in a file, and then giving %s that filename. Multiple processor affinity schemes were specified (can only specify one): -Slot list: %s -prte_paffinity_alone: true +| Slot list: %s +| prte_paffinity_alone: true Please specify only the one desired method. @@ -821,18 +827,18 @@ functionality (e.g., Linux Kernels newer than v2.6.18). Systems that do not support processor topology-aware functionality cannot use "bind to package" and other related functionality. - Local host: %s - Action attempted: %s %s - Application name: %s +| Local host: %s +| Action attempted: %s %s +| Application name: %s [prun:not-enough-resources] Not enough %s were found on the local host to meet the requested binding action: - Local host: %s - Action requested: %s - Application name: %s +| Local host: %s +| Action requested: %s +| Application name: %s Please revise the request and try again. @@ -841,8 +847,8 @@ Please revise the request and try again. A slot list was provided that exceeds the boundaries on available resources: - Local host: %s - Slot list: %s +| Local host: %s +| Slot list: %s Please check your boundaries and try again. @@ -850,16 +856,16 @@ Please check your boundaries and try again. A critical communication path was lost to: - My name: %s - Process name: %s - Node: %s +| My name: %s +| Process name: %s +| Node: %s [prun:proc-mem-exceeded] A process exceeded memory limits: - Process name: %s - Node: %s +| Process name: %s +| Node: %s [prun:proc-stalled] @@ -876,24 +882,24 @@ no further info is available. %s failed to receive scheduled heartbeat communications from a remote process: - Process name: %s - Node: %s +| Process name: %s +| Node: %s [prun:non-zero-exit] %s detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: - Process name: %s - Exit code: %d +| Process name: %s +| Exit code: %d [prun:negative-nprocs] %s has detected that one or more applications was given a negative number of processes to run: - Application: %s - Num procs: %d +| Application: %s +| Num procs: %d Please correct this value and try again. @@ -941,9 +947,9 @@ correct and retry. %s was unable to stop the executable at first instruction: - Error: %s - Nodename: %s - Rank: %lu +| Error: %s +| Nodename: %s +| Rank: %lu [use-pterm] @@ -970,9 +976,9 @@ to start the persistent DVM - it cannot be used with an application. %s was given an option that expected a string argument: - option: %s - argument: %s - expected: %s +| option: %s +| argument: %s +| expected: %s Please correct the option and try again. @@ -980,9 +986,9 @@ Please correct the option and try again. %s was unable to open the specified file provided as an option: - option: %s - argument: %s - file: %s +| option: %s +| argument: %s +| file: %s Please correct the option and try again. @@ -990,9 +996,9 @@ Please correct the option and try again. %s was unable to read the necessary info from the provided file: - option: %s - argument: %s - file: %s +| option: %s +| argument: %s +| file: %s Please correct the option or the file and try again. diff --git a/src/hwloc/hwloc-internal.h b/src/hwloc/hwloc-internal.h index 8337274a47..eb238d6f9c 100644 --- a/src/hwloc/hwloc-internal.h +++ b/src/hwloc/hwloc-internal.h @@ -7,7 +7,7 @@ * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * @@ -28,24 +28,7 @@ #include #include #include -#if HWLOC_API_VERSION >= 0x20000 -# include -#endif - -#if HWLOC_API_VERSION < 0x10b00 -#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE -#define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET -#endif -#if HWLOC_API_VERSION < 0x10a00 -static inline hwloc_obj_t hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) -{ - hwloc_obj_t obj = NULL; - while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL) - if (obj->os_index == os_index) - return obj; - return NULL; -} -#endif +#include #include "src/class/pmix_list.h" #include "src/class/pmix_value_array.h" @@ -114,6 +97,21 @@ typedef struct { size_t mbs_len; } prte_hwloc_base_memory_segment_t; + +/** + * Struct used to cache topology-level data used + * for repeated lookup - the struct is attached + * to the userdata of the root object of the + * topology + */ +typedef struct { + pmix_object_t super; + bool computed; + unsigned numa_cutoff; +} prte_hwloc_topo_data_t; +PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_hwloc_topo_data_t); + + /* define binding policies */ typedef uint16_t prte_binding_policy_t; #define PRTE_BINDING_POLICY PRTE_UINT16 @@ -161,20 +159,6 @@ PRTE_EXPORT extern hwloc_obj_type_t prte_hwloc_levels[]; PRTE_EXPORT extern char *prte_hwloc_default_cpu_list; PRTE_EXPORT extern bool prte_hwloc_default_use_hwthread_cpus; -#if HWLOC_API_VERSION < 0x20000 -# define HWLOC_OBJ_L3CACHE HWLOC_OBJ_CACHE -# define HWLOC_OBJ_L2CACHE HWLOC_OBJ_CACHE -# define HWLOC_OBJ_L1CACHE HWLOC_OBJ_CACHE -# if HWLOC_API_VERSION < 0x10a00 -# define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET -# endif -# define HAVE_DECL_HWLOC_OBJ_OSDEV_COPROC 0 -# define HAVE_HWLOC_TOPOLOGY_DUP 0 -#else -# define HAVE_DECL_HWLOC_OBJ_OSDEV_COPROC 1 -# define HAVE_HWLOC_TOPOLOGY_DUP 1 -#endif - /** * Debugging output stream */ @@ -210,20 +194,6 @@ PRTE_EXPORT extern bool prte_hwloc_synthetic_topo; hwloc_bitmap_free(bind); \ } while (0); -#if HWLOC_API_VERSION < 0x20000 -# define PRTE_HWLOC_MAKE_OBJ_CACHE(level, obj, cache_level) \ - do { \ - obj = HWLOC_OBJ_CACHE; \ - cache_level = level; \ - } while (0) -#else -# define PRTE_HWLOC_MAKE_OBJ_CACHE(level, obj, cache_level) \ - do { \ - obj = HWLOC_OBJ_L##level##CACHE; \ - cache_level = 0; \ - } while (0) -#endif - PRTE_EXPORT prte_hwloc_locality_t prte_hwloc_base_get_relative_locality(hwloc_topology_t topo, char *cpuset1, char *cpuset2); @@ -271,30 +241,27 @@ PRTE_EXPORT extern prte_hwloc_base_mbfa_t prte_hwloc_base_mbfa; * hwloc_topology_load()). */ PRTE_EXPORT int prte_hwloc_base_get_topology(void); -PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_setup_summary(hwloc_topology_t topo); /** * Set the hwloc topology to that from the given topo file */ PRTE_EXPORT int prte_hwloc_base_set_topology(char *topofile); +PRTE_EXPORT void prte_hwloc_base_setup_summary(hwloc_topology_t topo); + PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, bool use_hwthread_cpus, char *cpulist); PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_filter_cpus(hwloc_topology_t topo); -/** - * Free the hwloc topology. - */ +PRTE_EXPORT unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj); + PRTE_EXPORT unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, - hwloc_obj_type_t target, - unsigned cache_level); + hwloc_obj_type_t target); PRTE_EXPORT hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, - unsigned cache_level, unsigned int instance); -PRTE_EXPORT unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj); /** * Get the number of pu's under a given hwloc object. @@ -354,7 +321,7 @@ PRTE_EXPORT char *prte_hwloc_base_cset2str(hwloc_const_cpuset_t cpuset, PRTE_EXPORT void prte_hwloc_get_binding_info(hwloc_const_cpuset_t cpuset, bool use_hwthread_cpus, - hwloc_topology_t topo, int *pkgnum, + hwloc_topology_t topo, int *pkgnum, char *cores, int sz); /* get the hwloc object that corresponds to the given processor id and type */ diff --git a/src/hwloc/hwloc.c b/src/hwloc/hwloc.c index aad1ce57cd..fe5dd1df67 100644 --- a/src/hwloc/hwloc.c +++ b/src/hwloc/hwloc.c @@ -377,23 +377,12 @@ int prte_hwloc_base_set_default_binding(void *jd, void *opt) PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE); } else if (HWLOC_OBJ_NUMANODE== options->maptype) { PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NUMA); -#if HWLOC_API_VERSION < 0x20000 - } else if (HWLOC_OBJ_CACHE == options->maptype) { - if (1 == options->cmaplvl) { - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_L1CACHE); - } else if (2 == options->cmaplvl) { - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_L2CACHE); - } else if (3 == options->cmaplvl) { - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_L3CACHE); - } -#else } else if (HWLOC_OBJ_L1CACHE == options->maptype) { PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_L1CACHE); } else if (HWLOC_OBJ_L2CACHE == options->maptype) { PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_L2CACHE); } else if (HWLOC_OBJ_L3CACHE == options->maptype) { PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_L3CACHE); -#endif } else if (HWLOC_OBJ_CORE == options->maptype) { PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_CORE); } else if (HWLOC_OBJ_PU == options->maptype) { @@ -664,3 +653,13 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) } return PRTE_SUCCESS; } + +static void topo_data_const(prte_hwloc_topo_data_t *ptr) +{ + ptr->computed = false; + ptr->numa_cutoff = UINT_MAX; +} +PMIX_CLASS_INSTANCE(prte_hwloc_topo_data_t, + pmix_object_t, + topo_data_const, NULL); + diff --git a/src/hwloc/hwloc_base_util.c b/src/hwloc/hwloc_base_util.c index 4a32a7fa1a..80cd5187f8 100644 --- a/src/hwloc/hwloc_base_util.c +++ b/src/hwloc/hwloc_base_util.c @@ -145,11 +145,7 @@ hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, /* only one cpu given - get that object */ cpu = strtoul(range[0], NULL, 10); if (NULL != (pu = prte_hwloc_base_get_pu(topo, use_hwthread_cpus, cpu))) { -#if HWLOC_API_VERSION < 0x20000 - hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); -#else hwloc_bitmap_and(pucpus, pu->cpuset, hwloc_topology_get_allowed_cpuset(topo)); -#endif hwloc_bitmap_or(res, avail, pucpus); hwloc_bitmap_copy(avail, res); } @@ -160,11 +156,7 @@ hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, end = strtoul(range[1], NULL, 10); for (cpu = start; cpu <= end; cpu++) { if (NULL != (pu = prte_hwloc_base_get_pu(topo, use_hwthread_cpus, cpu))) { -#if HWLOC_API_VERSION < 0x20000 - hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); -#else hwloc_bitmap_and(pucpus, pu->cpuset, hwloc_topology_get_allowed_cpuset(topo)); -#endif hwloc_bitmap_or(res, avail, pucpus); hwloc_bitmap_copy(avail, res); } @@ -184,33 +176,89 @@ hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, return avail; } -hwloc_cpuset_t prte_hwloc_base_setup_summary(hwloc_topology_t topo) +void prte_hwloc_base_setup_summary(hwloc_topology_t topo) { - hwloc_cpuset_t avail = NULL; + hwloc_obj_t root; + prte_hwloc_topo_data_t *sum; + unsigned width, w, m, N, last; + hwloc_bitmap_t *numas; + hwloc_obj_t obj; - avail = hwloc_bitmap_alloc(); + /* Historically, CPU packages contained a single cpu die + * and nothing else. NUMA was therefore determined by simply + * looking at the memory bus attached to the socket where + * the package resided - all cpus in the package were + * exclusively "under" that NUMA. Since each socket had a + * unique NUMA, you could easily map by them. + + * More recently, packages have started to contain multiple + * cpu dies as well as memory and sometimes even fabric die. + * In these cases, the memory bus of the cpu dies in the + * package generally share any included memory die. This + * complicates the memory situation, leaving NUMA domains + * no longer cleanly delineated by processor (i.e.., the + * NUMA domains overlap each other). + * + * Fortunately, the OS index of non-CPU NUMA domains starts + * at 255 and counts downward (at least for GPUs) - while + * the index of CPU NUMA domains starts at 0 and counts + * upward. We can therefore separate the two by excluding + * NUMA domains with an OS index above the level where + * they first begin to intersect + */ - /* get the root available cpuset */ -#if HWLOC_API_VERSION < 0x20000 - hwloc_obj_t root; root = hwloc_get_root_obj(topo); - - if (NULL == root->online_cpuset && NULL == root->allowed_cpuset) { - /* we are hosed */ - return NULL; + if (NULL == root->userdata) { + root->userdata = (void *) PMIX_NEW(prte_hwloc_topo_data_t); } - if (NULL == root->online_cpuset) { - hwloc_bitmap_copy(avail, root->allowed_cpuset); - } else if (NULL == root->allowed_cpuset) { - hwloc_bitmap_copy(avail, root->online_cpuset); - } else { - hwloc_bitmap_and(avail, root->online_cpuset, root->allowed_cpuset); + sum = (prte_hwloc_topo_data_t *) root->userdata; + + /* only need to do this once */ + if (sum->computed) { + return; } -#else - hwloc_bitmap_copy(avail, hwloc_topology_get_allowed_cpuset(topo)); -#endif + sum->computed = true; - return avail; + /* compute the CPU NUMA cutoff for this topology */ + width = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_NUMANODE); + if (0 == width) { + sum->numa_cutoff = 0; + return; + } + numas = (hwloc_bitmap_t*)malloc(width * sizeof(hwloc_bitmap_t)); + N = 0; + last = 0; + for (w=0; w < UINT_MAX && N < width; w++) { + /* get the object at this index */ + obj = hwloc_get_numanode_obj_by_os_index(topo, w); + if (NULL == obj) { + continue; + } + /* check for overlap with all preceding numas */ + for (m=0; m < N; m++) { + if (hwloc_bitmap_intersects(obj->cpuset, numas[m])) { + // if it intersects anyone, then we are done + sum->numa_cutoff = last+1; + break; + } + } + if (UINT_MAX != sum->numa_cutoff) { + break; + } else { + last = w; + /* cache this bitmap */ + numas[N] = hwloc_bitmap_alloc(); + hwloc_bitmap_copy(numas[N], obj->cpuset); + ++N; + } + } + if (UINT_MAX == sum->numa_cutoff) { + sum->numa_cutoff = last + 1; + } + for (m=0; m < N; m++) { + hwloc_bitmap_free(numas[m]); + } + free(numas); } /* determine the node-level available cpuset based on @@ -224,12 +272,15 @@ hwloc_cpuset_t prte_hwloc_base_filter_cpus(hwloc_topology_t topo) if (NULL == prte_hwloc_default_cpu_list) { PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base: no cpus specified - using root available cpuset")); - avail = prte_hwloc_base_setup_summary(topo); + avail = hwloc_bitmap_alloc(); + hwloc_bitmap_copy(avail, hwloc_topology_get_allowed_cpuset(topo)); + } else { PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base: filtering cpuset")); avail = prte_hwloc_base_generate_cpuset(topo, prte_hwloc_default_use_hwthread_cpus, prte_hwloc_default_cpu_list); } + return avail; } @@ -245,7 +296,7 @@ static void fill_cache_line_size(void) while (cache_level > 0 && !found) { i = 0; while (1) { - obj = prte_hwloc_base_get_obj_by_type(prte_hwloc_topology, cache_object, cache_level, i); + obj = hwloc_get_obj_by_type(prte_hwloc_topology, cache_object, i); if (NULL == obj) { --cache_level; cache_object = HWLOC_OBJ_L1CACHE; @@ -305,6 +356,8 @@ int prte_hwloc_base_get_topology(void) line size */ fill_cache_line_size(); + // create the summary + prte_hwloc_base_setup_summary(prte_hwloc_topology); return PRTE_SUCCESS; } @@ -473,150 +526,95 @@ unsigned int prte_hwloc_base_get_npus(hwloc_topology_t topo, bool use_hwthread_c unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj) { - unsigned cache_level = 0; hwloc_obj_t ptr; unsigned int nobjs, i; PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base:get_idx")); -#if HWLOC_API_VERSION < 0x20000 - /* determine the number of objects of this type */ - if (HWLOC_OBJ_CACHE == obj->type) { - cache_level = obj->attr->cache.depth; - } -#endif - - nobjs = prte_hwloc_base_get_nbobjs_by_type(topo, obj->type, cache_level); + nobjs = prte_hwloc_base_get_nbobjs_by_type(topo, obj->type); PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, - "hwloc:base:get_idx found %u objects of type %s:%u", nobjs, - hwloc_obj_type_string(obj->type), cache_level)); + "hwloc:base:get_idx found %u objects of type %s", nobjs, + hwloc_obj_type_string(obj->type))); /* find this object */ for (i = 0; i < nobjs; i++) { - ptr = prte_hwloc_base_get_obj_by_type(topo, obj->type, cache_level, i); + ptr = prte_hwloc_base_get_obj_by_type(topo, obj->type, i); if (ptr == obj) { return i; } } /* if we get here, it wasn't found */ pmix_show_help("help-prte-hwloc-base.txt", "obj-idx-failed", true, - hwloc_obj_type_string(obj->type), cache_level); + hwloc_obj_type_string(obj->type)); return UINT_MAX; } -#if HWLOC_API_VERSION < 0x20000 -/* hwloc treats cache objects as special - * cases. Instead of having a unique type for each cache level, - * there is a single cache object type, and the level is encoded - * in an attribute union. So looking for cache objects involves - * a multi-step test :-( - */ -static hwloc_obj_t df_search(hwloc_topology_t topo, hwloc_obj_t start, hwloc_obj_type_t target, - unsigned cache_level, unsigned int nobj, unsigned int *num_objs) +unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, + hwloc_obj_type_t target) { - int search_depth; - - search_depth = hwloc_get_type_depth(topo, target); - if (HWLOC_TYPE_DEPTH_MULTIPLE == search_depth) { - /* either v1.x Cache, or Groups */ - if (cache_level != HWLOC_OBJ_CACHE) { - return NULL; + unsigned w, rc; + hwloc_obj_t obj, root; + prte_hwloc_topo_data_t *sum; + + /* if the type is NUMA, then we need to only count the + * CPU NUMAs and ignore the GPU NUMAs as we only deal + * with CPUs at this time */ + if (HWLOC_OBJ_NUMANODE == target) { + + root = hwloc_get_root_obj(topo); + sum = (prte_hwloc_topo_data_t *) root->userdata; + if (NULL == sum) { + return 0; } - search_depth = hwloc_get_cache_type_depth(topo, cache_level, (hwloc_obj_cache_type_t) -1); - } - if (HWLOC_TYPE_DEPTH_UNKNOWN == search_depth) { - return NULL; - } - - if (num_objs) { - *num_objs = hwloc_get_nbobjs_by_depth(topo, search_depth); - } - return hwloc_get_obj_by_depth(topo, search_depth, nobj); -} -#endif - -unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, - unsigned cache_level) -{ - int rc; -#if HWLOC_API_VERSION >= 0x20000 - PRTE_HIDE_UNUSED_PARAMS(cache_level); -#endif - /* bozo check */ - if (NULL == topo) { - PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base:get_nbobjs NULL topology")); - return 0; + rc = 0; + for (w=0; w < sum->numa_cutoff; w++) { + obj = hwloc_get_numanode_obj_by_os_index(topo, w); + if (NULL != obj) { + ++rc; + } + } + return rc; } - -#if HWLOC_API_VERSION >= 0x20000 - if (0 > (rc = hwloc_get_nbobjs_by_type(topo, target))) { + rc = hwloc_get_nbobjs_by_type(topo, target); + if (UINT_MAX == rc) { pmix_output(0, "UNKNOWN HWLOC ERROR"); return 0; } return rc; -#else - unsigned int num_objs; - hwloc_obj_t obj; - - /* we can just use the hwloc accessor to get it, - * unless it is a CACHE as these are treated as special cases - */ - if (HWLOC_OBJ_CACHE != target) { - /* we should not get an error back, but just in case... */ - if (0 > (rc = hwloc_get_nbobjs_by_type(topo, target))) { - pmix_output(0, "UNKNOWN HWLOC ERROR"); - return 0; - } - return rc; - } - - /* for everything else, we have to do some work */ - num_objs = 0; - obj = hwloc_get_root_obj(topo); - - df_search(topo, obj, target, cache_level, 0, &num_objs); - - PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, - "hwloc:base:get_nbojbs computed data %u of %s:%u", num_objs, - hwloc_obj_type_string(target), cache_level)); - - return num_objs; -#endif } -/* as above, only return the Nth instance of the specified object - * type from inside the topology - */ -hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, - unsigned cache_level, unsigned int instance) +hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, + hwloc_obj_type_t target, + unsigned int instance) { -#if HWLOC_API_VERSION >= 0x20000 - PRTE_HIDE_UNUSED_PARAMS(cache_level); -#endif + unsigned w, cnt; + hwloc_obj_t obj, root; + prte_hwloc_topo_data_t *sum; + + /* if we are looking for NUMA, then ignore all the + * GPU NUMAs */ + if (HWLOC_OBJ_NUMANODE == target) { + root = hwloc_get_root_obj(topo); + sum = (prte_hwloc_topo_data_t *) root->userdata; + if (NULL == sum) { + return NULL; + } - /* bozo check */ - if (NULL == topo) { + cnt = 0; + for (w=0; w < sum->numa_cutoff; w++) { + obj = hwloc_get_numanode_obj_by_os_index(topo, w); + if (NULL != obj) { + if (cnt == instance) { + return obj; + } + ++cnt; + } + } return NULL; } - -#if HWLOC_API_VERSION >= 0x20000 return hwloc_get_obj_by_type(topo, target, instance); -#else - hwloc_obj_t obj; - - /* we can just use the hwloc accessor to get it, unless it is a CACHE - * as these are treated as special cases - */ - if (HWLOC_OBJ_CACHE != target) { - return hwloc_get_obj_by_type(topo, target, instance); - } - - /* for everything else, we have to do some work */ - obj = hwloc_get_root_obj(topo); - return df_search(topo, obj, target, cache_level, instance, NULL); -#endif } /* The current slot_list notation only goes to the core level - i.e., the location @@ -647,7 +645,7 @@ static int package_to_cpu_set(char *cpus, hwloc_topology_t topo, hwloc_bitmap_t switch (range_cnt) { case 1: /* no range was present, so just one package given */ package_id = atoi(range[0]); - obj = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, 0, package_id); + obj = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, package_id); /* get the available cpus for this package */ hwloc_bitmap_or(cpumask, cpumask, obj->cpuset); break; @@ -657,7 +655,7 @@ static int package_to_cpu_set(char *cpus, hwloc_topology_t topo, hwloc_bitmap_t upper_range = atoi(range[1]); /* cycle across the range of packages */ for (package_id = lower_range; package_id <= upper_range; package_id++) { - obj = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, 0, package_id); + obj = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, package_id); /* set the available cpus for this package bits in the bitmask */ hwloc_bitmap_or(cpumask, cpumask, obj->cpuset); } @@ -689,7 +687,7 @@ static int package_core_to_cpu_set(char *package_core_list, hwloc_topology_t top package_id = atoi(package_core[0]); /* get the object for this package id */ - package = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, 0, package_id); + package = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, package_id); if (NULL == package) { PMIX_ARGV_FREE_COMPAT(package_core); return PRTE_ERR_NOT_FOUND; @@ -699,7 +697,7 @@ static int package_core_to_cpu_set(char *package_core_list, hwloc_topology_t top * to find cores on all platforms. Adjust the type here if * required */ - if (NULL == hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, 0)) { + if (NULL == prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_CORE, 0)) { obj_type = HWLOC_OBJ_PU; hwthreadcpus = true; } @@ -729,7 +727,7 @@ static int package_core_to_cpu_set(char *package_core_list, hwloc_topology_t top /* get the indexed core from this package */ core_id = atoi(list[j]) + npus; /* get that object */ - core = prte_hwloc_base_get_obj_by_type(topo, obj_type, 0, core_id); + core = prte_hwloc_base_get_obj_by_type(topo, obj_type, core_id); if (NULL == core) { rc = PRTE_ERR_NOT_FOUND; break; @@ -749,7 +747,7 @@ static int package_core_to_cpu_set(char *package_core_list, hwloc_topology_t top /* get the indexed core from this package */ core_id = j + npus; /* get that object */ - core = prte_hwloc_base_get_obj_by_type(topo, obj_type, 0, core_id); + core = prte_hwloc_base_get_obj_by_type(topo, obj_type, core_id); if (NULL == core) { rc = PRTE_ERR_NOT_FOUND; break; @@ -922,17 +920,6 @@ static void prte_hwloc_base_get_relative_locality_by_depth(hwloc_topology_t topo case HWLOC_OBJ_NUMANODE: *locality |= PRTE_PROC_ON_NUMA; break; -#if HWLOC_API_VERSION < 0x20000 - case HWLOC_OBJ_CACHE: - if (3 == obj->attr->cache.depth) { - *locality |= PRTE_PROC_ON_L3CACHE; - } else if (2 == obj->attr->cache.depth) { - *locality |= PRTE_PROC_ON_L2CACHE; - } else { - *locality |= PRTE_PROC_ON_L1CACHE; - } - break; -#else case HWLOC_OBJ_L3CACHE: *locality |= PRTE_PROC_ON_L3CACHE; break; @@ -942,7 +929,6 @@ static void prte_hwloc_base_get_relative_locality_by_depth(hwloc_topology_t topo case HWLOC_OBJ_L1CACHE: *locality |= PRTE_PROC_ON_L1CACHE; break; -#endif case HWLOC_OBJ_CORE: *locality |= PRTE_PROC_ON_CORE; break; @@ -998,11 +984,7 @@ prte_hwloc_locality_t prte_hwloc_base_get_relative_locality(hwloc_topology_t top type = hwloc_get_depth_type(topo, d); /* if it isn't one of interest, then ignore it */ if (HWLOC_OBJ_NUMANODE != type && HWLOC_OBJ_PACKAGE != type && -#if HWLOC_API_VERSION < 0x20000 - HWLOC_OBJ_CACHE != type && -#else HWLOC_OBJ_L3CACHE != type && HWLOC_OBJ_L2CACHE != type && HWLOC_OBJ_L1CACHE != type && -#endif HWLOC_OBJ_CORE != type && HWLOC_OBJ_PU != type) { continue; } @@ -1017,10 +999,8 @@ prte_hwloc_locality_t prte_hwloc_base_get_relative_locality(hwloc_topology_t top } } -#if HWLOC_API_VERSION >= 0x20000 prte_hwloc_base_get_relative_locality_by_depth(topo, (unsigned) HWLOC_TYPE_DEPTH_NUMANODE, loc1, loc2, &locality, &shared); -#endif pmix_output_verbose(5, prte_hwloc_base_output, "locality: %s", prte_hwloc_base_print_locality(locality)); @@ -1036,11 +1016,9 @@ prte_hwloc_locality_t prte_hwloc_base_get_relative_locality(hwloc_topology_t top */ char *prte_hwloc_base_find_coprocessors(hwloc_topology_t topo) { -#if HAVE_DECL_HWLOC_OBJ_OSDEV_COPROC hwloc_obj_t osdev; unsigned i; char **cps = NULL; -#endif char *cpstring = NULL; int depth; @@ -1052,7 +1030,7 @@ char *prte_hwloc_base_find_coprocessors(hwloc_topology_t topo) (5, prte_hwloc_base_output, "hwloc:base:find_coprocessors: NONE FOUND IN TOPO")); return NULL; } -#if HAVE_DECL_HWLOC_OBJ_OSDEV_COPROC + /* check the device objects for coprocessors */ osdev = hwloc_get_obj_by_depth(topo, depth, 0); while (NULL != osdev) { @@ -1078,13 +1056,6 @@ char *prte_hwloc_base_find_coprocessors(hwloc_topology_t topo) PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base:find_coprocessors: hosting coprocessors %s", (NULL == cpstring) ? "NONE" : cpstring)); -#else - PMIX_OUTPUT_VERBOSE((5, prte_hwloc_base_output, - "hwloc:base:find_coprocessors: the version of hwloc that PRTE was built " - "against (v%d.%d.%d) does not support detecting coprocessors", - (HWLOC_API_VERSION >> 16) && 0xFF, (HWLOC_API_VERSION >> 8) & 0xFF, - HWLOC_API_VERSION && 0xFF)); -#endif return cpstring; } @@ -1254,7 +1225,6 @@ static int bitmap_list_snprintf_exp(char *__hwloc_restrict buf, size_t buflen, { int ret = 0; char *tmp = buf; -#if HWLOC_API_VERSION >= 0x20000 int prev = -1; ssize_t size = buflen; int res; @@ -1303,12 +1273,6 @@ static int bitmap_list_snprintf_exp(char *__hwloc_restrict buf, size_t buflen, prev = end - 1; } } -#else - if (buflen > 0) { - tmp[0] = '\0'; - } - ret = PRTE_ERR_NOT_SUPPORTED; -#endif return ret; } @@ -1326,7 +1290,7 @@ void prte_hwloc_get_binding_info(hwloc_const_cpuset_t cpuset, /* if the cpuset is all zero, then something is wrong */ if (hwloc_bitmap_iszero(cpuset)) { - snprintf(cores, sz, "\n%*c\n", 20, ' '); + snprintf(cores, sz, "\n%*c\n", 20, ' '); } /* if the cpuset includes all available cpus, and @@ -1340,10 +1304,10 @@ void prte_hwloc_get_binding_info(hwloc_const_cpuset_t cpuset, hwloc_bitmap_free(avail); /* get the number of packages in the topology */ - npkgs = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PACKAGE); + npkgs = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PACKAGE); avail = hwloc_bitmap_alloc(); - npus = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU); - ncores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE); + npus = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PU); + ncores = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE); if (npus == ncores && !use_hwthread_cpus) { /* the bits in this bitmap represent cores */ @@ -1355,7 +1319,7 @@ void prte_hwloc_get_binding_info(hwloc_const_cpuset_t cpuset, /* binding happens within a package and not across packages */ for (n = 0; n < npkgs; n++) { - pkg = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, n); + pkg = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, n); /* see if we have any here */ hwloc_bitmap_and(avail, cpuset, pkg->cpuset); @@ -1399,7 +1363,7 @@ char *prte_hwloc_base_cset2str(hwloc_const_cpuset_t cpuset, /* if the cpuset is all zero, then something is wrong */ if (hwloc_bitmap_iszero(cpuset)) { - return strdup("NOT MAPPED"); + return strdup("EMPTY CPUSET"); } /* if the cpuset includes all available cpus, and @@ -1413,11 +1377,11 @@ char *prte_hwloc_base_cset2str(hwloc_const_cpuset_t cpuset, hwloc_bitmap_free(avail); /* get the number of packages in the topology */ - npkgs = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PACKAGE); + npkgs = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PACKAGE); avail = hwloc_bitmap_alloc(); - npus = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU); - ncores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE); + npus = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PU); + ncores = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE); if (npus == ncores && !use_hwthread_cpus) { /* the bits in this bitmap represent cores */ bits_as_cores = true; @@ -1427,7 +1391,7 @@ char *prte_hwloc_base_cset2str(hwloc_const_cpuset_t cpuset, } for (n = 0; n < npkgs; n++) { - pkg = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, n); + pkg = prte_hwloc_base_get_obj_by_type(topo, HWLOC_OBJ_PACKAGE, n); /* see if we have any here */ hwloc_bitmap_and(avail, cpuset, pkg->cpuset); if (hwloc_bitmap_iszero(avail)) { @@ -1471,13 +1435,13 @@ char *prte_hwloc_base_get_topo_signature(hwloc_topology_t topo) unsigned i; hwloc_bitmap_t complete, allowed; - nnuma = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_NUMANODE, 0); - npackage = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PACKAGE, 0); - nl3 = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_L3CACHE, 3); - nl2 = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_L2CACHE, 2); - nl1 = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_L1CACHE, 1); - ncore = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE, 0); - nhwt = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PU, 0); + nnuma = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_NUMANODE); + npackage = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PACKAGE); + nl3 = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_L3CACHE); + nl2 = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_L2CACHE); + nl1 = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_L1CACHE); + ncore = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE); + nhwt = prte_hwloc_base_get_nbobjs_by_type(topo, HWLOC_OBJ_PU); /* get the root object so we can add the processor architecture */ obj = hwloc_get_root_obj(topo); @@ -1592,11 +1556,7 @@ char *prte_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap) type = hwloc_get_depth_type(topo, d); /* if it isn't one of interest, then ignore it */ if (HWLOC_OBJ_NUMANODE != type && HWLOC_OBJ_PACKAGE != type && -#if HWLOC_API_VERSION < 0x20000 - HWLOC_OBJ_CACHE != type && -#else HWLOC_OBJ_L1CACHE != type && HWLOC_OBJ_L2CACHE != type && HWLOC_OBJ_L3CACHE != type && -#endif HWLOC_OBJ_CORE != type && HWLOC_OBJ_PU != type) { continue; } @@ -1624,33 +1584,6 @@ char *prte_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap) } locality = t2; break; -#if HWLOC_API_VERSION < 0x20000 - case HWLOC_OBJ_CACHE: { - unsigned cachedepth = hwloc_get_obj_by_depth(topo, d, 0)->attr->cache.depth; - if (3 == cachedepth) { - pmix_asprintf(&t2, "%sL3%s:", (NULL == locality) ? "" : locality, tmp); - if (NULL != locality) { - free(locality); - } - locality = t2; - break; - } else if (2 == cachedepth) { - pmix_asprintf(&t2, "%sL2%s:", (NULL == locality) ? "" : locality, tmp); - if (NULL != locality) { - free(locality); - } - locality = t2; - break; - } else { - pmix_asprintf(&t2, "%sL1%s:", (NULL == locality) ? "" : locality, tmp); - if (NULL != locality) { - free(locality); - } - locality = t2; - break; - } - } break; -#else case HWLOC_OBJ_L3CACHE: pmix_asprintf(&t2, "%sL3%s:", (NULL == locality) ? "" : locality, tmp); if (NULL != locality) { @@ -1672,7 +1605,6 @@ char *prte_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap) } locality = t2; break; -#endif case HWLOC_OBJ_CORE: pmix_asprintf(&t2, "%sCR%s:", (NULL == locality) ? "" : locality, tmp); if (NULL != locality) { @@ -1696,7 +1628,6 @@ char *prte_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap) hwloc_bitmap_zero(result); } -#if HWLOC_API_VERSION >= 0x20000 if (prte_hwloc_base_get_locality_string_by_depth(topo, HWLOC_TYPE_DEPTH_NUMANODE, cpuset, result) == 0) { @@ -1713,7 +1644,6 @@ char *prte_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap) } hwloc_bitmap_zero(result); } -#endif hwloc_bitmap_free(result); hwloc_bitmap_free(cpuset); @@ -1730,9 +1660,7 @@ char *prte_hwloc_base_get_location(char *locality, hwloc_obj_type_t type, unsign char **loc; char *srch, *ans = NULL; size_t n; -#if HWLOC_API_VERSION >= 0x20000 PRTE_HIDE_UNUSED_PARAMS(index); -#endif if (NULL == locality) { return NULL; @@ -1744,17 +1672,6 @@ char *prte_hwloc_base_get_location(char *locality, hwloc_obj_type_t type, unsign case HWLOC_OBJ_PACKAGE: srch = "SK"; break; -#if HWLOC_API_VERSION < 0x20000 - case HWLOC_OBJ_CACHE: - if (3 == index) { - srch = "L3"; - } else if (2 == index) { - srch = "L2"; - } else { - srch = "L1"; - } - break; -#else case HWLOC_OBJ_L3CACHE: srch = "L3"; break; @@ -1764,7 +1681,6 @@ char *prte_hwloc_base_get_location(char *locality, hwloc_obj_type_t type, unsign case HWLOC_OBJ_L1CACHE: srch = "L1"; break; -#endif case HWLOC_OBJ_CORE: srch = "CR"; break; @@ -1854,33 +1770,21 @@ prte_hwloc_locality_t prte_hwloc_compute_relative_locality(char *loc1, char *loc int prte_hwloc_base_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlpath, int *buflen) { -#if HWLOC_API_VERSION < 0x00020000 - return hwloc_topology_export_xmlbuffer(topology, xmlpath, buflen); -#else return hwloc_topology_export_xmlbuffer(topology, xmlpath, buflen, 0); -#endif } int prte_hwloc_base_topology_set_flags(hwloc_topology_t topology, unsigned long flags, bool io) { if (io) { -#if HWLOC_API_VERSION < 0x00020000 - flags |= HWLOC_TOPOLOGY_FLAG_IO_DEVICES; -#else int ret = hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_IMPORTANT); if (0 != ret) { return ret; } -#endif } // Blacklist the "gl" component due to potential conflicts. // See "https://github.com/open-mpi/ompi/issues/10025" for // an explanation -#if HWLOC_VERSION_MAJOR > 2 hwloc_topology_set_components(topology, HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST, "gl"); -#elif HWLOC_VERSION_MAJOR == 2 && HWLOC_VERSION_MINOR >= 1 - hwloc_topology_set_components(topology, HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST, "gl"); -#endif return hwloc_topology_set_flags(topology, flags); } diff --git a/src/include/prte_portable_platform.h b/src/include/prte_portable_platform.h index babd7b518c..52e8c0aaac 100644 --- a/src/include/prte_portable_platform.h +++ b/src/include/prte_portable_platform.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All Rights * reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -15,13 +15,6 @@ #ifndef PRTE_PORTABLE_PLATFORM_H #define PRTE_PORTABLE_PLATFORM_H 1 -#ifndef _PORTABLE_PLATFORM_H -#define _PORTABLE_PLATFORM_H 0 -#endif -#ifndef PLATFORM_HEADER_VERSION -#define PLATFORM_HEADER_VERSION 0 -#endif - #include "src/include/prte_portable_platform_real.h" #endif diff --git a/src/include/prte_portable_platform_real.h b/src/include/prte_portable_platform_real.h index 3a9f0cccb0..6f21dbfa8d 100644 --- a/src/include/prte_portable_platform_real.h +++ b/src/include/prte_portable_platform_real.h @@ -28,10 +28,11 @@ */ /* Publish and enforce version number for the public interface to this header */ /* YOU ARE NOT PERMITTED TO CHANGE THIS SECTION WITHOUT DIRECT APPROVAL FROM DAN BONACHEA */ -#if _PORTABLE_PLATFORM_H != PLATFORM_HEADER_VERSION \ - || PLATFORM_HEADER_VERSION < 16 +#if !defined(_PORTABLE_PLATFORM_H) || !defined(PLATFORM_HEADER_VERSION) \ + || _PORTABLE_PLATFORM_H != PLATFORM_HEADER_VERSION \ + || PLATFORM_HEADER_VERSION < 22 #undef PLATFORM_HEADER_VERSION -#define PLATFORM_HEADER_VERSION 16 +#define PLATFORM_HEADER_VERSION 22 #undef _PORTABLE_PLATFORM_H #define _PORTABLE_PLATFORM_H PLATFORM_HEADER_VERSION /* End Header versioning handshake */ @@ -115,14 +116,17 @@ #undef PLATFORM_COMPILER_UNKNOWN #undef PLATFORM_OS_FAMILYNAME +#undef PLATFORM_OS_SUBFAMILYNAME #undef PLATFORM_OS_CATAMOUNT -#undef PLATFORM_OS_CNL #undef PLATFORM_OS_BGP #undef PLATFORM_OS_BGQ -#undef PLATFORM_OS_WSL #undef PLATFORM_OS_K42 #undef PLATFORM_OS_UCLINUX #undef PLATFORM_OS_LINUX +#undef PLATFORM_OS_CNL +#undef PLATFORM_OS_SUBFAMILY_CNL +#undef PLATFORM_OS_WSL +#undef PLATFORM_OS_SUBFAMILY_WSL #undef PLATFORM_OS_BLRTS #undef PLATFORM_OS_CYGWIN #undef PLATFORM_OS_MSWINDOWS @@ -167,6 +171,7 @@ #undef PLATFORM_ARCH_AARCH64 #undef PLATFORM_ARCH_TILE #undef PLATFORM_ARCH_S390 +#undef PLATFORM_ARCH_RISCV #undef PLATFORM_ARCH_UNKNOWN /* prevent known old/broken versions of this header from loading */ @@ -291,7 +296,7 @@ PLATFORM_COMPILER_VERSION_INT(__PATHCC__,__PATHCC_MINOR__,__PATHCC_PATCHLEVEL__+0) #define PLATFORM_COMPILER_VERSION_STR __PATHSCALE__ -#elif defined(__NVCOMPILER) // Must occur prior to PGI and CLANG +#elif defined(__NVCOMPILER) /* Must occur prior to PGI and CLANG */ #define PLATFORM_COMPILER_NVHPC 1 #define PLATFORM_COMPILER_FAMILYNAME NVHPC #define PLATFORM_COMPILER_FAMILYID 20 @@ -590,7 +595,7 @@ #define PLATFORM_COMPILER_VERSION_STR __clang_version__ #endif -// NOTE: PLATFORM_COMPILER_FAMILYID "20" is allocted to NVHPC, appearing earlier +/* NOTE: PLATFORM_COMPILER_FAMILYID "20" is allocted to NVHPC, appearing earlier */ #else /* unknown compiler */ #define PLATFORM_COMPILER_UNKNOWN 1 @@ -750,17 +755,17 @@ PLATFORM_OS_: defined to a positive value if OS belongs to a given family, undef otherwise PLATFORM_OS_FAMILYNAME: - unquoted token which provides the compiler family name + unquoted token which provides the OS family name + + Some systems also define a subfamily: + PLATFORM_OS_SUBFAMILY_: positive value or undef + PLATFORM_OS_SUBFAMILYNAME: unquoted token for subfamily name or undef */ #if defined(__LIBCATAMOUNT__) || defined(__QK_USER__) #define PLATFORM_OS_CATAMOUNT 1 #define PLATFORM_OS_FAMILYNAME CATAMOUNT -#elif defined(__CRAYXT_COMPUTE_LINUX_TARGET) - #define PLATFORM_OS_CNL 1 - #define PLATFORM_OS_FAMILYNAME CNL - #elif defined(GASNETI_ARCH_BGP) || defined(__bgp__) #define PLATFORM_OS_BGP 1 #define PLATFORM_OS_FAMILYNAME BGP @@ -769,10 +774,6 @@ #define PLATFORM_OS_BGQ 1 #define PLATFORM_OS_FAMILYNAME BGQ -#elif defined(GASNETI_ARCH_WSL) - #define PLATFORM_OS_WSL 1 - #define PLATFORM_OS_FAMILYNAME WSL - #elif defined(__K42) #define PLATFORM_OS_K42 1 #define PLATFORM_OS_FAMILYNAME K42 @@ -784,6 +785,14 @@ #elif defined(__linux) || defined(__linux__) || defined(__gnu_linux__) #define PLATFORM_OS_LINUX 1 #define PLATFORM_OS_FAMILYNAME LINUX + #if defined(GASNETI_ARCH_WSL) + #define PLATFORM_OS_SUBFAMILY_WSL 1 + #define PLATFORM_OS_SUBFAMILYNAME WSL + #elif defined(__CRAYXT_COMPUTE_LINUX_TARGET) + /* NOTE: As of 2022-07 this is ONLY defined for the Cray cc/CC wrappers, and not the raw PrgEnv compilers */ + #define PLATFORM_OS_SUBFAMILY_CNL 1 + #define PLATFORM_OS_SUBFAMILYNAME CNL + #endif #elif defined(__blrts) || defined(__blrts__) || defined(__gnu_blrts__) #define PLATFORM_OS_BLRTS 1 @@ -1031,6 +1040,16 @@ #define _PLATFORM_ARCH_32 1 #endif +#elif defined(__riscv) + #define PLATFORM_ARCH_RISCV 1 + #define PLATFORM_ARCH_FAMILYNAME RISCV + #define _PLATFORM_ARCH_LITTLE_ENDIAN 1 + #if __riscv_xlen == 32 + #define _PLATFORM_ARCH_32 1 + #else /* (__riscv_xlen == 64) || (__riscv_xlen == 128) */ + #define _PLATFORM_ARCH_64 1 + #endif + #else /* unknown CPU */ #define PLATFORM_ARCH_UNKNOWN 1 #define PLATFORM_ARCH_FAMILYNAME UNKNOWN @@ -1053,9 +1072,9 @@ #elif defined(__LITTLE_ENDIAN__) || defined(WORDS_LITTLEENDIAN) || \ ( __BYTE_ORDER__ > 0 && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ) #define PLATFORM_ARCH_LITTLE_ENDIAN 1 -#elif defined(_PLATFORM_ARCH_BIG_ENDIAN) && _PLATFORM_ARCH_BIG_ENDIAN +#elif defined(_PLATFORM_ARCH_BIG_ENDIAN) #define PLATFORM_ARCH_BIG_ENDIAN 1 -#elif defined(_PLATFORM_ARCH_LITTLE_ENDIAN) && _PLATFORM_ARCH_LITTLE_ENDIAN +#elif defined(_PLATFORM_ARCH_LITTLE_ENDIAN) #define PLATFORM_ARCH_LITTLE_ENDIAN 1 #endif #undef _PLATFORM_ARCH_BIG_ENDIAN @@ -1086,9 +1105,9 @@ defined(__arch32__) || defined(__32BIT__) || \ __INTPTR_MAX__ == 2147483647 #define PLATFORM_ARCH_32 1 -#elif defined(_PLATFORM_ARCH_64) && _PLATFORM_ARCH_64 +#elif defined(_PLATFORM_ARCH_64) #define PLATFORM_ARCH_64 1 -#elif defined(_PLATFORM_ARCH_32) && _PLATFORM_ARCH_32 +#elif defined(_PLATFORM_ARCH_32) #define PLATFORM_ARCH_32 1 #endif #undef _PLATFORM_ARCH_64 @@ -1128,16 +1147,21 @@ int main(void) { PLATFORM_DISPX(COMPILER_VERSION); PLATFORM_DISP(COMPILER_VERSION_STR); PLATFORM_DISP(COMPILER_IDSTR); - #if PLATFORM_COMPILER_C_LANGLVL + #ifdef PLATFORM_COMPILER_C_LANGLVL PLATFORM_DISPI(COMPILER_C_LANGLVL); - #elif PLATFORM_COMPILER_CXX_LANGLVL + #elif defined(PLATFORM_COMPILER_CXX_LANGLVL) PLATFORM_DISPI(COMPILER_CXX_LANGLVL); #else printf("WARNING: Missing PLATFORM_COMPILER_C(XX)_LANGLVL!"); #endif PLATFORM_DISP(OS_FAMILYNAME); + #ifdef PLATFORM_OS_SUBFAMILYNAME + { const char * OS_SUBFAMILYNAME = PLATFORM_STRINGIFY(PLATFORM_OS_SUBFAMILYNAME); + PLATFORM_DISP(OS_SUBFAMILYNAME); + } + #endif PLATFORM_DISP(ARCH_FAMILYNAME); - #if PLATFORM_ARCH_32 + #ifdef PLATFORM_ARCH_32 PLATFORM_DISPI(ARCH_32); assert(sizeof(void *) == 4); #else @@ -1146,7 +1170,7 @@ int main(void) { #endif { int x = 0x00FF; unsigned char *p = (unsigned char *)&x; - #if PLATFORM_ARCH_BIG_ENDIAN + #ifdef PLATFORM_ARCH_BIG_ENDIAN PLATFORM_DISPI(ARCH_BIG_ENDIAN); assert(*p == 0); #else diff --git a/src/mca/ess/base/ess_base_std_prted.c b/src/mca/ess/base/ess_base_std_prted.c index 4e48cce7d6..221d3e6632 100644 --- a/src/mca/ess/base/ess_base_std_prted.c +++ b/src/mca/ess/base/ess_base_std_prted.c @@ -53,11 +53,9 @@ #include "src/mca/grpcomm/grpcomm.h" #include "src/mca/iof/base/base.h" #include "src/mca/odls/base/base.h" -#include "src/mca/oob/base/base.h" #include "src/mca/plm/base/base.h" #include "src/mca/prtereachable/base/base.h" #include "src/mca/rmaps/base/base.h" -#include "src/mca/rtc/base/base.h" #include "src/mca/schizo/base/base.h" #include "src/mca/state/base/base.h" #include "src/mca/state/state.h" @@ -99,9 +97,8 @@ int prte_ess_base_prted_setup(void) int fd; char log_file[PATH_MAX]; char *error = NULL; - char *uri = NULL; - char *tmp; - prte_job_t *jdata; + char *tmp = NULL; + prte_job_t *jdata = NULL; prte_proc_t *proc; prte_app_context_t *app; hwloc_obj_t obj; @@ -109,7 +106,6 @@ int prte_ess_base_prted_setup(void) prte_topology_t *t; prte_ess_base_signal_t *sig; int idx; - pmix_value_t val; plm_in_use = false; @@ -245,7 +241,7 @@ int prte_ess_base_prted_setup(void) /* obviously, we have "reported" */ jdata->num_reported = 1; - /* setup my session directory here as the OOB may need it */ + /* setup my session directory here */ PMIX_OUTPUT_VERBOSE( (2, prte_ess_base_framework.framework_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", @@ -316,61 +312,15 @@ int prte_ess_base_prted_setup(void) error = "prte_prtereachable_base_select"; goto error; } - if (PRTE_SUCCESS - != (ret = pmix_mca_base_framework_open(&prte_oob_base_framework, - PMIX_MCA_BASE_OPEN_DEFAULT))) { - PRTE_ERROR_LOG(ret); - error = "prte_oob_base_open"; - goto error; - } - if (PRTE_SUCCESS != (ret = prte_oob_base_select())) { + if (PRTE_SUCCESS != (ret = prte_rml_open())) { PRTE_ERROR_LOG(ret); - error = "prte_oob_base_select"; + error = "prte_rml_open"; goto error; } - prte_rml_open(); /* it is now safe to start the pmix server */ pmix_server_start(); - /* store our URI for later */ - prte_oob_base_get_addr(&uri); - PMIX_VALUE_LOAD(&val, uri, PMIX_STRING); - ret = PMIx_Store_internal(PRTE_PROC_MY_NAME, PMIX_PROC_URI, &val); - if (PMIX_SUCCESS != ret) { - PMIX_VALUE_DESTRUCT(&val); - error = "store MY URI"; - ret = PRTE_ERROR; - goto error; - } - PMIX_VALUE_DESTRUCT(&val); - free(uri); - - if (NULL != prte_process_info.my_hnp_uri) { - /* extract the HNP's name so we can update the routing table */ - ret = prte_rml_parse_uris(prte_process_info.my_hnp_uri, - PRTE_PROC_MY_HNP, - NULL); - if (PRTE_SUCCESS != ret) { - PRTE_ERROR_LOG(ret); - error = "prte_rml_parse_HNP"; - goto error; - } - /* Set the contact info in the RML - this won't actually establish - * the connection, but just tells the RML how to reach the HNP - * if/when we attempt to send to it - */ - PMIX_VALUE_LOAD(&val, prte_process_info.my_hnp_uri, PMIX_STRING); - ret = PMIx_Store_internal(PRTE_PROC_MY_HNP, PMIX_PROC_URI, &val); - if (PMIX_SUCCESS != ret) { - PMIX_VALUE_DESTRUCT(&val); - error = "store HNP URI"; - ret = PRTE_ERROR; - goto error; - } - PMIX_VALUE_DESTRUCT(&val); - } - /* select the errmgr */ if (PRTE_SUCCESS != (ret = prte_errmgr_base_select())) { PRTE_ERROR_LOG(ret); @@ -405,19 +355,6 @@ int prte_ess_base_prted_setup(void) error = "prte_odls_base_select"; goto error; } - /* Open/select the rtc */ - if (PRTE_SUCCESS - != (ret = pmix_mca_base_framework_open(&prte_rtc_base_framework, - PMIX_MCA_BASE_OPEN_DEFAULT))) { - PRTE_ERROR_LOG(ret); - error = "prte_rtc_base_open"; - goto error; - } - if (PRTE_SUCCESS != (ret = prte_rtc_base_select())) { - PRTE_ERROR_LOG(ret); - error = "prte_rtc_base_select"; - goto error; - } if (PRTE_SUCCESS != (ret = pmix_mca_base_framework_open(&prte_rmaps_base_framework, PMIX_MCA_BASE_OPEN_DEFAULT))) { @@ -543,11 +480,9 @@ int prte_ess_base_prted_finalize(void) (void) pmix_mca_base_framework_close(&prte_plm_base_framework); /* make sure our local procs are dead */ prte_odls.kill_local_procs(NULL); - (void) pmix_mca_base_framework_close(&prte_rtc_base_framework); (void) pmix_mca_base_framework_close(&prte_odls_base_framework); (void) pmix_mca_base_framework_close(&prte_errmgr_base_framework); prte_rml_close(); - (void) pmix_mca_base_framework_close(&prte_oob_base_framework); (void) pmix_mca_base_framework_close(&prte_prtereachable_base_framework); (void) pmix_mca_base_framework_close(&prte_state_base_framework); diff --git a/src/mca/ess/hnp/ess_hnp_module.c b/src/mca/ess/hnp/ess_hnp_module.c index ca0185349f..cfd3db4115 100644 --- a/src/mca/ess/hnp/ess_hnp_module.c +++ b/src/mca/ess/hnp/ess_hnp_module.c @@ -59,13 +59,11 @@ #include "src/mca/grpcomm/base/base.h" #include "src/mca/iof/base/base.h" #include "src/mca/odls/base/base.h" -#include "src/mca/oob/base/base.h" #include "src/mca/plm/base/base.h" #include "src/mca/plm/plm.h" #include "src/mca/prtereachable/base/base.h" #include "src/mca/ras/base/base.h" #include "src/mca/rmaps/base/base.h" -#include "src/mca/rtc/base/base.h" #include "src/mca/schizo/base/base.h" #include "src/mca/state/base/base.h" #include "src/mca/state/state.h" @@ -107,8 +105,6 @@ static int rte_init(int argc, char **argv) prte_app_context_t *app; int idx; prte_topology_t *t; - pmix_value_t pval; - pmix_status_t pret; PRTE_HIDE_UNUSED_PARAMS(argc); /* run the prolog */ @@ -234,7 +230,7 @@ static int rte_init(int argc, char **argv) jdata->num_reported = 1; jdata->num_daemons_reported = 1; - /* setup my session directory here as the OOB may need it */ + /* setup my session directory here */ PMIX_OUTPUT_VERBOSE((2, prte_debug_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), @@ -268,38 +264,11 @@ static int rte_init(int argc, char **argv) error = "prte_prtereachable_base_select"; goto error; } - /* - * OOB Layer - */ - if (PRTE_SUCCESS - != (ret = pmix_mca_base_framework_open(&prte_oob_base_framework, - PMIX_MCA_BASE_OPEN_DEFAULT))) { - error = "prte_oob_base_open"; - goto error; - } - if (PRTE_SUCCESS != (ret = prte_oob_base_select())) { - error = "prte_oob_base_select"; - goto error; - } - - // set our RML address - prte_oob_base_get_addr(&proc->rml_uri); - prte_process_info.my_hnp_uri = strdup(proc->rml_uri); - /* store it in the local PMIx repo for later retrieval */ - PMIX_VALUE_LOAD(&pval, proc->rml_uri, PMIX_STRING); - if (PMIX_SUCCESS != (pret = PMIx_Store_internal(PRTE_PROC_MY_NAME, PMIX_PROC_URI, &pval))) { - PMIX_ERROR_LOG(pret); - ret = PRTE_ERROR; - PMIX_VALUE_DESTRUCT(&pval); - error = "store uri"; + if (PRTE_SUCCESS != (ret = prte_rml_open())) { + PRTE_ERROR_LOG(ret); + error = "prte_rml_open"; goto error; } - PMIX_VALUE_DESTRUCT(&pval); - - /* - * Runtime Messaging Layer - */ - prte_rml_open(); /* it is now safe to start the pmix server */ pmix_server_start(); @@ -406,19 +375,6 @@ static int rte_init(int argc, char **argv) error = "prte_odls_base_select"; goto error; } - /* Open/select the rtc */ - ret = pmix_mca_base_framework_open(&prte_rtc_base_framework, - PMIX_MCA_BASE_OPEN_DEFAULT); - if (PRTE_SUCCESS != ret) { - PRTE_ERROR_LOG(ret); - error = "prte_rtc_base_open"; - goto error; - } - if (PRTE_SUCCESS != (ret = prte_rtc_base_select())) { - PRTE_ERROR_LOG(ret); - error = "prte_rtc_base_select"; - goto error; - } /* set the pmix_output hnp file location to be in the * proc-specific session directory. */ @@ -471,7 +427,7 @@ static int rte_init(int argc, char **argv) static int rte_finalize(void) { /* first stage shutdown of the errmgr, deregister the handler but keep - * the required facilities until the rml and oob are offline */ + * the required facilities until the rml is offline */ prte_errmgr.finalize(); /* close frameworks */ @@ -483,10 +439,8 @@ static int rte_finalize(void) /* make sure our local procs are dead */ prte_odls.kill_local_procs(NULL); } - (void) pmix_mca_base_framework_close(&prte_rtc_base_framework); (void) pmix_mca_base_framework_close(&prte_odls_base_framework); prte_rml_close(); - (void) pmix_mca_base_framework_close(&prte_oob_base_framework); (void) pmix_mca_base_framework_close(&prte_prtereachable_base_framework); (void) pmix_mca_base_framework_close(&prte_errmgr_base_framework); (void) pmix_mca_base_framework_close(&prte_state_base_framework); diff --git a/src/mca/odls/base/Makefile.am b/src/mca/odls/base/Makefile.am index c72670b355..db790c9ede 100644 --- a/src/mca/odls/base/Makefile.am +++ b/src/mca/odls/base/Makefile.am @@ -13,7 +13,7 @@ # All rights reserved # Copyright (c) 2019 Intel, Inc. All rights reserved. # Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -27,6 +27,7 @@ headers += \ libprtemca_odls_la_SOURCES += \ base/odls_base_frame.c \ base/odls_base_select.c \ - base/odls_base_default_fns.c + base/odls_base_default_fns.c \ + base/odls_base_bind.c dist_prtedata_DATA += base/help-prte-odls-base.txt diff --git a/src/mca/odls/base/base.h b/src/mca/odls/base/base.h index b713866cc9..af33ba957e 100644 --- a/src/mca/odls/base/base.h +++ b/src/mca/odls/base/base.h @@ -12,7 +12,7 @@ * Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved * Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2017-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -157,6 +157,10 @@ PRTE_EXPORT void prte_odls_base_start_threads(prte_job_t *jdata); PRTE_EXPORT void prte_odls_base_harvest_threads(void); +/* Binding support */ +PRTE_EXPORT void prte_odls_base_set(prte_odls_spawn_caddy_t *cd, int write_fd); + + #define PRTE_ODLS_SET_ERROR(ns, s, j) \ do { \ int _idx; \ diff --git a/src/mca/rtc/hwloc/rtc_hwloc.c b/src/mca/odls/base/odls_base_bind.c similarity index 55% rename from src/mca/rtc/hwloc/rtc_hwloc.c rename to src/mca/odls/base/odls_base_bind.c index d1ba2ff8f9..441b77589d 100644 --- a/src/mca/rtc/hwloc/rtc_hwloc.c +++ b/src/mca/odls/base/odls_base_bind.c @@ -1,10 +1,25 @@ /* - * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2017-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2017 Inria. All rights reserved. - * Copyright (c) 2019 Research Organization for Information Science + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2011 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2017-2020 IBM Corporation. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -12,78 +27,142 @@ * $HEADER$ */ -#define PRTE_HWLOC_WANT_SHMEM 1 - #include "prte_config.h" #include "constants.h" #include "types.h" -#include +#ifdef HAVE_SYS_WAIT_H +# include +#endif #include -#include -#include -#include -#ifdef HAVE_UNISTD_H -# include -#endif /* HAVE_UNISTD_H */ -#include -#include #ifdef HAVE_SYS_STAT_H # include +#endif /* HAVE_SYS_STAT_H */ +#ifdef HAVE_SYS_PARAM_H +# include #endif -#if HAVE_FCNTL_H -# include -#endif +#include +#include +#include +#include -#include "src/class/pmix_list.h" +#include "prte_stdint.h" #include "src/hwloc/hwloc-internal.h" -#if HWLOC_API_VERSION >= 0x20000 -# include "hwloc/shmem.h" -#endif #include "src/pmix/pmix-internal.h" -#include "src/util/pmix_argv.h" -#include "src/util/pmix_fd.h" -#include "src/util/pmix_path.h" -#include "src/util/pmix_environ.h" +#include "src/util/pmix_printf.h" #include "src/mca/errmgr/errmgr.h" -#include "src/mca/odls/base/base.h" #include "src/mca/rmaps/rmaps_types.h" #include "src/runtime/prte_globals.h" -#include "src/util/error_strings.h" +#include "src/util/name_fns.h" #include "src/util/pmix_show_help.h" -#include "rtc_hwloc.h" -#include "src/mca/rtc/base/base.h" +#include "src/mca/odls/base/base.h" -static int init(void); -static void finalize(void); -static void assign(prte_job_t *jdata); -static void set(prte_odls_spawn_caddy_t *cd, int write_fd); -static void report_binding(prte_job_t *jobdat, int rank); +static void report_binding(prte_job_t *jobdat, int rank) +{ + char *tmp1; + hwloc_cpuset_t mycpus; + bool use_hwthread_cpus; -prte_rtc_base_module_t prte_rtc_hwloc_module = {.init = init, - .finalize = finalize, - .assign = assign, - .set = set}; + /* check for type of cpu being used */ + if (prte_get_attribute(&jobdat->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { + use_hwthread_cpus = true; + } else { + use_hwthread_cpus = false; + } + /* get the cpus we are bound to */ + mycpus = hwloc_bitmap_alloc(); + if (hwloc_get_cpubind(prte_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS) < 0) { + pmix_output(0, "Rank %d is not bound", rank); + } else { + tmp1 = prte_hwloc_base_cset2str(mycpus, use_hwthread_cpus, prte_hwloc_topology); + pmix_output(0, "Rank %d bound to %s", rank, tmp1); + free(tmp1); + } + hwloc_bitmap_free(mycpus); +} -static int init(void) +static int write_help_msg(int fd, prte_odls_pipe_err_msg_t *msg, const char *file, + const char *topic, va_list ap) { - return PRTE_SUCCESS; + int ret; + char *str; + + if (NULL == file || NULL == topic) { + return PRTE_ERR_BAD_PARAM; + } + + str = pmix_show_help_vstring(file, topic, true, ap); + + msg->file_str_len = (int) strlen(file); + if (msg->file_str_len > PRTE_ODLS_MAX_FILE_LEN) { + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + return PRTE_ERR_BAD_PARAM; + } + msg->topic_str_len = (int) strlen(topic); + if (msg->topic_str_len > PRTE_ODLS_MAX_TOPIC_LEN) { + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + return PRTE_ERR_BAD_PARAM; + } + msg->msg_str_len = (int) strlen(str); + + /* Only keep writing if each write() succeeds */ + if (PRTE_SUCCESS != (ret = pmix_fd_write(fd, sizeof(*msg), msg))) { + goto out; + } + if (msg->file_str_len > 0 + && PRTE_SUCCESS != (ret = pmix_fd_write(fd, msg->file_str_len, file))) { + goto out; + } + if (msg->topic_str_len > 0 + && PRTE_SUCCESS != (ret = pmix_fd_write(fd, msg->topic_str_len, topic))) { + goto out; + } + if (msg->msg_str_len > 0 && PRTE_SUCCESS != (ret = pmix_fd_write(fd, msg->msg_str_len, str))) { + goto out; + } + +out: + free(str); + return ret; } -static void finalize(void) +static int send_warn_show_help(int fd, const char *file, const char *topic, ...) { - return; + int ret; + va_list ap; + prte_odls_pipe_err_msg_t msg; + + msg.fatal = false; + msg.exit_status = 0; /* ignored */ + + /* Send it */ + va_start(ap, topic); + ret = write_help_msg(fd, &msg, file, topic, ap); + va_end(ap); + + return ret; } -static void assign(prte_job_t *jdata) +static void send_error_show_help(int fd, int exit_status, const char *file, + const char *topic, ...) { - PRTE_HIDE_UNUSED_PARAMS(jdata); - return; + va_list ap; + prte_odls_pipe_err_msg_t msg; + + msg.fatal = true; + msg.exit_status = exit_status; + + /* Send it */ + va_start(ap, topic); + write_help_msg(fd, &msg, file, topic, ap); + va_end(ap); + + exit(exit_status); } -static void set(prte_odls_spawn_caddy_t *cd, int write_fd) +void prte_odls_base_set(prte_odls_spawn_caddy_t *cd, int write_fd) { prte_job_t *jobdat = cd->jdata; prte_proc_t *child = cd->child; @@ -93,13 +172,14 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) int rc = PRTE_ERROR; char *msg; - pmix_output_verbose(2, prte_rtc_base_framework.framework_output, "%s hwloc:set on child %s", + pmix_output_verbose(2, prte_odls_base_framework.framework_output, + "%s hwloc:set on child %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == child) ? "NULL" : PRTE_NAME_PRINT(&child->name)); if (NULL == jobdat || NULL == child) { /* nothing for us to do */ - pmix_output_verbose(2, prte_rtc_base_framework.framework_output, + pmix_output_verbose(2, prte_odls_base_framework.framework_output, "%s hwloc:set jobdat %s child %s - nothing to do", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == jobdat) ? "NULL" : PRTE_JOBID_PRINT(jobdat->nspace), @@ -113,16 +193,12 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) if (NULL != prte_daemon_cores) { root = hwloc_get_root_obj(prte_hwloc_topology); if (NULL == root->userdata) { - prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", - "incorrectly bound", prte_process_info.nodename, - context->app, __FILE__, __LINE__); + send_warn_show_help(write_fd, "help-prte-odls-default.txt", + "incorrectly bound", prte_process_info.nodename, + context->app, __FILE__, __LINE__); } /* bind this proc to all available processors */ -#if HWLOC_API_VERSION < 0x20000 - cpuset = root->allowed_cpuset; -#else cpuset = (hwloc_cpuset_t)hwloc_topology_get_allowed_cpuset(prte_hwloc_topology); -#endif rc = hwloc_set_cpubind(prte_hwloc_topology, cpuset, 0); /* if we got an error and this wasn't a default binding policy, then report it */ if (rc < 0 && PRTE_BINDING_POLICY_IS_SET(jobdat->map->binding)) { @@ -140,14 +216,14 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) if (PRTE_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ - prte_rtc_base_send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", - "binding generic error", - prte_process_info.nodename, context->app, - msg, __FILE__, __LINE__); + send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", + "binding generic error", + prte_process_info.nodename, context->app, + msg, __FILE__, __LINE__); } else { - prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", - "not bound", prte_process_info.nodename, - context->app, msg, __FILE__, __LINE__); + send_warn_show_help(write_fd, "help-prte-odls-default.txt", + "not bound", prte_process_info.nodename, + context->app, msg, __FILE__, __LINE__); return; } } @@ -182,14 +258,14 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) * given (i.e., we are not binding due to a default policy), * send an error up the pipe (which exits -- it doesn't return). */ - prte_rtc_base_send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", - "binding generic error", - prte_process_info.nodename, context->app, msg, - __FILE__, __LINE__); + send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", + "binding generic error", + prte_process_info.nodename, context->app, msg, + __FILE__, __LINE__); } else { - prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", - "not bound", prte_process_info.nodename, - context->app, msg, __FILE__, __LINE__); + send_warn_show_help(write_fd, "help-prte-odls-default.txt", + "not bound", prte_process_info.nodename, + context->app, msg, __FILE__, __LINE__); hwloc_bitmap_free(cpuset); return; } @@ -211,14 +287,14 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) if (PRTE_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ - prte_rtc_base_send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", - "binding generic error", - prte_process_info.nodename, context->app, msg, - __FILE__, __LINE__); + send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", + "binding generic error", + prte_process_info.nodename, context->app, msg, + __FILE__, __LINE__); } else { - prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", - "not bound", prte_process_info.nodename, - context->app, msg, __FILE__, __LINE__); + send_warn_show_help(write_fd, "help-prte-odls-default.txt", + "not bound", prte_process_info.nodename, + context->app, msg, __FILE__, __LINE__); if (NULL != tmp) { free(tmp); free(msg); @@ -251,40 +327,16 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) if (PRTE_HWLOC_BASE_MBFA_ERROR == prte_hwloc_base_mbfa) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ - prte_rtc_base_send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", - "memory binding error", - prte_process_info.nodename, context->app, msg, - __FILE__, __LINE__); + send_error_show_help(write_fd, 1, "help-prte-odls-default.txt", + "memory binding error", + prte_process_info.nodename, context->app, msg, + __FILE__, __LINE__); } else { - prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", - "memory not bound", prte_process_info.nodename, - context->app, msg, __FILE__, __LINE__); + send_warn_show_help(write_fd, "help-prte-odls-default.txt", + "memory not bound", prte_process_info.nodename, + context->app, msg, __FILE__, __LINE__); return; } } } } - -static void report_binding(prte_job_t *jobdat, int rank) -{ - char *tmp1; - hwloc_cpuset_t mycpus; - bool use_hwthread_cpus; - - /* check for type of cpu being used */ - if (prte_get_attribute(&jobdat->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; - } else { - use_hwthread_cpus = false; - } - /* get the cpus we are bound to */ - mycpus = hwloc_bitmap_alloc(); - if (hwloc_get_cpubind(prte_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS) < 0) { - pmix_output(0, "Rank %d is not bound", rank); - } else { - tmp1 = prte_hwloc_base_cset2str(mycpus, use_hwthread_cpus, prte_hwloc_topology); - pmix_output(0, "Rank %d bound to %s", rank, tmp1); - free(tmp1); - } - hwloc_bitmap_free(mycpus); -} diff --git a/src/mca/odls/base/odls_base_default_fns.c b/src/mca/odls/base/odls_base_default_fns.c index 1ebd90eef8..7dc161a463 100644 --- a/src/mca/odls/base/odls_base_default_fns.c +++ b/src/mca/odls/base/odls_base_default_fns.c @@ -69,7 +69,6 @@ #include "src/mca/rmaps/rmaps_types.h" #include "src/rml/rml_contact.h" #include "src/rml/rml.h" -#include "src/mca/rtc/rtc.h" #include "src/mca/schizo/base/base.h" #include "src/mca/state/state.h" @@ -788,9 +787,6 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix lock.active = false; // we won't get a callback } - /* load any controls into the job */ - prte_rtc.assign(jdata); - /* spin up the spawn threads */ prte_odls_base_start_threads(jdata); diff --git a/src/mca/odls/default/odls_default_module.c b/src/mca/odls/default/odls_default_module.c index d9679747f6..33fb7b2e78 100644 --- a/src/mca/odls/default/odls_default_module.c +++ b/src/mca/odls/default/odls_default_module.c @@ -21,7 +21,7 @@ * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -126,7 +126,6 @@ #include "src/mca/ess/ess.h" #include "src/mca/iof/base/iof_base_setup.h" #include "src/mca/plm/plm.h" -#include "src/mca/rtc/rtc.h" #include "src/mca/state/state.h" #include "src/runtime/prte_globals.h" #include "src/runtime/prte_wait.h" @@ -333,7 +332,7 @@ static void do_child(prte_odls_spawn_caddy_t *cd, int write_fd) } /* now set any child-level controls such as binding */ - prte_rtc.set(cd, write_fd); + prte_odls_base_set(cd, write_fd); } else if (!PRTE_FLAG_TEST(cd->jdata, PRTE_JOB_FLAG_FORWARD_OUTPUT)) { /* tie stdin/out/err/internal to /dev/null */ diff --git a/src/mca/oob/Makefile.am b/src/mca/oob/Makefile.am deleted file mode 100644 index 1a97fbbe79..0000000000 --- a/src/mca/oob/Makefile.am +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2019 Intel, Inc. All rights reserved. -# Copyright (c) 2022 Nanook Consulting. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# main library setup -noinst_LTLIBRARIES = libprtemca_oob.la -libprtemca_oob_la_SOURCES = - -# pkgdata setup -dist_prtedata_DATA = - -# local files -headers = oob.h -libprtemca_oob_la_SOURCES += $(headers) - -# Conditionally install the header files -prtedir = $(prteincludedir)/$(subdir) -nobase_prte_HEADERS = $(headers) - -include base/Makefile.am - -distclean-local: - rm -f base/static-components.h diff --git a/src/mca/oob/base/Makefile.am b/src/mca/oob/base/Makefile.am deleted file mode 100644 index b3cdb6a3d1..0000000000 --- a/src/mca/oob/base/Makefile.am +++ /dev/null @@ -1,32 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights -# reserved. -# Copyright (c) 2014-2019 Intel, Inc. All rights reserved. -# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_prtedata_DATA += base/help-oob-base.txt - -headers += \ - base/base.h - -libprtemca_oob_la_SOURCES += \ - base/oob_base_stubs.c \ - base/oob_base_frame.c \ - base/oob_base_select.c diff --git a/src/mca/oob/base/oob_base_frame.c b/src/mca/oob/base/oob_base_frame.c deleted file mode 100644 index 9102fa94d9..0000000000 --- a/src/mca/oob/base/oob_base_frame.c +++ /dev/null @@ -1,111 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2013-2017 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015-2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2017-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include "src/class/pmix_bitmap.h" -#include "src/mca/base/pmix_base.h" -#include "src/mca/mca.h" -#include "src/pmix/pmix-internal.h" -#include "src/runtime/prte_progress_threads.h" -#include "src/util/pmix_output.h" - -#include "src/mca/oob/base/base.h" -#include "src/rml/rml.h" - -/* - * The following file was created by configure. It contains extern - * statements and the definition of an array of pointers to each - * component's public pmix_mca_base_component_t struct. - */ - -#include "src/mca/oob/base/static-components.h" - -/* - * Global variables - */ -prte_oob_base_t prte_oob_base = {0}; - -static int prte_oob_base_close(void) -{ - prte_oob_base_component_t *component; - pmix_mca_base_component_list_item_t *cli; - - /* shutdown all active transports */ - while (NULL - != (cli = (pmix_mca_base_component_list_item_t *) pmix_list_remove_first( - &prte_oob_base.actives))) { - component = (prte_oob_base_component_t *) cli->cli_component; - if (NULL != component->shutdown) { - component->shutdown(); - } - PMIX_RELEASE(cli); - } - - /* destruct our internal lists */ - PMIX_DESTRUCT(&prte_oob_base.actives); - - /* release all peers from the list */ - PMIX_LIST_DESTRUCT(&prte_oob_base.peers); - - return pmix_mca_base_framework_components_close(&prte_oob_base_framework, NULL); -} - -/** - * Function for finding and opening either all MCA components, - * or the one that was specifically requested via a MCA parameter. - */ -static int prte_oob_base_open(pmix_mca_base_open_flag_t flags) -{ - /* setup globals */ - prte_oob_base.max_uri_length = -1; - PMIX_CONSTRUCT(&prte_oob_base.peers, pmix_list_t); - PMIX_CONSTRUCT(&prte_oob_base.actives, pmix_list_t); - - /* Open up all available components */ - return pmix_mca_base_framework_components_open(&prte_oob_base_framework, flags); -} - -PMIX_MCA_BASE_FRAMEWORK_DECLARE(prte, oob, "Out-of-Band Messaging Subsystem", NULL, - prte_oob_base_open, prte_oob_base_close, - prte_oob_base_static_components, - PMIX_MCA_BASE_FRAMEWORK_FLAG_DEFAULT); - -PMIX_CLASS_INSTANCE(prte_oob_send_t, pmix_object_t, NULL, NULL); - -static void pr_cons(prte_oob_base_peer_t *ptr) -{ - PMIX_LOAD_PROCID(&ptr->name, NULL, PMIX_RANK_INVALID); - ptr->component = NULL; - PMIX_CONSTRUCT(&ptr->addressable, pmix_bitmap_t); - pmix_bitmap_init(&ptr->addressable, 8); -} -static void pr_des(prte_oob_base_peer_t *ptr) -{ - PMIX_DESTRUCT(&ptr->addressable); -} -PMIX_CLASS_INSTANCE(prte_oob_base_peer_t, pmix_list_item_t, pr_cons, pr_des); diff --git a/src/mca/oob/base/oob_base_select.c b/src/mca/oob/base/oob_base_select.c deleted file mode 100644 index 290a9589f1..0000000000 --- a/src/mca/oob/base/oob_base_select.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include -#include - -#include "src/mca/base/pmix_base.h" -#include "src/mca/mca.h" -#include "src/util/pmix_output.h" - -#include "src/util/pmix_show_help.h" - -#include "src/mca/oob/base/base.h" -#include "src/mca/oob/oob.h" -#include "src/runtime/prte_globals.h" - -/** - * Function for selecting all runnable modules from those that are - * available. - * - * Call the init function on all available modules. - */ -int prte_oob_base_select(void) -{ - pmix_mca_base_component_list_item_t *cli, *cmp, *c2; - prte_oob_base_component_t *component, *c3; - bool added; - int i, rc; - - /* Query all available components and ask if their transport is available */ - PMIX_LIST_FOREACH(cli, &prte_oob_base_framework.framework_components, - pmix_mca_base_component_list_item_t) - { - component = (prte_oob_base_component_t *) cli->cli_component; - - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: checking available component %s", - component->oob_base.pmix_mca_component_name); - - /* If there's no query function, skip it */ - if (NULL == component->available) { - pmix_output_verbose( - 5, prte_oob_base_framework.framework_output, - "mca:oob:select: Skipping component [%s]. It does not implement a query function", - component->oob_base.pmix_mca_component_name); - continue; - } - - /* Query the component */ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Querying component [%s]", - component->oob_base.pmix_mca_component_name); - - rc = component->available(); - - /* If the component is not available, then skip it as - * it has no available interfaces - */ - if (PRTE_SUCCESS != rc && PRTE_ERR_FORCE_SELECT != rc) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Skipping component [%s] - no available interfaces", - component->oob_base.pmix_mca_component_name); - continue; - } - - /* if it fails to startup, then skip it */ - if (PRTE_SUCCESS != component->startup()) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Skipping component [%s] - failed to startup", - component->oob_base.pmix_mca_component_name); - continue; - } - - if (PRTE_ERR_FORCE_SELECT == rc) { - /* this component shall be the *only* component allowed - * for use, so shutdown and remove any prior ones */ - while (NULL - != (cmp = (pmix_mca_base_component_list_item_t *) pmix_list_remove_first( - &prte_oob_base.actives))) { - c3 = (prte_oob_base_component_t *) cmp->cli_component; - if (NULL != c3->shutdown) { - c3->shutdown(); - } - PMIX_RELEASE(cmp); - } - c2 = PMIX_NEW(pmix_mca_base_component_list_item_t); - c2->cli_component = (pmix_mca_base_component_t *) component; - pmix_list_append(&prte_oob_base.actives, &c2->super); - break; - } - - /* record it, but maintain priority order */ - added = false; - PMIX_LIST_FOREACH(cmp, &prte_oob_base.actives, pmix_mca_base_component_list_item_t) - { - c3 = (prte_oob_base_component_t *) cmp->cli_component; - if (c3->priority > component->priority) { - continue; - } - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Inserting component"); - c2 = PMIX_NEW(pmix_mca_base_component_list_item_t); - c2->cli_component = (pmix_mca_base_component_t *) component; - pmix_list_insert_pos(&prte_oob_base.actives, &cmp->super, &c2->super); - added = true; - break; - } - if (!added) { - /* add to end */ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Adding component to end"); - c2 = PMIX_NEW(pmix_mca_base_component_list_item_t); - c2->cli_component = (pmix_mca_base_component_t *) component; - pmix_list_append(&prte_oob_base.actives, &c2->super); - } - } - - if (0 == pmix_list_get_size(&prte_oob_base.actives)) { - /* no support available means we really cannot run */ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Init failed to return any available transports"); - pmix_show_help("help-oob-base.txt", "no-interfaces-avail", true); - return PRTE_ERR_SILENT; - } - - /* provide them an index so we can track their usability in a bitmap */ - i = 0; - PMIX_LIST_FOREACH(cmp, &prte_oob_base.actives, pmix_mca_base_component_list_item_t) - { - c3 = (prte_oob_base_component_t *) cmp->cli_component; - c3->idx = i++; - } - - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "mca:oob:select: Found %d active transports", - (int) pmix_list_get_size(&prte_oob_base.actives)); - return PRTE_SUCCESS; -} diff --git a/src/mca/oob/base/oob_base_stubs.c b/src/mca/oob/base/oob_base_stubs.c deleted file mode 100644 index 16f380a730..0000000000 --- a/src/mca/oob/base/oob_base_stubs.c +++ /dev/null @@ -1,382 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include "src/pmix/pmix-internal.h" -#include "src/runtime/prte_globals.h" -#include "src/util/pmix_argv.h" -#include "src/util/pmix_output.h" -#include "src/util/pmix_printf.h" - -#include "src/mca/errmgr/errmgr.h" -#include "src/mca/oob/base/base.h" -#include "src/rml/rml.h" -#include "src/mca/state/state.h" -#include "src/threads/pmix_threads.h" - -static prte_oob_base_peer_t* process_uri(char *uri); - -void prte_oob_base_send_nb(int fd, short args, void *cbdata) -{ - prte_oob_send_t *cd = (prte_oob_send_t *) cbdata; - prte_rml_send_t *msg; - pmix_mca_base_component_list_item_t *cli; - prte_oob_base_peer_t *pr; - int rc; - bool msg_sent; - prte_oob_base_component_t *component; - bool reachable; - char *uri; - PRTE_HIDE_UNUSED_PARAMS(fd, args); - - PMIX_ACQUIRE_OBJECT(cd); - - /* done with this. release it now */ - msg = cd->msg; - PMIX_RELEASE(cd); - - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s oob:base:send to target %s - attempt %u", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&msg->dst), - msg->retries); - - /* don't try forever - if we have exceeded the number of retries, - * then report this message as undeliverable even if someone continues - * to think they could reach it */ - if (prte_rml_base.max_retries <= msg->retries) { - msg->status = PRTE_ERR_NO_PATH_TO_TARGET; - PRTE_RML_SEND_COMPLETE(msg); - return; - } - - /* check if we have this peer in our list */ - pr = prte_oob_base_get_peer(&msg->dst); - if (NULL == pr) { - /* if we are abnormally terminating, or terminating the DVM, then - * don't bother looking for it */ - if (prte_abnormal_term_ordered || prte_never_launched || prte_dvm_abort_ordered) { - return; - } - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s oob:base:send unknown peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&msg->dst)); - /* for direct launched procs, the URI might be in the database, - * so check there next - if it is, the peer object will be added - * to our hash table. However, we don't want to chase up to the - * server after it, so indicate it is optional - */ - PRTE_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PROC_URI, &msg->dst, (char **) &uri, PMIX_STRING); - if (PRTE_SUCCESS == rc) { - if (NULL != uri) { - pr = process_uri(uri); - if (NULL == pr) { - /* that is just plain wrong */ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s oob:base:send addressee unknown %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&msg->dst)); - PRTE_ERROR_LOG(PRTE_ERR_ADDRESSEE_UNKNOWN); - msg->status = PRTE_ERR_ADDRESSEE_UNKNOWN; - PRTE_RML_SEND_COMPLETE(msg); - return; - } - } else { - PRTE_ERROR_LOG(PRTE_ERR_ADDRESSEE_UNKNOWN); - msg->status = PRTE_ERR_ADDRESSEE_UNKNOWN; - PRTE_RML_SEND_COMPLETE(msg); - return; - } - } else { - /* even though we don't know about this peer yet, we still might - * be able to get to it via routing, so ask each component if - * it can reach it - */ - reachable = false; - pr = NULL; - PMIX_LIST_FOREACH(cli, &prte_oob_base.actives, pmix_mca_base_component_list_item_t) - { - component = (prte_oob_base_component_t *) cli->cli_component; - if (NULL != component->is_reachable) { - if (component->is_reachable(&msg->dst)) { - /* there is a way to reach this peer - record it - * so we don't waste this time again - */ - if (NULL == pr) { - pr = PMIX_NEW(prte_oob_base_peer_t); - PMIX_XFER_PROCID(&pr->name, &msg->dst); - pmix_list_append(&prte_oob_base.peers, &pr->super); - } - /* mark that this component can reach the peer */ - pmix_bitmap_set_bit(&pr->addressable, component->idx); - /* flag that at least one component can reach this peer */ - reachable = true; - } - } - } - /* if nobody could reach it, then that's an error */ - if (!reachable) { - /* if we are a daemon or HNP, then it could be that - * this is a local proc we just haven't heard from - * yet due to a race condition. Check that situation */ - if (PRTE_PROC_IS_DAEMON || PRTE_PROC_IS_MASTER) { - ++msg->retries; - if (msg->retries < prte_rml_base.max_retries) { - PRTE_OOB_SEND(msg); - return; - } - } - msg->status = PRTE_ERR_ADDRESSEE_UNKNOWN; - PRTE_RML_SEND_COMPLETE(msg); - return; - } - } - } - - /* if we already have a connection to this peer, use it */ - if (NULL != pr->component) { - /* post this msg for send by this transport - the component - * runs on our event base, so we can just call their function - */ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s oob:base:send known transport for peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&msg->dst)); - if (PRTE_SUCCESS == (rc = pr->component->send_nb(msg))) { - return; - } - } - - /* if we haven't identified a transport to this peer, - * loop across all available components in priority order until - * one replies that it has a module that can reach this peer. - * Let it try to make the connection - */ - msg_sent = false; - PMIX_LIST_FOREACH(cli, &prte_oob_base.actives, pmix_mca_base_component_list_item_t) - { - component = (prte_oob_base_component_t *) cli->cli_component; - /* is this peer reachable via this component? */ - if (!component->is_reachable(&msg->dst)) { - continue; - } - /* it is addressable, so attempt to send via that transport */ - if (PRTE_SUCCESS == (rc = component->send_nb(msg))) { - /* the msg status will be set upon send completion/failure */ - msg_sent = true; - /* point to this transport for any future messages */ - pr->component = component; - break; - } else if (PRTE_ERR_TAKE_NEXT_OPTION != rc) { - /* components return "next option" if they can't connect - * to this peer. anything else is a true error. - */ - PRTE_ERROR_LOG(rc); - msg->status = rc; - PRTE_RML_SEND_COMPLETE(msg); - return; - } - } - - /* if no component can reach this peer, that's an error - post - * it back to the RML for handling - */ - if (!msg_sent) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s oob:base:send no path to target %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&msg->dst)); - msg->status = PRTE_ERR_NO_PATH_TO_TARGET; - PRTE_RML_SEND_COMPLETE(msg); - } -} - -/** - * Obtain a uri for initial connection purposes - * - * During initial wireup, we can only transfer contact info on the daemon - * command line. This limits what we can send to a string representation of - * the actual contact info, which gets sent in a uri-like form. Not every - * oob module can support this transaction, so this function will loop - * across all oob components/modules, letting each add to the uri string if - * it supports bootstrap operations. An error will be returned in the cbfunc - * if NO component can successfully provide a contact. - * - * Note: since there is a limit to what an OS will allow on a cmd line, we - * impose a limit on the length of the resulting uri via an MCA param. The - * default value of -1 implies unlimited - however, users with large numbers - * of interfaces on their nodes may wish to restrict the size. - */ -void prte_oob_base_get_addr(char **uri) -{ - char *turi, *final = NULL, *tmp; - size_t len = 0; - bool one_added = false; - pmix_mca_base_component_list_item_t *cli; - prte_oob_base_component_t *component; - pmix_status_t rc; - - /* start with our process name */ - rc = prte_util_convert_process_name_to_string(&final, PRTE_PROC_MY_NAME); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - *uri = NULL; - return; - } - len = strlen(final); - - /* loop across all available modules to get their input - * up to the max length - */ - PMIX_LIST_FOREACH(cli, &prte_oob_base.actives, pmix_mca_base_component_list_item_t) - { - component = (prte_oob_base_component_t *) cli->cli_component; - /* ask the component for its input, obtained when it - * opened its modules - */ - if (NULL == component->get_addr) { - /* doesn't support this ability */ - continue; - } - /* the components operate within our event base, so we - * can directly call their get_uri function to get the - * pointer to the uri - this is not a copy, so - * do NOT free it! - */ - turi = component->get_addr(); - if (NULL != turi) { - /* check overall length for limits */ - if (0 < prte_oob_base.max_uri_length - && prte_oob_base.max_uri_length < (int) (len + strlen(turi))) { - /* cannot accept the payload */ - continue; - } - /* add new value to final one */ - pmix_asprintf(&tmp, "%s;%s", final, turi); - free(turi); - free(final); - final = tmp; - len = strlen(final); - /* flag that at least one contributed */ - one_added = true; - } - } - - if (!one_added) { - /* nobody could contribute */ - if (NULL != final) { - free(final); - final = NULL; - } - } - - *uri = final; -} - -static prte_oob_base_peer_t* process_uri(char *uri) -{ - pmix_proc_t peer; - char *cptr; - pmix_mca_base_component_list_item_t *cli; - prte_oob_base_component_t *component; - char **uris = NULL; - prte_oob_base_peer_t *pr; - - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s:set_addr processing uri %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), uri); - - /* find the first semi-colon in the string */ - cptr = strchr(uri, ';'); - if (NULL == cptr) { - /* got a problem - there must be at least two fields, - * the first containing the process name of our peer - * and all others containing the OOB contact info - */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return NULL; - } - *cptr = '\0'; - cptr++; - /* the first field is the process name, so convert it */ - prte_util_convert_string_to_process_name(&peer, uri); - - /* if the peer is us, no need to go further as we already - * know our own contact info - */ - if (PMIX_CHECK_PROCID(&peer, PRTE_PROC_MY_NAME)) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s:set_addr peer %s is me", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&peer)); - return NULL; - } - - /* split the rest of the uri into component parts */ - uris = PMIX_ARGV_SPLIT_COMPAT(cptr, ';'); - - /* get the peer object for this process */ - pr = prte_oob_base_get_peer(&peer); - if (NULL == pr) { - pr = PMIX_NEW(prte_oob_base_peer_t); - PMIX_XFER_PROCID(&pr->name, &peer); - pmix_list_append(&prte_oob_base.peers, &pr->super); - } - - /* loop across all available components and let them extract - * whatever piece(s) of the uri they find relevant - they - * are all operating on our event base, so we can just - * directly call their functions - */ - PMIX_LIST_FOREACH(cli, &prte_oob_base.actives, pmix_mca_base_component_list_item_t) - { - component = (prte_oob_base_component_t *) cli->cli_component; - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s:set_addr checking if peer %s is reachable via component %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer), - component->oob_base.pmix_mca_component_name); - if (NULL != component->set_addr) { - if (PRTE_SUCCESS == component->set_addr(&peer, uris)) { - /* this component found reachable addresses - * in the uris - */ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s: peer %s is reachable via component %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer), - component->oob_base.pmix_mca_component_name); - pmix_bitmap_set_bit(&pr->addressable, component->idx); - } else { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s: peer %s is NOT reachable via component %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer), - component->oob_base.pmix_mca_component_name); - } - } - } - PMIX_ARGV_FREE_COMPAT(uris); - return pr; -} - -prte_oob_base_peer_t *prte_oob_base_get_peer(const pmix_proc_t *pr) -{ - prte_oob_base_peer_t *peer; - - PMIX_LIST_FOREACH(peer, &prte_oob_base.peers, prte_oob_base_peer_t) - { - if (PMIX_CHECK_PROCID(pr, &peer->name)) { - return peer; - } - } - return NULL; -} diff --git a/src/mca/oob/base/owner.txt b/src/mca/oob/base/owner.txt deleted file mode 100644 index 2d23c9be65..0000000000 --- a/src/mca/oob/base/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: project -status: maintenance diff --git a/src/mca/oob/oob.h b/src/mca/oob/oob.h deleted file mode 100644 index 6c6e3294f3..0000000000 --- a/src/mca/oob/oob.h +++ /dev/null @@ -1,83 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * - * Contains the internal functions and typedefs for the use of the oob - */ - -#ifndef MCA_OOB_H_ -#define MCA_OOB_H_ - -#include "prte_config.h" -#include "types.h" - -#ifdef HAVE_UNISTD_H -# include -#endif -#ifdef HAVE_SYS_UIO_H -# include -#endif -#ifdef HAVE_NET_UIO_H -# include -#endif - -#include "src/class/pmix_list.h" -#include "src/class/pmix_pointer_array.h" -#include "src/mca/mca.h" -#include "src/pmix/pmix-internal.h" - -#include "src/rml/rml_types.h" - -BEGIN_C_DECLS - -typedef int (*mca_oob_base_component_avail_fn_t)(void); -typedef int (*mca_oob_base_component_startup_fn_t)(void); -typedef void (*mca_oob_base_component_shutdown_fn_t)(void); -typedef int (*mca_oob_base_component_send_fn_t)(prte_rml_send_t *msg); -typedef char *(*mca_oob_base_component_get_addr_fn_t)(void); -typedef int (*mca_oob_base_component_set_addr_fn_t)(pmix_proc_t *peer, char **uris); -typedef bool (*mca_oob_base_component_is_reachable_fn_t)(pmix_proc_t *peer); -typedef void (*mca_oob_ping_callback_fn_t)(int status, void *cbdata); - -typedef struct { - pmix_mca_base_component_t oob_base; - int idx; - int priority; - mca_oob_base_component_avail_fn_t available; - mca_oob_base_component_startup_fn_t startup; - mca_oob_base_component_shutdown_fn_t shutdown; - mca_oob_base_component_send_fn_t send_nb; - mca_oob_base_component_get_addr_fn_t get_addr; - mca_oob_base_component_set_addr_fn_t set_addr; - mca_oob_base_component_is_reachable_fn_t is_reachable; -} prte_oob_base_component_t; - -/** - * Macro for use in components that are of type oob - */ -#define PRTE_OOB_BASE_VERSION_2_0_0 PRTE_MCA_BASE_VERSION_3_0_0("oob", 2, 0, 0) - -END_C_DECLS - -#endif diff --git a/src/mca/oob/tcp/Makefile.am b/src/mca/oob/tcp/Makefile.am deleted file mode 100644 index 07e6c7bfbd..0000000000 --- a/src/mca/oob/tcp/Makefile.am +++ /dev/null @@ -1,63 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2012-2013 Los Alamos National Security, LLC. -# All rights reserved -# Copyright (c) 2014-2020 Intel, Inc. All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# Copyright (c) 2022 Nanook Consulting. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_prtedata_DATA = help-oob-tcp.txt - -sources = \ - oob_tcp_component.h \ - oob_tcp.h \ - oob_tcp_listener.h \ - oob_tcp_common.h \ - oob_tcp_component.c \ - oob_tcp_connection.h \ - oob_tcp_sendrecv.h \ - oob_tcp_hdr.h \ - oob_tcp_peer.h \ - oob_tcp.c \ - oob_tcp_listener.c \ - oob_tcp_common.c \ - oob_tcp_connection.c \ - oob_tcp_sendrecv.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_prte_oob_tcp_DSO -component_noinst = -component_install = prte_mca_oob_tcp.la -else -component_noinst = libprtemca_oob_tcp.la -component_install = -endif - -mcacomponentdir = $(prtelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -prte_mca_oob_tcp_la_SOURCES = $(sources) -prte_mca_oob_tcp_la_LDFLAGS = -module -avoid-version -prte_mca_oob_tcp_la_LIBADD = $(top_builddir)/src/libprrte.la - -noinst_LTLIBRARIES = $(component_noinst) -libprtemca_oob_tcp_la_SOURCES = $(sources) -libprtemca_oob_tcp_la_LDFLAGS = -module -avoid-version diff --git a/src/mca/oob/tcp/configure.m4 b/src/mca/oob/tcp/configure.m4 deleted file mode 100644 index 25ea055125..0000000000 --- a/src/mca/oob/tcp/configure.m4 +++ /dev/null @@ -1,39 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2019 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_oob_tcp_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_prte_oob_tcp_CONFIG],[ - AC_CONFIG_FILES([src/mca/oob/tcp/Makefile]) - - # check for sockaddr_in (a good sign we have TCP) - AC_CHECK_TYPES([struct sockaddr_in], - [oob_tcp_happy="yes"], - [oob_tcp_happy="no"], - [AC_INCLUDES_DEFAULT -#ifdef HAVE_NETINET_IN_H -#include -#endif]) - - AS_IF([test "$oob_tcp_happy" = "yes"], [$1], [$2]) -])dnl diff --git a/src/mca/oob/tcp/oob_tcp.c b/src/mca/oob/tcp/oob_tcp.c deleted file mode 100644 index a78ce60079..0000000000 --- a/src/mca/oob/tcp/oob_tcp.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2009-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2016-2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "prte_config.h" -#include "types.h" - -#ifdef HAVE_UNISTD_H -# include -#endif -#ifdef HAVE_SYS_TYPES_H -# include -#endif -#include -#ifdef HAVE_NETINET_IN_H -# include -#endif -#ifdef HAVE_ARPA_INET_H -# include -#endif -#ifdef HAVE_NETDB_H -# include -#endif -#include - -#include "src/include/prte_socket_errno.h" -#include "src/runtime/prte_progress_threads.h" -#include "src/util/pmix_argv.h" -#include "src/util/error.h" -#include "src/util/pmix_if.h" -#include "src/util/pmix_net.h" -#include "src/util/pmix_output.h" -#include "src/util/pmix_show_help.h" - -#include "src/mca/errmgr/errmgr.h" -#include "src/mca/ess/ess.h" -#include "src/runtime/prte_globals.h" -#include "src/threads/pmix_threads.h" -#include "src/util/name_fns.h" -#include "src/util/pmix_parse_options.h" -#include "src/util/pmix_show_help.h" - -#include "src/mca/oob/tcp/oob_tcp.h" -#include "src/mca/oob/tcp/oob_tcp_common.h" -#include "src/mca/oob/tcp/oob_tcp_component.h" -#include "src/mca/oob/tcp/oob_tcp_connection.h" -#include "src/mca/oob/tcp/oob_tcp_peer.h" -#include "src/mca/oob/tcp/oob_tcp_sendrecv.h" - -static void accept_connection(const int accepted_fd, const struct sockaddr *addr); -static void ping(const pmix_proc_t *proc); -static void send_nb(prte_rml_send_t *msg); - -prte_oob_tcp_module_t prte_oob_tcp_module = {.accept_connection = accept_connection, - .ping = ping, - .send_nb = send_nb}; - -/* - * Local utility functions - */ -static void recv_handler(int sd, short flags, void *user); - -/* Called by prte_oob_tcp_accept() and connection_handler() on - * a socket that has been accepted. This call finishes processing the - * socket, including setting socket options and registering for the - * OOB-level connection handshake. Used in both the threaded and - * event listen modes. - */ -static void accept_connection(const int accepted_fd, const struct sockaddr *addr) -{ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s accept_connection: %s:%d\n", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - pmix_net_get_hostname(addr), pmix_net_get_port(addr)); - - /* setup socket options */ - prte_oob_tcp_set_socket_options(accepted_fd); - - /* use a one-time event to wait for receipt of peer's - * process ident message to complete this connection - */ - PRTE_ACTIVATE_TCP_ACCEPT_STATE(accepted_fd, addr, recv_handler); -} - -/* API functions */ -static void ping(const pmix_proc_t *proc) -{ - prte_oob_tcp_peer_t *peer; - - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s:[%s:%d] processing ping to peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - __FILE__, __LINE__, PRTE_NAME_PRINT(proc)); - - /* do we know this peer? */ - if (NULL == (peer = prte_oob_tcp_peer_lookup(proc))) { - /* push this back to the component so it can try - * another module within this transport. If no - * module can be found, the component can push back - * to the framework so another component can try - */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s:[%s:%d] hop %s unknown", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - __FILE__, __LINE__, PRTE_NAME_PRINT(proc)); - PRTE_ACTIVATE_TCP_MSG_ERROR(NULL, NULL, proc, prte_mca_oob_tcp_component_hop_unknown); - return; - } - - /* if we are already connected, there is nothing to do */ - if (MCA_OOB_TCP_CONNECTED == peer->state) { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s:[%s:%d] already connected to peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, - PRTE_NAME_PRINT(proc)); - return; - } - - /* if we are already connecting, there is nothing to do */ - if (MCA_OOB_TCP_CONNECTING == peer->state || MCA_OOB_TCP_CONNECT_ACK == peer->state) { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s:[%s:%d] already connecting to peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, - PRTE_NAME_PRINT(proc)); - return; - } - - /* attempt the connection */ - peer->state = MCA_OOB_TCP_CONNECTING; - PRTE_ACTIVATE_TCP_CONN_STATE(peer, prte_oob_tcp_peer_try_connect); -} - -static void send_nb(prte_rml_send_t *msg) -{ - prte_oob_tcp_peer_t *peer; - pmix_proc_t hop; - - /* do we have a route to this peer (could be direct)? */ - PMIX_LOAD_NSPACE(hop.nspace, PRTE_PROC_MY_NAME->nspace); - hop.rank = prte_rml_get_route(msg->dst.rank); - /* do we know this hop? */ - if (NULL == (peer = prte_oob_tcp_peer_lookup(&hop))) { - /* if this message is going to the HNP, send it direct */ - if (PRTE_PROC_MY_HNP->rank == msg->dst.rank) { - hop.rank = PRTE_PROC_MY_HNP->rank; - peer = prte_oob_tcp_peer_lookup(&hop); - if (NULL != peer) { - goto send; - } - } - /* push this back to the component so it can try - * another module within this transport. If no - * module can be found, the component can push back - * to the framework so another component can try - */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s:[%s:%d] processing send to peer %s:%d seq_num = %d hop %s unknown", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, - PRTE_NAME_PRINT(&msg->dst), msg->tag, msg->seq_num, - PRTE_NAME_PRINT(&hop)); - PRTE_ACTIVATE_TCP_NO_ROUTE(msg, &hop, prte_mca_oob_tcp_component_no_route); - return; - } - -send: - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s:[%s:%d] processing send to peer %s:%d seq_num = %d via %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, - PRTE_NAME_PRINT(&msg->dst), msg->tag, msg->seq_num, - PRTE_NAME_PRINT(&peer->name)); - - /* add the msg to the hop's send queue */ - if (MCA_OOB_TCP_CONNECTED == peer->state) { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s tcp:send_nb: already connected to %s - queueing for send", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); - MCA_OOB_TCP_QUEUE_SEND(msg, peer); - return; - } - - /* add the message to the queue for sending after the - * connection is formed - */ - MCA_OOB_TCP_QUEUE_PENDING(msg, peer); - - if (MCA_OOB_TCP_CONNECTING != peer->state && MCA_OOB_TCP_CONNECT_ACK != peer->state) { - /* we have to initiate the connection - again, we do not - * want to block while the connection is created. - * So throw us into an event that will create - * the connection via a mini-state-machine :-) - */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s tcp:send_nb: initiating connection to %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); - peer->state = MCA_OOB_TCP_CONNECTING; - PRTE_ACTIVATE_TCP_CONN_STATE(peer, prte_oob_tcp_peer_try_connect); - } -} - -/* - * Event callback when there is data available on the registered - * socket to recv. This is called for the listen sockets to accept an - * incoming connection, on new sockets trying to complete the software - * connection process, and for probes. Data on an established - * connection is handled elsewhere. - */ -static void recv_handler(int sd, short flg, void *cbdata) -{ - prte_oob_tcp_conn_op_t *op = (prte_oob_tcp_conn_op_t *) cbdata; - int flags; - prte_oob_tcp_hdr_t hdr; - prte_oob_tcp_peer_t *peer; - PRTE_HIDE_UNUSED_PARAMS(flg); - - PMIX_ACQUIRE_OBJECT(op); - - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s:tcp:recv:handler called", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - - /* get the handshake */ - if (PRTE_SUCCESS != prte_oob_tcp_peer_recv_connect_ack(NULL, sd, &hdr)) { - goto cleanup; - } - - /* finish processing ident */ - if (MCA_OOB_TCP_IDENT == hdr.type) { - if (NULL == (peer = prte_oob_tcp_peer_lookup(&hdr.origin))) { - /* should never happen */ - prte_oob_tcp_peer_close(peer); - goto cleanup; - } - /* set socket up to be non-blocking */ - if ((flags = fcntl(sd, F_GETFL, 0)) < 0) { - pmix_output(0, "%s prte_oob_tcp_recv_connect: fcntl(F_GETFL) failed: %s (%d)", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), strerror(prte_socket_errno), - prte_socket_errno); - } else { - flags |= O_NONBLOCK; - if (fcntl(sd, F_SETFL, flags) < 0) { - pmix_output(0, "%s prte_oob_tcp_recv_connect: fcntl(F_SETFL) failed: %s (%d)", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), strerror(prte_socket_errno), - prte_socket_errno); - } - } - /* is the peer instance willing to accept this connection */ - peer->sd = sd; - if (prte_oob_tcp_peer_accept(peer) == false) { - if (OOB_TCP_DEBUG_CONNECT - <= pmix_output_get_verbosity(prte_oob_base_framework.framework_output)) { - pmix_output(0, - "%s-%s prte_oob_tcp_recv_connect: " - "rejected connection from %s connection state %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), - PRTE_NAME_PRINT(&(hdr.origin)), peer->state); - } - CLOSE_THE_SOCKET(sd); - } - } - -cleanup: - PMIX_RELEASE(op); -} diff --git a/src/mca/oob/tcp/oob_tcp_component.c b/src/mca/oob/tcp/oob_tcp_component.c deleted file mode 100644 index 23e59d6f32..0000000000 --- a/src/mca/oob/tcp/oob_tcp_component.c +++ /dev/null @@ -1,1101 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2017 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2009-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015-2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2017 IBM Corporation. All rights reserved. - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights - * reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * In windows, many of the socket functions return an EWOULDBLOCK - * instead of things like EAGAIN, EINPROGRESS, etc. It has been - * verified that this will not conflict with other error codes that - * are returned by these functions under UNIX/Linux environments - */ - -#include "prte_config.h" -#include "types.h" - -#ifdef HAVE_UNISTD_H -# include -#endif -#ifdef HAVE_SYS_TYPES_H -# include -#endif -#include -#ifdef HAVE_NET_IF_H -# include -#endif -#ifdef HAVE_NETINET_IN_H -# include -#endif -#ifdef HAVE_ARPA_INET_H -# include -#endif -#ifdef HAVE_NETDB_H -# include -#endif -#include -#include -#include - -#ifndef MIN -# define MIN(a, b) ((a) < (b) ? (a) : (b)) -#endif - -#include "src/class/pmix_list.h" -#include "src/event/event-internal.h" -#include "src/include/prte_socket_errno.h" -#include "src/runtime/prte_progress_threads.h" -#include "src/util/pmix_argv.h" -#include "src/util/pmix_if.h" -#include "src/util/error.h" -#include "src/util/pmix_net.h" -#include "src/util/pmix_output.h" -#include "src/util/pmix_show_help.h" - -#include "src/mca/errmgr/errmgr.h" -#include "src/mca/ess/ess.h" -#include "src/rml/rml.h" -#include "src/mca/state/state.h" -#include "src/runtime/prte_globals.h" -#include "src/runtime/prte_wait.h" -#include "src/threads/pmix_threads.h" -#include "src/util/attr.h" -#include "src/util/name_fns.h" -#include "src/util/pmix_parse_options.h" -#include "src/util/pmix_show_help.h" - -#include "oob_tcp_peer.h" -#include "src/mca/oob/tcp/oob_tcp.h" -#include "src/mca/oob/tcp/oob_tcp_common.h" -#include "src/mca/oob/tcp/oob_tcp_component.h" -#include "src/mca/oob/tcp/oob_tcp_connection.h" -#include "src/mca/oob/tcp/oob_tcp_listener.h" -#include "src/mca/oob/tcp/oob_tcp_peer.h" - -/* - * Local utility functions - */ - -static int tcp_component_register(void); -static int tcp_component_open(void); -static int tcp_component_close(void); - -static int component_available(void); -static int component_startup(void); -static void component_shutdown(void); -static int component_send(prte_rml_send_t *msg); -static char *component_get_addr(void); -static int component_set_addr(pmix_proc_t *peer, char **uris); -static bool component_is_reachable(pmix_proc_t *peer); - -/* - * Struct of function pointers and all that to let us be initialized - */ -prte_mca_oob_tcp_component_t prte_mca_oob_tcp_component = { - .super = { - .oob_base = { - PRTE_OOB_BASE_VERSION_2_0_0, - .pmix_mca_component_name = "tcp", - PMIX_MCA_BASE_MAKE_VERSION(component, - PRTE_MAJOR_VERSION, - PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), - .pmix_mca_open_component = tcp_component_open, - .pmix_mca_close_component = tcp_component_close, - .pmix_mca_register_component_params = tcp_component_register, - }, - .priority = 30, // default priority of this transport - .available = component_available, - .startup = component_startup, - .shutdown = component_shutdown, - .send_nb = component_send, - .get_addr = component_get_addr, - .set_addr = component_set_addr, - .is_reachable = component_is_reachable, - } -}; - -/* - * Initialize global variables used w/in this module. - */ -static int tcp_component_open(void) -{ - PMIX_CONSTRUCT(&prte_mca_oob_tcp_component.peers, pmix_list_t); - PMIX_CONSTRUCT(&prte_mca_oob_tcp_component.listeners, pmix_list_t); - if (PRTE_PROC_IS_MASTER) { - PMIX_CONSTRUCT(&prte_mca_oob_tcp_component.listen_thread, pmix_thread_t); - prte_mca_oob_tcp_component.listen_thread_active = false; - prte_mca_oob_tcp_component.listen_thread_tv.tv_sec = 3600; - prte_mca_oob_tcp_component.listen_thread_tv.tv_usec = 0; - } - prte_mca_oob_tcp_component.addr_count = 0; - prte_mca_oob_tcp_component.ipv4conns = NULL; - prte_mca_oob_tcp_component.ipv4ports = NULL; - prte_mca_oob_tcp_component.ipv6conns = NULL; - prte_mca_oob_tcp_component.ipv6ports = NULL; - prte_mca_oob_tcp_component.if_masks = NULL; - - PMIX_CONSTRUCT(&prte_mca_oob_tcp_component.local_ifs, pmix_list_t); - return PRTE_SUCCESS; -} - -/* - * Cleanup of global variables used by this module. - */ -static int tcp_component_close(void) -{ - PMIX_LIST_DESTRUCT(&prte_mca_oob_tcp_component.local_ifs); - PMIX_LIST_DESTRUCT(&prte_mca_oob_tcp_component.peers); - - if (NULL != prte_mca_oob_tcp_component.ipv4conns) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.ipv4conns); - } - if (NULL != prte_mca_oob_tcp_component.ipv4ports) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.ipv4ports); - } - -#if PRTE_ENABLE_IPV6 - if (NULL != prte_mca_oob_tcp_component.ipv6conns) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.ipv6conns); - } - if (NULL != prte_mca_oob_tcp_component.ipv6ports) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.ipv6ports); - } -#endif - if (NULL != prte_mca_oob_tcp_component.if_masks) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.if_masks); - } - return PRTE_SUCCESS; -} -static char *static_port_string; -#if PRTE_ENABLE_IPV6 -static char *static_port_string6; -#endif // PRTE_ENABLE_IPV6 - -static char *dyn_port_string; -#if PRTE_ENABLE_IPV6 -static char *dyn_port_string6; -#endif - -static int tcp_component_register(void) -{ - pmix_mca_base_component_t *component = &prte_mca_oob_tcp_component.super.oob_base; - - /* register oob module parameters */ - prte_mca_oob_tcp_component.peer_limit = -1; - (void) pmix_mca_base_component_var_register(component, "peer_limit", - "Maximum number of peer connections to simultaneously maintain (-1 = infinite)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.peer_limit); - - prte_mca_oob_tcp_component.max_retries = 2; - (void) pmix_mca_base_component_var_register(component, "peer_retries", - "Number of times to try shutting down a connection before giving up", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.max_retries); - - prte_mca_oob_tcp_component.tcp_sndbuf = 0; - (void) pmix_mca_base_component_var_register(component, "sndbuf", - "TCP socket send buffering size (in bytes, 0 => leave system default)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.tcp_sndbuf); - - prte_mca_oob_tcp_component.tcp_rcvbuf = 0; - (void) pmix_mca_base_component_var_register(component, "rcvbuf", - "TCP socket receive buffering size (in bytes, 0 => leave system default)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.tcp_rcvbuf); - - - static_port_string = NULL; - (void) pmix_mca_base_component_var_register(component, "static_ipv4_ports", - "Static ports for daemons and procs (IPv4)", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &static_port_string); - - /* if ports were provided, parse the provided range */ - if (NULL != static_port_string) { - pmix_util_parse_range_options(static_port_string, &prte_mca_oob_tcp_component.tcp_static_ports); - if (0 == strcmp(prte_mca_oob_tcp_component.tcp_static_ports[0], "-1")) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.tcp_static_ports); - prte_mca_oob_tcp_component.tcp_static_ports = NULL; - } - } else { - prte_mca_oob_tcp_component.tcp_static_ports = NULL; - } - -#if PRTE_ENABLE_IPV6 - static_port_string6 = NULL; - (void) pmix_mca_base_component_var_register(component, "static_ipv6_ports", - "Static ports for daemons and procs (IPv6)", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &static_port_string6); - - /* if ports were provided, parse the provided range */ - if (NULL != static_port_string6) { - pmix_util_parse_range_options(static_port_string6, - &prte_mca_oob_tcp_component.tcp6_static_ports); - if (0 == strcmp(prte_mca_oob_tcp_component.tcp6_static_ports[0], "-1")) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.tcp6_static_ports); - prte_mca_oob_tcp_component.tcp6_static_ports = NULL; - } - } else { - prte_mca_oob_tcp_component.tcp6_static_ports = NULL; - } -#endif // PRTE_ENABLE_IPV6 - - if (NULL != prte_mca_oob_tcp_component.tcp_static_ports - || NULL != prte_mca_oob_tcp_component.tcp6_static_ports) { - prte_static_ports = true; - } - - dyn_port_string = NULL; - (void) pmix_mca_base_component_var_register(component, "dynamic_ipv4_ports", - "Range of ports to be dynamically used by daemons and procs (IPv4)", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &dyn_port_string); - /* if ports were provided, parse the provided range */ - if (NULL != dyn_port_string) { - /* can't have both static and dynamic ports! */ - if (prte_static_ports) { - char *err = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.tcp_static_ports, ','); - pmix_show_help("help-oob-tcp.txt", "static-and-dynamic", true, err, dyn_port_string); - free(err); - return PRTE_ERROR; - } - pmix_util_parse_range_options(dyn_port_string, &prte_mca_oob_tcp_component.tcp_dyn_ports); - if (0 == strcmp(prte_mca_oob_tcp_component.tcp_dyn_ports[0], "-1")) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.tcp_dyn_ports); - prte_mca_oob_tcp_component.tcp_dyn_ports = NULL; - } - } else { - prte_mca_oob_tcp_component.tcp_dyn_ports = NULL; - } - -#if PRTE_ENABLE_IPV6 - dyn_port_string6 = NULL; - (void) pmix_mca_base_component_var_register(component, "dynamic_ipv6_ports", - "Range of ports to be dynamically used by daemons and procs (IPv6)", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &dyn_port_string6); - /* if ports were provided, parse the provided range */ - if (NULL != dyn_port_string6) { - /* can't have both static and dynamic ports! */ - if (prte_static_ports) { - char *err4 = NULL, *err6 = NULL; - if (NULL != prte_mca_oob_tcp_component.tcp_static_ports) { - err4 = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.tcp_static_ports, ','); - } - if (NULL != prte_mca_oob_tcp_component.tcp6_static_ports) { - err6 = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.tcp6_static_ports, ','); - } - pmix_show_help("help-oob-tcp.txt", "static-and-dynamic-ipv6", true, - (NULL == err4) ? "N/A" : err4, (NULL == err6) ? "N/A" : err6, - dyn_port_string6); - if (NULL != err4) { - free(err4); - } - if (NULL != err6) { - free(err6); - } - return PRTE_ERROR; - } - pmix_util_parse_range_options(dyn_port_string6, &prte_mca_oob_tcp_component.tcp6_dyn_ports); - if (0 == strcmp(prte_mca_oob_tcp_component.tcp6_dyn_ports[0], "-1")) { - PMIX_ARGV_FREE_COMPAT(prte_mca_oob_tcp_component.tcp6_dyn_ports); - prte_mca_oob_tcp_component.tcp6_dyn_ports = NULL; - } - } else { - prte_mca_oob_tcp_component.tcp6_dyn_ports = NULL; - } -#endif // PRTE_ENABLE_IPV6 - - prte_mca_oob_tcp_component.disable_ipv4_family = false; - (void) pmix_mca_base_component_var_register(component, "disable_ipv4_family", - "Disable the IPv4 interfaces", - PMIX_MCA_BASE_VAR_TYPE_BOOL, - &prte_mca_oob_tcp_component.disable_ipv4_family); - -#if PRTE_ENABLE_IPV6 - prte_mca_oob_tcp_component.disable_ipv6_family = false; - (void) pmix_mca_base_component_var_register(component, "disable_ipv6_family", - "Disable the IPv6 interfaces", - PMIX_MCA_BASE_VAR_TYPE_BOOL, - &prte_mca_oob_tcp_component.disable_ipv6_family); -#endif // PRTE_ENABLE_IPV6 - - // Wait for this amount of time before sending the first keepalive probe - prte_mca_oob_tcp_component.keepalive_time = 300; - (void) pmix_mca_base_component_var_register(component, "keepalive_time", - "Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables " - "keepalive functionality)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.keepalive_time); - - // Resend keepalive probe every INT seconds - prte_mca_oob_tcp_component.keepalive_intvl = 20; - (void) pmix_mca_base_component_var_register(component, "keepalive_intvl", - "Time between successive keepalive pings when peer has not responded, in seconds (ignored " - "if keepalive_time <= 0)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.keepalive_intvl); - - // After sending PR probes every INT seconds consider the connection dead - prte_mca_oob_tcp_component.keepalive_probes = 9; - (void) pmix_mca_base_component_var_register(component, "keepalive_probes", - "Number of keepalives that can be missed before " - "declaring error (ignored if keepalive_time <= 0)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.keepalive_probes); - - prte_mca_oob_tcp_component.retry_delay = 0; - (void) pmix_mca_base_component_var_register(component, "retry_delay", - "Time (in sec) to wait before trying to connect to peer again", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.retry_delay); - - prte_mca_oob_tcp_component.max_recon_attempts = 10; - (void) pmix_mca_base_component_var_register(component, "max_recon_attempts", - "Max number of times to attempt connection before giving up (-1 -> never give up)", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_mca_oob_tcp_component.max_recon_attempts); - - return PRTE_SUCCESS; -} - -static int component_available(void) -{ - pmix_pif_t *copied_interface, *selected_interface; - struct sockaddr_storage my_ss; - char name[PMIX_IF_NAMESIZE]; - /* Larger than necessary, used for copying mask */ - char string[50]; - int kindex; - int i; - bool keeploopback = false; - - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "oob:tcp: component_available called"); - - /* if we are the master, then check the interfaces for loopbacks - * and keep loopbacks only if no non-loopback interface exists */ - if (PRTE_PROC_IS_MASTER) { - keeploopback = true; - PMIX_LIST_FOREACH(selected_interface, &pmix_if_list, pmix_pif_t) - { - if (!(selected_interface->if_flags & IFF_LOOPBACK)) { - keeploopback = false; - break; - } - } - } - - /* look at all available interfaces */ - PMIX_LIST_FOREACH(selected_interface, &pmix_if_list, pmix_pif_t) - { - if ((selected_interface->if_flags & IFF_LOOPBACK) && - !keeploopback) { - continue; - } - - i = selected_interface->if_index; - kindex = selected_interface->if_kernel_index; - memcpy((struct sockaddr *) &my_ss, &selected_interface->if_addr, - MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr))); - - /* Refs ticket #3019 - * it would probably be worthwhile to print out a warning if PRRTE detects multiple - * IP interfaces that are "up" on the same subnet (because that's a Bad Idea). Note - * that we should only check for this after applying the relevant include/exclude - * list MCA params. If we detect redundant ports, we can also automatically ignore - * them so that applications won't hang. - */ - - /* add this address to our connections */ - if (AF_INET == my_ss.ss_family) { - pmix_output_verbose(10, prte_oob_base_framework.framework_output, - "%s oob:tcp:init adding %s to our list of %s connections", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - pmix_net_get_hostname((struct sockaddr *) &my_ss), - (AF_INET == my_ss.ss_family) ? "V4" : "V6"); - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_mca_oob_tcp_component.ipv4conns, - pmix_net_get_hostname((struct sockaddr *) &my_ss)); - } else if (AF_INET6 == my_ss.ss_family) { -#if PRTE_ENABLE_IPV6 - pmix_output_verbose(10, prte_oob_base_framework.framework_output, - "%s oob:tcp:init adding %s to our list of %s connections", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - pmix_net_get_hostname((struct sockaddr *) &my_ss), - (AF_INET == my_ss.ss_family) ? "V4" : "V6"); - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_mca_oob_tcp_component.ipv6conns, - pmix_net_get_hostname((struct sockaddr *) &my_ss)); -#endif // PRTE_ENABLE_IPV6 - } else { - pmix_output_verbose(10, prte_oob_base_framework.framework_output, - "%s oob:tcp:init ignoring %s from out list of connections", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - pmix_net_get_hostname((struct sockaddr *) &my_ss)); - continue; - } - copied_interface = PMIX_NEW(pmix_pif_t); - if (NULL == copied_interface) { - return PRTE_ERR_OUT_OF_RESOURCE; - } - pmix_string_copy(copied_interface->if_name, selected_interface->if_name, sizeof(name)); - copied_interface->if_index = i; - copied_interface->if_kernel_index = kindex; - copied_interface->af_family = my_ss.ss_family; - copied_interface->if_flags = selected_interface->if_flags; - copied_interface->if_speed = selected_interface->if_speed; - memcpy(&copied_interface->if_addr, &selected_interface->if_addr, - sizeof(struct sockaddr_storage)); - copied_interface->if_mask = selected_interface->if_mask; - /* If bandwidth is not found, set to arbitrary non zero value */ - copied_interface->if_bandwidth = selected_interface->if_bandwidth > 0 - ? selected_interface->if_bandwidth - : 1; - memcpy(&copied_interface->if_mac, &selected_interface->if_mac, - sizeof(copied_interface->if_mac)); - copied_interface->ifmtu = selected_interface->ifmtu; - /* Add the if_mask to the list */ - sprintf(string, "%d", selected_interface->if_mask); - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_mca_oob_tcp_component.if_masks, string); - pmix_list_append(&prte_mca_oob_tcp_component.local_ifs, &(copied_interface->super)); - } - - if (0 == PMIX_ARGV_COUNT_COMPAT(prte_mca_oob_tcp_component.ipv4conns) -#if PRTE_ENABLE_IPV6 - && 0 == PMIX_ARGV_COUNT_COMPAT(prte_mca_oob_tcp_component.ipv6conns) -#endif - ) { - return PRTE_ERR_NOT_AVAILABLE; - } - - return PRTE_SUCCESS; -} - -/* Start all modules */ -static int component_startup(void) -{ - int rc = PRTE_SUCCESS; - - pmix_output_verbose(2, prte_oob_base_framework.framework_output, "%s TCP STARTUP", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - - /* if we are a daemon/HNP, - * then it is possible that someone else may initiate a - * connection to us. In these cases, we need to start the - * listening thread/event. Otherwise, we will be the one - * initiating communication, and there is no need for - * a listener */ - if (PRTE_PROC_IS_MASTER || PRTE_PROC_IS_DAEMON) { - if (PRTE_SUCCESS != (rc = prte_oob_tcp_start_listening())) { - PRTE_ERROR_LOG(rc); - } - } - - return rc; -} - -static void component_shutdown(void) -{ - int i = 0, rc; - - pmix_output_verbose(2, prte_oob_base_framework.framework_output, "%s TCP SHUTDOWN", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - - if (PRTE_PROC_IS_MASTER && prte_mca_oob_tcp_component.listen_thread_active) { - prte_mca_oob_tcp_component.listen_thread_active = false; - /* tell the thread to exit */ - rc = write(prte_mca_oob_tcp_component.stop_thread[1], &i, sizeof(int)); - if (0 < rc) { - pmix_thread_join(&prte_mca_oob_tcp_component.listen_thread, NULL); - } - - close(prte_mca_oob_tcp_component.stop_thread[0]); - close(prte_mca_oob_tcp_component.stop_thread[1]); - - } else { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, "no hnp or not active"); - } - - /* cleanup listen event list */ - PMIX_LIST_DESTRUCT(&prte_mca_oob_tcp_component.listeners); - - pmix_output_verbose(2, prte_oob_base_framework.framework_output, "%s TCP SHUTDOWN done", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); -} - -static int component_send(prte_rml_send_t *msg) -{ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, - "%s oob:tcp:send_nb to peer %s:%d seq = %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&msg->dst), msg->tag, - msg->seq_num); - - /* The module will first see if it knows - * of a way to send the data to the target, and then - * attempt to send the data. It will call the cbfunc - * with the status upon completion - if it can't do it for - * some reason, it will pass the error to our fn below so - * it can do something about it - */ - prte_oob_tcp_module.send_nb(msg); - return PRTE_SUCCESS; -} - -static char *component_get_addr(void) -{ - char *cptr = NULL, *tmp, *tp, *tm; - - if (!prte_mca_oob_tcp_component.disable_ipv4_family && - NULL != prte_mca_oob_tcp_component.ipv4conns) { - tmp = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.ipv4conns, ','); - tp = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.ipv4ports, ','); - tm = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.if_masks, ','); - pmix_asprintf(&cptr, "tcp://%s:%s:%s", tmp, tp, tm); - free(tmp); - free(tp); - free(tm); - } -#if PRTE_ENABLE_IPV6 - if (!prte_mca_oob_tcp_component.disable_ipv6_family && NULL != prte_mca_oob_tcp_component.ipv6conns) { - char *tmp2; - - /* Fixes #2498 - * RFC 3986, section 3.2.2 - * The notation in that case is to encode the IPv6 IP number in square brackets: - * "http://[2001:db8:1f70::999:de8:7648:6e8]:100/" - * A host identified by an Internet Protocol literal address, version 6 [RFC3513] - * or later, is distinguished by enclosing the IP literal within square brackets. - * This is the only place where square bracket characters are allowed in the URI - * syntax. In anticipation of future, as-yet-undefined IP literal address formats, - * an implementation may use an optional version flag to indicate such a format - * explicitly rather than rely on heuristic determination. - */ - tmp = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.ipv6conns, ','); - tp = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.ipv6ports, ','); - tm = PMIX_ARGV_JOIN_COMPAT(prte_mca_oob_tcp_component.if_masks, ','); - if (NULL == cptr) { - /* no ipv4 stuff */ - pmix_asprintf(&cptr, "tcp6://[%s]:%s:%s", tmp, tp, tm); - } else { - pmix_asprintf(&tmp2, "%s;tcp6://[%s]:%s:%s", cptr, tmp, tp, tm); - free(cptr); - cptr = tmp2; - } - free(tmp); - free(tp); - free(tm); - } -#endif // PRTE_ENABLE_IPV6 - - /* return our uri */ - return cptr; -} - -/* the host in this case is always in "dot" notation, and - * thus we do not need to do a DNS lookup to convert it */ -static int parse_uri(const uint16_t af_family, const char *host, const char *port, - struct sockaddr_storage *inaddr) -{ - struct sockaddr_in *in; - - if (AF_INET == af_family) { - memset(inaddr, 0, sizeof(struct sockaddr_in)); - in = (struct sockaddr_in *) inaddr; - in->sin_family = AF_INET; - in->sin_addr.s_addr = inet_addr(host); - if (in->sin_addr.s_addr == INADDR_NONE) { - return PRTE_ERR_BAD_PARAM; - } - ((struct sockaddr_in *) inaddr)->sin_port = htons(atoi(port)); - } -#if PRTE_ENABLE_IPV6 - else if (AF_INET6 == af_family) { - struct sockaddr_in6 *in6; - memset(inaddr, 0, sizeof(struct sockaddr_in6)); - in6 = (struct sockaddr_in6 *) inaddr; - - if (0 == inet_pton(AF_INET6, host, (void *) &in6->sin6_addr)) { - pmix_output(0, "oob_tcp_parse_uri: Could not convert %s\n", host); - return PRTE_ERR_BAD_PARAM; - } - in6->sin6_family = AF_INET6; - in6->sin6_port = htons(atoi(port)); - } -#endif - else { - return PRTE_ERR_NOT_SUPPORTED; - } - return PRTE_SUCCESS; -} - -static int component_set_addr(pmix_proc_t *peer, char **uris) -{ - char **addrs, **masks, *hptr; - char *tcpuri = NULL, *host, *ports, *masks_string; - int i, j, rc; - uint16_t af_family = AF_UNSPEC; - uint64_t ui64; - bool found; - prte_oob_tcp_peer_t *pr; - prte_oob_tcp_addr_t *maddr; - - memcpy(&ui64, (char *) peer, sizeof(uint64_t)); - /* cycle across component parts and see if one belongs to us */ - found = false; - - for (i = 0; NULL != uris[i]; i++) { - tcpuri = strdup(uris[i]); - if (NULL == tcpuri) { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s oob:tcp: out of memory", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - continue; - } - if (0 == strncmp(uris[i], "tcp:", 4)) { - af_family = AF_INET; - host = tcpuri + strlen("tcp://"); - } else if (0 == strncmp(uris[i], "tcp6:", 5)) { -#if PRTE_ENABLE_IPV6 - af_family = AF_INET6; - host = tcpuri + strlen("tcp6://"); -#else // PRTE_ENABLE_IPV6 - /* we don't support this connection type */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s oob:tcp: address %s not supported", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), uris[i]); - free(tcpuri); - continue; -#endif // PRTE_ENABLE_IPV6 - } else { - /* not one of ours */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s oob:tcp: ignoring address %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), uris[i]); - free(tcpuri); - continue; - } - - /* this one is ours - record the peer */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s oob:tcp: working peer %s address %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(peer), uris[i]); - - /* separate the mask from the network addrs */ - masks_string = strrchr(tcpuri, ':'); - if (NULL == masks_string) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - free(tcpuri); - continue; - } - *masks_string = '\0'; - masks_string++; - masks = PMIX_ARGV_SPLIT_COMPAT(masks_string, ','); - - /* separate the ports from the network addrs */ - ports = strrchr(tcpuri, ':'); - if (NULL == ports) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - free(tcpuri); - continue; - } - *ports = '\0'; - ports++; - - /* split the addrs */ - /* if this is a tcp6 connection, the first one will have a '[' - * at the beginning of it, and the last will have a ']' at the - * end - we need to remove those extra characters - */ - hptr = host; -#if PRTE_ENABLE_IPV6 - if (AF_INET6 == af_family) { - if ('[' == host[0]) { - hptr = &host[1]; - } - if (']' == host[strlen(host) - 1]) { - host[strlen(host) - 1] = '\0'; - } - } -#endif // PRTE_ENABLE_IPV6 - addrs = PMIX_ARGV_SPLIT_COMPAT(hptr, ','); - - /* cycle across the provided addrs */ - for (j = 0; NULL != addrs[j]; j++) { - if (NULL == masks[j]) { - /* Missing mask information */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, - "%s oob:tcp: uri missing mask information.", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - return PRTE_ERR_TAKE_NEXT_OPTION; - } - /* if they gave us "localhost", then just take the first conn on our list */ - if (0 == strcasecmp(addrs[j], "localhost")) { -#if PRTE_ENABLE_IPV6 - if (AF_INET6 == af_family) { - if (NULL == prte_mca_oob_tcp_component.ipv6conns - || NULL == prte_mca_oob_tcp_component.ipv6conns[0]) { - continue; - } - host = prte_mca_oob_tcp_component.ipv6conns[0]; - } else { -#endif // PRTE_ENABLE_IPV6 - if (NULL == prte_mca_oob_tcp_component.ipv4conns - || NULL == prte_mca_oob_tcp_component.ipv4conns[0]) { - continue; - } - host = prte_mca_oob_tcp_component.ipv4conns[0]; -#if PRTE_ENABLE_IPV6 - } -#endif - } else { - host = addrs[j]; - } - - if (NULL == (pr = prte_oob_tcp_peer_lookup(peer))) { - pr = PMIX_NEW(prte_oob_tcp_peer_t); - PMIX_XFER_PROCID(&pr->name, peer); - pmix_output_verbose(20, prte_oob_base_framework.framework_output, - "%s SET_PEER ADDING PEER %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(peer)); - pmix_list_append(&prte_mca_oob_tcp_component.peers, &pr->super); - } - - maddr = PMIX_NEW(prte_oob_tcp_addr_t); - ((struct sockaddr_storage *) &(maddr->addr))->ss_family = af_family; - if (PRTE_SUCCESS - != (rc = parse_uri(af_family, host, ports, - (struct sockaddr_storage *) &(maddr->addr)))) { - PRTE_ERROR_LOG(rc); - PMIX_RELEASE(maddr); - pmix_list_remove_item(&prte_mca_oob_tcp_component.peers, &pr->super); - PMIX_RELEASE(pr); - return PRTE_ERR_TAKE_NEXT_OPTION; - } - maddr->if_mask = atoi(masks[j]); - - pmix_output_verbose(20, prte_oob_base_framework.framework_output, - "%s set_peer: peer %s is listening on net %s port %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(peer), - (NULL == host) ? "NULL" : host, (NULL == ports) ? "NULL" : ports); - pmix_list_append(&pr->addrs, &maddr->super); - - found = true; - } - PMIX_ARGV_FREE_COMPAT(addrs); - free(tcpuri); - } - if (found) { - /* indicate that this peer is addressable by this component */ - return PRTE_SUCCESS; - } - - /* otherwise indicate that it is not addressable by us */ - return PRTE_ERR_TAKE_NEXT_OPTION; -} - -static bool component_is_reachable(pmix_proc_t *peer) -{ - PRTE_HIDE_UNUSED_PARAMS(peer); - - /* assume we can reach the hop - the module will tell us if it can't - * when we try to send the first time, and then we'll correct it */ - return true; -} - -void prte_mca_oob_tcp_component_set_module(int fd, short args, void *cbdata) -{ - prte_oob_tcp_peer_op_t *pop = (prte_oob_tcp_peer_op_t *) cbdata; - prte_oob_base_peer_t *bpr; - PRTE_HIDE_UNUSED_PARAMS(fd, args); - - PMIX_ACQUIRE_OBJECT(pop); - - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s tcp:set_module called for peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&pop->peer)); - - /* make sure the OOB knows that we can reach this peer - we - * are in the same event base as the OOB base, so we can - * directly access its storage - */ - bpr = prte_oob_base_get_peer(&pop->peer); - if (NULL == bpr) { - bpr = PMIX_NEW(prte_oob_base_peer_t); - PMIX_XFER_PROCID(&bpr->name, &pop->peer); - pmix_list_append(&prte_oob_base.peers, &bpr->super); - } - pmix_bitmap_set_bit(&bpr->addressable, prte_mca_oob_tcp_component.super.idx); - bpr->component = &prte_mca_oob_tcp_component.super; - - PMIX_RELEASE(pop); -} - -void prte_mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) -{ - prte_oob_tcp_peer_op_t *pop = (prte_oob_tcp_peer_op_t *) cbdata; - prte_oob_base_peer_t *bpr; - PRTE_HIDE_UNUSED_PARAMS(fd, args); - - PMIX_ACQUIRE_OBJECT(pop); - - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s tcp:lost connection called for peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pop->peer)); - - /* Mark that we no longer support this peer */ - bpr = prte_oob_base_get_peer(&pop->peer); - if (NULL != bpr) { - pmix_bitmap_clear_bit(&bpr->addressable, prte_mca_oob_tcp_component.super.idx); - pmix_list_remove_item(&prte_oob_base.peers, &bpr->super); - PMIX_RELEASE(bpr); - } - - if (!prte_finalizing) { - /* activate the proc state */ - if (PRTE_SUCCESS != prte_rml_route_lost(pop->peer.rank)) { - PRTE_ACTIVATE_PROC_STATE(&pop->peer, PRTE_PROC_STATE_LIFELINE_LOST); - } else { - PRTE_ACTIVATE_PROC_STATE(&pop->peer, PRTE_PROC_STATE_COMM_FAILED); - } - } - PMIX_RELEASE(pop); -} - -void prte_mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) -{ - prte_oob_tcp_msg_error_t *mop = (prte_oob_tcp_msg_error_t *) cbdata; - prte_oob_base_peer_t *bpr; - PRTE_HIDE_UNUSED_PARAMS(fd, args); - - PMIX_ACQUIRE_OBJECT(mop); - - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s tcp:no route called for peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&mop->hop)); - - /* mark that we cannot reach this hop */ - bpr = prte_oob_base_get_peer(&mop->hop); - if (NULL == bpr) { - bpr = PMIX_NEW(prte_oob_base_peer_t); - PMIX_XFER_PROCID(&bpr->name, &mop->hop); - } - pmix_bitmap_clear_bit(&bpr->addressable, prte_mca_oob_tcp_component.super.idx); - - /* report the error back to the OOB and let it try other components - * or declare a problem - */ - mop->rmsg->retries++; - /* activate the OOB send state */ - PRTE_OOB_SEND(mop->rmsg); - - PMIX_RELEASE(mop); -} - -void prte_mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata) -{ - prte_oob_tcp_msg_error_t *mop = (prte_oob_tcp_msg_error_t *) cbdata; - prte_rml_send_t *snd; - prte_oob_base_peer_t *bpr; - pmix_status_t rc; - pmix_byte_object_t bo; - PRTE_HIDE_UNUSED_PARAMS(fd, args); - - PMIX_ACQUIRE_OBJECT(mop); - - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s tcp:unknown hop called for peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_NAME_PRINT(&mop->hop)); - - if (prte_finalizing || prte_abnormal_term_ordered) { - /* just ignore the problem */ - PMIX_RELEASE(mop); - return; - } - - /* mark that this component cannot reach this hop */ - bpr = prte_oob_base_get_peer(&mop->hop); - if (NULL == bpr) { - /* the overall OOB has no knowledge of this hop. Only - * way this could happen is if the peer contacted us - * via this component, and it wasn't entered into the - * OOB framework hash table. We have no way of knowing - * what to do next, so just output an error message and - * abort */ - pmix_output(0, - "%s ERROR: message to %s requires routing and the OOB has no knowledge of the " - "reqd hop %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&mop->snd->hdr.dst), - PRTE_NAME_PRINT(&mop->hop)); - PRTE_ACTIVATE_PROC_STATE(&mop->hop, PRTE_PROC_STATE_UNABLE_TO_SEND_MSG); - PMIX_RELEASE(mop); - return; - } - pmix_bitmap_clear_bit(&bpr->addressable, prte_mca_oob_tcp_component.super.idx); - - /* mark that this component cannot reach this destination either */ - bpr = prte_oob_base_get_peer(&mop->snd->hdr.dst); - if (NULL == bpr) { - pmix_output( - 0, - "%s ERROR: message to %s requires routing and the OOB has no knowledge of this process", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&mop->snd->hdr.dst)); - PRTE_ACTIVATE_PROC_STATE(&mop->hop, PRTE_PROC_STATE_UNABLE_TO_SEND_MSG); - PMIX_RELEASE(mop); - return; - } - pmix_bitmap_clear_bit(&bpr->addressable, prte_mca_oob_tcp_component.super.idx); - - /* post the message to the OOB so it can see - * if another component can transfer it - */ - MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr); - snd = PMIX_NEW(prte_rml_send_t); - snd->retries = mop->rmsg->retries + 1; - PMIX_XFER_PROCID(&snd->dst, &mop->snd->hdr.dst); - PMIX_XFER_PROCID(&snd->origin, &mop->snd->hdr.origin); - snd->tag = mop->snd->hdr.tag; - snd->seq_num = mop->snd->hdr.seq_num; - bo.bytes = mop->snd->data; - bo.size = mop->snd->hdr.nbytes; - PMIX_DATA_BUFFER_CREATE(snd->dbuf); - rc = PMIx_Data_load(snd->dbuf, &bo); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - } - snd->cbfunc = NULL; - snd->cbdata = NULL; - /* activate the OOB send state */ - PRTE_OOB_SEND(snd); - /* protect the data */ - mop->snd->data = NULL; - - PMIX_RELEASE(mop); -} - -void prte_mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata) -{ - prte_oob_tcp_peer_op_t *pop = (prte_oob_tcp_peer_op_t *) cbdata; - PRTE_HIDE_UNUSED_PARAMS(fd, args); - - PMIX_ACQUIRE_OBJECT(pop); - - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s tcp:failed_to_connect called for peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pop->peer)); - - /* if we are terminating, then don't attempt to reconnect */ - if (prte_prteds_term_ordered || prte_finalizing || prte_abnormal_term_ordered) { - PMIX_RELEASE(pop); - return; - } - - /* activate the proc state */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s tcp:failed_to_connect unable to reach peer %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pop->peer)); - - PRTE_ACTIVATE_PROC_STATE(&pop->peer, PRTE_PROC_STATE_FAILED_TO_CONNECT); - PMIX_RELEASE(pop); -} - - -/* OOB TCP Class instances */ - -static void peer_cons(prte_oob_tcp_peer_t *peer) -{ - peer->auth_method = NULL; - peer->sd = -1; - PMIX_CONSTRUCT(&peer->addrs, pmix_list_t); - peer->active_addr = NULL; - peer->state = MCA_OOB_TCP_UNCONNECTED; - peer->num_retries = 0; - PMIX_CONSTRUCT(&peer->send_queue, pmix_list_t); - peer->send_msg = NULL; - peer->recv_msg = NULL; - peer->send_ev_active = false; - peer->recv_ev_active = false; - peer->timer_ev_active = false; -} -static void peer_des(prte_oob_tcp_peer_t *peer) -{ - if (NULL != peer->auth_method) { - free(peer->auth_method); - } - if (peer->send_ev_active) { - prte_event_del(&peer->send_event); - } - if (peer->recv_ev_active) { - prte_event_del(&peer->recv_event); - } - if (peer->timer_ev_active) { - prte_event_del(&peer->timer_event); - } - if (0 <= peer->sd) { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, "%s CLOSING SOCKET %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), peer->sd); - CLOSE_THE_SOCKET(peer->sd); - } - PMIX_LIST_DESTRUCT(&peer->addrs); - PMIX_LIST_DESTRUCT(&peer->send_queue); -} -PMIX_CLASS_INSTANCE(prte_oob_tcp_peer_t, pmix_list_item_t, peer_cons, peer_des); - -static void padd_cons(prte_oob_tcp_addr_t *ptr) -{ - memset(&ptr->addr, 0, sizeof(ptr->addr)); - ptr->retries = 0; - ptr->state = MCA_OOB_TCP_UNCONNECTED; -} -PMIX_CLASS_INSTANCE(prte_oob_tcp_addr_t, pmix_list_item_t, padd_cons, NULL); - -static void pop_cons(prte_oob_tcp_peer_op_t *pop) -{ - pop->net = NULL; - pop->port = NULL; -} -static void pop_des(prte_oob_tcp_peer_op_t *pop) -{ - if (NULL != pop->net) { - free(pop->net); - } - if (NULL != pop->port) { - free(pop->port); - } -} -PMIX_CLASS_INSTANCE(prte_oob_tcp_peer_op_t, pmix_object_t, pop_cons, pop_des); - -PMIX_CLASS_INSTANCE(prte_oob_tcp_msg_op_t, pmix_object_t, NULL, NULL); - -PMIX_CLASS_INSTANCE(prte_oob_tcp_conn_op_t, pmix_object_t, NULL, NULL); - -static void nicaddr_cons(prte_oob_tcp_nicaddr_t *ptr) -{ - ptr->af_family = PF_UNSPEC; - memset(&ptr->addr, 0, sizeof(ptr->addr)); -} -PMIX_CLASS_INSTANCE(prte_oob_tcp_nicaddr_t, pmix_list_item_t, nicaddr_cons, NULL); diff --git a/src/mca/oob/tcp/oob_tcp_component.h b/src/mca/oob/tcp/oob_tcp_component.h deleted file mode 100644 index 9131e100fb..0000000000 --- a/src/mca/oob/tcp/oob_tcp_component.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights - * reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _MCA_OOB_TCP_COMPONENT_H_ -#define _MCA_OOB_TCP_COMPONENT_H_ - -#include "prte_config.h" - -#ifdef HAVE_SYS_TIME_H -# include -#endif - -#include "src/include/prte_stdatomic.h" -#include "src/class/pmix_bitmap.h" -#include "src/class/pmix_list.h" -#include "src/class/pmix_pointer_array.h" -#include "src/event/event-internal.h" - -#include "oob_tcp.h" -#include "src/mca/oob/oob.h" - -/** - * OOB TCP Component - */ -typedef struct { - prte_oob_base_component_t super; /**< base OOB component */ - uint32_t addr_count; /**< total number of addresses */ - int num_links; /**< number of logical links per physical device */ - int max_retries; /**< max number of retries before declaring peer gone */ - pmix_list_t events; /**< events for monitoring connections */ - int peer_limit; /**< max size of tcp peer cache */ - pmix_list_t peers; // connection addresses for peers - - /* Port specifications */ - int tcp_sndbuf; /**< socket send buffer size */ - int tcp_rcvbuf; /**< socket recv buffer size */ - - /* IPv4 support */ - bool disable_ipv4_family; /**< disable this AF */ - char **tcp_static_ports; /**< Static ports - IPV4 */ - char **tcp_dyn_ports; /**< Dynamic ports - IPV4 */ - char **ipv4conns; - char **ipv4ports; - - /* IPv6 support */ - bool disable_ipv6_family; /**< disable this AF */ - char **tcp6_static_ports; /**< Static ports - IPV6 */ - char **tcp6_dyn_ports; /**< Dynamic ports - IPV6 */ - char **ipv6conns; - char **ipv6ports; - - /* connection support */ - pmix_list_t local_ifs; /**< prte list of local pmix_pif_t interfaces */ - char **if_masks; - char *my_uri; /**< uri for connecting to the TCP module */ - int num_hnp_ports; /**< number of ports the HNP should listen on */ - pmix_list_t listeners; /**< List of sockets being monitored by event or thread */ - pmix_thread_t listen_thread; /**< handle to the listening thread */ - prte_atomic_bool_t listen_thread_active; - struct timeval listen_thread_tv; /**< Timeout when using listen thread */ - int stop_thread[2]; /**< pipe used to exit the listen thread */ - int keepalive_probes; /**< number of keepalives that can be missed before declaring error */ - int keepalive_time; /**< idle time in seconds before starting to send keepalives */ - int keepalive_intvl; /**< time between keepalives, in seconds */ - int retry_delay; /**< time to wait before retrying connection */ - int max_recon_attempts; /**< maximum number of times to attempt connect before giving up (-1 for - never) */ -} prte_mca_oob_tcp_component_t; - -PRTE_MODULE_EXPORT extern prte_mca_oob_tcp_component_t prte_mca_oob_tcp_component; - -PRTE_MODULE_EXPORT void prte_mca_oob_tcp_component_set_module(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_mca_oob_tcp_component_no_route(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata); - -#endif /* _MCA_OOB_TCP_COMPONENT_H_ */ diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index d8f31848e7..c0e787fbe6 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -57,7 +57,6 @@ #include "src/mca/rmaps/rmaps.h" #include "src/rml/rml_contact.h" #include "src/rml/rml.h" -#include "src/mca/rtc/rtc.h" #include "src/mca/state/base/base.h" #include "src/mca/state/state.h" #include "src/runtime/prte_globals.h" @@ -85,28 +84,28 @@ void prte_plm_base_set_slots(prte_node_t *node) if (0 == strncmp(prte_set_slots, "cores", strlen(prte_set_slots))) { if (NULL != node->topology && NULL != node->topology->topo) { node->slots = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_CORE, 0); + HWLOC_OBJ_CORE); } } else if (0 == strncmp(prte_set_slots, "sockets", strlen(prte_set_slots))) { if (NULL != node->topology && NULL != node->topology->topo) { node->slots = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_SOCKET, 0); + HWLOC_OBJ_SOCKET); if (0 == node->slots) { /* some systems don't report sockets - in this case, * use numanodes */ node->slots = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_NUMANODE, 0); + HWLOC_OBJ_NUMANODE); } } } else if (0 == strncmp(prte_set_slots, "numas", strlen(prte_set_slots))) { if (NULL != node->topology && NULL != node->topology->topo) { node->slots = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_NUMANODE, 0); + HWLOC_OBJ_NUMANODE); } } else if (0 == strncmp(prte_set_slots, "hwthreads", strlen(prte_set_slots))) { if (NULL != node->topology && NULL != node->topology->topo) { node->slots = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_PU, 0); + HWLOC_OBJ_PU); } } else { /* must be a number */ @@ -1224,6 +1223,7 @@ void prte_plm_base_daemon_topology(int status, pmix_proc_t *sender, } /* Apply any CPU filters (not preserved by the XML) */ daemon->node->available = prte_hwloc_base_filter_cpus(topo); + prte_hwloc_base_setup_summary(topo); /* process any cached daemons that match this signature */ PMIX_LIST_FOREACH_SAFE(dptr, dnxt, &prte_plm_globals.daemon_cache, prte_proc_t) { @@ -1291,7 +1291,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu char *ptr; int idx; pmix_status_t ret; - prte_proc_t *daemon = NULL, *dptr; + prte_proc_t *daemon = NULL, *dptr, *d1; prte_job_t *jdata; pmix_proc_t dname; pmix_data_buffer_t *relay; @@ -1626,6 +1626,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu hwloc_bitmap_free(daemon->node->available); } daemon->node->available = prte_hwloc_base_filter_cpus(t->topo); + prte_hwloc_base_setup_summary(t->topo); free(sig); break; } @@ -1634,14 +1635,21 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu if (1 == dname.rank) { /* process any cached daemons */ PMIX_CONSTRUCT(&cachelist, pmix_list_t); + d1 = (prte_proc_t *) pmix_pointer_array_get_item(jdatorted->procs, 1); + if (NULL == d1) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + prted_failed_launch = true; + goto CLEANUP; + } while (NULL != (dptr = (prte_proc_t*)pmix_list_remove_first(&prte_plm_globals.daemon_cache))) { PMIX_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, "%s plm:base:prted_daemon_cback processing cached daemon %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&dptr->name))); if (0 == strcmp(dptr->node->topology->sig, sig)) { - dptr->node->topology = t; + dptr->node->topology = d1->node->topology; dptr->node->available = prte_hwloc_base_filter_cpus(topo); + prte_hwloc_base_setup_summary(topo); jdatorted->num_reported++; } else { /* see if this topology has already been requested */ @@ -1704,6 +1712,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu hwloc_bitmap_free(daemon->node->available); } daemon->node->available = prte_hwloc_base_filter_cpus(t->topo); + prte_hwloc_base_setup_summary(t->topo); } } if (!prte_plm_globals.daemon1_has_reported) { diff --git a/src/mca/plm/slurm/help-plm-slurm.txt b/src/mca/plm/slurm/help-plm-slurm.txt index 5f554484ff..2f71f2bb44 100644 --- a/src/mca/plm/slurm/help-plm-slurm.txt +++ b/src/mca/plm/slurm/help-plm-slurm.txt @@ -12,7 +12,7 @@ # All rights reserved. # Copyright (c) 2014-2020 Intel, Inc. All rights reserved. # Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -51,6 +51,7 @@ are running. Please consult with your system administrator about obtaining such support. +# [no-srun] The SLURM process starter for OpenMPI was unable to locate a usable "srun" command in its path. Please check your path @@ -80,3 +81,58 @@ process starter via the following MCA parameter: This will result in use of the ssh process starter. This will have no impact on your application, but will result in any accounting being done solely at the allocation level instead of per-job. +# +[custom-args-in-env] +The Slurm process starter for PRTE detected the presence of an MCA +parameter in the environment that assigns custom command line arguments +to the `srun` command used to start PRTE's daemons on remote nodes: + + Paramater value: %s + +This warning is provided to alert you (the user) to a perhaps +unintentional setting of command line arguments, or the unseen +overriding of your intended arguments by Slurm. + +Background: Starting with Slurm version 23.11, a command line argument +(`--external-launcher`) was added to `srun` to indicate that the +command was being initiated from within a third-party launcher (e.g., +`prte` or `prterun`). This allows Slurm to essentially freely modify +the `srun` command line while retaining a backward compatibility +capability when explicitly told to use it. Notably, the Slurm +environment does this by automatically setting the +PRTE_MCA_plm_slurm_args environment variable to pass in its own +command line arguments. This has the side effect of overriding most +user- or system-level settings. Note that arguments passed on the +PRTE command line will override any Slurm setting of the +PRTE_MCA_plm_slurm_args environment variable, but with potentially +undesirable side effects if newer versions of `srun` misinterpret or +fail to understand the user-specified arguments. + +If the setting of the MCA parameter was intentional, or if the +parameter looks acceptable to you, then please set the following +MCA parameter to disable this warning: + + Environment: PRTE_MCA_plm_slurm_disable_warning=true + Cmd line: --prtemca plm_slurm_disable_warning 1 + Default MCA param file: plm_slurm_disable_warning = true + +If you did not intentionally set the identified command line +arguments and do not wish them to be used, then set the +following MCA param to have them ignored: + + Environment: PRTE_MCA_plm_slurm_ignore_args=true + Cmd line: --prtemca plm_slurm_ignore_args 1 + Default MCA param file: plm_slurm_ignore_args = true + +Note that if you wish to provide custom `srun` command line +arguments and are finding them being overridden by Slurm, you +can ensure that your values are used by setting them with the +following param: + + Environment: PRTE_MCA_plm_slurm_force_args=foo + Cmd line: --prtemca plm_slurm_force_args foo + Default MCA param file: plm_slurm_force_args = foo + +Note that you may need to add the `--external-launcher` option +to your provided args to ensure that `srun` properly functions +if you are using a relatively recent release of Slurm. diff --git a/src/mca/plm/slurm/plm_slurm.h b/src/mca/plm/slurm/plm_slurm.h index 425d0acd89..b357d4b7ef 100644 --- a/src/mca/plm/slurm/plm_slurm.h +++ b/src/mca/plm/slurm/plm_slurm.h @@ -13,7 +13,7 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2022-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,7 +34,7 @@ BEGIN_C_DECLS struct prte_mca_plm_slurm_component_t { prte_plm_base_component_t super; char *custom_args; - bool slurm_warning_msg; + bool early; }; typedef struct prte_mca_plm_slurm_component_t prte_mca_plm_slurm_component_t; diff --git a/src/mca/plm/slurm/plm_slurm_component.c b/src/mca/plm/slurm/plm_slurm_component.c index 7f6e20bb49..81cf74e59e 100644 --- a/src/mca/plm/slurm/plm_slurm_component.c +++ b/src/mca/plm/slurm/plm_slurm_component.c @@ -16,7 +16,7 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,6 +38,7 @@ #include "src/util/name_fns.h" #include "src/util/pmix_environ.h" #include "src/util/pmix_show_help.h" +#include "src/util/pmix_string_copy.h" #include "plm_slurm.h" #include "src/mca/plm/base/plm_private.h" @@ -88,15 +89,11 @@ static int plm_slurm_register(void) { pmix_mca_base_component_t *comp = &prte_mca_plm_slurm_component.super; - prte_mca_plm_slurm_component.custom_args = NULL; - (void) pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &prte_mca_plm_slurm_component.custom_args); - prte_mca_plm_slurm_component.slurm_warning_msg = true; - (void) pmix_mca_base_component_var_register(comp, "warning", "Turn off warning message", - PMIX_MCA_BASE_VAR_TYPE_BOOL, - &prte_mca_plm_slurm_component.slurm_warning_msg); + prte_mca_plm_slurm_component.custom_args = NULL; + pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &prte_mca_plm_slurm_component.custom_args); return PRTE_SUCCESS; } @@ -108,8 +105,11 @@ static int plm_slurm_open(void) static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, int *priority) { - /* Are we running under a SLURM job? */ + FILE *fp; + char version[1024], *ptr; + int major, minor; + /* Are we running under a SLURM job? */ if (NULL != getenv("SLURM_JOBID")) { *priority = 75; @@ -117,6 +117,34 @@ static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, i "%s plm:slurm: available for selection", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + // check the version + fp = popen("srun --version", "r"); + if (NULL == fp) { + // cannot run srun, so we cannot support this job + *module = NULL; + return PRTE_ERROR; + } + if (NULL == fgets(version, sizeof(version), fp)) { + pclose(fp); + *module = NULL; + return PRTE_ERROR; + } + pclose(fp); + // parse on the dots + major = strtol(&version[6], &ptr, 10); + ++ptr; + minor = strtol(ptr, NULL, 10); + + if (23 > major) { + prte_mca_plm_slurm_component.early = true; + } else if (23 < major) { + prte_mca_plm_slurm_component.early = false; + } else if (11 > minor) { + prte_mca_plm_slurm_component.early = true; + } else { + prte_mca_plm_slurm_component.early = false; + } + *module = (pmix_mca_base_module_t *) &prte_plm_slurm_module; return PRTE_SUCCESS; } diff --git a/src/mca/plm/slurm/plm_slurm_module.c b/src/mca/plm/slurm/plm_slurm_module.c index 2edae0312a..8c9a04ec27 100644 --- a/src/mca/plm/slurm/plm_slurm_module.c +++ b/src/mca/plm/slurm/plm_slurm_module.c @@ -15,7 +15,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -253,6 +253,11 @@ static void launch_daemons(int fd, short args, void *cbdata) /* add the srun command */ pmix_argv_append(&argc, &argv, "srun"); + // add the external launcher flag if necessary + if (!prte_mca_plm_slurm_component.early) { + pmix_argv_append(&argc, &argv, "--external-launcher"); + } + /* start one orted on each node */ pmix_argv_append(&argc, &argv, "--ntasks-per-node=1"); diff --git a/src/mca/plm/ssh/plm_ssh_module.c b/src/mca/plm/ssh/plm_ssh_module.c index 3608ae64a7..b14e38a21d 100644 --- a/src/mca/plm/ssh/plm_ssh_module.c +++ b/src/mca/plm/ssh/plm_ssh_module.c @@ -17,7 +17,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -88,7 +88,6 @@ #include "src/mca/ess/base/base.h" #include "src/mca/ess/ess.h" #include "src/mca/grpcomm/base/base.h" -#include "src/mca/oob/base/base.h" #include "src/mca/rmaps/rmaps.h" #include "src/rml/rml_contact.h" #include "src/rml/rml.h" @@ -652,11 +651,9 @@ static int setup_launch(int *argcptr, char ***argvptr, char *nodename, int *node * uri of their parent (me) */ if (!prte_mca_plm_ssh_component.no_tree_spawn) { pmix_argv_append(&argc, &argv, "--tree-spawn"); - prte_oob_base_get_addr(¶m); pmix_argv_append(&argc, &argv, "--prtemca"); pmix_argv_append(&argc, &argv, "prte_parent_uri"); - pmix_argv_append(&argc, &argv, param); - free(param); + pmix_argv_append(&argc, &argv, prte_process_info.my_uri); } /* protect the params */ diff --git a/src/mca/ras/base/ras_base_allocate.c b/src/mca/ras/base/ras_base_allocate.c index fa750d6935..2f83caa3d7 100644 --- a/src/mca/ras/base/ras_base_allocate.c +++ b/src/mca/ras/base/ras_base_allocate.c @@ -206,8 +206,8 @@ static void display_cpus(prte_topology_t *t, return; } - npus = hwloc_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PU); - ncores = hwloc_get_nbobjs_by_type(t->topo, HWLOC_OBJ_CORE); + npus = prte_hwloc_base_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PU); + ncores = prte_hwloc_base_get_nbobjs_by_type(t->topo, HWLOC_OBJ_CORE); if (npus == ncores && !use_hwthread_cpus) { /* the bits in this bitmap represent cores */ bits_as_cores = true; @@ -224,10 +224,10 @@ static void display_cpus(prte_topology_t *t, pmix_output(prte_clean_output, "\n====================== AVAILABLE PROCESSORS [node: %s] ======================\n\n", node); } - npkgs = hwloc_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PACKAGE); + npkgs = prte_hwloc_base_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PACKAGE); allowed = (hwloc_cpuset_t)hwloc_topology_get_allowed_cpuset(t->topo); for (pkg = 0; pkg < npkgs; pkg++) { - obj = hwloc_get_obj_by_type(t->topo, HWLOC_OBJ_PACKAGE, pkg); + obj = prte_hwloc_base_get_obj_by_type(t->topo, HWLOC_OBJ_PACKAGE, pkg); hwloc_bitmap_and(avail, obj->cpuset, allowed); if (hwloc_bitmap_iszero(avail)) { if (parsable) { diff --git a/src/mca/rmaps/base/rmaps_base_binding.c b/src/mca/rmaps/base/rmaps_base_binding.c index 551100ea60..0a3ac5efdc 100644 --- a/src/mca/rmaps/base/rmaps_base_binding.c +++ b/src/mca/rmaps/base/rmaps_base_binding.c @@ -16,7 +16,7 @@ * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 Inria. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,6 +61,7 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, hwloc_obj_type_t type; hwloc_obj_t target; hwloc_cpuset_t tgtcpus, tmpcpus; + int nobjs, n; pmix_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps: bind %s with policy %s", @@ -75,25 +76,29 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, if (NULL == options->target) { return PRTE_ERROR; } -#if HWLOC_API_VERSION < 0x20000 - tgtcpus = target->allowed_cpuset; -#else tgtcpus = target->cpuset; -#endif hwloc_bitmap_and(prte_rmaps_base.baseset, options->target, tgtcpus); - trg_obj = NULL; - /* find the first object of that type in the target that has at least one available CPU */ - tmp_obj = hwloc_get_next_obj_inside_cpuset_by_type(node->topology->topo, - prte_rmaps_base.baseset, - options->hwb, NULL); - while (NULL != tmp_obj) { -#if HWLOC_API_VERSION < 0x20000 - tmpcpus = tmp_obj->allowed_cpuset; -#else + nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, options->hwb); + + // check for target object existence + if (0 == nobjs) { + // if this is not a default binding policy, then error out + if (PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { + pmix_show_help("help-prte-rmaps-base.txt", "rmaps:binding-target-not-found", + true, prte_hwloc_base_print_binding(jdata->map->binding), node->name); + return PRTE_ERR_SILENT; + } + // fallback to not binding + return PRTE_SUCCESS; + } + + for (n=0; n < nobjs; n++) { + tmp_obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, options->hwb, n); tmpcpus = tmp_obj->cpuset; -#endif hwloc_bitmap_and(prte_rmaps_base.available, node->available, tmpcpus); + hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, prte_rmaps_base.baseset); + if (options->use_hwthreads) { ncpus = hwloc_bitmap_weight(prte_rmaps_base.available); } else { @@ -112,9 +117,6 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, trg_obj = tmp_obj; break; } - tmp_obj = hwloc_get_next_obj_inside_cpuset_by_type(node->topology->topo, - prte_rmaps_base.baseset, - options->hwb, tmp_obj); } if (NULL == trg_obj) { /* there aren't any appropriate targets under this object */ @@ -126,11 +128,10 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, } } -#if HWLOC_API_VERSION < 0x20000 - tgtcpus = trg_obj->allowed_cpuset; -#else tgtcpus = trg_obj->cpuset; -#endif + if (NULL == tgtcpus) { + return PRTE_ERROR; + } hwloc_bitmap_list_asprintf(&proc->cpuset, tgtcpus); // bind to the entire target object if (4 < pmix_output_get_verbosity(prte_rmaps_base_framework.framework_output)) { char *tmp1; @@ -156,19 +157,11 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, tmp_obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, prte_rmaps_base.available, type, 0); -#if HWLOC_API_VERSION < 0x20000 - hwloc_bitmap_andnot(node->available, node->available, tmp_obj->allowed_cpuset); - if (hwloc_bitmap_iszero(node->available) && options->overload) { - /* reset the availability */ - hwloc_bitmap_copy(node->available, node->jobcache); - } -#else hwloc_bitmap_andnot(node->available, node->available, tmp_obj->cpuset); if (hwloc_bitmap_iszero(node->available) && options->overload) { /* reset the availability */ hwloc_bitmap_copy(node->available, node->jobcache); } -#endif return PRTE_SUCCESS; } @@ -217,21 +210,13 @@ static int bind_to_cpuset(prte_job_t *jdata, * cpu in the list. Since we are assigning * procs as they are mapped, this ensures they * will be assigned in order */ -#if HWLOC_API_VERSION < 0x20000 - tset = root->allowed_cpuset; -#else tset = root->cpuset; -#endif obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, tset, type, idx); if (NULL == obj) { PMIX_ARGV_FREE_COMPAT(cpus); return PRTE_ERR_OUT_OF_RESOURCE; } -#if HWLOC_API_VERSION < 0x20000 - tset = obj->allowed_cpuset; -#else tset = obj->cpuset; -#endif } else { /* bind the proc to all assigned cpus */ tset = options->target; @@ -239,15 +224,11 @@ static int bind_to_cpuset(prte_job_t *jdata, /* sanity check - are all the target cpus in a single * package, or do they span packages? */ - npkgs = hwloc_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE); + npkgs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE); included = false; for (n=0; n < npkgs; n++) { - pkg = hwloc_get_obj_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, n); -#if HWLOC_API_VERSION < 0x20000 - rc = hwloc_bitmap_isincluded(tset, pkg->allowed_cpuset); -#else + pkg = prte_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, n); rc = hwloc_bitmap_isincluded(tset, pkg->cpuset); -#endif if (1 == rc) { included = true; break; @@ -278,19 +259,10 @@ static int bind_to_cpuset(prte_job_t *jdata, * the cpuset is assigned to a proc. When all the cpus in the * set have been removed, we know that the set will be overloaded * if any more procs are assigned to it. */ -#if HWLOC_API_VERSION < 0x20000 - tset = root->allowed_cpuset; -#else tset = root->cpuset; -#endif obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, tset, type, idx); - if (NULL == obj) { - } else { -#if HWLOC_API_VERSION < 0x20000 - hwloc_bitmap_andnot(node->available, node->available, obj->allowed_cpuset); -#else + if (NULL != obj) { hwloc_bitmap_andnot(node->available, node->available, obj->cpuset); -#endif } return PRTE_SUCCESS; } @@ -320,11 +292,7 @@ static int bind_multiple(prte_job_t *jdata, prte_proc_t *proc, } else { target = obj; } -#if HWLOC_API_VERSION < 0x20000 - tgtcpus = target->allowed_cpuset; -#else tgtcpus = target->cpuset; -#endif hwloc_bitmap_and(prte_rmaps_base.baseset, options->target, tgtcpus); if (options->use_hwthreads) { type = HWLOC_OBJ_PU; @@ -336,14 +304,10 @@ static int bind_multiple(prte_job_t *jdata, prte_proc_t *proc, * packages, so we need to ensure we set the * available processors to cover whichever package * has enough CPUs to fill the request */ - npkgs = hwloc_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE); + npkgs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE); for (n=0; n < npkgs; n++) { - pkg = hwloc_get_obj_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, n); -#if HWLOC_API_VERSION < 0x20000 - hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.baseset, pkg->allowed_cpuset); -#else + pkg = prte_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, n); hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.baseset, pkg->cpuset); -#endif hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, node->available); ncpus = hwloc_get_nbobjs_inside_cpuset_by_type(node->topology->topo, prte_rmaps_base.available, type); if (ncpus >= options->cpus_per_rank) { @@ -370,15 +334,9 @@ static int bind_multiple(prte_job_t *jdata, prte_proc_t *proc, for (n=0; n < options->cpus_per_rank; n++) { tmp_obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, prte_rmaps_base.available, type, n); if (NULL != tmp_obj) { -#if HWLOC_API_VERSION < 0x20000 - hwloc_bitmap_or(result, result, tmp_obj->allowed_cpuset); - hwloc_bitmap_andnot(node->available, node->available, tmp_obj->allowed_cpuset); - hwloc_bitmap_andnot(options->target, options->target, tmp_obj->allowed_cpuset); -#else hwloc_bitmap_or(result, result, tmp_obj->cpuset); hwloc_bitmap_andnot(node->available, node->available, tmp_obj->cpuset); hwloc_bitmap_andnot(options->target, options->target, tmp_obj->cpuset); -#endif } } hwloc_bitmap_list_asprintf(&proc->cpuset, result); diff --git a/src/mca/rmaps/base/rmaps_base_frame.c b/src/mca/rmaps/base/rmaps_base_frame.c index 041daebf1a..4f8a6acd98 100644 --- a/src/mca/rmaps/base/rmaps_base_frame.c +++ b/src/mca/rmaps/base/rmaps_base_frame.c @@ -81,7 +81,7 @@ static int prte_rmaps_base_register(pmix_mca_base_register_flag_t flags) (void) pmix_mca_base_var_register("prte", "rmaps", "default", "mapping_policy", "Default mapping Policy [slot | hwthread | core | l1cache | " "l2cache | l3cache | numa | package | node | seq | dist | ppr | " - "rankfile | likwid | pe-list=a,b (comma-delimited ranges of cpus to use for this job)]," + "rankfile | pe-list=a,b (comma-delimited ranges of cpus to use for this job)]," " with supported colon-delimited modifiers: PE=y (for multiple cpus/proc), " "SPAN, OVERSUBSCRIBE, NOOVERSUBSCRIBE, NOLOCAL, HWTCPUS, CORECPUS, " "DEVICE=dev (for dist policy), INHERIT, NOINHERIT, ORDERED, FILE=%s (path to file containing sequential " @@ -408,11 +408,11 @@ int prte_rmaps_base_set_default_mapping(prte_job_t *jdata, } } else { /* if package is available, map by that */ - if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_PACKAGE, 0)) { + if (NULL != prte_hwloc_base_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_PACKAGE, 0)) { pmix_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps mapping not set by user - using bypackage"); PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYPACKAGE); - } else if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_NUMANODE, 0)) { + } else if (NULL != prte_hwloc_base_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_NUMANODE, 0)) { /* if NUMA is available, map by that */ pmix_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps mapping not set by user - using bynuma"); diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index 70980d0354..93dd97291a 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -388,7 +388,8 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) options.maptype = HWLOC_OBJ_CORE; options.mapdepth = PRTE_BIND_TO_CORE; } else if (0 == strncasecmp(ck[1], "package", len) || - 0 == strncasecmp(ck[1], "skt", len)) { + 0 == strncasecmp(ck[1], "skt", len) || + 0 == strncasecmp(ck[1], "socket", len)) { options.maptype = HWLOC_OBJ_PACKAGE; options.mapdepth = PRTE_BIND_TO_PACKAGE; } else if (0 == strncasecmp(ck[1], "numa", len) || @@ -396,13 +397,13 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) options.maptype = HWLOC_OBJ_NUMANODE; options.mapdepth = PRTE_BIND_TO_NUMA; } else if (0 == strncasecmp(ck[1], "l1cache", len)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(1, options.maptype, options.cmaplvl); + options.maptype = HWLOC_OBJ_L1CACHE; options.mapdepth = PRTE_BIND_TO_L1CACHE; } else if (0 == strncasecmp(ck[1], "l2cache", len)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(2, options.maptype, options.cmaplvl); + options.maptype = HWLOC_OBJ_L2CACHE; options.mapdepth = PRTE_BIND_TO_L2CACHE; } else if (0 == strncasecmp(ck[1], "l3cache", len)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(3, options.maptype, options.cmaplvl); + options.maptype = HWLOC_OBJ_L3CACHE; options.mapdepth = PRTE_BIND_TO_L3CACHE; } else { /* unknown spec */ @@ -449,42 +450,33 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) /* add in #packages for each node */ PMIX_LIST_FOREACH (node, &nodes, prte_node_t) { app->num_procs += options.pprn * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_PACKAGE, 0); + HWLOC_OBJ_PACKAGE); } } else if (HWLOC_OBJ_NUMANODE== options.maptype) { /* add in #numa for each node */ PMIX_LIST_FOREACH (node, &nodes, prte_node_t) { app->num_procs += options.pprn * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_NUMANODE, 0); + HWLOC_OBJ_NUMANODE); } -#if HWLOC_API_VERSION < 0x20000 - } else if (HWLOC_OBJ_CACHE == options.maptype) { - /* add in #cache for each node */ - PMIX_LIST_FOREACH (node, &nodes, prte_node_t) { - app->num_procs += options.pprn * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - options.maptype, options.cmaplvl); - } -#else } else if (HWLOC_OBJ_L1CACHE == options.maptype || HWLOC_OBJ_L2CACHE == options.maptype || HWLOC_OBJ_L1CACHE == options.maptype) { /* add in #cache for each node */ PMIX_LIST_FOREACH (node, &nodes, prte_node_t) { app->num_procs += options.pprn * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - options.maptype, options.cmaplvl); + options.maptype); } -#endif } else if (HWLOC_OBJ_CORE == options.maptype) { /* add in #cores for each node */ PMIX_LIST_FOREACH (node, &nodes, prte_node_t) { app->num_procs += options.pprn * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_CORE, 0); + HWLOC_OBJ_CORE); } } else if (HWLOC_OBJ_PU == options.maptype) { /* add in #hwt for each node */ PMIX_LIST_FOREACH (node, &nodes, prte_node_t) { app->num_procs += options.pprn * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_PU, 0); + HWLOC_OBJ_PU); } } } else { @@ -571,15 +563,15 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) break; case PRTE_MAPPING_BYL3CACHE: options.mapdepth = PRTE_BIND_TO_L3CACHE; - PRTE_HWLOC_MAKE_OBJ_CACHE(3, options.maptype, options.cmaplvl); + options.maptype = HWLOC_OBJ_L3CACHE; break; case PRTE_MAPPING_BYL2CACHE: options.mapdepth = PRTE_BIND_TO_L2CACHE; - PRTE_HWLOC_MAKE_OBJ_CACHE(2, options.maptype, options.cmaplvl); + options.maptype = HWLOC_OBJ_L2CACHE; break; case PRTE_MAPPING_BYL1CACHE: options.mapdepth = PRTE_BIND_TO_L1CACHE; - PRTE_HWLOC_MAKE_OBJ_CACHE(1, options.maptype, options.cmaplvl); + options.maptype = HWLOC_OBJ_L1CACHE; break; case PRTE_MAPPING_BYCORE: if (1 < options.cpus_per_rank && @@ -731,13 +723,13 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) options.hwb = HWLOC_OBJ_NUMANODE; break; case PRTE_BIND_TO_L3CACHE: - PRTE_HWLOC_MAKE_OBJ_CACHE(3, options.hwb, options.clvl); + options.hwb = HWLOC_OBJ_L3CACHE; break; case PRTE_BIND_TO_L2CACHE: - PRTE_HWLOC_MAKE_OBJ_CACHE(2, options.hwb, options.clvl); + options.hwb = HWLOC_OBJ_L2CACHE; break; case PRTE_BIND_TO_L1CACHE: - PRTE_HWLOC_MAKE_OBJ_CACHE(1, options.hwb, options.clvl); + options.hwb = HWLOC_OBJ_L1CACHE; break; case PRTE_BIND_TO_CORE: options.hwb = HWLOC_OBJ_CORE; diff --git a/src/mca/rmaps/base/rmaps_base_ranking.c b/src/mca/rmaps/base/rmaps_base_ranking.c index c53861a0e6..8f4c630c78 100644 --- a/src/mca/rmaps/base/rmaps_base_ranking.c +++ b/src/mca/rmaps/base/rmaps_base_ranking.c @@ -14,7 +14,7 @@ * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -197,10 +197,10 @@ int prte_rmaps_base_compute_vpids(prte_job_t *jdata, } lrank = 0; nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - options->maptype, options->cmaplvl); + options->maptype); for (k=0; k < nobjs; k++) { obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, - options->maptype, options->cmaplvl, k); + options->maptype, k); for (m=0; m < node->procs->size; m++) { proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, m); if (NULL == proc) { @@ -248,13 +248,13 @@ int prte_rmaps_base_compute_vpids(prte_job_t *jdata, continue; } nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - options->maptype, options->cmaplvl); + options->maptype); lrank = pass * nobjs; /* make a pass across all objects on this node */ for (k=0; k < nobjs && rank < jdata->num_procs; k++) { /* get this object */ obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, - options->maptype, options->cmaplvl, k); + options->maptype, k); /* find an unranked proc on this object */ for (m=0; m < node->procs->size && rank < jdata->num_procs; m++) { proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, m); diff --git a/src/mca/rmaps/base/rmaps_base_support_fns.c b/src/mca/rmaps/base/rmaps_base_support_fns.c index 84389b21a9..0c090413d9 100644 --- a/src/mca/rmaps/base/rmaps_base_support_fns.c +++ b/src/mca/rmaps/base/rmaps_base_support_fns.c @@ -631,27 +631,14 @@ int prte_rmaps_base_get_ncpus(prte_node_t *node, { int ncpus; -#if HWLOC_API_VERSION < 0x20000 - hwloc_obj_t root; - root = hwloc_get_root_obj(node->topology->topo); if (NULL == options->job_cpuset) { - hwloc_bitmap_copy(prte_rmaps_base.available, root->allowed_cpuset); + hwloc_bitmap_copy(prte_rmaps_base.available, node->available); } else { - hwloc_bitmap_and(prte_rmaps_base.available, root->allowed_cpuset, options->job_cpuset); - } - if (NULL != obj) { - hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, obj->allowed_cpuset); - } -#else - if (NULL == options->job_cpuset) { - hwloc_bitmap_copy(prte_rmaps_base.available, hwloc_topology_get_allowed_cpuset(node->topology->topo)); - } else { - hwloc_bitmap_and(prte_rmaps_base.available, hwloc_topology_get_allowed_cpuset(node->topology->topo), options->job_cpuset); + hwloc_bitmap_and(prte_rmaps_base.available, node->available, options->job_cpuset); } if (NULL != obj) { hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, obj->cpuset); } -#endif if (options->use_hwthreads) { ncpus = hwloc_bitmap_weight(prte_rmaps_base.available); } else { @@ -664,6 +651,7 @@ int prte_rmaps_base_get_ncpus(prte_node_t *node, */ ncpus = hwloc_get_nbobjs_inside_cpuset_by_type(node->topology->topo, prte_rmaps_base.available, HWLOC_OBJ_CORE); } + return ncpus; } diff --git a/src/mca/rmaps/ppr/rmaps_ppr.c b/src/mca/rmaps/ppr/rmaps_ppr.c index 56d1dd59ab..8f51636f0a 100644 --- a/src/mca/rmaps/ppr/rmaps_ppr.c +++ b/src/mca/rmaps/ppr/rmaps_ppr.c @@ -110,23 +110,12 @@ static int ppr_mapper(prte_job_t *jdata, mapping = PRTE_MAPPING_BYPACKAGE; } else if (HWLOC_OBJ_NUMANODE== options->maptype) { mapping = PRTE_MAPPING_BYNUMA; -#if HWLOC_API_VERSION < 0x20000 - } else if (HWLOC_OBJ_CACHE == options->maptype) { - if (1 == options->cmaplvl) { - mapping = PRTE_MAPPING_BYL1CACHE; - } else if (2 == options->cmaplvl) { - mapping = PRTE_MAPPING_BYL2CACHE; - } else if (3 == options->cmaplvl) { - mapping = PRTE_MAPPING_BYL3CACHE; - } -#else } else if (HWLOC_OBJ_L1CACHE == options->maptype) { mapping = PRTE_MAPPING_BYL1CACHE; } else if (HWLOC_OBJ_L2CACHE == options->maptype) { mapping = PRTE_MAPPING_BYL2CACHE; } else if (HWLOC_OBJ_L3CACHE == options->maptype) { mapping = PRTE_MAPPING_BYL3CACHE; -#endif } else if (HWLOC_OBJ_CORE == options->maptype) { mapping = PRTE_MAPPING_BYCORE; } else if (HWLOC_OBJ_PU == options->maptype) { @@ -248,7 +237,7 @@ static int ppr_mapper(prte_job_t *jdata, } else { /* get the number of resources on this node */ nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - options->maptype, options->cmaplvl); + options->maptype); if (0 == nobjs) { continue; } @@ -268,7 +257,7 @@ static int ppr_mapper(prte_job_t *jdata, /* map the specified number of procs to each such resource on this node */ for (i = 0; i < nobjs && nprocs_mapped < app->num_procs; i++) { obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, - options->maptype, options->cmaplvl, i); + options->maptype, i); if (!prte_rmaps_base_check_avail(jdata, app, node, &node_list, obj, options)) { continue; } diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file.c b/src/mca/rmaps/rank_file/rmaps_rank_file.c index ece80d7062..a29ff95a11 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file.c @@ -432,11 +432,7 @@ static int prte_rmaps_rf_map(prte_job_t *jdata, } /* Mark these slots as taken on this node */ -#if HWLOC_API_VERSION < 0x20000 hwloc_bitmap_andnot(node->available, node->available, proc_bitmap); -#else - hwloc_bitmap_andnot(node->available, node->available, proc_bitmap); -#endif /* cleanup */ free(cpu_bitmap); diff --git a/src/mca/rmaps/rmaps_types.h b/src/mca/rmaps/rmaps_types.h index 8182ef5a3f..28c5d4ce6c 100644 --- a/src/mca/rmaps/rmaps_types.h +++ b/src/mca/rmaps/rmaps_types.h @@ -97,7 +97,6 @@ typedef struct { unsigned ncpus; int nprocs; hwloc_obj_type_t maptype; - unsigned cmaplvl; /* #procs/resource as per PPR */ int pprn; @@ -112,7 +111,7 @@ typedef struct { prte_binding_policy_t bind; bool dobind; hwloc_obj_type_t hwb; - unsigned clvl; + uint16_t limit; /* usage tracking */ hwloc_cpuset_t target; diff --git a/src/mca/rmaps/round_robin/rmaps_rr_mappers.c b/src/mca/rmaps/round_robin/rmaps_rr_mappers.c index ee992be256..239e81da86 100644 --- a/src/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/src/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -628,7 +628,7 @@ int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, /* get the number of objects of this type on this node */ nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - options->maptype, options->cmaplvl); + options->maptype); if (0 == nobjs) { /* this node doesn't have any objects of this type, so * we might as well drop it from consideration */ @@ -648,7 +648,7 @@ int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, "mca:rmaps:rr: assigning proc to object %d", j); /* get the hwloc object */ obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, - options->maptype, options->cmaplvl, j); + options->maptype, j); if (NULL == obj) { /* out of objects on this node */ break; diff --git a/src/mca/rtc/Makefile.am b/src/mca/rtc/Makefile.am deleted file mode 100644 index 9e27dfbdd7..0000000000 --- a/src/mca/rtc/Makefile.am +++ /dev/null @@ -1,27 +0,0 @@ -# -# Copyright (c) 2014-2019 Intel, Inc. All rights reserved. -# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# main library setup -noinst_LTLIBRARIES = libprtemca_rtc.la -libprtemca_rtc_la_SOURCES = - -# local files -headers = rtc.h -libprtemca_rtc_la_SOURCES += $(headers) - -# Conditionally install the header files -prtedir = $(prteincludedir)/$(subdir) -nobase_prte_HEADERS = $(headers) - -include base/Makefile.am - -distclean-local: - rm -f base/static-components.h diff --git a/src/mca/rtc/base/Makefile.am b/src/mca/rtc/base/Makefile.am deleted file mode 100644 index 3f60f7c6c9..0000000000 --- a/src/mca/rtc/base/Makefile.am +++ /dev/null @@ -1,20 +0,0 @@ -# -# Copyright (c) 2014-2019 Intel, Inc. All rights reserved. -# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -headers += \ - base/base.h - -libprtemca_rtc_la_SOURCES += \ - base/rtc_base_frame.c \ - base/rtc_base_select.c \ - base/rtc_base_stubs.c - -dist_prtedata_DATA = base/help-prte-rtc-base.txt diff --git a/src/mca/rtc/base/base.h b/src/mca/rtc/base/base.h deleted file mode 100644 index c646c1d69a..0000000000 --- a/src/mca/rtc/base/base.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * rtc framework base functionality. - */ - -#ifndef PRTE_MCA_RTC_BASE_H -#define PRTE_MCA_RTC_BASE_H - -/* - * includes - */ -#include "prte_config.h" -#include "types.h" - -#include "src/class/pmix_list.h" -#include "src/mca/mca.h" -#include "src/mca/odls/base/base.h" -#include "src/util/pmix_printf.h" - -#include "src/mca/rtc/rtc.h" - -BEGIN_C_DECLS - -/* - * MCA Framework - */ -PRTE_EXPORT extern pmix_mca_base_framework_t prte_rtc_base_framework; -/* select a component */ -PRTE_EXPORT int prte_rtc_base_select(void); - -/* - * Global functions for MCA overall collective open and close - */ - -/** - * Struct to hold data global to the rtc framework - */ -typedef struct { - /* list of selected modules */ - pmix_list_t actives; -} prte_rtc_base_t; - -/** - * Global instance of rtc-wide framework data - */ -PRTE_EXPORT extern prte_rtc_base_t prte_rtc_base; - -/** - * Select an rtc component / module - */ -typedef struct { - pmix_list_item_t super; - int pri; - prte_rtc_base_module_t *module; - pmix_mca_base_component_t *component; -} prte_rtc_base_selected_module_t; -PMIX_CLASS_DECLARATION(prte_rtc_base_selected_module_t); - -PRTE_EXPORT void prte_rtc_base_assign(prte_job_t *jdata); -PRTE_EXPORT void prte_rtc_base_set(prte_odls_spawn_caddy_t *cd, int error_fd); -PRTE_EXPORT void prte_rtc_base_get_avail_vals(pmix_list_t *vals); - -/* Called from the child to send a warning show_help message up the - pipe to the waiting parent. */ -PRTE_EXPORT int prte_rtc_base_send_warn_show_help(int fd, const char *file, const char *topic, ...); - -/* Called from the child to send an error message up the pipe to the - waiting parent. */ -PRTE_EXPORT void prte_rtc_base_send_error_show_help(int fd, int exit_status, const char *file, - const char *topic, ...); - -END_C_DECLS - -#endif diff --git a/src/mca/rtc/base/help-prte-rtc-base.txt b/src/mca/rtc/base/help-prte-rtc-base.txt deleted file mode 100644 index 8c58b7b20c..0000000000 --- a/src/mca/rtc/base/help-prte-rtc-base.txt +++ /dev/null @@ -1,287 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2014-2020 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for PRTE's prun. -# -[prte-rtc-base:not-all-mapped-alloc] -Some of the requested hosts are not included in the current allocation for the -application: - %s -The requested hosts were: - %s - -Verify that you have mapped the allocated resources properly using the ---host or --hostfile specification. -[prte-rtc-base:no-mapped-node] -There are no allocated resources for the application: - %s -that match the requested mapping: - %s: %s - -Verify that you have mapped the allocated resources properly for the -indicated specification. -[prte-rtc-base:nolocal-no-available-resources] -There are no available nodes allocated to this job. This could be because -no nodes were found or all the available nodes were already used. - -Note that since the -nolocal option was given no processes can be -launched on the local node. -[prte-rtc-base:no-available-resources] -No nodes are available for this job, either due to a failure to -allocate nodes to the job, or allocated nodes being marked -as unavailable (e.g., down, rebooting, or a process attempting -to be relocated to another node when none are available). -[prte-rtc-base:all-available-resources-used] -All nodes which are allocated for this job are already filled. -# -[out-of-vpids] -The system has exhausted its available ranks - the application is attempting -to spawn too many daemons and will be aborted. - -This may be resolved by increasing the number of available ranks by -re-configuring with the --enable-jumbo-apps option, and then -re-building the application. -# -[rtc:too-many-procs] -Your job has requested a conflicting number of processes for the -application: - -App: %s -number of procs: %d - -This is more processes than we can launch under the following -additional directives and conditions: - -%s: %d -%s: %d - -Please revise the conflict and try again. -# -[too-many-cpus-per-rank] -Your job has requested more cpus per process(rank) than there -are cpus in a package: - - Cpus/rank: %d - #cpus/package: %d - -Please correct one or both of these values and try again. -# -[failed-map] -Your job failed to map. Either no mapper was available, or none -of the available mappers was able to perform the requested -mapping operation. This can happen if you request a map type -(e.g., loadbalance) and the corresponding mapper was not built. -# -[unrecognized-policy] -The specified %s policy is not recognized: - - Policy: %s - -Please check for a typo or ensure that the option is a supported -one. -# -[redefining-policy] -Conflicting directives for %s policy are causing the policy -to be redefined: - - New policy: %s - Prior policy: %s - -Please check that only one policy is defined. -# -[rtc:binding-target-not-found] -A request was made to bind to %s, but an appropriate target could not -be found on node %s. -# -[rtc:binding-overload] -A request was made to bind to that would result in binding more -processes than cpus on a resource: - - Bind to: %s - Node: %s - #processes: %d - #cpus: %d - -You can override this protection by adding the "overload-allowed" -option to your binding directive. -# -[rtc:no-topology] -A mapping directive was given that requires knowledge of -a remote node's topology. However, no topology info is -available for the following node: - - Node: %s - -The job cannot be executed under this condition. Please either -remove the directive or investigate the lack of topology info. -# -[rtc:no-available-cpus] -While computing bindings, we found no available cpus on -the following node: - - Node: %s - -Please check your allocation. -# -[rtc:cpubind-not-supported] -A request was made to bind a process, but at least one node does NOT -support binding processes to cpus. - - Node: %s -# -[rtc:membind-not-supported] -WARNING: a request was made to bind a process. While the system -supports binding the process itself, at least one node does NOT -support binding memory to the process location. - - Node: %s - -This is a warning only; your job will continue, though performance may -be degraded. -# -[rtc:membind-not-supported-fatal] -A request was made to bind a process. While the system -supports binding the process itself, at least one node does NOT -support binding memory to the process location. - - Node: %s - -The provided memory binding policy requires that we abort the -job at this time. -# -[rtc:no-bindable-objects] -No bindable objects of the specified type were available -on at least one node: - - Node: %s - Target: %s -# -[rtc:unknown-binding-level] -Unknown binding level: - - Target: %s - Cache level: %u -# -[prte-rtc-base:missing-daemon] -While attempting to build a map of this job, a node -was detected to be missing a daemon: - - Node: %s - -This usually indicates a mismatch between what the -allocation provided for the node name versus what was -actually found on the node. -# -[prte-rtc-base:no-objects] -No objects of the specified type were found on at least one node: - - Type: %s - Node: %s - -The map cannot be done as specified. -# -[topo-file] -A topology file was given for the compute nodes, but -we were unable to correctly process it. Common errors -include incorrectly specifying the path to the file, -or the file being generated in a way that is incompatible -with the version of hwloc being used by OMPI. - - File: %s - -Please correct the problem and try again. -# -[deprecated] -The following command line options and corresponding MCA parameter have -been deprecated and replaced as follows: - - Command line options: - Deprecated: %s - Replacement: %s - - Equivalent MCA parameter: - Deprecated: %s - Replacement: %s - -The deprecated forms *will* disappear in a future version of PRTE. -Please update to the new syntax. -# -[mismatch-binding] -A request for multiple cpus-per-proc was given, but a conflicting binding -policy was specified: - - #cpus-per-proc: %d - type of cpus: %s - binding policy given: %s - -The correct binding policy for the given type of cpu is: - - correct binding policy: %s - -This is the binding policy we would apply by default for this -situation, so no binding need be specified. Please correct the -situation and try again. -# -[mapping-too-low] -A request for multiple cpus-per-proc was given, but a directive -was also given to map to an object level that has less cpus than -requested ones: - - #cpus-per-proc: %d - number of cpus: %d - map-by: %s - -Please specify a mapping level that has more cpus, or else let us -define a default mapping that will allow multiple cpus-per-proc. -# -[unrecognized-modifier] -The mapping request contains an unrecognized modifier: - - Request: %s - -Please check your request and try again. -# -[invalid-pattern] -The mapping request contains a pattern that doesn't match -the required syntax of #:object - - Pattern: %s - -Please check your request and try again. -# -[prte-rtc-base:oversubscribed] -The requested number of processes exceeds the allocated -number of slots: - - #slots: %d - #processes: %d - -This creates an oversubscribed condition that may adversely -impact performance when combined with the requested binding -operation. We will continue, but will not bind the processes. -This warning can be omitted by adding the "overload-allowed" -qualifier to the binding policy. -# -[cannot-launch] -Although we were able to map your job, we are unable to launch -it at this time due to required resources being busy. Please -try again later. diff --git a/src/mca/rtc/base/owner.txt b/src/mca/rtc/base/owner.txt deleted file mode 100644 index 4ad6f408ca..0000000000 --- a/src/mca/rtc/base/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: INTEL -status: maintenance diff --git a/src/mca/rtc/base/rtc_base_frame.c b/src/mca/rtc/base/rtc_base_frame.c deleted file mode 100644 index 07c3c0fd7f..0000000000 --- a/src/mca/rtc/base/rtc_base_frame.c +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2015-2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include - -#include "src/class/pmix_list.h" -#include "src/mca/base/pmix_base.h" -#include "src/mca/mca.h" - -#include "src/mca/rtc/base/base.h" -/* - * The following file was created by configure. It contains extern - * statements and the definition of an array of pointers to each - * component's public pmix_mca_base_component_t struct. - */ - -#include "src/mca/rtc/base/static-components.h" - -/* - * Global variables - */ -prte_rtc_API_module_t prte_rtc = { - .assign = prte_rtc_base_assign, - .set = prte_rtc_base_set, - .get_available_values = prte_rtc_base_get_avail_vals -}; -prte_rtc_base_t prte_rtc_base = { - .actives = PMIX_LIST_STATIC_INIT -}; - -static int prte_rtc_base_close(void) -{ - pmix_list_item_t *item; - - /* cleanup globals */ - while (NULL != (item = pmix_list_remove_first(&prte_rtc_base.actives))) { - PMIX_RELEASE(item); - } - PMIX_DESTRUCT(&prte_rtc_base.actives); - - return pmix_mca_base_framework_components_close(&prte_rtc_base_framework, NULL); -} - -/** - * Function for finding and opening either all MCA components, or the one - * that was specifically requested via a MCA parameter. - */ -static int prte_rtc_base_open(pmix_mca_base_open_flag_t flags) -{ - /* init the globals */ - PMIX_CONSTRUCT(&prte_rtc_base.actives, pmix_list_t); - - /* Open up all available components */ - return pmix_mca_base_framework_components_open(&prte_rtc_base_framework, flags); -} - -PMIX_MCA_BASE_FRAMEWORK_DECLARE(prte, rtc, "PRTE Mapping Subsystem", NULL, prte_rtc_base_open, - prte_rtc_base_close, prte_rtc_base_static_components, - PMIX_MCA_BASE_FRAMEWORK_FLAG_DEFAULT); - -static void mdes(prte_rtc_base_selected_module_t *active) -{ - if (NULL != active->module->finalize) { - active->module->finalize(); - } -} -PMIX_CLASS_INSTANCE(prte_rtc_base_selected_module_t, pmix_list_item_t, NULL, mdes); - -static void rcon(prte_rtc_resource_t *p) -{ - p->component = NULL; - p->category = NULL; - PMIX_CONSTRUCT(&p->control, prte_value_t); -} -static void rdes(prte_rtc_resource_t *p) -{ - if (NULL != p->component) { - free(p->component); - } - if (NULL != p->category) { - free(p->category); - } - PMIX_DESTRUCT(&p->control); -} -PMIX_CLASS_INSTANCE(prte_rtc_resource_t, pmix_list_item_t, rcon, rdes); diff --git a/src/mca/rtc/base/rtc_base_select.c b/src/mca/rtc/base/rtc_base_select.c deleted file mode 100644 index b0721b96c2..0000000000 --- a/src/mca/rtc/base/rtc_base_select.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include - -#include "src/mca/base/pmix_base.h" -#include "src/mca/mca.h" - -#include "src/mca/rtc/base/base.h" - -static bool selected = false; - -/* - * Function for selecting one component from all those that are - * available. - */ -int prte_rtc_base_select(void) -{ - pmix_mca_base_component_list_item_t *cli = NULL; - pmix_mca_base_component_t *component = NULL; - pmix_mca_base_module_t *module = NULL; - prte_rtc_base_module_t *nmodule; - prte_rtc_base_selected_module_t *newmodule, *mod; - int rc, priority; - bool inserted; - - if (selected) { - /* ensure we don't do this twice */ - return PRTE_SUCCESS; - } - selected = true; - - /* Query all available components and ask if they have a module */ - PMIX_LIST_FOREACH(cli, &prte_rtc_base_framework.framework_components, - pmix_mca_base_component_list_item_t) - { - component = (pmix_mca_base_component_t *) cli->cli_component; - - pmix_output_verbose(5, prte_rtc_base_framework.framework_output, - "mca:rtc:select: checking available component %s", - component->pmix_mca_component_name); - - /* If there's no query function, skip it */ - if (NULL == component->pmix_mca_query_component) { - pmix_output_verbose( - 5, prte_rtc_base_framework.framework_output, - "mca:rtc:select: Skipping component [%s]. It does not implement a query function", - component->pmix_mca_component_name); - continue; - } - - /* Query the component */ - pmix_output_verbose(5, prte_rtc_base_framework.framework_output, - "mca:rtc:select: Querying component [%s]", - component->pmix_mca_component_name); - rc = component->pmix_mca_query_component(&module, &priority); - - /* If no module was returned, then skip component */ - if (PRTE_SUCCESS != rc || NULL == module) { - pmix_output_verbose( - 5, prte_rtc_base_framework.framework_output, - "mca:rtc:select: Skipping component [%s]. Query failed to return a module", - component->pmix_mca_component_name); - continue; - } - nmodule = (prte_rtc_base_module_t *) module; - - /* give the module a chance to init */ - if (NULL != nmodule->init) { - if (PRTE_SUCCESS != (rc = nmodule->init())) { - pmix_output_verbose(5, prte_rtc_base_framework.framework_output, - "mca:rtc:select: Skipping component [%s]. Failed to init", - component->pmix_mca_component_name); - continue; - } - } - - /* add to the list of selected modules */ - newmodule = PMIX_NEW(prte_rtc_base_selected_module_t); - newmodule->pri = priority; - newmodule->module = nmodule; - newmodule->component = component; - - /* maintain priority order */ - inserted = false; - PMIX_LIST_FOREACH(mod, &prte_rtc_base.actives, prte_rtc_base_selected_module_t) - { - if (priority > mod->pri) { - pmix_list_insert_pos(&prte_rtc_base.actives, (pmix_list_item_t *) mod, - &newmodule->super); - inserted = true; - break; - } - } - if (!inserted) { - /* must be lowest priority - add to end */ - pmix_list_append(&prte_rtc_base.actives, &newmodule->super); - } - } - - if (4 < pmix_output_get_verbosity(prte_rtc_base_framework.framework_output)) { - pmix_output(0, "%s: Final RTC priorities", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - /* show the prioritized list */ - PMIX_LIST_FOREACH(mod, &prte_rtc_base.actives, prte_rtc_base_selected_module_t) - { - pmix_output(0, "\tModule: %s Priority: %d", mod->component->pmix_mca_component_name, - mod->pri); - } - } - - return PRTE_SUCCESS; -} diff --git a/src/mca/rtc/base/rtc_base_stubs.c b/src/mca/rtc/base/rtc_base_stubs.c deleted file mode 100644 index 35a95a17c9..0000000000 --- a/src/mca/rtc/base/rtc_base_stubs.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" - -#include "src/util/pmix_fd.h" -#include "src/util/pmix_show_help.h" - -#include "src/mca/errmgr/errmgr.h" -#include "src/mca/odls/odls_types.h" - -#include "src/mca/rtc/base/base.h" - -void prte_rtc_base_assign(prte_job_t *jdata) -{ - prte_rtc_base_selected_module_t *active; - - PMIX_LIST_FOREACH(active, &prte_rtc_base.actives, prte_rtc_base_selected_module_t) - { - if (NULL != active->module->assign) { - /* give this module a chance to operate on it */ - active->module->assign(jdata); - } - } -} - -void prte_rtc_base_set(prte_odls_spawn_caddy_t *cd, int error_fd) -{ - prte_rtc_base_selected_module_t *active; - - PMIX_LIST_FOREACH(active, &prte_rtc_base.actives, prte_rtc_base_selected_module_t) - { - if (NULL != active->module->set) { - /* give this module a chance to operate on it */ - active->module->set(cd, error_fd); - } - } -} - -void prte_rtc_base_get_avail_vals(pmix_list_t *vals) -{ - prte_rtc_base_selected_module_t *active; - - PMIX_LIST_FOREACH(active, &prte_rtc_base.actives, prte_rtc_base_selected_module_t) - { - if (NULL != active->module->get_available_values) { - /* give this module a chance to operate on it */ - active->module->get_available_values(vals); - } - } -} - -static int write_help_msg(int fd, prte_odls_pipe_err_msg_t *msg, const char *file, - const char *topic, va_list ap) -{ - int ret; - char *str; - - if (NULL == file || NULL == topic) { - return PRTE_ERR_BAD_PARAM; - } - - str = pmix_show_help_vstring(file, topic, true, ap); - - msg->file_str_len = (int) strlen(file); - if (msg->file_str_len > PRTE_ODLS_MAX_FILE_LEN) { - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - msg->topic_str_len = (int) strlen(topic); - if (msg->topic_str_len > PRTE_ODLS_MAX_TOPIC_LEN) { - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - msg->msg_str_len = (int) strlen(str); - - /* Only keep writing if each write() succeeds */ - if (PRTE_SUCCESS != (ret = pmix_fd_write(fd, sizeof(*msg), msg))) { - goto out; - } - if (msg->file_str_len > 0 - && PRTE_SUCCESS != (ret = pmix_fd_write(fd, msg->file_str_len, file))) { - goto out; - } - if (msg->topic_str_len > 0 - && PRTE_SUCCESS != (ret = pmix_fd_write(fd, msg->topic_str_len, topic))) { - goto out; - } - if (msg->msg_str_len > 0 && PRTE_SUCCESS != (ret = pmix_fd_write(fd, msg->msg_str_len, str))) { - goto out; - } - -out: - free(str); - return ret; -} - -int prte_rtc_base_send_warn_show_help(int fd, const char *file, const char *topic, ...) -{ - int ret; - va_list ap; - prte_odls_pipe_err_msg_t msg; - - msg.fatal = false; - msg.exit_status = 0; /* ignored */ - - /* Send it */ - va_start(ap, topic); - ret = write_help_msg(fd, &msg, file, topic, ap); - va_end(ap); - - return ret; -} - -void prte_rtc_base_send_error_show_help(int fd, int exit_status, const char *file, - const char *topic, ...) -{ - va_list ap; - prte_odls_pipe_err_msg_t msg; - - msg.fatal = true; - msg.exit_status = exit_status; - - /* Send it */ - va_start(ap, topic); - write_help_msg(fd, &msg, file, topic, ap); - va_end(ap); - - exit(exit_status); -} diff --git a/src/mca/rtc/hwloc/Makefile.am b/src/mca/rtc/hwloc/Makefile.am deleted file mode 100644 index 73fb56b8de..0000000000 --- a/src/mca/rtc/hwloc/Makefile.am +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2014-2020 Intel, Inc. All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2022 Nanook Consulting. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_prtedata_DATA = help-prte-rtc-hwloc.txt - -sources = \ - rtc_hwloc.c \ - rtc_hwloc.h \ - rtc_hwloc_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_prte_rtc_hwloc_DSO -component_noinst = -component_install = prte_mca_rtc_hwloc.la -else -component_noinst = libprtemca_rtc_hwloc.la -component_install = -endif - -mcacomponentdir = $(prtelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -prte_mca_rtc_hwloc_la_SOURCES = $(sources) -prte_mca_rtc_hwloc_la_LDFLAGS = -module -avoid-version -prte_mca_rtc_hwloc_la_LIBADD = $(top_builddir)/src/libprrte.la - -noinst_LTLIBRARIES = $(component_noinst) -libprtemca_rtc_hwloc_la_SOURCES =$(sources) -libprtemca_rtc_hwloc_la_LDFLAGS = -module -avoid-version diff --git a/src/mca/rtc/hwloc/help-prte-rtc-hwloc.txt b/src/mca/rtc/hwloc/help-prte-rtc-hwloc.txt deleted file mode 100644 index cdd48f0d42..0000000000 --- a/src/mca/rtc/hwloc/help-prte-rtc-hwloc.txt +++ /dev/null @@ -1,30 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2009-2020 Cisco Systems, Inc. All rights reserved -# Copyright (c) 2010-2012 Los Alamos National Security, LLC. -# All rights reserved. -# -# Copyright (c) 2017-2019 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for PRTE's hwloc support. -# -[sys call fail] -A system call failed during shared memory initialization that should -not have. - - Local host: %s - System call: %s %s - Error: %s (errno %d) -# -[target full] -It appears as if there is not enough space for %s (the shared-memory backing -file for hwloc topology). - - Local host: %s - Space Requested: %lu B - Space Available: %llu B diff --git a/src/mca/rtc/hwloc/owner.txt b/src/mca/rtc/hwloc/owner.txt deleted file mode 100644 index 4ad6f408ca..0000000000 --- a/src/mca/rtc/hwloc/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: INTEL -status: maintenance diff --git a/src/mca/rtc/hwloc/rtc_hwloc.h b/src/mca/rtc/hwloc/rtc_hwloc.h deleted file mode 100644 index 5995dee3d1..0000000000 --- a/src/mca/rtc/hwloc/rtc_hwloc.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2017 Inria. All rights reserved. - * Copyright (c) 2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef PRTE_RTC_HWLOC_H -#define PRTE_RTC_HWLOC_H - -#include "prte_config.h" - -#include "src/mca/rtc/rtc.h" - -BEGIN_C_DECLS - -typedef enum { - VM_HOLE_NONE = -1, - VM_HOLE_BEGIN = 0, /* use hole at the very beginning */ - VM_HOLE_AFTER_HEAP = 1, /* use hole right after heap */ - VM_HOLE_BEFORE_STACK = 2, /* use hole right before stack */ - VM_HOLE_BIGGEST = 3, /* use biggest hole */ - VM_HOLE_IN_LIBS = 4, /* use biggest hole between heap and stack */ - VM_HOLE_CUSTOM = 5, /* use given address if available */ -} prte_rtc_hwloc_vm_hole_kind_t; - -typedef enum { - VM_MAP_FILE = 0, - VM_MAP_ANONYMOUS = 1, - VM_MAP_HEAP = 2, - VM_MAP_STACK = 3, - VM_MAP_OTHER = 4 /* vsyscall/vdso/vvar shouldn't occur since we stop after stack */ -} prte_rtc_hwloc_vm_map_kind_t; - -typedef struct { - prte_rtc_base_component_t super; - prte_rtc_hwloc_vm_hole_kind_t kind; -} prte_mca_rtc_hwloc_component_t; - -PRTE_MODULE_EXPORT extern prte_mca_rtc_hwloc_component_t prte_mca_rtc_hwloc_component; - -extern prte_rtc_base_module_t prte_rtc_hwloc_module; - -END_C_DECLS - -#endif /* PRTE_RTC_HWLOC_H */ diff --git a/src/mca/rtc/hwloc/rtc_hwloc_component.c b/src/mca/rtc/hwloc/rtc_hwloc_component.c deleted file mode 100644 index 7af01ba5be..0000000000 --- a/src/mca/rtc/hwloc/rtc_hwloc_component.c +++ /dev/null @@ -1,98 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2017-2019 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2017 Inria. All rights reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include "src/mca/base/pmix_base.h" -#include "src/mca/base/pmix_mca_base_var.h" - -#include "rtc_hwloc.h" - -/* - * Local functions - */ - -static int rtc_hwloc_query(pmix_mca_base_module_t **module, int *priority); -static int rtc_hwloc_register(void); - -static int my_priority; - -prte_mca_rtc_hwloc_component_t prte_mca_rtc_hwloc_component = { - .super = { - PRTE_RTC_BASE_VERSION_1_0_0, - - .pmix_mca_component_name = "hwloc", - PMIX_MCA_BASE_MAKE_VERSION(component, - PRTE_MAJOR_VERSION, - PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), - .pmix_mca_query_component = rtc_hwloc_query, - .pmix_mca_register_component_params = rtc_hwloc_register, - }, - .kind = VM_HOLE_BIGGEST -}; - -static char *biggest = "biggest"; -static char *vmhole; - -static int rtc_hwloc_register(void) -{ - pmix_mca_base_component_t *c = &prte_mca_rtc_hwloc_component.super; - - /* set as the default */ - my_priority = 70; - (void) pmix_mca_base_component_var_register(c, "priority", - "Priority of the HWLOC rtc component", - PMIX_MCA_BASE_VAR_TYPE_INT, - &my_priority); - - prte_mca_rtc_hwloc_component.kind = VM_HOLE_BIGGEST; - vmhole = biggest; - (void) pmix_mca_base_component_var_register(c, "vmhole", - "Kind of VM hole to identify - none, begin, biggest, libs, heap, stack (default=biggest)", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &vmhole); - if (0 == strcasecmp(vmhole, "none")) { - prte_mca_rtc_hwloc_component.kind = VM_HOLE_NONE; - } else if (0 == strcasecmp(vmhole, "begin")) { - prte_mca_rtc_hwloc_component.kind = VM_HOLE_BEGIN; - } else if (0 == strcasecmp(vmhole, "biggest")) { - prte_mca_rtc_hwloc_component.kind = VM_HOLE_BIGGEST; - } else if (0 == strcasecmp(vmhole, "libs")) { - prte_mca_rtc_hwloc_component.kind = VM_HOLE_IN_LIBS; - } else if (0 == strcasecmp(vmhole, "heap")) { - prte_mca_rtc_hwloc_component.kind = VM_HOLE_AFTER_HEAP; - } else if (0 == strcasecmp(vmhole, "stack")) { - prte_mca_rtc_hwloc_component.kind = VM_HOLE_BEFORE_STACK; - } else { - pmix_output(0, "INVALID VM HOLE TYPE"); - return PRTE_ERROR; - } - - return PRTE_SUCCESS; -} - -static int rtc_hwloc_query(pmix_mca_base_module_t **module, int *priority) -{ - /* Only run on the HNP */ - - *priority = my_priority; - *module = (pmix_mca_base_module_t *) &prte_rtc_hwloc_module; - - return PRTE_SUCCESS; -} diff --git a/src/mca/rtc/rtc.h b/src/mca/rtc/rtc.h deleted file mode 100644 index 0ea03dab72..0000000000 --- a/src/mca/rtc/rtc.h +++ /dev/null @@ -1,108 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * - * The PRTE Run-Time Control Framework (RTC) - * - */ - -#ifndef PRTE_MCA_RTC_H -#define PRTE_MCA_RTC_H - -#include "prte_config.h" -#include "types.h" - -#include "src/class/pmix_list.h" -#include "src/mca/mca.h" -#include "src/mca/odls/base/base.h" -#include "src/pmix/pmix-internal.h" -#include "src/runtime/prte_globals.h" - -BEGIN_C_DECLS - -typedef struct { - pmix_list_item_t super; - char *component; - char *category; - prte_value_t control; -} prte_rtc_resource_t; -PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_rtc_resource_t); - -/* Assign run-time controls for a given job. This provides each component with - * an opportunity to insert attributes into the prte_job_t and/or its - * associated proc structures that will be passed to backend daemons for - * controlling the job. For example, if the user specified a frequency - * setting for the job, then the freq component will have an opportunity - * to add an attribute to the job so the freq component on the remote daemons - * can "catch" it and perform the desired action - */ -typedef void (*prte_rtc_base_module_assign_fn_t)(prte_job_t *jdata); - -/* Set run-time controls for a given job and/or process. This can include - * controls for power, binding, memory, and any other resource on the node. - * Each active plugin will be given a chance to operate on the request, setting - * whatever controls that lie within its purview. - * - * Each module is responsible for reporting errors via the state machine. Thus, - * no error code is returned. However, warnings and error messages for the user - * can be output via the provided error_fd */ -typedef void (*prte_rtc_base_module_set_fn_t)(prte_odls_spawn_caddy_t *cd, - int error_fd); - -/* Return a list of valid controls values for this component. - * Each module is responsible for adding its control values - * to a list of prte_value_t objects. - */ -typedef void (*prte_rtc_base_module_get_avail_vals_fn_t)(pmix_list_t *vals); - -/* provide a way for the module to init during selection */ -typedef int (*prte_rtc_base_module_init_fn_t)(void); - -/* provide a chance for the module to finalize */ -typedef void (*prte_rtc_base_module_fini_fn_t)(void); - -/* - * rtc module version 1.0.0 - */ -typedef struct { - prte_rtc_base_module_init_fn_t init; - prte_rtc_base_module_fini_fn_t finalize; - prte_rtc_base_module_assign_fn_t assign; - prte_rtc_base_module_set_fn_t set; - prte_rtc_base_module_get_avail_vals_fn_t get_available_values; -} prte_rtc_base_module_t; - -/* provide a public API version */ -typedef struct { - prte_rtc_base_module_assign_fn_t assign; - prte_rtc_base_module_set_fn_t set; - prte_rtc_base_module_get_avail_vals_fn_t get_available_values; -} prte_rtc_API_module_t; - -/** - * rtc component version 1.0.0 - */ -typedef pmix_mca_base_component_t prte_rtc_base_component_t; - -/* declare the struct containing the public API */ -PRTE_EXPORT extern prte_rtc_API_module_t prte_rtc; - -/* - * Macro for use in components that are of type rtc - */ -#define PRTE_RTC_BASE_VERSION_1_0_0 PRTE_MCA_BASE_VERSION_3_0_0("rtc", 1, 0, 0) - -END_C_DECLS - -#endif diff --git a/src/mca/schizo/base/base.h b/src/mca/schizo/base/base.h index 57b2d85f0a..983eb862a1 100644 --- a/src/mca/schizo/base/base.h +++ b/src/mca/schizo/base/base.h @@ -2,7 +2,7 @@ * Copyright (c) 2015-2020 Intel, Inc. All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -95,8 +95,6 @@ PRTE_EXPORT bool prte_schizo_base_check_directives(char *directive, PRTE_EXPORT bool prte_schizo_base_check_qualifiers(char *directive, char **valid, char *qual); -PRTE_EXPORT bool prte_schizo_base_check_prte_param(char *param); -PRTE_EXPORT bool prte_schizo_base_check_pmix_param(char *param); PRTE_EXPORT void prte_schizo_base_expose(char *param, char *prefix); PRTE_EXPORT int prte_schizo_base_add_directive(pmix_cli_result_t *results, const char *deprecated, const char *target, diff --git a/src/mca/schizo/base/schizo_base_frame.c b/src/mca/schizo/base/schizo_base_frame.c index 964afa6265..ad99aa0af7 100644 --- a/src/mca/schizo/base/schizo_base_frame.c +++ b/src/mca/schizo/base/schizo_base_frame.c @@ -3,7 +3,7 @@ * Copyright (c) 2015-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -140,6 +140,8 @@ bool prte_schizo_base_check_directives(char *directive, PRTE_CLI_L3CACHE, PRTE_CLI_NUMA, PRTE_CLI_PACKAGE, + "socket", // dealt with elsewhere + "skt", // dealt with elsewhere PRTE_CLI_NODE, NULL }; diff --git a/src/mca/schizo/base/schizo_base_stubs.c b/src/mca/schizo/base/schizo_base_stubs.c index 6a3bc7225a..149762c1c8 100644 --- a/src/mca/schizo/base/schizo_base_stubs.c +++ b/src/mca/schizo/base/schizo_base_stubs.c @@ -21,6 +21,7 @@ #include "src/include/pmix_frameworks.h" #include "src/include/prte_frameworks.h" #include "src/mca/errmgr/errmgr.h" +#include "src/mca/pmdl/base/base.h" #include "src/mca/schizo/base/base.h" #include "src/runtime/prte_globals.h" #include "src/util/pmix_argv.h" @@ -242,26 +243,6 @@ char *prte_schizo_base_strip_quotes(char *p) return pout; } -bool prte_schizo_base_check_prte_param(char *param) -{ - char *p; - size_t n; - int len; - - p = strchr(param, '_'); - len = (int)(p - param); - - if (0 == strncmp(param, "prte", len)) { - return true; - } - for (n=0; NULL != prte_framework_names[n]; n++) { - if (0 == strncmp(param, prte_framework_names[n], len)) { - return true; - } - } - return false; -} - int prte_schizo_base_parse_prte(int argc, int start, char **argv, char ***target) { int i; @@ -312,7 +293,7 @@ int prte_schizo_base_parse_prte(int argc, int start, char **argv, char ***target /* this is a generic MCA designation, so see if the parameter it * refers to belongs to one of our frameworks */ - use = prte_schizo_base_check_prte_param(p1); + use = pmix_pmdl_base_check_prte_param(p1); if (use) { /* replace the generic directive with a PRRTE specific * one so we know this has been processed */ @@ -361,51 +342,6 @@ int prte_schizo_base_parse_prte(int argc, int start, char **argv, char ***target return PRTE_SUCCESS; } -static char **pmix_frameworks_tocheck = pmix_framework_names; -static bool pmix_frameworks_setup = false; - -static void setup_pmix_frameworks(void) -{ - if (pmix_frameworks_setup) { - return; - } - pmix_frameworks_setup = true; - - char *env = getenv("PMIX_MCA_PREFIXES"); - if (NULL == env) { - return; - } - - // If we found the env variable, it will be a comma-delimited list - // of values. Split it into an argv-style array. - char **tmp = PMIX_ARGV_SPLIT_COMPAT(env, ','); - if (NULL != tmp) { - pmix_frameworks_tocheck = tmp; - } -} - -bool prte_schizo_base_check_pmix_param(char *param) -{ - char *p; - size_t n; - int len; - - setup_pmix_frameworks(); - - p = strchr(param, '_'); - len = (int)(p - param); - - if (0 == strncmp(param, "pmix", len)) { - return true; - } - for (n=0; NULL != pmix_frameworks_tocheck[n]; n++) { - if (0 == strncmp(param, pmix_frameworks_tocheck[n], len)) { - return true; - } - } - return false; -} - int prte_schizo_base_parse_pmix(int argc, int start, char **argv, char ***target) { int i; @@ -487,7 +423,7 @@ int prte_schizo_base_parse_pmix(int argc, int start, char **argv, char ***target /* this is a generic MCA designation, so see if the parameter it * refers to belongs to one of our frameworks */ - use = prte_schizo_base_check_pmix_param(p1); + use = pmix_pmdl_base_check_pmix_param(p1); if (use) { /* replace the generic directive with a PMIx specific * one so we know this has been processed */ diff --git a/src/mca/schizo/ompi/schizo-ompi-cli.rstxt b/src/mca/schizo/ompi/schizo-ompi-cli.rstxt index 78eb2fdcf3..d8cd380291 100644 --- a/src/mca/schizo/ompi/schizo-ompi-cli.rstxt +++ b/src/mca/schizo/ompi/schizo-ompi-cli.rstxt @@ -4,6 +4,8 @@ Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. Copyright (c) 2022 IBM Corporation. All rights reserved. Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. + Copyright (c) 2024 Triad National Security, LLC. All rights reserved. + $COPYRIGHT$ Additional copyrights may follow @@ -159,6 +161,10 @@ MPI Options * ``--initial-errhandler``: Specify the initial error handler that is attached to predefined communicators during the first MPI call. +* ``--memory-alloc-kinds``: Value is a comma separated list of + memory allocation kinds. +* ``--disable-gpu-support``: Specify to disable any accelerator support + built in to the Open MPI installation at run time. * ``--display-comm``: Display table of communication methods between MPI_COMM_WORLD ranks during MPI_Init * ``--display-comm-finalize``: Display table of communication methods diff --git a/src/mca/schizo/ompi/schizo_ompi.c b/src/mca/schizo/ompi/schizo_ompi.c index f98c1e4224..d13e9f2aaa 100644 --- a/src/mca/schizo/ompi/schizo_ompi.c +++ b/src/mca/schizo/ompi/schizo_ompi.c @@ -19,7 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2018-2022 IBM Corporation. All rights reserved. * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. - * Copyright (c) 2022 Triad National Security, LLC. All rights + * Copyright (c) 2022-2025 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -57,6 +57,7 @@ #include "src/mca/base/pmix_mca_base_vari.h" #include "src/mca/errmgr/errmgr.h" #include "src/mca/ess/base/base.h" +#include "src/mca/pmdl/base/base.h" #include "src/mca/rmaps/base/base.h" #include "src/mca/state/base/base.h" #include "src/runtime/prte_globals.h" @@ -193,9 +194,12 @@ static struct option ompioptions[] = { PMIX_OPTION_DEFINE(PRTE_CLI_DISABLE_RECOVERY, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_CONTINUOUS, PMIX_ARG_NONE), PMIX_OPTION_DEFINE("with-ft", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("disable-gpu-support", PMIX_ARG_NONE), - /* mpiexec mandated form launch key parameters */ + /* mpiexec mandated form launch key parameters - MPI 4.0 */ PMIX_OPTION_DEFINE("initial-errhandler", PMIX_ARG_REQD), + /* mpiexec mandated form launch key parameters - MPI 4.1*/ + PMIX_OPTION_DEFINE("memory-alloc-kinds", PMIX_ARG_REQD), /* Display Commumication Protocol : MPI_Init */ PMIX_OPTION_DEFINE("display-comm", PMIX_ARG_NONE), @@ -926,6 +930,31 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, free(p1); free(opt->values[0]); opt->values[0] = tmp; + } else if (0 == strncasecmp(opt->values[0], "ppr", strlen("ppr"))) { + // see if they specified "socket" as the resource + p1 = strdup(opt->values[0]); + p2 = strrchr(p1, ':'); + ++p2; + if (0 == strncasecmp(p2, "socket", strlen("socket")) || + 0 == strncasecmp(p2, "skt", strlen("skt"))) { + *p2 = '\0'; + pmix_asprintf(&p2, "%spackage", p1); + if (warn) { + pmix_asprintf(&tmp, "%s %s", option, opt->values[0]); + pmix_asprintf(&tmp2, "%s %s", option, p2); + /* can't just call show_help as we want every instance to be reported */ + output = pmix_show_help_string("help-schizo-base.txt", + "deprecated-converted", true, + tmp, tmp2); + fprintf(stderr, "%s\n", output); + free(output); + free(tmp); + free(tmp2); + } + free(opt->values[0]); + opt->values[0] = p2; + } + free(p1); } } /* --rank-by socket -> --rank-by package */ @@ -1575,6 +1604,24 @@ static int parse_env(char **srcenv, char ***dstenv, } } + if (NULL != (opt = pmix_cmd_line_get_param(results, "memory-alloc-kinds"))) { + rc = check_cache(&cache, &cachevals, "mpi_memory_alloc_kinds", opt->values[0]); + if (PRTE_SUCCESS != rc) { + PMIX_ARGV_FREE_COMPAT(cache); + PMIX_ARGV_FREE_COMPAT(cachevals); + return rc; + } + } + + if (NULL != (opt = pmix_cmd_line_get_param(results, "disable-gpu-support"))) { + rc = check_cache(&cache, &cachevals, "disable_gpu_support", "true"); + if (PRTE_SUCCESS != rc) { + PMIX_ARGV_FREE_COMPAT(cache); + PMIX_ARGV_FREE_COMPAT(cachevals); + return rc; + } + } + if (pmix_cmd_line_is_taken(results, "display-comm") && pmix_cmd_line_is_taken(results, "display-comm-finalize")) { PMIX_SETENV_COMPAT("OMPI_MCA_ompi_display_comm", "mpi_init,mpi_finalize", true, dstenv); @@ -1810,6 +1857,11 @@ static int parse_env(char **srcenv, char ***dstenv, return PRTE_SUCCESS; } +// NOTE: This code is fundamentally the same (module PMIX <-> OPAL) +// as the translate_params() routine in the OMPI repo's +// opal/mca/pmix/base/pmix_base_fns.c file. If there are +// changes here, there are likely to be changes there. + static bool check_prte_overlap(char *var, char *value) { char *tmp; @@ -1871,7 +1923,6 @@ static bool check_prte_overlap(char *var, char *value) return false; } - static bool check_pmix_overlap(char *var, char *value) { char *tmp; @@ -1909,10 +1960,6 @@ static bool check_pmix_overlap(char *var, char *value) return false; } -// NOTE: This code is fundamentally the same (module PMIX <-> OPAL) -// as the translate_params() routine in the OMPI repo's -// opal/mca/pmix/base/pmix_base_fns.c file. If there are -// changes here, there are likely to be changes there. static int translate_params(void) { char *evar, *tmp, *e2; @@ -1953,7 +2000,7 @@ static int translate_params(void) if (check_prte_overlap(&e2[len], evar)) { // check for pmix overlap check_pmix_overlap(&e2[len], evar); - } else if (prte_schizo_base_check_prte_param(&e2[len])) { + } else if (pmix_pmdl_base_check_prte_param(&e2[len])) { pmix_asprintf(&tmp, "PRTE_MCA_%s", &e2[len]); // set it, but don't overwrite if they already // have a value in our environment @@ -1961,7 +2008,7 @@ static int translate_params(void) free(tmp); // check for pmix overlap check_pmix_overlap(&e2[len], evar); - } else if (prte_schizo_base_check_pmix_param(&e2[len])) { + } else if (pmix_pmdl_base_check_pmix_param(&e2[len])) { pmix_asprintf(&tmp, "PMIX_MCA_%s", &e2[len]); // set it, but don't overwrite if they already // have a value in our environment @@ -1986,7 +2033,7 @@ static int translate_params(void) // see if this param relates to PRRTE if (check_prte_overlap(fv->mbvfv_var, fv->mbvfv_value)) { check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); - } else if (prte_schizo_base_check_prte_param(fv->mbvfv_var)) { + } else if (pmix_pmdl_base_check_prte_param(fv->mbvfv_var)) { pmix_asprintf(&tmp, "PRTE_MCA_%s", fv->mbvfv_var); // set it, but don't overwrite if they already // have a value in our environment @@ -1996,7 +2043,7 @@ static int translate_params(void) // REACHABLE frameworks, then we also need to set // the equivalent PMIx value check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); - } else if (prte_schizo_base_check_pmix_param(fv->mbvfv_var)) { + } else if (pmix_pmdl_base_check_pmix_param(fv->mbvfv_var)) { pmix_asprintf(&tmp, "PMIX_MCA_%s", fv->mbvfv_var); // set it, but don't overwrite if they already // have a value in our environment @@ -2015,10 +2062,12 @@ static int translate_params(void) pmix_mca_base_parse_paramfile(file, ¶ms); free(file); PMIX_LIST_FOREACH (fv, ¶ms, pmix_mca_base_var_file_value_t) { - // see if this param relates to PRRTE - if (check_prte_overlap(fv->mbvfv_var, fv->mbvfv_value)) { - check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); - } else if (prte_schizo_base_check_prte_param(fv->mbvfv_var)) { + // see if this param overlaps with PRRTE + check_prte_overlap(fv->mbvfv_var, fv->mbvfv_value); + // see if it overlaps with PMIx + check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); + // see if it relates to PRRTE + if (pmix_pmdl_base_check_prte_param(fv->mbvfv_var)) { pmix_asprintf(&tmp, "PRTE_MCA_%s", fv->mbvfv_var); // set it, but don't overwrite if they already // have a value in our environment @@ -2029,6 +2078,14 @@ static int translate_params(void) // the equivalent PMIx value check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); } + // see if it relates to PMIx + if (pmix_pmdl_base_check_pmix_param(fv->mbvfv_var)) { + pmix_asprintf(&tmp, "PMIX_MCA_%s", fv->mbvfv_var); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, fv->mbvfv_value, false); + free(tmp); + } } PMIX_LIST_DESTRUCT(¶ms); } @@ -2059,7 +2116,7 @@ static int detect_proxy(char *personalities) /* if we were told the proxy, then use it */ if (NULL != (evar = getenv("PRTE_MCA_schizo_proxy"))) { if (0 == strcmp(evar, "ompi")) { - return translate_params(); + return 100; } else { return 0; } diff --git a/src/mca/schizo/prte/schizo_prte.c b/src/mca/schizo/prte/schizo_prte.c index db18d7013e..bc37e4cfc5 100644 --- a/src/mca/schizo/prte/schizo_prte.c +++ b/src/mca/schizo/prte/schizo_prte.c @@ -196,6 +196,7 @@ static struct option prterunoptions[] = { PMIX_OPTION_SHORT_DEFINE(PRTE_CLI_PRELOAD_BIN, PMIX_ARG_NONE, 's'), PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_AGG_HELP, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_FWD_ENVIRON, PMIX_ARG_OPTIONAL), + PMIX_OPTION_DEFINE(PRTE_CLI_MEM_ALLOC_KIND, PMIX_ARG_REQD), // output options PMIX_OPTION_DEFINE(PRTE_CLI_OUTPUT, PMIX_ARG_REQD), @@ -310,6 +311,7 @@ static struct option prunoptions[] = { PMIX_OPTION_SHORT_DEFINE(PRTE_CLI_PRELOAD_BIN, PMIX_ARG_NONE, 's'), PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_AGG_HELP, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_FWD_ENVIRON, PMIX_ARG_OPTIONAL), + PMIX_OPTION_DEFINE(PRTE_CLI_MEM_ALLOC_KIND, PMIX_ARG_REQD), // output options PMIX_OPTION_DEFINE(PRTE_CLI_OUTPUT, PMIX_ARG_REQD), @@ -819,6 +821,31 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, free(p1); free(opt->values[0]); opt->values[0] = tmp; + } else if (0 == strncasecmp(opt->values[0], "ppr", strlen("ppr"))) { + // see if they specified "socket" as the resource + p1 = strdup(opt->values[0]); + p2 = strrchr(p1, ':'); + ++p2; + if (0 == strncasecmp(p2, "socket", strlen("socket")) || + 0 == strncasecmp(p2, "skt", strlen("skt"))) { + *p2 = '\0'; + pmix_asprintf(&p2, "%spackage", p1); + if (warn) { + pmix_asprintf(&tmp, "%s %s", option, opt->values[0]); + pmix_asprintf(&tmp2, "%s %s", option, p2); + /* can't just call show_help as we want every instance to be reported */ + output = pmix_show_help_string("help-schizo-base.txt", + "deprecated-converted", true, + tmp, tmp2); + fprintf(stderr, "%s\n", output); + free(output); + free(tmp); + free(tmp2); + } + free(opt->values[0]); + opt->values[0] = p2; + } + free(p1); } } /* --rank-by */ diff --git a/src/mca/state/dvm/state_dvm.c b/src/mca/state/dvm/state_dvm.c index 1591b9f7c7..cd4d0f8811 100644 --- a/src/mca/state/dvm/state_dvm.c +++ b/src/mca/state/dvm/state_dvm.c @@ -793,11 +793,7 @@ static void check_complete(int fd, short args, void *cbdata) pmix_output(0, "COULD NOT GET BOUND CPU FOR RESOURCE RELEASE"); continue; } -#if HWLOC_API_VERSION < 0x20000 - tgt = obj->allowed_cpuset; -#else tgt = obj->cpuset; -#endif } hwloc_bitmap_or(node->available, node->available, tgt); } diff --git a/src/prted/pmix/pmix_server_queries.c b/src/prted/pmix/pmix_server_queries.c index d3df3743c8..ce91333f6b 100644 --- a/src/prted/pmix/pmix_server_queries.c +++ b/src/prted/pmix/pmix_server_queries.c @@ -90,7 +90,8 @@ static void _query(int sd, short args, void *cbdata) pmix_proc_info_t *procinfo; pmix_data_array_t dry; prte_proc_t *proct; - pmix_proc_t *proc; + pmix_proc_t *proc, pproc; + pmix_info_t info; size_t sz; PRTE_HIDE_UNUSED_PARAMS(sd, args); @@ -375,20 +376,12 @@ static void _query(int sd, short args, void *cbdata) char *xmlbuffer = NULL; int len; kv = PMIX_NEW(prte_info_item_t); -#if HWLOC_API_VERSION < 0x20000 - /* get this from the v1.x API */ - if (0 != hwloc_topology_export_xmlbuffer(prte_hwloc_topology, &xmlbuffer, &len)) { - PMIX_RELEASE(kv); - continue; - } -#else /* get it from the v2 API */ if (0 != hwloc_topology_export_xmlbuffer(prte_hwloc_topology, &xmlbuffer, &len, HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1)) { PMIX_RELEASE(kv); continue; } -#endif PMIX_INFO_LIST_ADD(rc, results, PMIX_HWLOC_XML_V1, xmlbuffer, PMIX_STRING); free(xmlbuffer); if (PMIX_SUCCESS != rc) { @@ -398,8 +391,6 @@ static void _query(int sd, short args, void *cbdata) } } else if (0 == strcmp(q->keys[n], PMIX_HWLOC_XML_V2)) { - /* we cannot provide it if we are using v1.x */ -#if HWLOC_API_VERSION >= 0x20000 if (NULL != prte_hwloc_topology) { char *xmlbuffer = NULL; int len; @@ -415,7 +406,6 @@ static void _query(int sd, short args, void *cbdata) goto done; } } -#endif } else if (0 == strcmp(q->keys[n], PMIX_PROC_URI)) { /* they want our URI */ @@ -755,18 +745,10 @@ static void _query(int sd, short args, void *cbdata) continue; } /* convert the topology to XML representation */ -#if HWLOC_API_VERSION < 0x20000 - /* get this from the v1.x API */ - if (0 != hwloc_topology_export_xmlbuffer(topo->topo, &str, &len)) { - continue; - } - PMIX_INFO_LIST_ADD(rc, nodelist, PMIX_HWLOC_XML_V1, str, PMIX_STRING); -#else if (0 != hwloc_topology_export_xmlbuffer(topo->topo, &str, &len, 0)) { continue; } PMIX_INFO_LIST_ADD(rc, nodelist, PMIX_HWLOC_XML_V2, str, PMIX_STRING); -#endif free(str); } /* convert list to array */ @@ -823,6 +805,27 @@ static void _query(int sd, short args, void *cbdata) } #endif +#ifdef PMIX_MEM_ALLOC_KIND + } else if (0 == strcmp(q->keys[n], PMIX_MEM_ALLOC_KIND)) { + pmix_value_t *value; + jdata = prte_get_job_data_object(jobid); + if (NULL == jdata) { + ret = PMIX_ERR_NOT_FOUND; + goto done; + } + PMIX_LOAD_PROCID(&pproc, jobid, PMIX_RANK_WILDCARD); + PMIX_INFO_LOAD(&info, PMIX_IMMEDIATE, NULL, PMIX_BOOL); + ret = PMIx_Get(&pproc, PMIX_MEM_ALLOC_KIND, &info, 1, &value); + if (PMIX_SUCCESS != ret) { + goto done; + } + PMIX_INFO_LIST_ADD(rc, results, PMIX_MEM_ALLOC_KIND, value->data.string, PMIX_STRING); + PMIX_VALUE_RELEASE(value); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto done; + } +#endif } else { fprintf(stderr, "Query for unrecognized attribute: %s\n", q->keys[n]); } diff --git a/src/prted/pmix/pmix_server_register_fns.c b/src/prted/pmix/pmix_server_register_fns.c index f8aa13cca1..58b8b296e1 100644 --- a/src/prted/pmix/pmix_server_register_fns.c +++ b/src/prted/pmix/pmix_server_register_fns.c @@ -302,11 +302,7 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) /* total available physical memory */ machine = hwloc_get_next_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); if (NULL != machine) { -#if HWLOC_API_VERSION < 0x20000 - PMIX_INFO_LIST_ADD(ret, info, PMIX_AVAIL_PHYS_MEMORY, &machine->memory.total_memory, PMIX_UINT64); -#else PMIX_INFO_LIST_ADD(ret, info, PMIX_AVAIL_PHYS_MEMORY, &machine->total_memory, PMIX_UINT64); -#endif } /* pass the mapping policy used for this job */ diff --git a/src/prted/prted_comm.c b/src/prted/prted_comm.c index c78fa51d88..054740afc8 100644 --- a/src/prted/prted_comm.c +++ b/src/prted/prted_comm.c @@ -65,7 +65,6 @@ #include "src/mca/grpcomm/base/base.h" #include "src/mca/iof/base/base.h" #include "src/mca/odls/base/base.h" -#include "src/mca/oob/base/base.h" #include "src/mca/plm/base/base.h" #include "src/mca/plm/plm.h" #include "src/mca/rmaps/rmaps_types.h" diff --git a/src/prted/prun_common.c b/src/prted/prun_common.c index 695aec2f21..3d2897aa25 100644 --- a/src/prted/prun_common.c +++ b/src/prted/prun_common.c @@ -661,6 +661,13 @@ int prun_common(pmix_cli_result_t *results, PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_LOG_AGG, &flag, PMIX_BOOL); } +#ifdef PMIX_MEM_ALLOC_KIND + opt = pmix_cmd_line_get_param(results, PRTE_CLI_MEM_ALLOC_KIND); + if (NULL != opt) { + PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MEM_ALLOC_KIND, opt->values[0], PMIX_STRING); + } +#endif + /* give the schizo components a chance to add to the job info */ schizo->job_info(results, jinfo); @@ -779,7 +786,7 @@ int prun_common(pmix_cli_result_t *results, } else if (0 == strcmp(opt->values[0], "none")) { pname.rank = PMIX_RANK_INVALID; } else { - pname.rank = 0; + pname.rank = strtoul(opt->values[0], NULL, 10); } } else { pname.rank = 0; diff --git a/src/rml/Makefile.am b/src/rml/Makefile.am index 5cc9f3ca0e..c864b2c2e4 100644 --- a/src/rml/Makefile.am +++ b/src/rml/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved # Copyright (c) 2019 Intel, Inc. All rights reserved. -# Copyright (c) 2022 Nanook Consulting. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -32,3 +32,5 @@ libprrte_la_SOURCES += \ rml/rml_base_contact.c \ rml/rml_base_msg_handlers.c \ rml/routed_radix.c + +include rml/oob/Makefile.am diff --git a/src/rml/oob/Makefile.am b/src/rml/oob/Makefile.am new file mode 100644 index 0000000000..25704f9b3a --- /dev/null +++ b/src/rml/oob/Makefile.am @@ -0,0 +1,46 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved +# Copyright (c) 2012-2013 Los Alamos National Security, LLC. +# All rights reserved +# Copyright (c) 2014-2020 Intel, Inc. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2022-2024 Nanook Consulting All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_prtedata_DATA += \ + rml/oob/help-oob-base.txt \ + rml/oob/help-oob-tcp.txt + +headers += \ + rml/oob/oob.h \ + rml/oob/oob_tcp.h \ + rml/oob/oob_tcp_listener.h \ + rml/oob/oob_tcp_common.h \ + rml/oob/oob_tcp_connection.h \ + rml/oob/oob_tcp_sendrecv.h \ + rml/oob/oob_tcp_hdr.h \ + rml/oob/oob_tcp_peer.h + +libprrte_la_SOURCES += \ + rml/oob/oob_tcp_component.c \ + rml/oob/oob_tcp.c \ + rml/oob/oob_tcp_listener.c \ + rml/oob/oob_tcp_common.c \ + rml/oob/oob_tcp_connection.c \ + rml/oob/oob_tcp_sendrecv.c \ + rml/oob/oob_base_stubs.c diff --git a/src/mca/oob/base/help-oob-base.txt b/src/rml/oob/help-oob-base.txt similarity index 93% rename from src/mca/oob/base/help-oob-base.txt rename to src/rml/oob/help-oob-base.txt index 41ae1761d7..009fcf0e0e 100644 --- a/src/mca/oob/base/help-oob-base.txt +++ b/src/rml/oob/help-oob-base.txt @@ -11,6 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2014-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/src/mca/oob/tcp/help-oob-tcp.txt b/src/rml/oob/help-oob-tcp.txt similarity index 98% rename from src/mca/oob/tcp/help-oob-tcp.txt rename to src/rml/oob/help-oob-tcp.txt index edbce3ef98..950599d810 100644 --- a/src/mca/oob/tcp/help-oob-tcp.txt +++ b/src/rml/oob/help-oob-tcp.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2014-2020 Intel, Inc. All rights reserved. # Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved +# Copyright (c) 2024 Nanook Consulting All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/src/mca/oob/base/base.h b/src/rml/oob/oob.h similarity index 65% rename from src/mca/oob/base/base.h rename to src/rml/oob/oob.h index c3f1f04142..b901997b4b 100644 --- a/src/mca/oob/base/base.h +++ b/src/rml/oob/oob.h @@ -15,7 +15,7 @@ * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,12 +46,11 @@ #include "src/class/pmix_hash_table.h" #include "src/class/pmix_list.h" #include "src/event/event-internal.h" +#include "src/include/prte_stdatomic.h" #include "src/util/pmix_printf.h" - -#include "src/mca/mca.h" #include "src/threads/pmix_threads.h" -#include "src/mca/oob/oob.h" +#include "src/rml/rml_types.h" BEGIN_C_DECLS @@ -59,26 +58,55 @@ BEGIN_C_DECLS * Convenience Typedef */ typedef struct { - char *include; - char *exclude; - pmix_list_t components; - pmix_list_t actives; + int output; + uint32_t addr_count; /**< total number of addresses */ + int num_links; /**< number of logical links per physical device */ + int max_retries; /**< max number of retries before declaring peer gone */ int max_uri_length; - pmix_list_t peers; + pmix_list_t events; /**< events for monitoring connections */ + int peer_limit; /**< max size of tcp peer cache */ + pmix_list_t peers; // connection addresses for peers + + /* Port specifications */ + int tcp_sndbuf; /**< socket send buffer size */ + int tcp_rcvbuf; /**< socket recv buffer size */ + + /* IPv4 support */ + bool disable_ipv4_family; /**< disable this AF */ + char **tcp_static_ports; /**< Static ports - IPV4 */ + char **tcp_dyn_ports; /**< Dynamic ports - IPV4 */ + char **ipv4conns; + char **ipv4ports; + + /* IPv6 support */ + bool disable_ipv6_family; /**< disable this AF */ + char **tcp6_static_ports; /**< Static ports - IPV6 */ + char **tcp6_dyn_ports; /**< Dynamic ports - IPV6 */ + char **ipv6conns; + char **ipv6ports; + + /* connection support */ + pmix_list_t local_ifs; /**< prte list of local pmix_pif_t interfaces */ + char **if_masks; + int num_hnp_ports; /**< number of ports the HNP should listen on */ + pmix_list_t listeners; /**< List of sockets being monitored by event or thread */ + pmix_thread_t listen_thread; /**< handle to the listening thread */ + prte_atomic_bool_t listen_thread_active; + struct timeval listen_thread_tv; /**< Timeout when using listen thread */ + int stop_thread[2]; /**< pipe used to exit the listen thread */ + int keepalive_probes; /**< number of keepalives that can be missed before declaring error */ + int keepalive_time; /**< idle time in seconds before starting to send keepalives */ + int keepalive_intvl; /**< time between keepalives, in seconds */ + int retry_delay; /**< time to wait before retrying connection */ + int max_recon_attempts; /**< maximum number of times to attempt connect before giving up (-1 for + never) */ } prte_oob_base_t; PRTE_EXPORT extern prte_oob_base_t prte_oob_base; -typedef struct { - pmix_list_item_t super; - pmix_proc_t name; - prte_oob_base_component_t *component; - pmix_bitmap_t addressable; -} prte_oob_base_peer_t; -PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_oob_base_peer_t); - /* MCA framework */ -PRTE_EXPORT extern pmix_mca_base_framework_t prte_oob_base_framework; -PRTE_EXPORT int prte_oob_base_select(void); +PRTE_EXPORT int prte_oob_open(void); +PRTE_EXPORT void prte_oob_close(void); +PRTE_EXPORT int prte_oob_register(void); /* Access the OOB internal functions via set of event-based macros * for inserting messages and other commands into the @@ -112,15 +140,13 @@ PRTE_EXPORT void prte_oob_base_send_nb(int fd, short args, void *cbdata); #define PRTE_OOB_SEND(m) \ do { \ prte_oob_send_t *prte_oob_send_cd; \ - pmix_output_verbose(1, prte_oob_base_framework.framework_output, "%s OOB_SEND: %s:%d", \ + pmix_output_verbose(1, prte_oob_base.output, "%s OOB_SEND: %s:%d", \ PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__); \ prte_oob_send_cd = PMIX_NEW(prte_oob_send_t); \ prte_oob_send_cd->msg = (m); \ PRTE_PMIX_THREADSHIFT(prte_oob_send_cd, prte_event_base, prte_oob_base_send_nb); \ } while (0) -PRTE_EXPORT prte_oob_base_peer_t *prte_oob_base_get_peer(const pmix_proc_t *pr); - /* During initial wireup, we can only transfer contact info on the daemon * command line. This limits what we can send to a string representation of * the actual contact info, which gets sent in a uri-like form. Not every diff --git a/src/rml/oob/oob_base_stubs.c b/src/rml/oob/oob_base_stubs.c new file mode 100644 index 0000000000..31b2edf409 --- /dev/null +++ b/src/rml/oob/oob_base_stubs.c @@ -0,0 +1,494 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "prte_config.h" +#include "constants.h" + +#include "src/pmix/pmix-internal.h" +#include "src/runtime/prte_globals.h" +#include "src/util/pmix_argv.h" +#include "src/util/pmix_output.h" +#include "src/util/pmix_printf.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/rml/rml.h" +#include "src/mca/state/state.h" +#include "src/threads/pmix_threads.h" + +#include "src/rml/oob/oob.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_connection.h" +#include "src/rml/oob/oob_tcp_peer.h" + +static prte_oob_tcp_peer_t* process_uri(char *uri); + +void prte_oob_base_send_nb(int fd, short args, void *cbdata) +{ + prte_oob_send_t *cd = (prte_oob_send_t *) cbdata; + prte_rml_send_t *msg; + prte_oob_tcp_peer_t *peer; + pmix_proc_t hop; + int rc; + char *uri = NULL; + PRTE_HIDE_UNUSED_PARAMS(fd, args); + + PMIX_ACQUIRE_OBJECT(cd); + + /* done with this. release it now */ + msg = cd->msg; + PMIX_RELEASE(cd); + + pmix_output_verbose(5, prte_oob_base.output, + "%s oob:base:send to target %s - attempt %u", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&msg->dst), + msg->retries); + + /* don't try forever - if we have exceeded the number of retries, + * then report this message as undeliverable even if someone continues + * to think they could reach it */ + if (prte_rml_base.max_retries <= msg->retries) { + msg->status = PRTE_ERR_NO_PATH_TO_TARGET; + PRTE_RML_SEND_COMPLETE(msg); + return; + } + + /* do we have a route to this peer (could be direct)? */ + PMIX_LOAD_NSPACE(hop.nspace, PRTE_PROC_MY_NAME->nspace); + hop.rank = prte_rml_get_route(msg->dst.rank); + /* do we know this hop? */ + if (NULL == (peer = prte_oob_tcp_peer_lookup(&hop))) { + /* if this message is going to the HNP, send it direct */ + if (PRTE_PROC_MY_HNP->rank == msg->dst.rank) { + hop.rank = PRTE_PROC_MY_HNP->rank; + peer = prte_oob_tcp_peer_lookup(&hop); + if (NULL != peer) { + goto send; + } + } + // see if we know the contact info for it + PRTE_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PROC_URI, &hop, (char **) &uri, PMIX_STRING); + if (PRTE_SUCCESS == rc && NULL != uri) { + peer = process_uri(uri); + if (NULL == peer) { + /* that is just plain wrong */ + pmix_output_verbose(5, prte_oob_base.output, + "%s oob:base:send addressee unknown %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&msg->dst)); + + if (prte_prteds_term_ordered || prte_finalizing || prte_abnormal_term_ordered) { + /* just ignore the problem */ + PMIX_RELEASE(msg); + return; + } + PRTE_ACTIVATE_PROC_STATE(&hop, PRTE_PROC_STATE_UNABLE_TO_SEND_MSG); + PMIX_RELEASE(msg); + return; + } + } else { + // unable to send it + if (prte_prteds_term_ordered || prte_finalizing || prte_abnormal_term_ordered) { + /* just ignore the problem */ + PMIX_RELEASE(msg); + return; + } + PRTE_ACTIVATE_PROC_STATE(&hop, PRTE_PROC_STATE_UNABLE_TO_SEND_MSG); + PMIX_RELEASE(msg); + return; + } + } + +send: + pmix_output_verbose(2, prte_oob_base.output, + "%s:[%s:%d] processing send to peer %s:%d seq_num = %d via %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, + PRTE_NAME_PRINT(&msg->dst), msg->tag, msg->seq_num, + PRTE_NAME_PRINT(&peer->name)); + + /* add the msg to the hop's send queue */ + if (MCA_OOB_TCP_CONNECTED == peer->state) { + pmix_output_verbose(2, prte_oob_base.output, + "%s tcp:send_nb: already connected to %s - queueing for send", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); + MCA_OOB_TCP_QUEUE_SEND(msg, peer); + return; + } + + /* add the message to the queue for sending after the + * connection is formed + */ + MCA_OOB_TCP_QUEUE_PENDING(msg, peer); + + if (MCA_OOB_TCP_CONNECTING != peer->state && MCA_OOB_TCP_CONNECT_ACK != peer->state) { + /* we have to initiate the connection - again, we do not + * want to block while the connection is created. + * So throw us into an event that will create + * the connection via a mini-state-machine :-) + */ + pmix_output_verbose(2, prte_oob_base.output, + "%s tcp:send_nb: initiating connection to %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); + peer->state = MCA_OOB_TCP_CONNECTING; + PRTE_ACTIVATE_TCP_CONN_STATE(peer, prte_oob_tcp_peer_try_connect); + } +} + +/** + * Obtain a uri for initial connection purposes + * + * During initial wireup, we can only transfer contact info on the daemon + * command line. This limits what we can send to a string representation of + * the actual contact info, which gets sent in a uri-like form. Not every + * oob module can support this transaction, so this function will loop + * across all oob components/modules, letting each add to the uri string if + * it supports bootstrap operations. An error will be returned in the cbfunc + * if NO component can successfully provide a contact. + * + * Note: since there is a limit to what an OS will allow on a cmd line, we + * impose a limit on the length of the resulting uri via an MCA param. The + * default value of -1 implies unlimited - however, users with large numbers + * of interfaces on their nodes may wish to restrict the size. + */ +void prte_oob_base_get_addr(char **uri) +{ + char *final = NULL, *tmp; + char *cptr = NULL, *tp, *tm; + size_t len = 0; + pmix_status_t rc; + + /* start with our process name */ + rc = prte_util_convert_process_name_to_string(&final, PRTE_PROC_MY_NAME); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + *uri = NULL; + return; + } + len = strlen(final); + + if (!prte_oob_base.disable_ipv4_family && + NULL != prte_oob_base.ipv4conns) { + tmp = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.ipv4conns, ','); + tp = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.ipv4ports, ','); + tm = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.if_masks, ','); + pmix_asprintf(&cptr, "tcp://%s:%s:%s", tmp, tp, tm); + free(tmp); + free(tp); + free(tm); + } +#if PRTE_ENABLE_IPV6 + if (!prte_oob_base.disable_ipv6_family && + NULL != prte_oob_base.ipv6conns) { + char *tmp2; + + /* Fixes #2498 + * RFC 3986, section 3.2.2 + * The notation in that case is to encode the IPv6 IP number in square brackets: + * "http://[2001:db8:1f70::999:de8:7648:6e8]:100/" + * A host identified by an Internet Protocol literal address, version 6 [RFC3513] + * or later, is distinguished by enclosing the IP literal within square brackets. + * This is the only place where square bracket characters are allowed in the URI + * syntax. In anticipation of future, as-yet-undefined IP literal address formats, + * an implementation may use an optional version flag to indicate such a format + * explicitly rather than rely on heuristic determination. + */ + tmp = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.ipv6conns, ','); + tp = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.ipv6ports, ','); + tm = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.if_masks, ','); + if (NULL == cptr) { + /* no ipv4 stuff */ + pmix_asprintf(&cptr, "tcp6://[%s]:%s:%s", tmp, tp, tm); + } else { + pmix_asprintf(&tmp2, "%s;tcp6://[%s]:%s:%s", cptr, tmp, tp, tm); + free(cptr); + cptr = tmp2; + } + free(tmp); + free(tp); + free(tm); + } +#endif // PRTE_ENABLE_IPV6 + + /* check overall length for limits */ + if (0 < prte_oob_base.max_uri_length + && prte_oob_base.max_uri_length < (int) (len + strlen(cptr))) { + /* cannot accept the payload */ + free(final); + free(cptr); + *uri = NULL; + return; + } + /* add new value to final one */ + pmix_asprintf(&tmp, "%s;%s", final, cptr); + free(cptr); + free(final); + final = tmp; + + *uri = final; +} + +/* the host in this case is always in "dot" notation, and + * thus we do not need to do a DNS lookup to convert it */ +static int parse_uri(const uint16_t af_family, const char *host, const char *port, + struct sockaddr_storage *inaddr) +{ + struct sockaddr_in *in; + + if (AF_INET == af_family) { + memset(inaddr, 0, sizeof(struct sockaddr_in)); + in = (struct sockaddr_in *) inaddr; + in->sin_family = AF_INET; + in->sin_addr.s_addr = inet_addr(host); + if (in->sin_addr.s_addr == INADDR_NONE) { + return PRTE_ERR_BAD_PARAM; + } + ((struct sockaddr_in *) inaddr)->sin_port = htons(atoi(port)); + } +#if PRTE_ENABLE_IPV6 + else if (AF_INET6 == af_family) { + struct sockaddr_in6 *in6; + memset(inaddr, 0, sizeof(struct sockaddr_in6)); + in6 = (struct sockaddr_in6 *) inaddr; + + if (0 == inet_pton(AF_INET6, host, (void *) &in6->sin6_addr)) { + pmix_output(0, "oob_tcp_parse_uri: Could not convert %s\n", host); + return PRTE_ERR_BAD_PARAM; + } + in6->sin6_family = AF_INET6; + in6->sin6_port = htons(atoi(port)); + } +#endif + else { + return PRTE_ERR_NOT_SUPPORTED; + } + return PRTE_SUCCESS; +} + +static void set_addr(pmix_proc_t *peer, char **uris) +{ + char **addrs, **masks, *hptr; + char *tcpuri = NULL, *host, *ports, *masks_string; + int i, j, rc; + uint16_t af_family = AF_UNSPEC; + uint64_t ui64; + prte_oob_tcp_peer_t *pr; + prte_oob_tcp_addr_t *maddr; + + memcpy(&ui64, (char *) peer, sizeof(uint64_t)); + + for (i = 0; NULL != uris[i]; i++) { + tcpuri = strdup(uris[i]); + if (NULL == tcpuri) { + pmix_output_verbose(2, prte_oob_base.output, + "%s oob:tcp: out of memory", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); + continue; + } + if (0 == strncmp(uris[i], "tcp:", 4)) { + af_family = AF_INET; + host = tcpuri + strlen("tcp://"); + } else if (0 == strncmp(uris[i], "tcp6:", 5)) { +#if PRTE_ENABLE_IPV6 + af_family = AF_INET6; + host = tcpuri + strlen("tcp6://"); +#else // PRTE_ENABLE_IPV6 + /* we don't support this connection type */ + pmix_output_verbose(2, prte_oob_base.output, + "%s oob:tcp: address %s not supported", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), uris[i]); + free(tcpuri); + continue; +#endif // PRTE_ENABLE_IPV6 + } else { + /* not one of ours */ + pmix_output_verbose(2, prte_oob_base.output, + "%s oob:tcp: ignoring address %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), uris[i]); + free(tcpuri); + continue; + } + + /* this one is ours - record the peer */ + pmix_output_verbose(2, prte_oob_base.output, + "%s oob:tcp: working peer %s address %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(peer), uris[i]); + + /* separate the mask from the network addrs */ + masks_string = strrchr(tcpuri, ':'); + if (NULL == masks_string) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + free(tcpuri); + continue; + } + *masks_string = '\0'; + masks_string++; + masks = PMIX_ARGV_SPLIT_COMPAT(masks_string, ','); + + /* separate the ports from the network addrs */ + ports = strrchr(tcpuri, ':'); + if (NULL == ports) { + PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); + free(tcpuri); + continue; + } + *ports = '\0'; + ports++; + + /* split the addrs */ + /* if this is a tcp6 connection, the first one will have a '[' + * at the beginning of it, and the last will have a ']' at the + * end - we need to remove those extra characters + */ + hptr = host; +#if PRTE_ENABLE_IPV6 + if (AF_INET6 == af_family) { + if ('[' == host[0]) { + hptr = &host[1]; + } + if (']' == host[strlen(host) - 1]) { + host[strlen(host) - 1] = '\0'; + } + } +#endif // PRTE_ENABLE_IPV6 + addrs = PMIX_ARGV_SPLIT_COMPAT(hptr, ','); + + /* cycle across the provided addrs */ + for (j = 0; NULL != addrs[j]; j++) { + if (NULL == masks[j]) { + /* Missing mask information */ + pmix_output_verbose(2, prte_oob_base.output, + "%s oob:tcp: uri missing mask information.", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); + return; + } + /* if they gave us "localhost", then just take the first conn on our list */ + if (0 == strcasecmp(addrs[j], "localhost")) { +#if PRTE_ENABLE_IPV6 + if (AF_INET6 == af_family) { + if (NULL == prte_oob_base.ipv6conns + || NULL == prte_oob_base.ipv6conns[0]) { + continue; + } + host = prte_oob_base.ipv6conns[0]; + } else { +#endif // PRTE_ENABLE_IPV6 + if (NULL == prte_oob_base.ipv4conns + || NULL == prte_oob_base.ipv4conns[0]) { + continue; + } + host = prte_oob_base.ipv4conns[0]; +#if PRTE_ENABLE_IPV6 + } +#endif + } else { + host = addrs[j]; + } + + if (NULL == (pr = prte_oob_tcp_peer_lookup(peer))) { + pr = PMIX_NEW(prte_oob_tcp_peer_t); + PMIX_XFER_PROCID(&pr->name, peer); + pmix_output_verbose(20, prte_oob_base.output, + "%s SET_PEER ADDING PEER %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(peer)); + pmix_list_append(&prte_oob_base.peers, &pr->super); + } + + maddr = PMIX_NEW(prte_oob_tcp_addr_t); + ((struct sockaddr_storage *) &(maddr->addr))->ss_family = af_family; + if (PRTE_SUCCESS + != (rc = parse_uri(af_family, host, ports, + (struct sockaddr_storage *) &(maddr->addr)))) { + PRTE_ERROR_LOG(rc); + PMIX_RELEASE(maddr); + pmix_list_remove_item(&prte_oob_base.peers, &pr->super); + PMIX_RELEASE(pr); + return; + } + maddr->if_mask = atoi(masks[j]); + + pmix_output_verbose(20, prte_oob_base.output, + "%s set_peer: peer %s is listening on net %s port %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(peer), + (NULL == host) ? "NULL" : host, (NULL == ports) ? "NULL" : ports); + pmix_list_append(&pr->addrs, &maddr->super); + } + PMIX_ARGV_FREE_COMPAT(addrs); + free(tcpuri); + } +} + +static prte_oob_tcp_peer_t *get_peer(const pmix_proc_t *pr); + +static prte_oob_tcp_peer_t* process_uri(char *uri) +{ + pmix_proc_t peer; + char *cptr; + char **uris = NULL; + prte_oob_tcp_peer_t *pr; + + pmix_output_verbose(5, prte_oob_base.output, + "%s:set_addr processing uri %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), uri); + + /* find the first semi-colon in the string */ + cptr = strchr(uri, ';'); + if (NULL == cptr) { + /* got a problem - there must be at least two fields, + * the first containing the process name of our peer + * and all others containing the OOB contact info + */ + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + return NULL; + } + *cptr = '\0'; + cptr++; + /* the first field is the process name, so convert it */ + prte_util_convert_string_to_process_name(&peer, uri); + + /* if the peer is us, no need to go further as we already + * know our own contact info + */ + if (PMIX_CHECK_PROCID(&peer, PRTE_PROC_MY_NAME)) { + pmix_output_verbose(5, prte_oob_base.output, + "%s:set_addr peer %s is me", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&peer)); + return NULL; + } + + /* split the rest of the uri into component parts */ + uris = PMIX_ARGV_SPLIT_COMPAT(cptr, ';'); + + /* get the peer object for this process */ + pr = get_peer(&peer); + if (NULL == pr) { + pr = PMIX_NEW(prte_oob_tcp_peer_t); + PMIX_XFER_PROCID(&pr->name, &peer); + pmix_list_append(&prte_oob_base.peers, &pr->super); + } + + set_addr(&pr->name, uris); + PMIX_ARGV_FREE_COMPAT(uris); + return pr; +} + +static prte_oob_tcp_peer_t *get_peer(const pmix_proc_t *pr) +{ + prte_oob_tcp_peer_t *peer; + + PMIX_LIST_FOREACH(peer, &prte_oob_base.peers, prte_oob_tcp_peer_t) + { + if (PMIX_CHECK_PROCID(pr, &peer->name)) { + return peer; + } + } + return NULL; +} diff --git a/src/rml/oob/oob_tcp.c b/src/rml/oob/oob_tcp.c new file mode 100644 index 0000000000..0c01b47852 --- /dev/null +++ b/src/rml/oob/oob_tcp.c @@ -0,0 +1,813 @@ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2009-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "prte_config.h" +#include "types.h" + +#ifdef HAVE_UNISTD_H +# include +#endif +#ifdef HAVE_SYS_TYPES_H +# include +#endif +#include +#ifdef HAVE_NET_IF_H +# include +#endif +#ifdef HAVE_NETINET_IN_H +# include +#endif +#ifdef HAVE_ARPA_INET_H +# include +#endif +#ifdef HAVE_NETDB_H +# include +#endif +#include + +#include "src/include/prte_socket_errno.h" +#include "src/runtime/prte_progress_threads.h" +#include "src/util/pmix_argv.h" +#include "src/util/error.h" +#include "src/util/pmix_if.h" +#include "src/util/pmix_net.h" +#include "src/util/pmix_output.h" +#include "src/util/pmix_show_help.h" + +#include "src/mca/errmgr/errmgr.h" +#include "src/mca/ess/ess.h" +#include "src/runtime/prte_globals.h" +#include "src/threads/pmix_threads.h" +#include "src/util/name_fns.h" +#include "src/util/pmix_parse_options.h" +#include "src/util/pmix_show_help.h" + +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_connection.h" +#include "src/rml/oob/oob_tcp_listener.h" +#include "src/rml/oob/oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp_sendrecv.h" + +prte_oob_base_t prte_oob_base = { + .output = -1, + .addr_count = 0, + .num_links = 0, + .max_retries = 0, + .max_uri_length = -1, + .events = PMIX_LIST_STATIC_INIT, + .peer_limit = 0, + .peers = PMIX_LIST_STATIC_INIT, + + .tcp_sndbuf = 0, + .tcp_rcvbuf = 0, + + .disable_ipv4_family = false, + .tcp_static_ports = NULL, + .tcp_dyn_ports = NULL, + .ipv4conns = NULL, + .ipv4ports = NULL, + + .disable_ipv6_family = true, + .tcp6_static_ports = NULL, + .tcp6_dyn_ports = NULL, + .ipv6conns = NULL, + .ipv6ports = NULL, + + .local_ifs = PMIX_LIST_STATIC_INIT, + .if_masks = NULL, + .num_hnp_ports = 1, + .listeners = PMIX_LIST_STATIC_INIT, + .listen_thread_active = false, + .listen_thread_tv = {3600, 0}, + .stop_thread = {-1, -1}, + .keepalive_probes = 0, + .keepalive_time = 0, + .keepalive_intvl = 0, + .retry_delay = 0, + .max_recon_attempts = 0 +}; + +static char **split_and_resolve(char **orig_str, char *name); + +int prte_oob_open(void) +{ + pmix_pif_t *copied_interface, *selected_interface; + struct sockaddr_storage my_ss; + /* Larger than necessary, used for copying mask */ + char string[50], **interfaces = NULL; + int kindex; + int i, rc; + bool keeploopback = false; + bool including = false; + + pmix_output_verbose(5, prte_oob_base.output, + "oob:tcp: component_available called"); + + PMIX_CONSTRUCT(&prte_oob_base.listeners, pmix_list_t); + if (PRTE_PROC_IS_MASTER) { + PMIX_CONSTRUCT(&prte_oob_base.listen_thread, pmix_thread_t); + prte_oob_base.listen_thread_active = false; + prte_oob_base.listen_thread_tv.tv_sec = 3600; + prte_oob_base.listen_thread_tv.tv_usec = 0; + } + prte_oob_base.addr_count = 0; + prte_oob_base.ipv4conns = NULL; + prte_oob_base.ipv4ports = NULL; + prte_oob_base.ipv6conns = NULL; + prte_oob_base.ipv6ports = NULL; + prte_oob_base.if_masks = NULL; + + PMIX_CONSTRUCT(&prte_oob_base.local_ifs, pmix_list_t); + PMIX_CONSTRUCT(&prte_oob_base.peers, pmix_list_t); + + /* if interface include was given, construct a list + * of those interfaces which match the specifications - remember, + * the includes could be given as named interfaces, IP addrs, or + * subnet+mask + */ + if (NULL != prte_if_include) { + interfaces = split_and_resolve(&prte_if_include, + "include"); + including = true; + } else if (NULL != prte_if_exclude) { + interfaces = split_and_resolve(&prte_if_exclude, + "exclude"); + } + + /* if we are the master, then check the interfaces for loopbacks + * and keep loopbacks only if no non-loopback interface exists */ + if (PRTE_PROC_IS_MASTER) { + keeploopback = true; + PMIX_LIST_FOREACH(selected_interface, &pmix_if_list, pmix_pif_t) + { + if (!(selected_interface->if_flags & IFF_LOOPBACK)) { + keeploopback = false; + break; + } + } + } + + /* look at all available interfaces */ + PMIX_LIST_FOREACH(selected_interface, &pmix_if_list, pmix_pif_t) + { + if ((selected_interface->if_flags & IFF_LOOPBACK) && + !keeploopback) { + continue; + } + + + i = selected_interface->if_index; + kindex = selected_interface->if_kernel_index; + memcpy((struct sockaddr *) &my_ss, &selected_interface->if_addr, + MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr))); + + /* ignore non-ip4/6 interfaces */ + if (AF_INET != my_ss.ss_family +#if PRTE_ENABLE_IPV6 + && AF_INET6 != my_ss.ss_family +#endif + ) { + continue; + } + + /* ignore any virtual interfaces */ + if (0 == strncmp(selected_interface->if_name, "vir", 3)) { + continue; + } + + /* handle include/exclude directives */ + if (NULL != interfaces) { + /* check for match */ + rc = pmix_ifmatches(kindex, interfaces); + /* if one of the network specifications isn't parseable, then + * error out as we can't do what was requested + */ + if (PRTE_ERR_NETWORK_NOT_PARSEABLE == rc) { + pmix_show_help("help-oob-tcp.txt", "not-parseable", true); + PMIX_ARGV_FREE_COMPAT(interfaces); + return PRTE_ERR_BAD_PARAM; + } + /* if we are including, then ignore this if not present */ + if (including) { + if (PMIX_SUCCESS != rc) { + pmix_output_verbose(20, prte_oob_base.output, + "%s oob:tcp:init rejecting interface %s (not in include list)", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), selected_interface->if_name); + continue; + } + } else { + /* we are excluding, so ignore if present */ + if (PMIX_SUCCESS == rc) { + pmix_output_verbose(20, prte_oob_base.output, + "%s oob:tcp:init rejecting interface %s (in exclude list)", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), selected_interface->if_name); + continue; + } + } + } + + /* Refs ticket #3019 + * it would probably be worthwhile to print out a warning if PRRTE detects multiple + * IP interfaces that are "up" on the same subnet (because that's a Bad Idea). Note + * that we should only check for this after applying the relevant include/exclude + * list MCA params. If we detect redundant ports, we can also automatically ignore + * them so that applications won't hang. + */ + + /* add this address to our connections */ + if (AF_INET == my_ss.ss_family) { + pmix_output_verbose(10, prte_oob_base.output, + "%s oob:tcp:init adding %s to our list of %s connections", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + pmix_net_get_hostname((struct sockaddr *) &my_ss), + (AF_INET == my_ss.ss_family) ? "V4" : "V6"); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_oob_base.ipv4conns, + pmix_net_get_hostname((struct sockaddr *) &my_ss)); + } else if (AF_INET6 == my_ss.ss_family) { +#if PRTE_ENABLE_IPV6 + pmix_output_verbose(10, prte_oob_base.output, + "%s oob:tcp:init adding %s to our list of %s connections", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + pmix_net_get_hostname((struct sockaddr *) &my_ss), + (AF_INET == my_ss.ss_family) ? "V4" : "V6"); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_oob_base.ipv6conns, + pmix_net_get_hostname((struct sockaddr *) &my_ss)); +#endif // PRTE_ENABLE_IPV6 + } else { + pmix_output_verbose(10, prte_oob_base.output, + "%s oob:tcp:init ignoring %s from out list of connections", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + pmix_net_get_hostname((struct sockaddr *) &my_ss)); + continue; + } + copied_interface = PMIX_NEW(pmix_pif_t); + if (NULL == copied_interface) { + return PRTE_ERR_OUT_OF_RESOURCE; + } + pmix_string_copy(copied_interface->if_name, selected_interface->if_name, PMIX_IF_NAMESIZE); + copied_interface->if_index = i; + copied_interface->if_kernel_index = kindex; + copied_interface->af_family = my_ss.ss_family; + copied_interface->if_flags = selected_interface->if_flags; + copied_interface->if_speed = selected_interface->if_speed; + memcpy(&copied_interface->if_addr, &selected_interface->if_addr, + sizeof(struct sockaddr_storage)); + copied_interface->if_mask = selected_interface->if_mask; + /* If bandwidth is not found, set to arbitrary non zero value */ + copied_interface->if_bandwidth = selected_interface->if_bandwidth > 0 + ? selected_interface->if_bandwidth + : 1; + memcpy(&copied_interface->if_mac, &selected_interface->if_mac, + sizeof(copied_interface->if_mac)); + copied_interface->ifmtu = selected_interface->ifmtu; + /* Add the if_mask to the list */ + sprintf(string, "%d", selected_interface->if_mask); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_oob_base.if_masks, string); + pmix_list_append(&prte_oob_base.local_ifs, &(copied_interface->super)); + } + + if (0 == PMIX_ARGV_COUNT_COMPAT(prte_oob_base.ipv4conns) +#if PRTE_ENABLE_IPV6 + && 0 == PMIX_ARGV_COUNT_COMPAT(prte_oob_base.ipv6conns) +#endif + ) { + return PRTE_ERR_NOT_AVAILABLE; + } + + // start the listeners + if (PRTE_SUCCESS != (rc = prte_oob_tcp_start_listening())) { + PRTE_ERROR_LOG(rc); + } + return rc; +} + +void prte_oob_close(void) +{ + int i = 0, rc; + + if (PRTE_PROC_IS_MASTER && prte_oob_base.listen_thread_active) { + prte_oob_base.listen_thread_active = false; + /* tell the thread to exit */ + rc = write(prte_oob_base.stop_thread[1], &i, sizeof(int)); + if (0 < rc) { + pmix_thread_join(&prte_oob_base.listen_thread, NULL); + } + + close(prte_oob_base.stop_thread[0]); + close(prte_oob_base.stop_thread[1]); + + } + + PMIX_LIST_DESTRUCT(&prte_oob_base.local_ifs); + PMIX_LIST_DESTRUCT(&prte_oob_base.peers); + + if (NULL != prte_oob_base.ipv4conns) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.ipv4conns); + } + if (NULL != prte_oob_base.ipv4ports) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.ipv4ports); + } + +#if PRTE_ENABLE_IPV6 + if (NULL != prte_oob_base.ipv6conns) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.ipv6conns); + } + if (NULL != prte_oob_base.ipv6ports) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.ipv6ports); + } +#endif + if (NULL != prte_oob_base.if_masks) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.if_masks); + } + + if (0 <= prte_oob_base.output) { + pmix_output_close(prte_oob_base.output); + } +} + +static char *static_port_string; +#if PRTE_ENABLE_IPV6 +static char *static_port_string6; +#endif // PRTE_ENABLE_IPV6 + +static char *dyn_port_string; +#if PRTE_ENABLE_IPV6 +static char *dyn_port_string6; +#endif + +int prte_oob_register(void) +{ + prte_oob_base.peer_limit = -1; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "peer_limit", + "Maximum number of peer connections to simultaneously maintain (-1 = infinite)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.peer_limit); + + prte_oob_base.max_retries = 2; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "peer_retries", + "Number of times to try shutting down a connection before giving up", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.max_retries); + + prte_oob_base.tcp_sndbuf = 0; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "sndbuf", + "TCP socket send buffering size (in bytes, 0 => leave system default)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.tcp_sndbuf); + + prte_oob_base.tcp_rcvbuf = 0; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "rcvbuf", + "TCP socket receive buffering size (in bytes, 0 => leave system default)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.tcp_rcvbuf); + + + static_port_string = NULL; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "static_ipv4_ports", + "Static ports for daemons and procs (IPv4)", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &static_port_string); + + /* if ports were provided, parse the provided range */ + if (NULL != static_port_string) { + pmix_util_parse_range_options(static_port_string, &prte_oob_base.tcp_static_ports); + if (0 == strcmp(prte_oob_base.tcp_static_ports[0], "-1")) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.tcp_static_ports); + prte_oob_base.tcp_static_ports = NULL; + } + } else { + prte_oob_base.tcp_static_ports = NULL; + } + +#if PRTE_ENABLE_IPV6 + static_port_string6 = NULL; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "static_ipv6_ports", + "Static ports for daemons and procs (IPv6)", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &static_port_string6); + + /* if ports were provided, parse the provided range */ + if (NULL != static_port_string6) { + pmix_util_parse_range_options(static_port_string6, + &prte_oob_base.tcp6_static_ports); + if (0 == strcmp(prte_oob_base.tcp6_static_ports[0], "-1")) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.tcp6_static_ports); + prte_oob_base.tcp6_static_ports = NULL; + } + } else { + prte_oob_base.tcp6_static_ports = NULL; + } +#endif // PRTE_ENABLE_IPV6 + + if (NULL != prte_oob_base.tcp_static_ports + || NULL != prte_oob_base.tcp6_static_ports) { + prte_static_ports = true; + } + + dyn_port_string = NULL; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "dynamic_ipv4_ports", + "Range of ports to be dynamically used by daemons and procs (IPv4)", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &dyn_port_string); + /* if ports were provided, parse the provided range */ + if (NULL != dyn_port_string) { + /* can't have both static and dynamic ports! */ + if (prte_static_ports) { + char *err = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.tcp_static_ports, ','); + pmix_show_help("help-oob-tcp.txt", "static-and-dynamic", true, err, dyn_port_string); + free(err); + return PRTE_ERROR; + } + pmix_util_parse_range_options(dyn_port_string, &prte_oob_base.tcp_dyn_ports); + if (0 == strcmp(prte_oob_base.tcp_dyn_ports[0], "-1")) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.tcp_dyn_ports); + prte_oob_base.tcp_dyn_ports = NULL; + } + } else { + prte_oob_base.tcp_dyn_ports = NULL; + } + +#if PRTE_ENABLE_IPV6 + dyn_port_string6 = NULL; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "dynamic_ipv6_ports", + "Range of ports to be dynamically used by daemons and procs (IPv6)", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &dyn_port_string6); + /* if ports were provided, parse the provided range */ + if (NULL != dyn_port_string6) { + /* can't have both static and dynamic ports! */ + if (prte_static_ports) { + char *err4 = NULL, *err6 = NULL; + if (NULL != prte_oob_base.tcp_static_ports) { + err4 = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.tcp_static_ports, ','); + } + if (NULL != prte_oob_base.tcp6_static_ports) { + err6 = PMIX_ARGV_JOIN_COMPAT(prte_oob_base.tcp6_static_ports, ','); + } + pmix_show_help("help-oob-tcp.txt", "static-and-dynamic-ipv6", true, + (NULL == err4) ? "N/A" : err4, (NULL == err6) ? "N/A" : err6, + dyn_port_string6); + if (NULL != err4) { + free(err4); + } + if (NULL != err6) { + free(err6); + } + return PRTE_ERROR; + } + pmix_util_parse_range_options(dyn_port_string6, &prte_oob_base.tcp6_dyn_ports); + if (0 == strcmp(prte_oob_base.tcp6_dyn_ports[0], "-1")) { + PMIX_ARGV_FREE_COMPAT(prte_oob_base.tcp6_dyn_ports); + prte_oob_base.tcp6_dyn_ports = NULL; + } + } else { + prte_oob_base.tcp6_dyn_ports = NULL; + } +#endif // PRTE_ENABLE_IPV6 + + prte_oob_base.disable_ipv4_family = false; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "disable_ipv4_family", + "Disable the IPv4 interfaces", + PMIX_MCA_BASE_VAR_TYPE_BOOL, + &prte_oob_base.disable_ipv4_family); + +#if PRTE_ENABLE_IPV6 + prte_oob_base.disable_ipv6_family = false; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "disable_ipv6_family", + "Disable the IPv6 interfaces", + PMIX_MCA_BASE_VAR_TYPE_BOOL, + &prte_oob_base.disable_ipv6_family); +#endif // PRTE_ENABLE_IPV6 + + // Wait for this amount of time before sending the first keepalive probe + prte_oob_base.keepalive_time = 300; + (void)pmix_mca_base_var_register("prte", "prte", NULL, "keepalive_time", + "Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables " + "keepalive functionality)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.keepalive_time); + + // Resend keepalive probe every INT seconds + prte_oob_base.keepalive_intvl = 20; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "keepalive_intvl", + "Time between successive keepalive pings when peer has not responded, in seconds (ignored " + "if keepalive_time <= 0)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.keepalive_intvl); + + // After sending PR probes every INT seconds consider the connection dead + prte_oob_base.keepalive_probes = 9; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "keepalive_probes", + "Number of keepalives that can be missed before " + "declaring error (ignored if keepalive_time <= 0)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.keepalive_probes); + + prte_oob_base.retry_delay = 0; + (void) pmix_mca_base_var_register("prte","prte", NULL, "retry_delay", + "Time (in sec) to wait before trying to connect to peer again", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.retry_delay); + + prte_oob_base.max_recon_attempts = 10; + (void) pmix_mca_base_var_register("prte", "prte", NULL, "max_recon_attempts", + "Max number of times to attempt connection before giving up (-1 -> never give up)", + PMIX_MCA_BASE_VAR_TYPE_INT, + &prte_oob_base.max_recon_attempts); + return PRTE_SUCCESS; +} + +/* + * Local utility functions + */ +static void recv_handler(int sd, short flags, void *user); + +/* Called by prte_oob_tcp_accept() and connection_handler() on + * a socket that has been accepted. This call finishes processing the + * socket, including setting socket options and registering for the + * OOB-level connection handshake. Used in both the threaded and + * event listen modes. + */ +void prte_oob_accept_connection(const int accepted_fd, const struct sockaddr *addr) +{ + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s accept_connection: %s:%d\n", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + pmix_net_get_hostname(addr), pmix_net_get_port(addr)); + + /* setup socket options */ + prte_oob_tcp_set_socket_options(accepted_fd); + + /* use a one-time event to wait for receipt of peer's + * process ident message to complete this connection + */ + PRTE_ACTIVATE_TCP_ACCEPT_STATE(accepted_fd, addr, recv_handler); +} + +/* API functions */ +void prte_oob_ping(const pmix_proc_t *proc) +{ + prte_oob_tcp_peer_t *peer; + + pmix_output_verbose(2, prte_oob_base.output, + "%s:[%s:%d] processing ping to peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + __FILE__, __LINE__, PRTE_NAME_PRINT(proc)); + + /* do we know this peer? */ + if (NULL == (peer = prte_oob_tcp_peer_lookup(proc))) { + /* push this back to the component so it can try + * another module within this transport. If no + * module can be found, the component can push back + * to the framework so another component can try + */ + pmix_output_verbose(2, prte_oob_base.output, + "%s:[%s:%d] hop %s unknown", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + __FILE__, __LINE__, PRTE_NAME_PRINT(proc)); + PRTE_ACTIVATE_TCP_MSG_ERROR(NULL, NULL, proc, prte_mca_oob_tcp_component_hop_unknown); + return; + } + + /* if we are already connected, there is nothing to do */ + if (MCA_OOB_TCP_CONNECTED == peer->state) { + pmix_output_verbose(2, prte_oob_base.output, + "%s:[%s:%d] already connected to peer %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, + PRTE_NAME_PRINT(proc)); + return; + } + + /* if we are already connecting, there is nothing to do */ + if (MCA_OOB_TCP_CONNECTING == peer->state || MCA_OOB_TCP_CONNECT_ACK == peer->state) { + pmix_output_verbose(2, prte_oob_base.output, + "%s:[%s:%d] already connecting to peer %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), __FILE__, __LINE__, + PRTE_NAME_PRINT(proc)); + return; + } + + /* attempt the connection */ + peer->state = MCA_OOB_TCP_CONNECTING; + PRTE_ACTIVATE_TCP_CONN_STATE(peer, prte_oob_tcp_peer_try_connect); +} + +/* + * Event callback when there is data available on the registered + * socket to recv. This is called for the listen sockets to accept an + * incoming connection, on new sockets trying to complete the software + * connection process, and for probes. Data on an established + * connection is handled elsewhere. + */ +static void recv_handler(int sd, short flg, void *cbdata) +{ + prte_oob_tcp_conn_op_t *op = (prte_oob_tcp_conn_op_t *) cbdata; + int flags; + prte_oob_tcp_hdr_t hdr; + prte_oob_tcp_peer_t *peer; + PRTE_HIDE_UNUSED_PARAMS(flg); + + PMIX_ACQUIRE_OBJECT(op); + + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s:tcp:recv:handler called", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); + + /* get the handshake */ + if (PRTE_SUCCESS != prte_oob_tcp_peer_recv_connect_ack(NULL, sd, &hdr)) { + goto cleanup; + } + + /* finish processing ident */ + if (MCA_OOB_TCP_IDENT == hdr.type) { + if (NULL == (peer = prte_oob_tcp_peer_lookup(&hdr.origin))) { + /* should never happen */ + prte_oob_tcp_peer_close(peer); + goto cleanup; + } + /* set socket up to be non-blocking */ + if ((flags = fcntl(sd, F_GETFL, 0)) < 0) { + pmix_output(0, "%s prte_oob_tcp_recv_connect: fcntl(F_GETFL) failed: %s (%d)", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), strerror(prte_socket_errno), + prte_socket_errno); + } else { + flags |= O_NONBLOCK; + if (fcntl(sd, F_SETFL, flags) < 0) { + pmix_output(0, "%s prte_oob_tcp_recv_connect: fcntl(F_SETFL) failed: %s (%d)", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), strerror(prte_socket_errno), + prte_socket_errno); + } + } + /* is the peer instance willing to accept this connection */ + peer->sd = sd; + if (prte_oob_tcp_peer_accept(peer) == false) { + if (OOB_TCP_DEBUG_CONNECT + <= pmix_output_get_verbosity(prte_oob_base.output)) { + pmix_output(0, + "%s-%s prte_oob_tcp_recv_connect: " + "rejected connection from %s connection state %d", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), + PRTE_NAME_PRINT(&(hdr.origin)), peer->state); + } + CLOSE_THE_SOCKET(sd); + } + } + +cleanup: + PMIX_RELEASE(op); +} + +/* + * Go through a list of argv; if there are any subnet specifications + * (a.b.c.d/e), resolve them to an interface name (Currently only + * supporting IPv4). If unresolvable, warn and remove. + */ +static char **split_and_resolve(char **orig_str, char *name) +{ + pmix_pif_t *selected_interface; + int i, n, ret, match_count, interface_count; + char **argv, **interfaces, *str, *tmp; + char if_name[IF_NAMESIZE]; + struct sockaddr_storage argv_inaddr, if_inaddr; + uint32_t argv_prefix; + + /* Sanity check */ + if (NULL == orig_str || NULL == *orig_str) { + return NULL; + } + + argv = PMIX_ARGV_SPLIT_COMPAT(*orig_str, ','); + if (NULL == argv) { + return NULL; + } + interface_count = 0; + interfaces = NULL; + for (i = 0; NULL != argv[i]; ++i) { + if (isalpha(argv[i][0])) { + /* This is an interface name. If not already in the interfaces array, add it */ + for (n = 0; n < interface_count; n++) { + if (0 == strcmp(argv[i], interfaces[n])) { + break; + } + } + if (n == interface_count) { + pmix_output_verbose(20, + prte_oob_base.output, + "oob:tcp: Using interface: %s ", argv[i]); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&interfaces, argv[i]); + ++interface_count; + } + continue; + } + + /* Found a subnet notation. Convert it to an IP + address/netmask. Get the prefix first. */ + argv_prefix = 0; + tmp = strdup(argv[i]); + str = strchr(argv[i], '/'); + if (NULL == str) { + pmix_show_help("help-oob-tcp.txt", "invalid if_inexclude", + true, name, prte_process_info.nodename, + tmp, "Invalid specification (missing \"/\")"); + free(argv[i]); + free(tmp); + continue; + } + *str = '\0'; + argv_prefix = atoi(str + 1); + + /* Now convert the IPv4 address */ + ((struct sockaddr*) &argv_inaddr)->sa_family = AF_INET; + ret = inet_pton(AF_INET, argv[i], + &((struct sockaddr_in*) &argv_inaddr)->sin_addr); + free(argv[i]); + + if (1 != ret) { + pmix_show_help("help-oob-tcp.txt", "invalid if_inexclude", + true, name, prte_process_info.nodename, tmp, + "Invalid specification (inet_pton() failed)"); + free(tmp); + continue; + } + pmix_output_verbose(20, prte_oob_base.output, + "%s oob:tcp: Searching for %s address+prefix: %s / %u", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + name, + pmix_net_get_hostname((struct sockaddr*) &argv_inaddr), + argv_prefix); + + /* Go through all interfaces and see if we can find a match */ + match_count = 0; + PMIX_LIST_FOREACH(selected_interface, &pmix_if_list, pmix_pif_t) { + pmix_ifindextoaddr(selected_interface->if_kernel_index, + (struct sockaddr*) &if_inaddr, + sizeof(if_inaddr)); + if (pmix_net_samenetwork((struct sockaddr_storage*) &argv_inaddr, + (struct sockaddr_storage*) &if_inaddr, + argv_prefix)) { + /* We found a match. If it's not already in the interfaces array, + add it. If it's already in the array, treat it as a match */ + match_count = match_count + 1; + pmix_ifindextoname(selected_interface->if_kernel_index, if_name, sizeof(if_name)); + for (n = 0; n < interface_count; n++) { + if (0 == strcmp(if_name, interfaces[n])) { + break; + } + } + if (n == interface_count) { + pmix_output_verbose(20, + prte_oob_base.output, + "oob:tcp: Found match: %s (%s)", + pmix_net_get_hostname((struct sockaddr*) &if_inaddr), + if_name); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&interfaces, if_name); + ++interface_count; + } + } + } + /* If we didn't find a match, keep trying */ + if (0 == match_count) { + pmix_show_help("help-oob-tcp.txt", "invalid if_inexclude", + true, name, prte_process_info.nodename, tmp, + "Did not find interface matching this subnet"); + free(tmp); + continue; + } + + free(tmp); + } + + /* Mark the end of the interface name array with NULL */ + if (NULL != interfaces) { + interfaces[interface_count] = NULL; + } + free(argv); + free(*orig_str); + *orig_str = PMIX_ARGV_JOIN_COMPAT(interfaces, ','); + return interfaces; +} + +PMIX_CLASS_INSTANCE(prte_oob_send_t, + pmix_object_t, + NULL, NULL); diff --git a/src/mca/oob/tcp/oob_tcp.h b/src/rml/oob/oob_tcp.h similarity index 64% rename from src/mca/oob/tcp/oob_tcp.h rename to src/rml/oob/oob_tcp.h index 41bfaba28f..e23586dbd1 100644 --- a/src/mca/oob/tcp/oob_tcp.h +++ b/src/rml/oob/oob_tcp.h @@ -15,7 +15,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,8 +33,7 @@ #include "src/event/event-internal.h" #include "src/mca/base/pmix_base.h" -#include "src/mca/oob/base/base.h" -#include "src/mca/oob/oob.h" +#include "src/rml/oob/oob.h" BEGIN_C_DECLS @@ -42,10 +41,6 @@ BEGIN_C_DECLS #define OOB_TCP_DEBUG_FAIL 2 #define OOB_TCP_DEBUG_CONNECT 7 -/* forward declare a couple of structures */ -struct prte_oob_tcp_module_t; -struct prte_oob_tcp_msg_error_t; - /* define a struct for tracking NIC addresses */ typedef struct { pmix_list_item_t super; @@ -54,19 +49,6 @@ typedef struct { } prte_oob_tcp_nicaddr_t; PMIX_CLASS_DECLARATION(prte_oob_tcp_nicaddr_t); -/* Module definition */ -typedef void (*prte_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd, - const struct sockaddr *addr); -typedef void (*prte_oob_tcp_module_ping_fn_t)(const pmix_proc_t *proc); -typedef void (*prte_oob_tcp_module_send_nb_fn_t)(prte_rml_send_t *msg); - -typedef struct { - prte_oob_tcp_module_accept_connection_fn_t accept_connection; - prte_oob_tcp_module_ping_fn_t ping; - prte_oob_tcp_module_send_nb_fn_t send_nb; -} prte_oob_tcp_module_t; -PRTE_MODULE_EXPORT extern prte_oob_tcp_module_t prte_oob_tcp_module; - /** * the state of the connection */ @@ -82,10 +64,15 @@ typedef enum { } prte_oob_tcp_state_t; /* module-level shared functions */ -PRTE_MODULE_EXPORT void prte_oob_tcp_send_handler(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_oob_tcp_recv_handler(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_oob_tcp_queue_msg(int sd, short args, void *cbdata); - +PRTE_EXPORT void prte_oob_tcp_send_handler(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_oob_tcp_recv_handler(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_oob_tcp_queue_msg(int sd, short args, void *cbdata); +PRTE_EXPORT void prte_oob_accept_connection(const int accepted_fd, const struct sockaddr *addr); +PRTE_EXPORT void prte_mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_mca_oob_tcp_component_no_route(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_oob_ping(const pmix_proc_t *proc); END_C_DECLS #endif /* MCA_OOB_TCP_H_ */ diff --git a/src/mca/oob/tcp/oob_tcp_common.c b/src/rml/oob/oob_tcp_common.c similarity index 77% rename from src/mca/oob/tcp/oob_tcp_common.c rename to src/rml/oob/oob_tcp_common.c index 9671ee254f..928dff0056 100644 --- a/src/mca/oob/tcp/oob_tcp_common.c +++ b/src/rml/oob/oob_tcp_common.c @@ -16,7 +16,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,10 +63,9 @@ #include "src/util/pmix_net.h" #include "src/util/pmix_output.h" -#include "oob_tcp_common.h" -#include "oob_tcp_peer.h" -#include "src/mca/oob/tcp/oob_tcp.h" -#include "src/mca/oob/tcp/oob_tcp_component.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" /** * Set socket buffering @@ -87,27 +86,27 @@ static void set_keepalive(int sd) /* Set the option active */ option = 1; if (setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, optlen) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(SO_KEEPALIVE) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); return; } # if defined(TCP_KEEPALIVE) /* set the idle time */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE, &prte_mca_oob_tcp_component.keepalive_time, - sizeof(prte_mca_oob_tcp_component.keepalive_time)) + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE, &prte_oob_base.keepalive_time, + sizeof(prte_oob_base.keepalive_time)) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); return; } # elif defined(TCP_KEEPIDLE) /* set the idle time */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE, &prte_mca_oob_tcp_component.keepalive_time, - sizeof(prte_mca_oob_tcp_component.keepalive_time)) + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE, &prte_oob_base.keepalive_time, + sizeof(prte_oob_base.keepalive_time)) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); return; @@ -115,10 +114,10 @@ static void set_keepalive(int sd) # endif // TCP_KEEPIDLE # if defined(TCP_KEEPINTVL) /* set the keepalive interval */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL, &prte_mca_oob_tcp_component.keepalive_intvl, - sizeof(prte_mca_oob_tcp_component.keepalive_intvl)) + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL, &prte_oob_base.keepalive_intvl, + sizeof(prte_oob_base.keepalive_intvl)) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); return; @@ -126,10 +125,10 @@ static void set_keepalive(int sd) # endif // TCP_KEEPINTVL # if defined(TCP_KEEPCNT) /* set the miss rate */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT, &prte_mca_oob_tcp_component.keepalive_probes, - sizeof(prte_mca_oob_tcp_component.keepalive_probes)) + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT, &prte_oob_base.keepalive_probes, + sizeof(prte_oob_base.keepalive_probes)) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); } @@ -144,33 +143,33 @@ void prte_oob_tcp_set_socket_options(int sd) optval = 1; if (setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *) &optval, sizeof(optval)) < 0) { prte_backtrace_print(stderr, NULL, 1); - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(TCP_NODELAY) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); } #endif #if defined(SO_SNDBUF) - if (prte_mca_oob_tcp_component.tcp_sndbuf > 0 - && setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *) &prte_mca_oob_tcp_component.tcp_sndbuf, + if (prte_oob_base.tcp_sndbuf > 0 + && setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *) &prte_oob_base.tcp_sndbuf, sizeof(int)) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(SO_SNDBUF) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); } #endif #if defined(SO_RCVBUF) - if (prte_mca_oob_tcp_component.tcp_rcvbuf > 0 - && setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *) &prte_mca_oob_tcp_component.tcp_rcvbuf, + if (prte_oob_base.tcp_rcvbuf > 0 + && setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *) &prte_oob_base.tcp_rcvbuf, sizeof(int)) < 0) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "[%s:%d] setsockopt(SO_RCVBUF) failed: %s (%d)", __FILE__, __LINE__, strerror(prte_socket_errno), prte_socket_errno); } #endif - if (0 < prte_mca_oob_tcp_component.keepalive_time) { + if (0 < prte_oob_base.keepalive_time) { set_keepalive(sd); } } @@ -179,7 +178,7 @@ prte_oob_tcp_peer_t *prte_oob_tcp_peer_lookup(const pmix_proc_t *name) { prte_oob_tcp_peer_t *peer; - PMIX_LIST_FOREACH(peer, &prte_mca_oob_tcp_component.peers, prte_oob_tcp_peer_t) + PMIX_LIST_FOREACH(peer, &prte_oob_base.peers, prte_oob_tcp_peer_t) { if (PMIX_CHECK_PROCID(name, &peer->name)) { return peer; diff --git a/src/mca/oob/tcp/oob_tcp_common.h b/src/rml/oob/oob_tcp_common.h similarity index 77% rename from src/mca/oob/tcp/oob_tcp_common.h rename to src/rml/oob/oob_tcp_common.h index 4e2bfe5043..26e1408208 100644 --- a/src/mca/oob/tcp/oob_tcp_common.h +++ b/src/rml/oob/oob_tcp_common.h @@ -15,7 +15,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,10 +28,10 @@ #include "prte_config.h" -#include "oob_tcp.h" -#include "oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_peer.h" -PRTE_MODULE_EXPORT void prte_oob_tcp_set_socket_options(int sd); -PRTE_MODULE_EXPORT char *prte_oob_tcp_state_print(prte_oob_tcp_state_t state); -PRTE_MODULE_EXPORT prte_oob_tcp_peer_t *prte_oob_tcp_peer_lookup(const pmix_proc_t *name); +PRTE_EXPORT void prte_oob_tcp_set_socket_options(int sd); +PRTE_EXPORT char *prte_oob_tcp_state_print(prte_oob_tcp_state_t state); +PRTE_EXPORT prte_oob_tcp_peer_t *prte_oob_tcp_peer_lookup(const pmix_proc_t *name); #endif /* _MCA_OOB_TCP_COMMON_H_ */ diff --git a/src/rml/oob/oob_tcp_component.c b/src/rml/oob/oob_tcp_component.c new file mode 100644 index 0000000000..734b3eb1c3 --- /dev/null +++ b/src/rml/oob/oob_tcp_component.c @@ -0,0 +1,266 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2017 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2009-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. + * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015-2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights + * reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * In windows, many of the socket functions return an EWOULDBLOCK + * instead of things like EAGAIN, EINPROGRESS, etc. It has been + * verified that this will not conflict with other error codes that + * are returned by these functions under UNIX/Linux environments + */ + +#include "prte_config.h" +#include "types.h" + +#ifdef HAVE_UNISTD_H +# include +#endif +#ifdef HAVE_SYS_TYPES_H +# include +#endif +#include +#ifdef HAVE_NET_IF_H +# include +#endif +#ifdef HAVE_NETINET_IN_H +# include +#endif +#ifdef HAVE_ARPA_INET_H +# include +#endif +#ifdef HAVE_NETDB_H +# include +#endif +#include +#include + +#ifndef MIN +# define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#include "src/class/pmix_list.h" +#include "src/event/event-internal.h" +#include "src/include/prte_socket_errno.h" +#include "src/runtime/prte_progress_threads.h" +#include "src/util/pmix_argv.h" +#include "src/util/pmix_if.h" +#include "src/util/error.h" +#include "src/util/pmix_net.h" +#include "src/util/pmix_output.h" +#include "src/util/pmix_show_help.h" + +#include "src/mca/errmgr/errmgr.h" +#include "src/mca/ess/ess.h" +#include "src/rml/rml.h" +#include "src/mca/state/state.h" +#include "src/runtime/prte_globals.h" +#include "src/runtime/prte_wait.h" +#include "src/threads/pmix_threads.h" +#include "src/util/attr.h" +#include "src/util/name_fns.h" +#include "src/util/pmix_parse_options.h" +#include "src/util/pmix_show_help.h" + +#include "src/rml/oob/oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_connection.h" +#include "src/rml/oob/oob_tcp_listener.h" +#include "src/rml/oob/oob_tcp_peer.h" + +void prte_mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) +{ + prte_oob_tcp_peer_op_t *pop = (prte_oob_tcp_peer_op_t *) cbdata; + PRTE_HIDE_UNUSED_PARAMS(fd, args); + + PMIX_ACQUIRE_OBJECT(pop); + + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s tcp:lost connection called for peer %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pop->peer)); + + if (!prte_finalizing) { + /* activate the proc state */ + if (PRTE_SUCCESS != prte_rml_route_lost(pop->peer.rank)) { + PRTE_ACTIVATE_PROC_STATE(&pop->peer, PRTE_PROC_STATE_LIFELINE_LOST); + } else { + PRTE_ACTIVATE_PROC_STATE(&pop->peer, PRTE_PROC_STATE_COMM_FAILED); + } + } + PMIX_RELEASE(pop); +} + +void prte_mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) +{ + prte_oob_tcp_msg_error_t *mop = (prte_oob_tcp_msg_error_t *) cbdata; + PRTE_HIDE_UNUSED_PARAMS(fd, args); + + PMIX_ACQUIRE_OBJECT(mop); + + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s tcp:no route called for peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&mop->hop)); + + if (prte_prteds_term_ordered || prte_finalizing || prte_abnormal_term_ordered) { + /* just ignore the problem */ + PMIX_RELEASE(mop); + return; + } + + /* report the error */ + PRTE_ACTIVATE_PROC_STATE(&mop->hop, PRTE_PROC_STATE_UNABLE_TO_SEND_MSG); + + PMIX_RELEASE(mop); +} + +void prte_mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata) +{ + prte_oob_tcp_msg_error_t *mop = (prte_oob_tcp_msg_error_t *) cbdata; + PRTE_HIDE_UNUSED_PARAMS(fd, args); + + PMIX_ACQUIRE_OBJECT(mop); + + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s tcp:unknown hop called for peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + PRTE_NAME_PRINT(&mop->hop)); + + if (prte_prteds_term_ordered || prte_finalizing || prte_abnormal_term_ordered) { + /* just ignore the problem */ + PMIX_RELEASE(mop); + return; + } + + /* post the error */ + PRTE_ACTIVATE_PROC_STATE(&mop->hop, PRTE_PROC_STATE_UNABLE_TO_SEND_MSG); + + PMIX_RELEASE(mop); +} + +void prte_mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata) +{ + prte_oob_tcp_peer_op_t *pop = (prte_oob_tcp_peer_op_t *) cbdata; + PRTE_HIDE_UNUSED_PARAMS(fd, args); + + PMIX_ACQUIRE_OBJECT(pop); + + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s tcp:failed_to_connect called for peer %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pop->peer)); + + /* if we are terminating, then don't attempt to reconnect */ + if (prte_prteds_term_ordered || prte_finalizing || prte_abnormal_term_ordered) { + PMIX_RELEASE(pop); + return; + } + + /* activate the proc state */ + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, + "%s tcp:failed_to_connect unable to reach peer %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pop->peer)); + + PRTE_ACTIVATE_PROC_STATE(&pop->peer, PRTE_PROC_STATE_FAILED_TO_CONNECT); + PMIX_RELEASE(pop); +} + + +/* OOB TCP Class instances */ + +static void peer_cons(prte_oob_tcp_peer_t *peer) +{ + peer->auth_method = NULL; + peer->sd = -1; + PMIX_CONSTRUCT(&peer->addrs, pmix_list_t); + peer->active_addr = NULL; + peer->state = MCA_OOB_TCP_UNCONNECTED; + peer->num_retries = 0; + PMIX_CONSTRUCT(&peer->send_queue, pmix_list_t); + peer->send_msg = NULL; + peer->recv_msg = NULL; + peer->send_ev_active = false; + peer->recv_ev_active = false; + peer->timer_ev_active = false; +} +static void peer_des(prte_oob_tcp_peer_t *peer) +{ + if (NULL != peer->auth_method) { + free(peer->auth_method); + } + if (peer->send_ev_active) { + prte_event_del(&peer->send_event); + } + if (peer->recv_ev_active) { + prte_event_del(&peer->recv_event); + } + if (peer->timer_ev_active) { + prte_event_del(&peer->timer_event); + } + if (0 <= peer->sd) { + pmix_output_verbose(2, prte_oob_base.output, + "%s CLOSING SOCKET %d", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), peer->sd); + CLOSE_THE_SOCKET(peer->sd); + } + PMIX_LIST_DESTRUCT(&peer->addrs); + PMIX_LIST_DESTRUCT(&peer->send_queue); +} +PMIX_CLASS_INSTANCE(prte_oob_tcp_peer_t, pmix_list_item_t, peer_cons, peer_des); + +static void padd_cons(prte_oob_tcp_addr_t *ptr) +{ + memset(&ptr->addr, 0, sizeof(ptr->addr)); + ptr->retries = 0; + ptr->state = MCA_OOB_TCP_UNCONNECTED; +} +PMIX_CLASS_INSTANCE(prte_oob_tcp_addr_t, pmix_list_item_t, padd_cons, NULL); + +static void pop_cons(prte_oob_tcp_peer_op_t *pop) +{ + pop->net = NULL; + pop->port = NULL; +} +static void pop_des(prte_oob_tcp_peer_op_t *pop) +{ + if (NULL != pop->net) { + free(pop->net); + } + if (NULL != pop->port) { + free(pop->port); + } +} +PMIX_CLASS_INSTANCE(prte_oob_tcp_peer_op_t, pmix_object_t, pop_cons, pop_des); + +PMIX_CLASS_INSTANCE(prte_oob_tcp_msg_op_t, pmix_object_t, NULL, NULL); + +PMIX_CLASS_INSTANCE(prte_oob_tcp_conn_op_t, pmix_object_t, NULL, NULL); + +static void nicaddr_cons(prte_oob_tcp_nicaddr_t *ptr) +{ + ptr->af_family = PF_UNSPEC; + memset(&ptr->addr, 0, sizeof(ptr->addr)); +} +PMIX_CLASS_INSTANCE(prte_oob_tcp_nicaddr_t, pmix_list_item_t, nicaddr_cons, NULL); diff --git a/src/mca/oob/tcp/oob_tcp_connection.c b/src/rml/oob/oob_tcp_connection.c similarity index 92% rename from src/mca/oob/tcp/oob_tcp_connection.c rename to src/rml/oob/oob_tcp_connection.c index d77bf2de2e..4218f26431 100644 --- a/src/mca/oob/tcp/oob_tcp_connection.c +++ b/src/rml/oob/oob_tcp_connection.c @@ -19,7 +19,7 @@ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights * reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,14 +76,10 @@ #include "src/util/name_fns.h" #include "src/util/pmix_show_help.h" -#include "oob_tcp.h" -#include "oob_tcp_common.h" -#include "oob_tcp_connection.h" -#include "oob_tcp_peer.h" -#include "src/mca/oob/tcp/oob_tcp_common.h" -#include "src/mca/oob/tcp/oob_tcp_component.h" -#include "src/mca/oob/tcp/oob_tcp_connection.h" -#include "src/mca/oob/tcp/oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_connection.h" +#include "src/rml/oob/oob_tcp_peer.h" static void tcp_peer_event_init(prte_oob_tcp_peer_t *peer); static int tcp_peer_send_connect_ack(prte_oob_tcp_peer_t *peer); @@ -100,7 +96,7 @@ static int tcp_peer_create_socket(prte_oob_tcp_peer_t *peer, sa_family_t family) return PRTE_SUCCESS; } - PMIX_OUTPUT_VERBOSE((1, prte_oob_base_framework.framework_output, + PMIX_OUTPUT_VERBOSE((1, prte_oob_base.output, "%s oob:tcp:peer creating socket to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)))); peer->sd = socket(family, SOCK_STREAM, 0); @@ -149,7 +145,7 @@ static int tcp_peer_create_socket(prte_oob_tcp_peer_t *peer, sa_family_t family) */ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) { - pmix_list_t *local_list = &prte_mca_oob_tcp_component.local_ifs, *remote_list; + pmix_list_t *local_list = &prte_oob_base.local_ifs, *remote_list; int rc, i, j, local_if_count, remote_if_count, best, best_i = 0, best_j = 0; prte_oob_tcp_conn_op_t *op = (prte_oob_tcp_conn_op_t *) cbdata; prte_reachable_t *results = NULL; @@ -198,12 +194,12 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) results = prte_reachable.reachable(local_list, remote_list); /* Find match, bind socket. If connect attempt failed, move to next */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_tcp_peer_try_connect: " "attempting to connect to proc %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name))); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_tcp_peer_try_connect: " "attempting to connect to proc %s on socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), @@ -246,27 +242,27 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) peer->active_addr = (prte_oob_tcp_addr_t *) ptr; addr = peer->active_addr; /* Grab the local address we are using to bind the socket with */ - ptr = prte_mca_oob_tcp_component.local_ifs.pmix_list_sentinel.pmix_list_next; + ptr = prte_oob_base.local_ifs.pmix_list_sentinel.pmix_list_next; for (i = 0; i < best_i; i++) { ptr = ptr->pmix_list_next; } intf = (pmix_pif_t *) ptr; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_tcp_peer_try_connect: " "attempting to connect to proc %s on %s:%d - %d retries", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), pmix_net_get_hostname((struct sockaddr *) &addr->addr), pmix_net_get_port((struct sockaddr *) &addr->addr), addr->retries); if (MCA_OOB_TCP_FAILED == addr->state) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_tcp_peer_try_connect: %s:%d is down", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), pmix_net_get_hostname((struct sockaddr *) &addr->addr), pmix_net_get_port((struct sockaddr *) &addr->addr)); continue; } - if (prte_mca_oob_tcp_component.max_retries < addr->retries) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + if (prte_oob_base.max_retries < addr->retries) { + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_tcp_peer_try_connect: %s:%d retries exceeded", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), pmix_net_get_hostname((struct sockaddr *) &addr->addr), @@ -333,7 +329,7 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) /* non-blocking so wait for completion */ if (prte_socket_errno == EINPROGRESS || prte_socket_errno == EWOULDBLOCK) { pmix_output_verbose( - OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s waiting for connect completion to %s - activating send event", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); /* just ensure the send_event is active */ @@ -352,9 +348,9 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) * way by trying twice before giving up */ if (ECONNABORTED == prte_socket_errno) { - if (addr->retries < prte_mca_oob_tcp_component.max_retries) { + if (addr->retries < prte_oob_base.max_retries) { pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, - prte_oob_base_framework.framework_output, + prte_oob_base.output, "%s connection aborted by OS to %s - retrying", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); @@ -381,9 +377,9 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) /* it could be that the intended recipient just hasn't * started yet. if requested, wait awhile and try again * unless/until we hit the maximum number of retries */ - if (0 < prte_mca_oob_tcp_component.retry_delay) { - if (prte_mca_oob_tcp_component.max_recon_attempts < 0 - || peer->num_retries < prte_mca_oob_tcp_component.max_recon_attempts) { + if (0 < prte_oob_base.retry_delay) { + if (prte_oob_base.max_recon_attempts < 0 + || peer->num_retries < prte_oob_base.max_recon_attempts) { struct timeval tv; /* close the current socket */ CLOSE_THE_SOCKET(peer->sd); @@ -394,7 +390,7 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) addr->retries = 0; } /* give it awhile and try again */ - tv.tv_sec = prte_mca_oob_tcp_component.retry_delay; + tv.tv_sec = prte_oob_base.retry_delay; tv.tv_usec = 0; ++peer->num_retries; PRTE_RETRY_TCP_CONN_STATE(peer, prte_oob_tcp_peer_try_connect, &tv); @@ -439,7 +435,7 @@ void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) goto cleanup; } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_tcp_peer_try_connect: " "Connection to proc %s succeeded", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); @@ -502,7 +498,7 @@ static int tcp_peer_send_connect_ack(prte_oob_tcp_peer_t *peer) uint16_t ack_flag = htons(1); size_t sdsize, offset = 0; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s SEND CONNECT ACK", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); /* load the header */ @@ -557,7 +553,7 @@ static int tcp_peer_send_connect_nack(int sd, pmix_proc_t *name) int rc = PRTE_SUCCESS; size_t sdsize, offset = 0; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s SEND CONNECT NACK", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); /* load the header */ @@ -630,7 +626,7 @@ void prte_oob_tcp_peer_complete_connect(prte_oob_tcp_peer_t *peer) int so_error = 0; prte_socklen_t so_length = sizeof(so_error); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:complete_connect called for peer %s on socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name), peer->sd); @@ -645,12 +641,12 @@ void prte_oob_tcp_peer_complete_connect(prte_oob_tcp_peer_t *peer) } if (so_error == EINPROGRESS) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:send:handler still in progress", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); return; } else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s-%s tcp_peer_complete_connect: connection failed: %s (%d)", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), strerror(so_error), so_error); @@ -660,7 +656,7 @@ void prte_oob_tcp_peer_complete_connect(prte_oob_tcp_peer_t *peer) /* No need to worry about the return code here - we return regardless at this point, and if an error did occur a message has already been printed for the user */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s-%s tcp_peer_complete_connect: " "connection failed with error %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), @@ -669,14 +665,14 @@ void prte_oob_tcp_peer_complete_connect(prte_oob_tcp_peer_t *peer) return; } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp_peer_complete_connect: " "sending ack to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name))); if (tcp_peer_send_connect_ack(peer) == PRTE_SUCCESS) { peer->state = MCA_OOB_TCP_CONNECT_ACK; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp_peer_complete_connect: " "setting read event on connection to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name))); @@ -706,7 +702,7 @@ static int tcp_peer_send_blocking(int sd, void *data, size_t size) PMIX_ACQUIRE_OBJECT(ptr); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s send blocking of %" PRIsize_t " bytes to socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), size, sd); @@ -725,7 +721,7 @@ static int tcp_peer_send_blocking(int sd, void *data, size_t size) cnt += retval; } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s blocking send complete to socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), sd); @@ -741,7 +737,7 @@ static bool retry(prte_oob_tcp_peer_t *peer, int sd, bool fatal) { int cmpval; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s SIMUL CONNECTION WITH %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); cmpval = prte_util_compare_name_fields(PRTE_NS_CMP_ALL, &peer->name, PRTE_PROC_MY_NAME); @@ -800,7 +796,7 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob uint16_t ack_flag; bool is_new = (NULL == pr); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s RECV CONNECT ACK FROM %s ON SOCKET %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == pr) ? "UNKNOWN" : PRTE_NAME_PRINT(&pr->name), sd); @@ -823,14 +819,14 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob } } else { /* unable to complete the recv */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s unable to complete recv of connect-ack from %s ON SOCKET %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == peer) ? "UNKNOWN" : PRTE_NAME_PRINT(&peer->name), sd); return PRTE_ERR_UNREACH; } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s connect-ack recvd from %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == peer) ? "UNKNOWN" : PRTE_NAME_PRINT(&peer->name)); @@ -867,13 +863,13 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob if (NULL == peer) { peer = prte_oob_tcp_peer_lookup(&hdr.origin); if (NULL == peer) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_oob_tcp_recv_connect: connection from new peer", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); peer = PMIX_NEW(prte_oob_tcp_peer_t); PMIX_XFER_PROCID(&peer->name, &hdr.origin); peer->state = MCA_OOB_TCP_ACCEPTING; - pmix_list_append(&prte_mca_oob_tcp_component.peers, &peer->super); + pmix_list_append(&prte_oob_base.peers, &peer->super); } } else { /* compare the peers name to the expected value */ @@ -889,7 +885,7 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob } } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s connect-ack header from %s is okay", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); @@ -901,7 +897,7 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob } if (!tcp_peer_recv_blocking(peer, sd, msg, hdr.nbytes)) { /* unable to complete the recv but should never happen */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s unable to complete recv of connect-ack from %s ON SOCKET %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name), peer->sd); @@ -986,7 +982,7 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob } free(msg); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s connect-ack version from %s matches ours", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); @@ -997,15 +993,10 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob return PRTE_SUCCESS; } - /* set the peer into the component and OOB-level peer tables to indicate - * that we know this peer and we will be handling him - */ - PRTE_ACTIVATE_TCP_CMP_OP(peer, prte_mca_oob_tcp_component_set_module); - /* connected */ tcp_peer_connected(peer); if (OOB_TCP_DEBUG_CONNECT - <= pmix_output_get_verbosity(prte_oob_base_framework.framework_output)) { + <= pmix_output_get_verbosity(prte_oob_base.output)) { prte_oob_tcp_peer_dump(peer, "connected"); } return PRTE_SUCCESS; @@ -1017,7 +1008,7 @@ int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *pr, int sd, prte_oob */ static void tcp_peer_connected(prte_oob_tcp_peer_t *peer) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s-%s tcp_peer_connected on socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), peer->sd); @@ -1048,7 +1039,7 @@ static void tcp_peer_connected(prte_oob_tcp_peer_t *peer) */ void prte_oob_tcp_peer_close(prte_oob_tcp_peer_t *peer) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp_peer_close for %s sd %d state %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), peer->sd, prte_oob_tcp_state_print(peer->state)); @@ -1115,7 +1106,7 @@ static bool tcp_peer_recv_blocking(prte_oob_tcp_peer_t *peer, int sd, void *data unsigned char *ptr = (unsigned char *) data; size_t cnt = 0; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s waiting for connect ack from %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == peer) ? "UNKNOWN" : PRTE_NAME_PRINT(&(peer->name))); @@ -1124,7 +1115,7 @@ static bool tcp_peer_recv_blocking(prte_oob_tcp_peer_t *peer, int sd, void *data /* remote closed connection */ if (retval == 0) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s-%s tcp_peer_recv_blocking: " "peer closed connection: peer state %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), @@ -1162,7 +1153,7 @@ static bool tcp_peer_recv_blocking(prte_oob_tcp_peer_t *peer, int sd, void *data recv_connect_ack, who will try to establish the connection again */ pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, - prte_oob_base_framework.framework_output, + prte_oob_base.output, "%s connect ack received error %s from %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), strerror(prte_socket_errno), @@ -1184,7 +1175,7 @@ static bool tcp_peer_recv_blocking(prte_oob_tcp_peer_t *peer, int sd, void *data cnt += retval; } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s connect ack received from %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == peer) ? "UNKNOWN" : PRTE_NAME_PRINT(&(peer->name))); return true; @@ -1263,7 +1254,7 @@ void prte_oob_tcp_peer_dump(prte_oob_tcp_peer_t *peer, const char *msg) bool prte_oob_tcp_peer_accept(prte_oob_tcp_peer_t *peer) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp:peer_accept called for peer %s in state %s on socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name), prte_oob_tcp_state_print(peer->state), peer->sd); @@ -1282,11 +1273,6 @@ bool prte_oob_tcp_peer_accept(prte_oob_tcp_peer_t *peer) return false; } - /* set the peer into the component and OOB-level peer tables to indicate - * that we know this peer and we will be handling him - */ - PRTE_ACTIVATE_TCP_CMP_OP(peer, prte_mca_oob_tcp_component_set_module); - tcp_peer_connected(peer); if (!peer->recv_ev_active) { peer->recv_ev_active = true; @@ -1294,13 +1280,13 @@ bool prte_oob_tcp_peer_accept(prte_oob_tcp_peer_t *peer) prte_event_add(&peer->recv_event, 0); } if (OOB_TCP_DEBUG_CONNECT - <= pmix_output_get_verbosity(prte_oob_base_framework.framework_output)) { + <= pmix_output_get_verbosity(prte_oob_base.output)) { prte_oob_tcp_peer_dump(peer, "accepted"); } return true; } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp:peer_accept ignored for peer %s in state %s on socket %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name), prte_oob_tcp_state_print(peer->state), peer->sd); diff --git a/src/mca/oob/tcp/oob_tcp_connection.h b/src/rml/oob/oob_tcp_connection.h similarity index 83% rename from src/mca/oob/tcp/oob_tcp_connection.h rename to src/rml/oob/oob_tcp_connection.h index db2272959c..731cfec6d1 100644 --- a/src/mca/oob/tcp/oob_tcp_connection.h +++ b/src/rml/oob/oob_tcp_connection.h @@ -15,7 +15,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,8 +35,8 @@ # include #endif -#include "oob_tcp.h" -#include "oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_peer.h" #include "src/threads/pmix_threads.h" /* State machine for connection operations */ @@ -56,7 +56,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_conn_op_t); #define PRTE_ACTIVATE_TCP_CONN_STATE(p, cbfunc) \ do { \ prte_oob_tcp_conn_op_t *cop; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] connect to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT((&(p)->name))); \ cop = PMIX_NEW(prte_oob_tcp_conn_op_t); \ @@ -76,7 +76,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_conn_op_t); #define PRTE_RETRY_TCP_CONN_STATE(p, cbfunc, tv) \ do { \ prte_oob_tcp_conn_op_t *cop; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] retry connect to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT((&(p)->name))); \ cop = PMIX_NEW(prte_oob_tcp_conn_op_t); \ @@ -86,12 +86,12 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_conn_op_t); prte_event_evtimer_add(&cop->ev, (tv)); \ } while (0); -PRTE_MODULE_EXPORT void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata); -PRTE_MODULE_EXPORT void prte_oob_tcp_peer_dump(prte_oob_tcp_peer_t *peer, const char *msg); -PRTE_MODULE_EXPORT bool prte_oob_tcp_peer_accept(prte_oob_tcp_peer_t *peer); -PRTE_MODULE_EXPORT void prte_oob_tcp_peer_complete_connect(prte_oob_tcp_peer_t *peer); -PRTE_MODULE_EXPORT int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *peer, int sd, +PRTE_EXPORT void prte_oob_tcp_peer_try_connect(int fd, short args, void *cbdata); +PRTE_EXPORT void prte_oob_tcp_peer_dump(prte_oob_tcp_peer_t *peer, const char *msg); +PRTE_EXPORT bool prte_oob_tcp_peer_accept(prte_oob_tcp_peer_t *peer); +PRTE_EXPORT void prte_oob_tcp_peer_complete_connect(prte_oob_tcp_peer_t *peer); +PRTE_EXPORT int prte_oob_tcp_peer_recv_connect_ack(prte_oob_tcp_peer_t *peer, int sd, prte_oob_tcp_hdr_t *dhdr); -PRTE_MODULE_EXPORT void prte_oob_tcp_peer_close(prte_oob_tcp_peer_t *peer); +PRTE_EXPORT void prte_oob_tcp_peer_close(prte_oob_tcp_peer_t *peer); #endif /* _MCA_OOB_TCP_CONNECTION_H_ */ diff --git a/src/mca/oob/tcp/oob_tcp_hdr.h b/src/rml/oob/oob_tcp_hdr.h similarity index 97% rename from src/mca/oob/tcp/oob_tcp_hdr.h rename to src/rml/oob/oob_tcp_hdr.h index e014ccdd37..05977156be 100644 --- a/src/mca/oob/tcp/oob_tcp_hdr.h +++ b/src/rml/oob/oob_tcp_hdr.h @@ -16,7 +16,7 @@ * Copyright (c) 2017-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/src/mca/oob/tcp/oob_tcp_listener.c b/src/rml/oob/oob_tcp_listener.c similarity index 81% rename from src/mca/oob/tcp/oob_tcp_listener.c rename to src/rml/oob/oob_tcp_listener.c index 90cf611878..0c11764bae 100644 --- a/src/mca/oob/tcp/oob_tcp_listener.c +++ b/src/rml/oob/oob_tcp_listener.c @@ -68,12 +68,11 @@ #include "src/util/pmix_parse_options.h" #include "src/util/pmix_show_help.h" -#include "src/mca/oob/tcp/oob_tcp.h" -#include "src/mca/oob/tcp/oob_tcp_common.h" -#include "src/mca/oob/tcp/oob_tcp_component.h" -#include "src/mca/oob/tcp/oob_tcp_connection.h" -#include "src/mca/oob/tcp/oob_tcp_listener.h" -#include "src/mca/oob/tcp/oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_connection.h" +#include "src/rml/oob/oob_tcp_listener.h" +#include "src/rml/oob/oob_tcp_peer.h" static void connection_event_handler(int incoming_sd, short flags, void *cbdata); static void *listen_thread(pmix_object_t *obj); @@ -101,9 +100,9 @@ int prte_oob_tcp_start_listening(void) prte_oob_tcp_listener_t *listener; /* if we don't have any TCP interfaces, we shouldn't be here */ - if (NULL == prte_mca_oob_tcp_component.ipv4conns + if (NULL == prte_oob_base.ipv4conns #if PRTE_ENABLE_IPV6 - && NULL == prte_mca_oob_tcp_component.ipv6conns + && NULL == prte_oob_base.ipv6conns #endif ) { PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); @@ -128,25 +127,25 @@ int prte_oob_tcp_start_listening(void) * harvest connection requests as rapidly as possible */ if (PRTE_PROC_IS_MASTER) { - if (0 > pipe(prte_mca_oob_tcp_component.stop_thread)) { + if (0 > pipe(prte_oob_base.stop_thread)) { PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); return PRTE_ERR_OUT_OF_RESOURCE; } /* Make sure the pipe FDs are set to close-on-exec so that they don't leak into children */ - if (pmix_fd_set_cloexec(prte_mca_oob_tcp_component.stop_thread[0]) != PRTE_SUCCESS - || pmix_fd_set_cloexec(prte_mca_oob_tcp_component.stop_thread[1]) != PRTE_SUCCESS) { - close(prte_mca_oob_tcp_component.stop_thread[0]); - close(prte_mca_oob_tcp_component.stop_thread[1]); + if (pmix_fd_set_cloexec(prte_oob_base.stop_thread[0]) != PRTE_SUCCESS + || pmix_fd_set_cloexec(prte_oob_base.stop_thread[1]) != PRTE_SUCCESS) { + close(prte_oob_base.stop_thread[0]); + close(prte_oob_base.stop_thread[1]); PRTE_ERROR_LOG(PRTE_ERR_IN_ERRNO); return PRTE_ERR_IN_ERRNO; } - prte_mca_oob_tcp_component.listen_thread_active = true; - prte_mca_oob_tcp_component.listen_thread.t_run = listen_thread; - prte_mca_oob_tcp_component.listen_thread.t_arg = NULL; - if (PRTE_SUCCESS != (rc = pmix_thread_start(&prte_mca_oob_tcp_component.listen_thread))) { + prte_oob_base.listen_thread_active = true; + prte_oob_base.listen_thread.t_run = listen_thread; + prte_oob_base.listen_thread.t_arg = NULL; + if (PRTE_SUCCESS != (rc = pmix_thread_start(&prte_oob_base.listen_thread))) { PRTE_ERROR_LOG(rc); pmix_output(0, "%s Unable to start listen thread", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); } @@ -155,7 +154,7 @@ int prte_oob_tcp_start_listening(void) /* otherwise, setup to listen via the event lib */ - PMIX_LIST_FOREACH(listener, &prte_mca_oob_tcp_component.listeners, prte_oob_tcp_listener_t) + PMIX_LIST_FOREACH(listener, &prte_oob_base.listeners, prte_oob_tcp_listener_t) { listener->ev_active = true; prte_event_set(prte_event_base, &listener->event, listener->sd, @@ -191,16 +190,16 @@ static int create_listen(void) * port in the range. Otherwise, tcp_port_min will be 0, which * means "pick any port" */ - if (NULL != prte_mca_oob_tcp_component.tcp_static_ports) { + if (NULL != prte_oob_base.tcp_static_ports) { /* if static ports were provided, take the * first entry in the list */ - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ports, prte_mca_oob_tcp_component.tcp_static_ports[0]); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ports, prte_oob_base.tcp_static_ports[0]); /* flag that we are using static ports */ prte_static_ports = true; - } else if (NULL != prte_mca_oob_tcp_component.tcp_dyn_ports) { + } else if (NULL != prte_oob_base.tcp_dyn_ports) { /* take the entire range */ - ports = PMIX_ARGV_COPY_COMPAT(prte_mca_oob_tcp_component.tcp_dyn_ports); + ports = PMIX_ARGV_COPY_COMPAT(prte_oob_base.tcp_dyn_ports); prte_static_ports = false; } else { /* flag the system to dynamically take any available port */ @@ -225,7 +224,7 @@ static int create_listen(void) * sockets to support more flexible wireup protocols */ for (i = 0; i < PMIX_ARGV_COUNT_COMPAT(ports); i++) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "%s attempting to bind to IPv4 port %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), ports[i]); /* get the port number */ @@ -239,7 +238,7 @@ static int create_listen(void) sd = socket(AF_INET, SOCK_STREAM, 0); if (sd < 0) { if (EAFNOSUPPORT != prte_socket_errno) { - pmix_output(0, "prte_mca_oob_tcp_component_init: socket() failed: %s (%d)", + pmix_output(0, "prte_oob_create_listen: socket() failed: %s (%d)", strerror(prte_socket_errno), prte_socket_errno); } PMIX_ARGV_FREE_COMPAT(ports); @@ -254,7 +253,7 @@ static int create_listen(void) } if (setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, (const char *) &flags, sizeof(flags)) < 0) { pmix_output(0, - "prte_oob_tcp_create_listen: unable to set the " + "prte_oob_create_listen: unable to set the " "SO_REUSEADDR option (%s:%d)\n", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); @@ -266,7 +265,7 @@ static int create_listen(void) this FD */ if (pmix_fd_set_cloexec(sd) != PRTE_SUCCESS) { pmix_output(0, - "prte_oob_tcp_create_listen: unable to set the " + "prte_oob_create_listen: unable to set the " "listening socket to CLOEXEC (%s:%d)\n", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); @@ -287,7 +286,7 @@ static int create_listen(void) } /* resolve assigned port */ if (getsockname(sd, (struct sockaddr *) &inaddr, &addrlen) < 0) { - pmix_output(0, "prte_oob_tcp_create_listen: getsockname(): %s (%d)", + pmix_output(0, "prte_oob_create_listen: getsockname(): %s (%d)", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); PMIX_ARGV_FREE_COMPAT(ports); @@ -296,7 +295,7 @@ static int create_listen(void) /* setup listen backlog to maximum allowed by kernel */ if (listen(sd, SOMAXCONN) < 0) { - pmix_output(0, "prte_mca_oob_tcp_component_init: listen(): %s (%d)", + pmix_output(0, "prte_oob_create_listen: listen(): %s (%d)", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); PMIX_ARGV_FREE_COMPAT(ports); @@ -305,7 +304,7 @@ static int create_listen(void) /* set socket up to be non-blocking, otherwise accept could block */ if ((flags = fcntl(sd, F_GETFL, 0)) < 0) { - pmix_output(0, "prte_mca_oob_tcp_component_init: fcntl(F_GETFL) failed: %s (%d)", + pmix_output(0, "prte_oob_create_listen init: fcntl(F_GETFL) failed: %s (%d)", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); PMIX_ARGV_FREE_COMPAT(ports); @@ -313,7 +312,7 @@ static int create_listen(void) } flags |= O_NONBLOCK; if (fcntl(sd, F_SETFL, flags) < 0) { - pmix_output(0, "prte_mca_oob_tcp_component_init: fcntl(F_SETFL) failed: %s (%d)", + pmix_output(0, "prte_oob_create_listen init: fcntl(F_SETFL) failed: %s (%d)", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); PMIX_ARGV_FREE_COMPAT(ports); @@ -328,13 +327,13 @@ static int create_listen(void) /* save the first one */ prte_process_info.my_port = conn->port; } - pmix_list_append(&prte_mca_oob_tcp_component.listeners, &conn->item); + pmix_list_append(&prte_oob_base.listeners, &conn->item); /* and to our ports */ pmix_asprintf(&tconn, "%d", ntohs(((struct sockaddr_in *) &inaddr)->sin_port)); - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_mca_oob_tcp_component.ipv4ports, tconn); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_oob_base.ipv4ports, tconn); free(tconn); if (OOB_TCP_DEBUG_CONNECT - <= pmix_output_get_verbosity(prte_oob_base_framework.framework_output)) { + <= pmix_output_get_verbosity(prte_oob_base.output)) { port = ntohs(((struct sockaddr_in *) &inaddr)->sin_port); pmix_output(0, "%s assigned IPv4 port %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), port); } @@ -347,7 +346,7 @@ static int create_listen(void) /* done with this, so release it */ PMIX_ARGV_FREE_COMPAT(ports); - if (0 == pmix_list_get_size(&prte_mca_oob_tcp_component.listeners)) { + if (0 == pmix_list_get_size(&prte_oob_base.listeners)) { /* cleanup */ if (0 <= sd) { CLOSE_THE_SOCKET(sd); @@ -384,16 +383,16 @@ static int create_listen6(void) * means "pick any port" */ if (PRTE_PROC_IS_DAEMON) { - if (NULL != prte_mca_oob_tcp_component.tcp6_static_ports) { + if (NULL != prte_oob_base.tcp6_static_ports) { /* if static ports were provided, take the * first entry in the list */ - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ports, prte_mca_oob_tcp_component.tcp6_static_ports[0]); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ports, prte_oob_base.tcp6_static_ports[0]); /* flag that we are using static ports */ prte_static_ports = true; - } else if (NULL != prte_mca_oob_tcp_component.tcp6_dyn_ports) { + } else if (NULL != prte_oob_base.tcp6_dyn_ports) { /* take the entire range */ - ports = PMIX_ARGV_COPY_COMPAT(prte_mca_oob_tcp_component.tcp6_dyn_ports); + ports = PMIX_ARGV_COPY_COMPAT(prte_oob_base.tcp6_dyn_ports); prte_static_ports = false; } else { /* flag the system to dynamically take any available port */ @@ -401,16 +400,16 @@ static int create_listen6(void) prte_static_ports = false; } } else { - if (NULL != prte_mca_oob_tcp_component.tcp6_static_ports) { + if (NULL != prte_oob_base.tcp6_static_ports) { /* if static ports were provided, take the * first entry in the list */ - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ports, prte_mca_oob_tcp_component.tcp6_static_ports[0]); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ports, prte_oob_base.tcp6_static_ports[0]); /* flag that we are using static ports */ prte_static_ports = true; - } else if (NULL != prte_mca_oob_tcp_component.tcp6_dyn_ports) { + } else if (NULL != prte_oob_base.tcp6_dyn_ports) { /* take the entire range */ - ports = PMIX_ARGV_COPY_COMPAT(prte_mca_oob_tcp_component.tcp6_dyn_ports); + ports = PMIX_ARGV_COPY_COMPAT(prte_oob_base.tcp6_dyn_ports); prte_static_ports = false; } else { /* flag the system to dynamically take any available port */ @@ -436,7 +435,7 @@ static int create_listen6(void) * sockets to support more flexible wireup protocols */ for (i = 0; i < PMIX_ARGV_COUNT_COMPAT(ports); i++) { - pmix_output_verbose(5, prte_oob_base_framework.framework_output, + pmix_output_verbose(5, prte_oob_base.output, "%s attempting to bind to IPv6 port %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), ports[i]); /* get the port number */ @@ -450,7 +449,7 @@ static int create_listen6(void) sd = socket(AF_INET6, SOCK_STREAM, 0); if (sd < 0) { if (EAFNOSUPPORT != prte_socket_errno) { - pmix_output(0, "prte_mca_oob_tcp_component_init: socket() failed: %s (%d)", + pmix_output(0, "prte_oob_create_listen6: socket() failed: %s (%d)", strerror(prte_socket_errno), prte_socket_errno); } return PRTE_ERR_IN_ERRNO; @@ -459,7 +458,7 @@ static int create_listen6(void) this FD */ if (pmix_fd_set_cloexec(sd) != PRTE_SUCCESS) { pmix_output(0, - "prte_oob_tcp_create_listen6: unable to set the " + "prte_oob_create_listen6: unable to set the " "listening socket to CLOEXEC (%s:%d)\n", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); @@ -496,7 +495,7 @@ static int create_listen6(void) } /* resolve assigned port */ if (getsockname(sd, (struct sockaddr *) &inaddr, &addrlen) < 0) { - pmix_output(0, "prte_oob_tcp_create_listen: getsockname(): %s (%d)", + pmix_output(0, "prte_oob_create_listen6: getsockname(): %s (%d)", strerror(prte_socket_errno), prte_socket_errno); CLOSE_THE_SOCKET(sd); return PRTE_ERROR; @@ -504,20 +503,20 @@ static int create_listen6(void) /* setup listen backlog to maximum allowed by kernel */ if (listen(sd, SOMAXCONN) < 0) { - pmix_output(0, "prte_mca_oob_tcp_component_init: listen(): %s (%d)", + pmix_output(0, "prte_oob_create_listen6: listen(): %s (%d)", strerror(prte_socket_errno), prte_socket_errno); return PRTE_ERROR; } /* set socket up to be non-blocking, otherwise accept could block */ if ((flags = fcntl(sd, F_GETFL, 0)) < 0) { - pmix_output(0, "prte_mca_oob_tcp_component_init: fcntl(F_GETFL) failed: %s (%d)", + pmix_output(0, "prte_oob_create_listen6: fcntl(F_GETFL) failed: %s (%d)", strerror(prte_socket_errno), prte_socket_errno); return PRTE_ERROR; } flags |= O_NONBLOCK; if (fcntl(sd, F_SETFL, flags) < 0) { - pmix_output(0, "prte_mca_oob_tcp_component_init: fcntl(F_SETFL) failed: %s (%d)", + pmix_output(0, "prte_oob_create_listen6: fcntl(F_SETFL) failed: %s (%d)", strerror(prte_socket_errno), prte_socket_errno); return PRTE_ERROR; } @@ -527,13 +526,13 @@ static int create_listen6(void) conn->tcp6 = true; conn->sd = sd; conn->port = ntohs(((struct sockaddr_in6 *) &inaddr)->sin6_port); - pmix_list_append(&prte_mca_oob_tcp_component.listeners, &conn->item); + pmix_list_append(&prte_oob_base.listeners, &conn->item); /* and to our ports */ pmix_asprintf(&tconn, "%d", ntohs(((struct sockaddr_in6 *) &inaddr)->sin6_port)); - PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_mca_oob_tcp_component.ipv6ports, tconn); + PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_oob_base.ipv6ports, tconn); free(tconn); if (OOB_TCP_DEBUG_CONNECT - <= pmix_output_get_verbosity(prte_oob_base_framework.framework_output)) { + <= pmix_output_get_verbosity(prte_oob_base.output)) { pmix_output(0, "%s assigned IPv6 port %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (int) ntohs(((struct sockaddr_in6 *) &inaddr)->sin6_port)); } @@ -543,7 +542,7 @@ static int create_listen6(void) break; } } - if (0 == pmix_list_get_size(&prte_mca_oob_tcp_component.listeners)) { + if (0 == pmix_list_get_size(&prte_oob_base.listeners)) { /* cleanup */ CLOSE_THE_SOCKET(sd); PMIX_ARGV_FREE_COMPAT(ports); @@ -579,28 +578,27 @@ static void *listen_thread(pmix_object_t *obj) * to the event method for handling any further connections * so as to minimize overhead */ - while (prte_mca_oob_tcp_component.listen_thread_active) { + while (prte_oob_base.listen_thread_active) { FD_ZERO(&readfds); max = -1; - PMIX_LIST_FOREACH(listener, &prte_mca_oob_tcp_component.listeners, prte_oob_tcp_listener_t) + PMIX_LIST_FOREACH(listener, &prte_oob_base.listeners, prte_oob_tcp_listener_t) { FD_SET(listener->sd, &readfds); max = (listener->sd > max) ? listener->sd : max; } /* add the stop_thread fd */ - FD_SET(prte_mca_oob_tcp_component.stop_thread[0], &readfds); - max = (prte_mca_oob_tcp_component.stop_thread[0] > max) ? prte_mca_oob_tcp_component.stop_thread[0] - : max; + FD_SET(prte_oob_base.stop_thread[0], &readfds); + max = (prte_oob_base.stop_thread[0] > max) ? prte_oob_base.stop_thread[0] : max; /* set timeout interval */ - timeout.tv_sec = prte_mca_oob_tcp_component.listen_thread_tv.tv_sec; - timeout.tv_usec = prte_mca_oob_tcp_component.listen_thread_tv.tv_usec; + timeout.tv_sec = prte_oob_base.listen_thread_tv.tv_sec; + timeout.tv_usec = prte_oob_base.listen_thread_tv.tv_usec; /* Block in a select to avoid hammering the cpu. If a connection * comes in, we'll get woken up right away. */ rc = select(max + 1, &readfds, NULL, NULL, &timeout); - if (!prte_mca_oob_tcp_component.listen_thread_active) { + if (!prte_oob_base.listen_thread_active) { /* we've been asked to terminate */ return NULL; } @@ -617,7 +615,7 @@ static void *listen_thread(pmix_object_t *obj) */ do { accepted_connections = 0; - PMIX_LIST_FOREACH(listener, &prte_mca_oob_tcp_component.listeners, prte_oob_tcp_listener_t) + PMIX_LIST_FOREACH(listener, &prte_oob_base.listeners, prte_oob_tcp_listener_t) { sd = listener->sd; @@ -677,7 +675,7 @@ static void *listen_thread(pmix_object_t *obj) } } - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s prte_oob_tcp_listen_thread: incoming connection: " "(%d, %d) %s:%d\n", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), pending_connection->fd, @@ -716,28 +714,6 @@ static void *listen_thread(pmix_object_t *obj) } done: -#if 0 - /* once we complete the initial launch, the "flood" of connections - * will end - only connection requests from local procs, connect/accept - * operations across mpirun instances, or the occasional tool will need - * to be serviced. As these are relatively small events, we can easily - * handle them in the context of the event library and no longer require - * a separate connection harvesting thread. So switch over to the event - * lib handler now - */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, - "%s prte_oob_tcp_listen_thread: switching to event lib", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - /* setup to listen via event library */ - PMIX_LIST_FOREACH(listener, &prte_mca_oob_tcp_component.listeners, prte_oob_tcp_listener_t) { - prte_event_set(prte_event_base, listener->event, - listener->sd, - PRTE_EV_READ|PRTE_EV_PERSIST, - connection_event_handler, - 0); - prte_event_add(listener->event, 0); - } -#endif return NULL; } @@ -753,7 +729,7 @@ static void connection_handler(int sd, short flags, void *cbdata) PMIX_ACQUIRE_OBJECT(new_connection); - pmix_output_verbose(4, prte_oob_base_framework.framework_output, + pmix_output_verbose(4, prte_oob_base.output, "%s connection_handler: working connection " "(%d, %d) %s:%d\n", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), new_connection->fd, prte_socket_errno, @@ -761,8 +737,8 @@ static void connection_handler(int sd, short flags, void *cbdata) pmix_net_get_port((struct sockaddr *) &new_connection->addr)); /* process the connection */ - prte_oob_tcp_module.accept_connection(new_connection->fd, - (struct sockaddr *) &(new_connection->addr)); + prte_oob_accept_connection(new_connection->fd, (struct sockaddr *) &(new_connection->addr)); + /* cleanup */ PMIX_RELEASE(new_connection); } @@ -778,7 +754,7 @@ static void connection_event_handler(int incoming_sd, short flags, void *cbdata) PRTE_HIDE_UNUSED_PARAMS(flags, cbdata); sd = accept(incoming_sd, (struct sockaddr *) &addr, &addrlen); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s connection_event_handler: working connection " "(%d, %d) %s:%d\n", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), sd, prte_socket_errno, @@ -815,7 +791,7 @@ static void connection_event_handler(int incoming_sd, short flags, void *cbdata) } /* process the connection */ - prte_oob_tcp_module.accept_connection(sd, &addr); + prte_oob_accept_connection(sd, &addr); } static void tcp_ev_cons(prte_oob_tcp_listener_t *event) diff --git a/src/mca/oob/tcp/oob_tcp_listener.h b/src/rml/oob/oob_tcp_listener.h similarity index 93% rename from src/mca/oob/tcp/oob_tcp_listener.h rename to src/rml/oob/oob_tcp_listener.h index a109b15493..bfb4b984ae 100644 --- a/src/mca/oob/tcp/oob_tcp_listener.h +++ b/src/rml/oob/oob_tcp_listener.h @@ -15,7 +15,7 @@ * Copyright (c) 2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -60,6 +60,6 @@ typedef struct { } prte_oob_tcp_pending_connection_t; PMIX_CLASS_DECLARATION(prte_oob_tcp_pending_connection_t); -PRTE_MODULE_EXPORT int prte_oob_tcp_start_listening(void); +PRTE_EXPORT int prte_oob_tcp_start_listening(void); #endif /* _MCA_OOB_TCP_LISTENER_H_ */ diff --git a/src/mca/oob/tcp/oob_tcp_peer.h b/src/rml/oob/oob_tcp_peer.h similarity index 96% rename from src/mca/oob/tcp/oob_tcp_peer.h rename to src/rml/oob/oob_tcp_peer.h index a5753538be..04cac5c3cd 100644 --- a/src/mca/oob/tcp/oob_tcp_peer.h +++ b/src/rml/oob/oob_tcp_peer.h @@ -17,7 +17,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights * reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,8 +32,8 @@ #include "src/event/event-internal.h" -#include "oob_tcp.h" -#include "oob_tcp_sendrecv.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_sendrecv.h" #include "src/threads/pmix_threads.h" typedef struct { diff --git a/src/mca/oob/tcp/oob_tcp_sendrecv.c b/src/rml/oob/oob_tcp_sendrecv.c similarity index 94% rename from src/mca/oob/tcp/oob_tcp_sendrecv.c rename to src/rml/oob/oob_tcp_sendrecv.c index 8b2b627fc0..6d473cb1b6 100644 --- a/src/mca/oob/tcp/oob_tcp_sendrecv.c +++ b/src/rml/oob/oob_tcp_sendrecv.c @@ -16,7 +16,7 @@ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,11 +71,10 @@ #include "src/threads/pmix_threads.h" #include "src/util/name_fns.h" -#include "oob_tcp.h" -#include "src/mca/oob/tcp/oob_tcp_common.h" -#include "src/mca/oob/tcp/oob_tcp_component.h" -#include "src/mca/oob/tcp/oob_tcp_connection.h" -#include "src/mca/oob/tcp/oob_tcp_peer.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_common.h" +#include "src/rml/oob/oob_tcp_connection.h" +#include "src/rml/oob/oob_tcp_peer.h" #define OOB_SEND_MAX_RETRIES 3 @@ -205,14 +204,14 @@ void prte_oob_tcp_send_handler(int sd, short flags, void *cbdata) PMIX_ACQUIRE_OBJECT(peer); msg = peer->send_msg; - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp:send_handler called to send to peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECTING: case MCA_OOB_TCP_CLOSED: - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp:send_handler %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), prte_oob_tcp_state_print(peer->state)); prte_oob_tcp_peer_complete_connect(peer); @@ -225,17 +224,17 @@ void prte_oob_tcp_send_handler(int sd, short flags, void *cbdata) } break; case MCA_OOB_TCP_CONNECTED: - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s tcp:send_handler SENDING TO %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == peer->send_msg) ? "NULL" : PRTE_NAME_PRINT(&peer->name)); if (NULL != msg) { - pmix_output_verbose(2, prte_oob_base_framework.framework_output, + pmix_output_verbose(2, prte_oob_base.output, "oob:tcp:send_handler SENDING MSG"); if (PRTE_SUCCESS == (rc = send_msg(peer, msg))) { /* this msg is complete */ if (NULL != msg->data || NULL == msg->msg) { /* the relay is complete - release the data */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, + pmix_output_verbose(2, prte_oob_base.output, "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), @@ -244,7 +243,7 @@ void prte_oob_tcp_send_handler(int sd, short flags, void *cbdata) peer->send_msg = NULL; } else { /* we are done - notify the RML */ - pmix_output_verbose(2, prte_oob_base_framework.framework_output, + pmix_output_verbose(2, prte_oob_base.output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), @@ -327,7 +326,7 @@ static int read_bytes(prte_oob_tcp_peer_t *peer) * the error back to the RML and let the caller know * to abort this message */ - pmix_output_verbose(OOB_TCP_DEBUG_FAIL, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_FAIL, prte_oob_base.output, "%s-%s prte_oob_tcp_msg_recv: readv failed: %s (%d)", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name)), strerror(prte_socket_errno), prte_socket_errno); @@ -340,7 +339,7 @@ static int read_bytes(prte_oob_tcp_peer_t *peer) /* the remote peer closed the connection - report that condition * and let the caller know */ - pmix_output_verbose(OOB_TCP_DEBUG_FAIL, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_FAIL, prte_oob_base.output, "%s-%s prte_oob_tcp_msg_recv: peer closed connection", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&(peer->name))); /* stop all events */ @@ -390,14 +389,14 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) PMIX_ACQUIRE_OBJECT(peer); - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:recv:handler called for peer %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECT_ACK: if (PRTE_SUCCESS == (rc = prte_oob_tcp_peer_recv_connect_ack(peer, peer->sd, NULL))) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:recv:handler starting send/recv events", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ @@ -425,7 +424,7 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) /* we get an unreachable error returned if a connection * completes but is rejected - otherwise, we don't want * to terminate as we might be retrying the connection */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name)); prte_event_del(&peer->recv_event); @@ -434,11 +433,11 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) } break; case MCA_OOB_TCP_CONNECTED: - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:recv:handler CONNECTED", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:recv:handler allocate new recv msg", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); peer->recv_msg = PMIX_NEW(prte_oob_tcp_recv_t); @@ -454,7 +453,7 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:recv:handler read hdr", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); if (PRTE_SUCCESS == (rc = read_bytes(peer))) { /* completed reading the header */ @@ -464,14 +463,14 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, - prte_oob_base_framework.framework_output, + prte_oob_base.output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure } else { pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, - prte_oob_base_framework.framework_output, + prte_oob_base.output, "%s:tcp:recv:handler allocate data region of size %lu", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (unsigned long) peer->recv_msg->hdr.nbytes); @@ -487,7 +486,7 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) return; } else { /* close the connection */ - pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s:tcp:recv:handler error reading bytes - closing connection", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); prte_oob_tcp_peer_close(peer); @@ -503,7 +502,7 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) if (PRTE_SUCCESS == (rc = read_bytes(peer))) { /* we recvd all of the message */ pmix_output_verbose( - OOB_TCP_DEBUG_CONNECT, prte_oob_base_framework.framework_output, + OOB_TCP_DEBUG_CONNECT, prte_oob_base.output, "%s RECVD COMPLETE MESSAGE FROM %s (ORIGIN %s) OF %d BYTES FOR DEST %s TAG %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->name), PRTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int) peer->recv_msg->hdr.nbytes, @@ -513,7 +512,7 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) if (PMIX_CHECK_PROCID(&peer->recv_msg->hdr.dst, PRTE_PROC_MY_NAME)) { /* yes - post it to the RML for delivery */ pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, - prte_oob_base_framework.framework_output, + prte_oob_base.output, "%s DELIVERING TO RML tag = %d seq_num = %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), peer->recv_msg->hdr.tag, peer->recv_msg->hdr.seq_num); @@ -525,7 +524,7 @@ void prte_oob_tcp_recv_handler(int sd, short flags, void *cbdata) /* promote this to the OOB as some other transport might * be the next best hop */ pmix_output_verbose(OOB_TCP_DEBUG_CONNECT, - prte_oob_base_framework.framework_output, + prte_oob_base.output, "%s TCP PROMOTING ROUTED MESSAGE FOR %s TO OOB", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&peer->recv_msg->hdr.dst)); diff --git a/src/mca/oob/tcp/oob_tcp_sendrecv.h b/src/rml/oob/oob_tcp_sendrecv.h similarity index 95% rename from src/mca/oob/tcp/oob_tcp_sendrecv.h rename to src/rml/oob/oob_tcp_sendrecv.h index 973f0d0136..c654bc8abe 100644 --- a/src/mca/oob/tcp/oob_tcp_sendrecv.h +++ b/src/rml/oob/oob_tcp_sendrecv.h @@ -15,7 +15,7 @@ * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,8 +31,8 @@ #include "src/class/pmix_list.h" #include "src/util/pmix_string_copy.h" -#include "oob_tcp.h" -#include "oob_tcp_hdr.h" +#include "src/rml/oob/oob_tcp.h" +#include "src/rml/oob/oob_tcp_hdr.h" #include "src/rml/rml.h" #include "src/threads/pmix_threads.h" @@ -98,7 +98,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_recv_t); #define MCA_OOB_TCP_QUEUE_SEND(m, p) \ do { \ prte_oob_tcp_send_t *_s; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] queue send to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT(&((m)->dst))); \ _s = PMIX_NEW(prte_oob_tcp_send_t); \ @@ -130,7 +130,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_recv_t); #define MCA_OOB_TCP_QUEUE_PENDING(m, p) \ do { \ prte_oob_tcp_send_t *_s; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] queue pending to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT(&((m)->dst))); \ _s = PMIX_NEW(prte_oob_tcp_send_t); \ @@ -162,7 +162,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_recv_t); #define MCA_OOB_TCP_QUEUE_RELAY(m, p) \ do { \ prte_oob_tcp_send_t *_s; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] queue relay to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT(&((p)->name))); \ _s = PMIX_NEW(prte_oob_tcp_send_t); \ @@ -196,7 +196,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_msg_op_t); #define PRTE_ACTIVATE_TCP_POST_SEND(ms, cbfunc) \ do { \ prte_oob_tcp_msg_op_t *mop; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] post send to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT(&((ms)->dst))); \ mop = PMIX_NEW(prte_oob_tcp_msg_op_t); \ @@ -218,7 +218,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_msg_error_t); prte_oob_tcp_msg_error_t *mop; \ prte_oob_tcp_send_t *snd; \ prte_oob_tcp_recv_t *proxy; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base.output, \ "%s:[%s:%d] post msg error to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT((h))); \ mop = PMIX_NEW(prte_oob_tcp_msg_error_t); \ @@ -249,7 +249,7 @@ PMIX_CLASS_DECLARATION(prte_oob_tcp_msg_error_t); #define PRTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \ do { \ prte_oob_tcp_msg_error_t *mop; \ - pmix_output_verbose(5, prte_oob_base_framework.framework_output, \ + pmix_output_verbose(5, prte_oob_base_.output, \ "%s:[%s:%d] post no route to %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), \ __FILE__, __LINE__, PRTE_NAME_PRINT((h))); \ mop = PMIX_NEW(prte_oob_tcp_msg_error_t); \ diff --git a/src/mca/oob/tcp/owner.txt b/src/rml/oob/owner.txt similarity index 100% rename from src/mca/oob/tcp/owner.txt rename to src/rml/oob/owner.txt diff --git a/src/rml/rml.c b/src/rml/rml.c index f8be31be90..be965841b0 100644 --- a/src/rml/rml.c +++ b/src/rml/rml.c @@ -24,12 +24,14 @@ #include "src/mca/mca.h" #include "src/util/pmix_output.h" -#include "src/mca/errmgr/errmgr.h" -#include "src/rml/rml.h" #include "src/mca/state/state.h" #include "src/runtime/prte_wait.h" #include "src/threads/pmix_threads.h" #include "src/util/name_fns.h" +#include "src/mca/errmgr/errmgr.h" +#include "src/rml/rml.h" +#include "src/rml/rml_contact.h" +#include "src/rml/oob/oob.h" prte_rml_base_t prte_rml_base = { .rml_output = -1, @@ -82,10 +84,22 @@ void prte_rml_register(void) pmix_mca_base_var_register_synonym(ret, "prte", "routed", "radix", NULL, PMIX_MCA_BASE_VAR_SYN_FLAG_DEPRECATED); + prte_oob_register(); + + verbosity = 0; + pmix_mca_base_var_register("prte", "oob", "base", "verbose", + "Debug verbosity of the out-of-band subsystem", + PMIX_MCA_BASE_VAR_TYPE_INT, + &verbosity); + if (0 < verbosity) { + prte_oob_base.output = pmix_output_open(NULL); + pmix_output_set_verbosity(prte_oob_base.output, verbosity); + } } void prte_rml_close(void) { + prte_oob_close(); PMIX_LIST_DESTRUCT(&prte_rml_base.posted_recvs); PMIX_LIST_DESTRUCT(&prte_rml_base.unmatched_msgs); PMIX_LIST_DESTRUCT(&prte_rml_base.children); @@ -94,8 +108,12 @@ void prte_rml_close(void) } } -void prte_rml_open(void) +int prte_rml_open(void) { + char *uri = NULL; + pmix_value_t val; + int ret; + /* construct object for holding the active plugin modules */ PMIX_CONSTRUCT(&prte_rml_base.posted_recvs, pmix_list_t); PMIX_CONSTRUCT(&prte_rml_base.unmatched_msgs, pmix_list_t); @@ -106,6 +124,54 @@ void prte_rml_open(void) prte_rml_compute_routing_tree(); prte_rml_base.lifeline = PRTE_PROC_MY_PARENT->rank; + + prte_oob_open(); + + /* store our URI for later */ + prte_oob_base_get_addr(&uri); + PMIX_VALUE_LOAD(&val, uri, PMIX_STRING); + ret = PMIx_Store_internal(PRTE_PROC_MY_NAME, PMIX_PROC_URI, &val); + if (PMIX_SUCCESS != ret) { + PRTE_ERROR_LOG(PRTE_ERROR); + PMIX_VALUE_DESTRUCT(&val); + return PRTE_ERROR; + } + PMIX_VALUE_DESTRUCT(&val); + // add it to our local info + prte_process_info.my_uri = strdup(uri); + + if (PRTE_PROC_IS_MASTER) { + prte_process_info.my_hnp_uri = uri; + } else { + free(uri); + if (NULL == prte_process_info.my_hnp_uri) { + // this is an error + PRTE_ERROR_LOG(PRTE_ERROR); + return PRTE_ERROR; + } + /* extract the HNP's name so we can update the routing table */ + ret = prte_rml_parse_uris(prte_process_info.my_hnp_uri, + PRTE_PROC_MY_HNP, + NULL); + if (PRTE_SUCCESS != ret) { + PRTE_ERROR_LOG(ret); + return ret; + } + /* Set the contact info in the RML - this won't actually establish + * the connection, but just tells the RML how to reach the HNP + * if/when we attempt to send to it + */ + PMIX_VALUE_LOAD(&val, prte_process_info.my_hnp_uri, PMIX_STRING); + ret = PMIx_Store_internal(PRTE_PROC_MY_HNP, PMIX_PROC_URI, &val); + if (PMIX_SUCCESS != ret) { + PRTE_ERROR_LOG(ret); + PMIX_VALUE_DESTRUCT(&val); + return ret; + } + PMIX_VALUE_DESTRUCT(&val); + } + + return PRTE_SUCCESS; } void prte_rml_send_callback(int status, pmix_proc_t *peer, diff --git a/src/rml/rml.h b/src/rml/rml.h index dff3bc801e..7cb46aaed1 100644 --- a/src/rml/rml.h +++ b/src/rml/rml.h @@ -17,7 +17,7 @@ * and Technology (RIST). All rights reserved. * * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -145,6 +145,7 @@ PRTE_EXPORT void prte_rml_recv_cancel(pmix_proc_t *peer, prte_rml_tag_t tag); typedef struct { int rml_output; int routed_output; + int oob_output; int max_retries; pmix_list_t posted_recvs; pmix_list_t unmatched_msgs; @@ -158,7 +159,7 @@ PRTE_EXPORT extern prte_rml_base_t prte_rml_base; PRTE_EXPORT void prte_rml_register(void); PRTE_EXPORT void prte_rml_close(void); -PRTE_EXPORT void prte_rml_open(void); +PRTE_EXPORT int prte_rml_open(void); /* common implementations */ PRTE_EXPORT void prte_rml_base_post_recv(int sd, short args, void *cbdata); PRTE_EXPORT void prte_rml_base_process_msg(int fd, short flags, void *cbdata); diff --git a/src/rml/rml_send.c b/src/rml/rml_send.c index a429650476..8a81f143a4 100644 --- a/src/rml/rml_send.c +++ b/src/rml/rml_send.c @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,11 +31,11 @@ #include "src/util/pmix_name_fns.h" #include "src/mca/errmgr/errmgr.h" -#include "src/mca/oob/base/base.h" #include "src/runtime/prte_globals.h" #include "src/threads/pmix_threads.h" #include "src/rml/rml.h" +#include "src/rml/oob/oob.h" int prte_rml_send_buffer_nb(pmix_rank_t rank, pmix_data_buffer_t *buffer, diff --git a/src/runtime/data_type_support/prte_dt_print_fns.c b/src/runtime/data_type_support/prte_dt_print_fns.c index 3783e29ae4..087bc51e7c 100644 --- a/src/runtime/data_type_support/prte_dt_print_fns.c +++ b/src/runtime/data_type_support/prte_dt_print_fns.c @@ -58,8 +58,8 @@ static void display_cpus(prte_topology_t *t, char *tmp1, *tmp2; - npus = hwloc_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PU); - ncores = hwloc_get_nbobjs_by_type(t->topo, HWLOC_OBJ_CORE); + npus = prte_hwloc_base_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PU); + ncores = prte_hwloc_base_get_nbobjs_by_type(t->topo, HWLOC_OBJ_CORE); if (npus == ncores && !use_hwthread_cpus) { /* the bits in this bitmap represent cores */ bits_as_cores = true; @@ -70,10 +70,10 @@ static void display_cpus(prte_topology_t *t, } avail = hwloc_bitmap_alloc(); pmix_asprintf(&tmp1, " \n"); - npkgs = hwloc_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PACKAGE); + npkgs = prte_hwloc_base_get_nbobjs_by_type(t->topo, HWLOC_OBJ_PACKAGE); allowed = (hwloc_cpuset_t)hwloc_topology_get_allowed_cpuset(t->topo); for (pkg = 0; pkg < npkgs; pkg++) { - obj = hwloc_get_obj_by_type(t->topo, HWLOC_OBJ_PACKAGE, pkg); + obj = prte_hwloc_base_get_obj_by_type(t->topo, HWLOC_OBJ_PACKAGE, pkg); hwloc_bitmap_and(avail, obj->cpuset, allowed); if (hwloc_bitmap_iszero(avail)) { pmix_asprintf(&tmp2, "%s \n", tmp1, pkg, "NONE"); @@ -352,7 +352,7 @@ void prte_proc_print(char **output, prte_job_t *jdata, prte_proc_t *src) mycpus = hwloc_bitmap_alloc(); hwloc_bitmap_list_sscanf(mycpus, src->cpuset); - npus = hwloc_get_nbobjs_by_type(src->node->topology->topo, HWLOC_OBJ_PU); + npus = prte_hwloc_base_get_nbobjs_by_type(src->node->topology->topo, HWLOC_OBJ_PU); /* assuming each "core" xml element will take 20 characters. There could be at most npus such elements */ int sz = sizeof(char) * npus * 20; cores = (char*)malloc(sz); diff --git a/src/runtime/prte_globals.c b/src/runtime/prte_globals.c index 85e0c937f9..9e42b6f8d4 100644 --- a/src/runtime/prte_globals.c +++ b/src/runtime/prte_globals.c @@ -911,6 +911,12 @@ static void tcon(prte_topology_t *t) } static void tdes(prte_topology_t *t) { + hwloc_obj_t root; + + root = hwloc_get_root_obj(t->topo); + if (NULL != root->userdata) { + PMIX_RELEASE(root->userdata); + } if (NULL != t->topo) { hwloc_topology_destroy(t->topo); } diff --git a/src/runtime/prte_init.c b/src/runtime/prte_init.c index 994b0cb0df..8b5e1b0599 100644 --- a/src/runtime/prte_init.c +++ b/src/runtime/prte_init.c @@ -38,6 +38,9 @@ #ifdef HAVE_SYS_STAT_H # include #endif +#ifdef HAVE_STRING_H +#include +#endif #include "src/util/error.h" #include "src/util/error_strings.h" @@ -57,6 +60,7 @@ #include "src/hwloc/hwloc-internal.h" #include "src/prted/pmix/pmix_server.h" #include "src/threads/pmix_threads.h" +#include "src/include/prte_frameworks.h" #include "src/mca/base/pmix_base.h" #include "src/mca/base/pmix_mca_base_var.h" @@ -68,14 +72,12 @@ #include "src/mca/grpcomm/base/base.h" #include "src/mca/iof/base/base.h" #include "src/mca/odls/base/base.h" -#include "src/mca/oob/base/base.h" #include "src/mca/plm/base/base.h" #include "src/mca/pmdl/base/base.h" #include "src/mca/prtebacktrace/base/base.h" #include "src/mca/prteinstalldirs/base/base.h" #include "src/mca/ras/base/base.h" #include "src/mca/rmaps/base/base.h" -#include "src/mca/rtc/base/base.h" #include "src/mca/schizo/base/base.h" #include "src/mca/state/base/base.h" @@ -127,19 +129,88 @@ static bool check_exist(char *path) return false; } +static void print_error(unsigned major, + unsigned minor, + unsigned release) +{ + fprintf(stderr, "************************************************\n"); + fprintf(stderr, "We have detected that the runtime version\n"); + fprintf(stderr, "of the PMIx library we were given is binary\n"); + fprintf(stderr, "incompatible with the version we were built against:\n\n"); + fprintf(stderr, " Runtime: 0x%x%02x%02x\n", major, minor, release); + fprintf(stderr, " Build: 0x%0x\n\n", PMIX_NUMERIC_VERSION); + fprintf(stderr, "Please update your LD_LIBRARY_PATH to point\n"); + fprintf(stderr, "us to the same PMIx version used to build PRRTE.\n"); + fprintf(stderr, "************************************************\n"); +} + int prte_init_minimum(void) { - int ret; + int ret, n; char *path = NULL; + char *evar, **prefixes; + const char *rvers; + char token[100]; + unsigned int major, minor, release; if (min_initialized) { return PRTE_SUCCESS; } min_initialized = true; + /* check to see if the version of PMIx we were given in the + * library path matches the version we were built against. + * Because we are using PMIx internals, we cannot support + * cross version operations from inside of PRRTE. + */ + rvers = PMIx_Get_version(); + ret = sscanf(rvers, "%s %u.%u.%u", token, &major, &minor, &release); + + /* check the version triplet - we know that version + * 5 and above are not runtime compatible with version + * 4 and below. Since PRRTE has a minimum PMIx requirement + * in the v4.x series, we only need to check v4 vs 5 + * and above */ + if ((PMIX_VERSION_MAJOR > 4 && 4 == major) || + (PMIX_VERSION_MAJOR == 4 && 5 <= major)) { + print_error(major, minor, release); + return PRTE_ERR_SILENT; + } + + /* Protect against the envar version of the Slurm + * custom args MCA param. This is an unfortunate + * hack that hopefully will eventually go away. + * See both of the following for detailed + * explanations and discussion: + * + * https://github.com/openpmix/prrte/issues/1974 + * https://github.com/open-mpi/ompi/issues/12471 + * + * Orgs/users wanting to add custom args to the + * internal "srun" command used to spawn the + * PRRTE daemons must do so via the default MCA + * param files (system or user), or via the + * prterun (or its proxy) cmd line + */ + unsetenv("PRTE_MCA_plm_slurm_args"); + unsetenv("OMPI_MCA_plm_slurm_args"); + /* carry across the toolname */ pmix_tool_basename = prte_tool_basename; + // publish MCA prefixes + prefixes = NULL; + for (n=0; NULL != prte_framework_names[n]; n++) { + if (0 == strcmp("common", prte_framework_names[n])) { + continue; + } + PMIx_Argv_append_nosize(&prefixes, prte_framework_names[n]); + } + evar = PMIx_Argv_join(prefixes, ','); + pmix_setenv("PRTE_MCA_PREFIXES", evar, true, &environ); + free(evar); + PMIx_Argv_free(prefixes); + /* initialize install dirs code */ ret = pmix_mca_base_framework_open(&prte_prteinstalldirs_base_framework, PMIX_MCA_BASE_OPEN_DEFAULT); @@ -484,7 +555,13 @@ void prte_preload_default_mca_params(void) * user already has the param in our environment as their * environment settings override all defaults */ PMIX_LIST_FOREACH(fv, &pfinal, pmix_mca_base_var_file_value_t) { - if (pmix_pmdl_base_check_prte_param(fv->mbvfv_var)) { + if (pmix_pmdl_base_check_pmix_param(fv->mbvfv_var)) { + pmix_asprintf(&tmp, "PMIX_MCA_%s", fv->mbvfv_var); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, fv->mbvfv_value, false); + free(tmp); + } else { pmix_asprintf(&tmp, "PRTE_MCA_%s", fv->mbvfv_var); // set it, but don't overwrite if they already // have a value in our environment @@ -494,12 +571,6 @@ void prte_preload_default_mca_params(void) // or mca frameworks, then we also need to set // the equivalent PMIx value check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); - } else if (pmix_pmdl_base_check_pmix_param(fv->mbvfv_var)) { - pmix_asprintf(&tmp, "PMIX_MCA_%s", fv->mbvfv_var); - // set it, but don't overwrite if they already - // have a value in our environment - setenv(tmp, fv->mbvfv_value, false); - free(tmp); } } diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index edb16be9bf..cc34f22815 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -141,33 +141,6 @@ static void opcbfunc(pmix_status_t status, void *cbdata) PRTE_PMIX_WAKEUP_THREAD(lock); } -static void setupcbfunc(pmix_status_t status, pmix_info_t info[], size_t ninfo, - void *provided_cbdata, pmix_op_cbfunc_t cbfunc, void *cbdata) -{ - mylock_t *mylock = (mylock_t *) provided_cbdata; - size_t n; - - if (NULL != info) { - mylock->ninfo = ninfo; - PMIX_INFO_CREATE(mylock->info, mylock->ninfo); - /* cycle across the provided info */ - for (n = 0; n < ninfo; n++) { - PMIX_INFO_XFER(&mylock->info[n], &info[n]); - } - } else { - mylock->info = NULL; - mylock->ninfo = 0; - } - mylock->status = status; - - /* release the caller */ - if (NULL != cbfunc) { - cbfunc(PMIX_SUCCESS, cbdata); - } - - PRTE_PMIX_WAKEUP_THREAD(&mylock->lock); -} - static void spcbfunc(pmix_status_t status, char nspace[], void *cbdata) { prte_pmix_lock_t *lock = (prte_pmix_lock_t *) cbdata; @@ -336,6 +309,10 @@ int main(int argc, char *argv[]) } } + /* do a minimal setup of key infrastructure, including + * parsing the install-level and user-level PRRTE param + * files + */ rc = prte_init_minimum(); if (PRTE_SUCCESS != rc) { return rc; @@ -461,7 +438,7 @@ int main(int argc, char *argv[]) /* parse the input argv to get values, including everyone's MCA params */ PMIX_CONSTRUCT(&results, pmix_cli_result_t); // check for special case of executable immediately following tool - if (proxyrun && '-' != pargv[1][0]) { + if (proxyrun && pargc > 1 && '-' != pargv[1][0]) { results.tail = PMIx_Argv_copy(&pargv[1]); } else { rc = schizo->parse_cli(pargv, &results, PMIX_CLI_WARN); @@ -1147,49 +1124,6 @@ int main(int argc, char *argv[]) /* give the schizo components a chance to add to the job info */ schizo->job_info(&results, jinfo); - /* pickup any relevant envars */ - ninfo = 4; - PMIX_INFO_CREATE(iptr, ninfo); - flag = true; - PMIX_INFO_LOAD(&iptr[0], PMIX_SETUP_APP_ENVARS, &flag, PMIX_BOOL); - ui32 = geteuid(); - PMIX_INFO_LOAD(&iptr[1], PMIX_USERID, &ui32, PMIX_UINT32); - ui32 = getegid(); - PMIX_INFO_LOAD(&iptr[2], PMIX_GRPID, &ui32, PMIX_UINT32); - PMIX_INFO_LOAD(&iptr[3], PMIX_PERSONALITY, personality, PMIX_STRING); - - PRTE_PMIX_CONSTRUCT_LOCK(&mylock.lock); - ret = PMIx_server_setup_application(prte_process_info.myproc.nspace, iptr, ninfo, setupcbfunc, - &mylock); - if (PMIX_SUCCESS != ret) { - pmix_output(0, "Error setting up application: %s", PMIx_Error_string(ret)); - PRTE_PMIX_DESTRUCT_LOCK(&mylock.lock); - PRTE_UPDATE_EXIT_STATUS(ret); - goto DONE; - } - PRTE_PMIX_WAIT_THREAD(&mylock.lock); - PMIX_INFO_FREE(iptr, ninfo); - if (PMIX_SUCCESS != mylock.status) { - pmix_output(0, "Error setting up application: %s", PMIx_Error_string(mylock.status)); - PRTE_UPDATE_EXIT_STATUS(mylock.status); - PRTE_PMIX_DESTRUCT_LOCK(&mylock.lock); - goto DONE; - } - PRTE_PMIX_DESTRUCT_LOCK(&mylock.lock); - /* transfer any returned ENVARS to the job_info */ - if (NULL != mylock.info) { - for (n = 0; n < mylock.ninfo; n++) { - if (PMIX_CHECK_KEY(&mylock.info[n], PMIX_SET_ENVAR) || - PMIX_CHECK_KEY(&mylock.info[n], PMIX_ADD_ENVAR) || - PMIX_CHECK_KEY(&mylock.info[n], PMIX_UNSET_ENVAR) || - PMIX_CHECK_KEY(&mylock.info[n], PMIX_PREPEND_ENVAR) || - PMIX_CHECK_KEY(&mylock.info[n], PMIX_APPEND_ENVAR)) { - PMIX_INFO_LIST_XFER(ret, jinfo, &mylock.info[n]); - } - } - PMIX_INFO_FREE(mylock.info, mylock.ninfo); - } - /* convert the job info into an array */ PMIX_INFO_LIST_CONVERT(ret, jinfo, &darray); if (PMIX_ERR_EMPTY == ret) { @@ -1272,7 +1206,7 @@ int main(int argc, char *argv[]) } else if (0 == strcmp(opt->values[0], "none")) { pname.rank = PMIX_RANK_INVALID; } else { - pname.rank = 0; + pname.rank = strtoul(opt->values[0], NULL, 10); } } else { pname.rank = 0; diff --git a/src/util/proc_info.c b/src/util/proc_info.c index 0923b962eb..8e153fb388 100644 --- a/src/util/proc_info.c +++ b/src/util/proc_info.c @@ -64,6 +64,7 @@ PRTE_EXPORT prte_process_info_t prte_process_info = { .aliases = NULL, .pid = 0, .proc_type = PRTE_PROC_TYPE_NONE, + .my_uri = NULL, .my_port = 0, .tmpdir_base = NULL, .top_session_dir = NULL, diff --git a/src/util/proc_info.h b/src/util/proc_info.h index 85e53ba5af..fe970e5ce7 100644 --- a/src/util/proc_info.h +++ b/src/util/proc_info.h @@ -75,6 +75,7 @@ typedef struct prte_process_info_t { char **aliases; /**< aliases for this node */ pid_t pid; /**< Local process ID for this process */ prte_proc_type_t proc_type; /**< Type of process */ + char *my_uri; /**< My contact info */ uint16_t my_port; /**< TCP port for out-of-band comm */ /* The session directory has the form * ///, where the prefix diff --git a/src/util/prte_cmd_line.h b/src/util/prte_cmd_line.h index 3f657f6ad2..15f5727898 100644 --- a/src/util/prte_cmd_line.h +++ b/src/util/prte_cmd_line.h @@ -15,7 +15,7 @@ * Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2017-2022 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -108,6 +108,7 @@ BEGIN_C_DECLS #define PRTE_CLI_SET_CWD_SESSION "set-cwd-to-session-dir" // none #define PRTE_CLI_ENABLE_RECOVERY "enable-recovery" // none #define PRTE_CLI_DISABLE_RECOVERY "disable-recovery" // none +#define PRTE_CLI_MEM_ALLOC_KIND "memory-alloc-kinds" // required // Placement options #define PRTE_CLI_MAPBY "map-by" // required