From ae35d7552ebbeb1de7ddcba49a8e4944c5f5e4a4 Mon Sep 17 00:00:00 2001 From: xbw <78337767+xbw22109@users.noreply.github.com> Date: Mon, 19 May 2025 02:58:45 +0000 Subject: [PATCH] Fix singleton tmp files cleanup Only in singleton mode, directory cleaning needs to be done by the program itself. There are some problems with these parts of the code that cause the directory to not be cleaned. This commit fixes *some of* these issues. 1. btl/sm will not unlink its segments file. We never noticed this in non-singleton mode because pmix cleaned it up for us. After this, we can clean up the segment file created by sm in /dev/shm.(when singletons normally terminated) 2. Modified the singleton session directory structure and enabled recursive deletion. After this, we can cleanup the session dir. (when singletons normally terminated) 3. Fix a bug - local peer number of a singleton should be 0, not 1. After this, the btl/sm and btl/smcuda components will return NULL during their init process and will be automatically closed. btl/sm segment file in /dev/shm will never be created in singleton mode now. Signed-off-by: xbw <78337767+xbw22109@users.noreply.github.com> --- ompi/runtime/ompi_rte.c | 63 +++++++++++++++++++++--------- opal/mca/btl/sm/btl_sm_component.c | 11 ++++++ opal/mca/btl/sm/btl_sm_module.c | 3 -- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index 4e7719c73af..5cc3663d1e1 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -68,6 +68,7 @@ opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX}; * infrastructure that manages its structure (e.g., OpenPMIx). If we setup this * session directory structure, then we shall cleanup after ourselves. */ +static bool destroy_top_session_dir = false; static bool destroy_job_session_dir = false; static bool destroy_proc_session_dir = false; @@ -888,8 +889,13 @@ int ompi_rte_init(int *pargc, char ***pargv) if (0 == opal_process_info.num_local_peers) { if (NULL != peers) { opal_process_info.num_local_peers = opal_argv_count(peers) - 1; + } else if (opal_process_info.is_singleton) { + /* if we are a singleton, then we have no local peers */ + opal_process_info.num_local_peers = 0; } else { - opal_process_info.num_local_peers = 1; + ret = OPAL_ERR_BAD_PARAM; + error = "local peers"; + goto error; } } /* if my local rank if too high, then that's an error */ @@ -983,25 +989,28 @@ int ompi_rte_finalize(void) { /* cleanup the session directory we created */ + if (NULL != opal_process_info.proc_session_dir && destroy_proc_session_dir) { + opal_os_dirpath_destroy(opal_process_info.proc_session_dir, + true, check_file); + free(opal_process_info.proc_session_dir); + opal_process_info.proc_session_dir = NULL; + destroy_proc_session_dir = false; + } + if (NULL != opal_process_info.job_session_dir && destroy_job_session_dir) { opal_os_dirpath_destroy(opal_process_info.job_session_dir, - false, check_file); + true, check_file); free(opal_process_info.job_session_dir); opal_process_info.job_session_dir = NULL; destroy_job_session_dir = false; } - - if (NULL != opal_process_info.top_session_dir) { + + if (NULL != opal_process_info.top_session_dir && destroy_top_session_dir) { + opal_os_dirpath_destroy(opal_process_info.top_session_dir, + true, check_file); free(opal_process_info.top_session_dir); opal_process_info.top_session_dir = NULL; - } - - if (NULL != opal_process_info.proc_session_dir && destroy_proc_session_dir) { - opal_os_dirpath_destroy(opal_process_info.proc_session_dir, - false, check_file); - free(opal_process_info.proc_session_dir); - opal_process_info.proc_session_dir = NULL; - destroy_proc_session_dir = false; + destroy_top_session_dir = false; } if (NULL != opal_process_info.app_sizes) { @@ -1165,27 +1174,45 @@ void ompi_rte_wait_for_debugger(void) static int _setup_top_session_dir(char **sdir) { + /* + * Use a session directory structure similar to prrte (create only one + * directory for the top session) so that it can be cleaned up correctly + * when terminated. + */ char *tmpdir; + int rc; + uid_t uid = geteuid(); + pid_t pid = getpid(); if( NULL == (tmpdir = getenv("TMPDIR")) ) if( NULL == (tmpdir = getenv("TEMP")) ) if( NULL == (tmpdir = getenv("TMP")) ) tmpdir = "/tmp"; - *sdir = strdup(tmpdir); + if (0 > opal_asprintf(sdir, "%s/%s.%s.%lu.%lu", + tmpdir, "ompi", + opal_process_info.nodename, + (unsigned long)pid, (unsigned long) uid)) { + opal_process_info.top_session_dir = NULL; + return OPAL_ERR_OUT_OF_RESOURCE; + } + rc = opal_os_dirpath_create(opal_process_info.top_session_dir, 0755); + if (OPAL_SUCCESS != rc) { + // could not create top session dir + free(opal_process_info.top_session_dir); + opal_process_info.top_session_dir = NULL; + return rc; + } + destroy_top_session_dir = true; return OPAL_SUCCESS; } static int _setup_job_session_dir(char **sdir) { int rc; - /* get the effective uid */ - uid_t uid = geteuid(); - if (0 > opal_asprintf(sdir, "%s/ompi.%s.%lu/jf.0/%u", + if (0 > opal_asprintf(sdir, "%s/%u", opal_process_info.top_session_dir, - opal_process_info.nodename, - (unsigned long)uid, opal_process_info.my_name.jobid)) { opal_process_info.job_session_dir = NULL; return OPAL_ERR_OUT_OF_RESOURCE; diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c index 21e1bcf2a6c..82f04df9466 100644 --- a/opal/mca/btl/sm/btl_sm_component.c +++ b/opal/mca/btl/sm/btl_sm_component.c @@ -298,6 +298,11 @@ static int mca_btl_sm_deregister_mem_knem(struct mca_btl_base_module_t *btl, return OPAL_SUCCESS; } +static void mca_btl_sm_component_finalize(void *data /*data unused*/) { + opal_shmem_unlink(&mca_btl_sm_component.seg_ds); + opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds); +} + /* * SM component initialization */ @@ -419,6 +424,12 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab /* set flag indicating btl not inited */ mca_btl_sm.btl_inited = false; + /* + * Use a method similar to `mca_btl_smcuda_component_init` to register segment finalize + * to opal and release it before shmem is closed. + */ + opal_finalize_register_cleanup(mca_btl_sm_component_finalize); + return btls; failed: opal_shmem_unlink(&component->seg_ds); diff --git a/opal/mca/btl/sm/btl_sm_module.c b/opal/mca/btl/sm/btl_sm_module.c index 3cb69c8d3da..27859bb4a2c 100644 --- a/opal/mca/btl/sm/btl_sm_module.c +++ b/opal/mca/btl/sm/btl_sm_module.c @@ -347,9 +347,6 @@ static int sm_finalize(struct mca_btl_base_module_t *btl) free(component->fbox_in_endpoints); component->fbox_in_endpoints = NULL; - opal_shmem_unlink(&mca_btl_sm_component.seg_ds); - opal_shmem_segment_detach(&mca_btl_sm_component.seg_ds); - return OPAL_SUCCESS; }