Open
Description
I'm getting memory leaks when calling mpi_file_write_all
, using Open MPI 4.1.5 built from a source downloaded from here. Eventually the process runs out of memory and is killed by the operating system.
Valgrind also reports memory leaks from mpi_init
and mpi_finalize
, but I suspect it's the ones in mpi_file_write_all
that are causing the major problems.
I built Open MPI Intel 2020.2, like so:
$ export FC=ifort CC=icc
$ ./configure --prefix=/home/andrew/install/Debug/openmpi-4.1.5 --with-psm2 --disable-psm2-version-check --with-slurm --with-pmi=/mnt/beegfs/software/slurm/19.05.2/../prod --enable-mem-debug --enable-debug
System
- Operating system/version: CentOS 7
- Computer hardware: Intel Corporation Xeon E7
- Network type:
Details of the problem
The following code, test-leak.f90, reproduces the problem:
program test_leak
use mpi
implicit none
integer, parameter :: nx=10,ng=1,ncopies=4
integer :: ierr,comm,nproc,rank,fh,sizes(1),subsizes(1),starts(1),subarray,distribution
integer(kind=mpi_offset_kind) :: loc
double precision :: array(1-ng:nx+ng)
character(len=13) :: filename = 'test-leak.dat'
logical :: exists
comm = mpi_comm_world
call mpi_init(ierr)
call mpi_comm_size(comm,nproc,ierr)
call mpi_comm_rank(comm,rank,ierr)
inquire(file=filename,exist=exists)
if(exists) call mpi_file_delete(filename,mpi_info_null,ierr)
call mpi_file_open(comm,filename,mpi_mode_create+mpi_mode_wronly,mpi_info_null,fh,ierr)
sizes = [nx + 2*ng]
subsizes = [nx]
starts = [ng]
call mpi_type_create_subarray(1,sizes,subsizes,starts,mpi_order_fortran,mpi_double_precision,subarray,ierr)
call mpi_type_commit(subarray,ierr)
sizes = [nproc*nx]
starts = [nx*rank]
call mpi_type_create_subarray(1,sizes,subsizes,starts,mpi_order_fortran,mpi_double_precision,distribution,ierr)
call mpi_type_commit(distribution,ierr)
loc = 0
do loc = 0,(ncopies-1)*nproc*nx*8,ncopies
call mpi_file_set_view(fh,loc,mpi_byte,distribution,'native',mpi_info_null,ierr)
call mpi_file_write_all(fh,array,1,subarray,mpi_status_ignore,ierr)
enddo
call mpi_type_free(distribution,ierr)
call mpi_type_free(subarray,ierr)
call mpi_file_close(fh,ierr)
call mpi_finalize(ierr)
end program test_leak
I'm using the following CMakeLists.txt to build the executable:
set(CMAKE_BUILD_TYPE Debug CACHE STRING "" FORCE)
set(CMAKE_Fortran_COMPILER ifort CACHE STRING "" FORCE)
set(CMAKE_C_COMPILER icc CACHE STRING "" FORCE)
set(CMAKE_CXX_COMPILER ipcp CACHE STRING "" FORCE)
cmake_minimum_required(VERSION 3.24)
project(testompi Fortran)
find_package(MPI REQUIRED)
include_directories(${MPI_Fortran_INCLUDE_PATH})
add_executable(test_ompi test-leak.f90)
target_link_libraries(test_ompi ${MPI_Fortran_LIBRARIES})
Compile and run like so:
$ cmake -B build
$ cmake --build build
$ mpirun mpirun --oversubscribe --mca btl ^openib,ofi -n 4 valgrind --num-callers=100 --leak-check=full --show-leak-kinds=all --track-origins=yes --verbose --suppressions=/home/andrew/install/Debug/openmpi-4.1.5/share/openmpi/openmpi-valgrind.supp --log-file=valgrind-out-%p.txt ./build/test_leak
Here's some sample output from valgrind:
==27384== 960 bytes in 240 blocks are definitely lost in loss record 53 of 69
==27384== at 0x4C29E63: malloc (vg_replace_malloc.c:309)
==27384== by 0x6C61EA9: opal_malloc (malloc.c:101)
==27384== by 0x1194353C: ???
==27384== by 0x180B6883: ???
==27384== by 0x180AED65: ???
==27384== by 0x11952030: ???
==27384== by 0x10A130E8: ???
==27384== by 0x565855A: PMPI_File_write_all (pfile_write_all.c:74)
==27384== by 0x52F57D6: mpi_file_write_all (pfile_write_all_f.c:82)
==27384== by 0x409B85: MAIN__ (test-leak.f90:36)
==27384== by 0x409411: main (in /home/andrew/open-mpi-bugs/memory-leaks/build/test_ompi)
...
==27384== 23,136 bytes in 241 blocks are indirectly lost in loss record 64 of 69
==27384== at 0x4C29E63: malloc (vg_replace_malloc.c:309)
==27384== by 0x6C61EA9: opal_malloc (malloc.c:101)
==27384== by 0x6BF9EC4: opal_datatype_optimize_short (opal_datatype_optimize.c:51)
==27384== by 0x6BF9EC4: opal_datatype_commit (opal_datatype_optimize.c:310)
==27384== by 0x5741866: ompi_datatype_commit (ompi_datatype.h:171)
==27384== by 0x5741866: ompi_coll_base_allgatherv_intra_basic_default (coll_base_allgatherv.c:639)
==27384== by 0x14D1536A: ???
==27384== by 0x14D0BE08: ???
==27384== by 0x180AFA4D: ???
==27384== by 0x11952030: ???
==27384== by 0x10A130E8: ???
==27384== by 0x565855A: PMPI_File_write_all (pfile_write_all.c:74)
==27384== by 0x52F57D6: mpi_file_write_all (pfile_write_all_f.c:82)
==27384== by 0x409B85: MAIN__ (test-leak.f90:36)
==27384== by 0x409411: main (in /home/andrew/open-mpi-bugs/memory-leaks/build/test_ompi)
...
==27384== LEAK SUMMARY:
==27384== definitely lost: 202,280 bytes in 764 blocks
==27384== indirectly lost: 183,408 bytes in 968 blocks
==27384== possibly lost: 1,920 bytes in 2 blocks
==27384== still reachable: 30,864 bytes in 107 blocks
==27384== suppressed: 0 bytes in 0 blocks
==27384==
==27384== ERROR SUMMARY: 59 errors from 59 contexts (suppressed: 0 from 0)