Skip to content

Read after write() but before wait_all() causes seg faults #185

Open
@wangvsa

Description

@wangvsa

Bug Report

I'm trying to optimize the CFITSIO PDC driver for Montage. Some Montage components perform a large number of small writes followed by a single read on the same file.
All I/O operations are file-per-process. So we don't need to perform a wait() for each write() call. We can do a wait_all() at the flush/close time to reduce overhead.

The issue is that the read occurs before wait_all() sometimes can cause seg fault.
write->write->write->...->read->write_all()

To Reproduce

I attached a simple code to help debugging. Run it on a single node with a small number of processes will trigger the seg fault.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>

#include "mpi.h"
#include "pdc.h"


/**
 * write -> read -> write -> wait_all()
 */

int mpi_rank, mpi_size;
MPI_Comm mpi_comm;

void write_read_wait_all(pdcid_t obj_id, int iterations) {
    pdcid_t region_local, region_remote;
    pdcid_t transfer_request;
    perr_t  ret;

    int      ndim = 1;
    uint64_t offset_local = 0;
    uint64_t offset_remote = 0;
    uint64_t chunk_size = 2880;

    char* data_out = (char*) malloc(chunk_size * sizeof(char));
    memset(data_out, 'a', chunk_size*sizeof(char));

    pdcid_t* tids = (pdcid_t*)malloc(sizeof(pdcid_t) * (iterations+1));
    for(int i = 0; i < iterations; i++) {

        region_local  = PDCregion_create(ndim, &offset_local, &chunk_size);
        region_remote = PDCregion_create(ndim, &offset_remote, &chunk_size);
        offset_local  += chunk_size;
        offset_remote += chunk_size;

        tids[i] = PDCregion_transfer_create(data_out, PDC_WRITE, obj_id, region_local, region_remote);

        if (tids[i] == 0)
            printf("transfer request creation failed\n");
        ret = PDCregion_transfer_start(tids[i]);
        if (ret != SUCCEED)
            printf("Failed to start transfer\n");
    }

    printf("rank %d read before wait_all()\n", mpi_rank);
    fflush(stdout);
    char* data_in = (char*) malloc(chunk_size * sizeof(char));
    offset_local  = 0;
    offset_remote = 0;
    region_local  = PDCregion_create(ndim, &offset_local, &chunk_size);
    region_remote = PDCregion_create(ndim, &offset_remote, &chunk_size);
    pdcid_t read_tid = PDCregion_transfer_create(data_in, PDC_READ, obj_id, region_local, region_remote);
    ret = PDCregion_transfer_start(read_tid);
    ret = PDCregion_transfer_wait(read_tid);
    ret = PDCregion_transfer_close(read_tid);
    printf("rank %d read success: %c!\n", mpi_rank, data_in[0]);
    fflush(stdout);

    // Write one more time
    offset_local  = 0;
    offset_remote = chunk_size*iterations;
    region_local  = PDCregion_create(ndim, &offset_local,  &chunk_size);
    region_remote = PDCregion_create(ndim, &offset_remote, &chunk_size);
    tids[iterations] = PDCregion_transfer_create(data_out, PDC_WRITE, obj_id, region_local, region_remote);
    if (tids[iterations] == 0)
        printf("transfer request creation failed\n");
    ret = PDCregion_transfer_start(tids[iterations]);
    if (ret != SUCCEED)
        printf("Failed to start transfer\n");
    printf("rank %d final write transfer started.\n", mpi_rank);
    fflush(stdout);


    ret = PDCregion_transfer_wait_all(tids, (iterations+1));
    if (ret != SUCCEED) {
        printf("Failed to transfer wait\n");
    }

    for(int i = 0; i < iterations+1; i++) {

        ret = PDCregion_transfer_close(transfer_request);
        if (ret != SUCCEED) {
            printf("region transfer close failed\n");
        }
    }

    free(data_in);
    free(data_out);
    free(tids);
}

int main(int argc, char** argv) {

    pdcid_t pdc_id, cont_prop, cont_id;
    pdcid_t obj_prop, obj_id;

    uint64_t  dims[1] = {PDC_SIZE_UNLIMITED};

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm);

    // create a pdc
    pdc_id = PDCinit("pdc");

    // create a container property
    cont_prop = PDCprop_create(PDC_CONT_CREATE, pdc_id);
    if (cont_prop <= 0) {
        printf("Fail to create container property @ line  %d!\n", __LINE__);
    }
    // create a container
    cont_id = PDCcont_create_col("c1", cont_prop);
    if (cont_id <= 0) {
        printf("Fail to create container @ line  %d!\n", __LINE__);
    }

    // create an object property
    obj_prop = PDCprop_create(PDC_OBJ_CREATE, pdc_id);
    PDCprop_set_obj_dims(obj_prop, 1, dims);
    PDCprop_set_obj_type(obj_prop, PDC_CHAR);
    PDCprop_set_obj_time_step(obj_prop, 0);
    PDCprop_set_obj_user_id(obj_prop, getuid());
    PDCprop_set_obj_app_name(obj_prop, "producer");

    char obj_name[100] = {0};
    sprintf(obj_name, "obj-var-%d", mpi_rank);
    PDCprop_set_obj_tags(obj_prop, obj_name);
    obj_id = PDCobj_create(cont_id, obj_name, obj_prop);

    write_read_wait_all(obj_id, 1000);

    if (PDCobj_close(obj_id) < 0) {
        printf("fail to close obj_id\n");
    }

    if (PDCprop_close(cont_prop) < 0) {
        printf("Fail to close property @ line %d\n", __LINE__);
    }

    if (PDCclose(pdc_id) < 0) {
        printf("fail to close PDC\n");
    }

    MPI_Finalize();
}

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions