/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "rdma_impl.h"
#include "pmi.h"

/* global rmda structure for the local process */
MPIDI_CH3I_RDMA_Process_t MPIDI_CH3I_RDMA_Process;

static int MPIDI_CH3I_PG_Compare_ids(void * id1, void * id2);
static int MPIDI_CH3I_PG_Destroy(MPIDI_PG_t * pg, void * id);

static void generate_shm_string(char *str)
{
#ifdef USE_WINDOWS_SHM
    UUID guid;
    UuidCreate(&guid);
    sprintf(str, "%08lX-%04X-%04x-%02X%02X-%02X%02X%02X%02X%02X%02X",
	guid.Data1, guid.Data2, guid.Data3,
	guid.Data4[0], guid.Data4[1], guid.Data4[2], guid.Data4[3],
	guid.Data4[4], guid.Data4[5], guid.Data4[6], guid.Data4[7]);
    MPIU_DBG_PRINTF(("GUID = %s\n", str));
#elif defined (USE_POSIX_SHM)
    sprintf(str, "/mpich_shm_%d", getpid());
#elif defined (USE_SYSV_SHM)
    sprintf(str, "%d", getpid());
#else
#error No shared memory subsystem defined
#endif
}

static int cached_pg_rank;
static int cached_pg_size;
static MPIDI_PG_t * cached_pg;

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RDMA_init_process_group
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RDMA_init_process_group(int * has_parent, MPIDI_PG_t ** pg_pptr, int * pg_rank_ptr)
{
    int pmi_errno = PMI_SUCCESS;
    int mpi_errno = MPI_SUCCESS;
    MPIDI_PG_t * pg = NULL;
    int rc;
    int pg_rank, pg_size;
    char * pg_id;
    int pg_id_sz;
    int kvs_name_sz;

    /*
     * Extract process group related information from PMI
     */
    rc = PMI_Init(has_parent);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_init", "**pmi_init %d", rc);
	return mpi_errno;
    }
    rc = PMI_Get_rank(&pg_rank);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_get_rank", "**pmi_get_rank %d", rc);
	return mpi_errno;
    }
    rc = PMI_Get_size(&pg_size);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_get_size", "**pmi_get_size %d", rc);
	return mpi_errno;
    }

    /*
     * Get the process group id
     */
    pmi_errno = PMI_Get_id_length_max(&pg_id_sz);
    if (pmi_errno != PMI_SUCCESS)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
					 "**pmi_get_id_length_max", "**pmi_get_id_length_max %d", pmi_errno);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }

    pg_id = MPIU_Malloc(pg_id_sz + 1);
    if (pg_id == NULL)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", NULL);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }
    
    pmi_errno = PMI_Get_id(pg_id, pg_id_sz);
    if (pmi_errno != PMI_SUCCESS)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_get_id",
					 "**pmi_get_id %d", pmi_errno);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }

    /*
     * Initialize the process group tracking subsystem
     */
    mpi_errno = MPIDI_PG_Init(MPIDI_CH3I_PG_Compare_ids, MPIDI_CH3I_PG_Destroy);
    if (mpi_errno != MPI_SUCCESS)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
					 "**ch3|pg_init", NULL);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }
    
    /*
     * Create a new structure to track the process group
     */
    mpi_errno = MPIDI_PG_Create(pg_size, pg_id, &pg);
    if (mpi_errno != MPI_SUCCESS)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
					 "**ch3|pg_create", NULL);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }
    pg->ch.kvs_name = NULL;

    /*
     * Get the name of the key-value space (KVS)
     */
    pmi_errno = PMI_KVS_Get_name_length_max(&kvs_name_sz);
    if (pmi_errno != PMI_SUCCESS)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
					 "**pmi_kvs_get_name_length_max", "**pmi_kvs_get_name_length_max %d", pmi_errno);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }
    
    pg->ch.kvs_name = MPIU_Malloc(kvs_name_sz + 1);
    if (pg->ch.kvs_name == NULL)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", NULL);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }
    
    pmi_errno = PMI_KVS_Get_my_name(pg->ch.kvs_name, kvs_name_sz);
    if (pmi_errno != PMI_SUCCESS)
    {
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
					 "**pmi_kvs_get_my_name", "**pmi_kvs_get_my_name %d", pmi_errno);
	goto fn_fail;
	/* --END ERROR HANDLING-- */
    }

    /*MPIU_Timer_init(pg_rank, pg_size);*/

    pg->ch.nRDMAWaitSpinCount = MPIDI_CH3I_SPIN_COUNT_DEFAULT;
    pg->ch.nRDMAWaitYieldCount = MPIDI_CH3I_YIELD_COUNT_DEFAULT;

    *pg_pptr = pg;
    *pg_rank_ptr = pg_rank;

    cached_pg_rank = pg_rank;
    cached_pg_size = pg_size;
    cached_pg = pg;

fn_exit:
    return MPI_SUCCESS;

fn_fail:
    if (pg != NULL)
    {
	MPIDI_PG_Destroy(pg);
    }
    goto fn_exit;
}

/* init must allocate RDMA memory and initialize the queues and other structures inside it */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RDMA_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RDMA_init()
{
    int error;

    int pg_rank, pg_size;
    MPIDI_PG_t * pg;
    MPIDI_VC_t * vc;

    char * key;
    char * val;
    int key_max_sz;
    int val_max_sz;

    char shmemkey[MPIDI_MAX_SHM_NAME_LENGTH];
    int i, j, k;
    int shm_block;

    pg = cached_pg;
    pg_rank = cached_pg_rank;
    pg_size = cached_pg_size;

    MPIDI_CH3I_RDMA_Process.nShmEagerLimit = MPIDI_SHM_EAGER_LIMIT;
#ifdef HAVE_SHARED_PROCESS_READ
    MPIDI_CH3I_RDMA_Process.nShmRndvLimit = MPIDI_SHM_RNDV_LIMIT;
#endif
    MPIDI_CH3I_RDMA_Process.addr = NULL;
#ifdef USE_POSIX_SHM
    MPIDI_CH3I_RDMA_Process.key[0] = '\0';
    MPIDI_CH3I_RDMA_Process.id = -1;
#elif defined (USE_SYSV_SHM)
    MPIDI_CH3I_RDMA_Process.key = -1;
    MPIDI_CH3I_RDMA_Process.id = -1;
#elif defined (USE_WINDOWS_SHM)
    MPIDI_CH3I_RDMA_Process.key[0] = '\0';
    MPIDI_CH3I_RDMA_Process.id = NULL;
#else
#error No shared memory subsystem defined
#endif
    MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = MPIDI_CH3I_SPIN_COUNT_DEFAULT;
    MPIDI_CH3I_RDMA_Process.nShmWaitYieldCount = MPIDI_CH3I_YIELD_COUNT_DEFAULT;

    /* initialize the shared memory */
    shm_block = sizeof(MPIDI_CH3I_SHM_Queue_t) * pg_size; 

    if (pg_size > 1)
    {
	/* Allocate space for pmi keys and values */
	error = PMI_KVS_Get_key_length_max(&key_max_sz);
	if (error != PMI_SUCCESS)
	{
	}
	key_max_sz++;
	key = MPIU_Malloc(key_max_sz);
	if (key == NULL)
	{
	    error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "pmi key");
	    return error;
	}
	error = PMI_KVS_Get_value_length_max(&val_max_sz);
	if (error != PMI_SUCCESS)
	{
	}
	val_max_sz++;
	val = MPIU_Malloc(val_max_sz);
	if (val == NULL)
	{
	    error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "pmi value");
	    return error;
	}

	if (pg_rank == 0)
	{
	    generate_shm_string(shmemkey);
	    MPIU_Strncpy(key, "SHMEMKEY", key_max_sz);
	    MPIU_Strncpy(val, shmemkey, val_max_sz);
	    error = PMI_KVS_Put(pg->ch.kvs_name, key, val);
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_put", "**pmi_kvs_put %d", error);
		return error;
	    }
	    error = PMI_KVS_Commit(pg->ch.kvs_name);
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_commit", "**pmi_kvs_commit %d", error);
		return error;
	    }
	    error = PMI_Barrier();
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", error);
		return error;
	    }
	}
	else
	{
	    MPIU_Strncpy(key, "SHMEMKEY", key_max_sz);
	    error = PMI_Barrier();
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", error);
		return error;
	    }
	    error = PMI_KVS_Get(pg->ch.kvs_name, key, val, val_max_sz);
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_get", "**pmi_kvs_get %d", error);
		return error;
	    }
	    MPIU_Strncpy(shmemkey, val, val_max_sz);
	}

	MPIU_Free(val);
	MPIU_Free(key);

	MPIU_DBG_PRINTF(("KEY = %s\n", shmemkey));
#ifdef USE_POSIX_SHM
	MPIU_Strncpy(MPIDI_CH3I_RDMA_Process.key, shmemkey, MPIDI_MAX_SHM_NAME_LENGTH);
#elif defined (USE_SYSV_SHM)
	MPIDI_CH3I_RDMA_Process.key = atoi(shmemkey);
#elif defined (USE_WINDOWS_SHM)
	MPIU_Strncpy(MPIDI_CH3I_RDMA_Process.key, shmemkey, MPIDI_MAX_SHM_NAME_LENGTH);
#else
#error No shared memory subsystem defined
#endif

	error = MPIDI_CH3I_SHM_Get_mem( pg, pg_size * shm_block, pg_rank, pg_size, TRUE, &MPIDI_CH3I_RDMA_Process.addr );
    }
    else
    {
	error = MPIDI_CH3I_SHM_Get_mem( pg, shm_block, 0, 1, FALSE, &MPIDI_CH3I_RDMA_Process.addr );
    }
    if (error != MPI_SUCCESS)
    {
	error = MPIR_Err_create_code(error, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "shared memory block");
	return error;
    }

    /* initialize each shared memory queue */
    for (i=0; i<MPIDI_PG_Get_size(pg); i++)
    {
	/* FIXME: Use vc instead of vcr to set all the vcs to active.
	 * This is necessary to prevent the close protocol from hanging
	 * until the shm shutdown code can be debugged.
	 */
	/*MPIDI_PG_Get_vcr(pg, i, &vc);*/
	MPIDI_PG_Get_vc(pg, i, &vc);
	if (i == pg_rank)
	{
	    vc->shm.shm = (MPIDI_CH3I_SHM_Queue_t*)((char*)MPIDI_CH3I_RDMA_Process.addr + (shm_block * i));
	    for (j=0; j<pg_size; j++)
	    {
		vc->shm.shm[j].head_index = 0;
		vc->shm.shm[j].tail_index = 0;
		for (k=0; k<MPIDI_CH3I_NUM_PACKETS; k++)
		{
		    vc->shm.shm[j].packet[k].offset = 0;
		    vc->shm.shm[j].packet[k].avail = MPIDI_CH3I_PKT_EMPTY;
		}
	    }
	}
	else
	{
	    /*vc->shm.shm += pg_rank;*/
	    vc->shm.shm = NULL;
	    vc->shm.write_shmq = (MPIDI_CH3I_SHM_Queue_t*)((char*)MPIDI_CH3I_RDMA_Process.addr + (shm_block * i)) + pg_rank;
	    vc->shm.read_shmq = (MPIDI_CH3I_SHM_Queue_t*)((char*)MPIDI_CH3I_RDMA_Process.addr + (shm_block * pg_rank)) + i;
	    /* post a read of the first packet header */
	    /*vc->shm.shm_reading_pkt = TRUE;*/
	    /*MPIDI_CH3I_post_read( &vc_table[i] , &vc->ch.pkt, sizeof(vc->ch.pkt));*/
	    vc->ch.req->dev.iov[0].MPID_IOV_BUF = (void *)&vc->ch.req->ch.pkt;
	    vc->ch.req->dev.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t);
	    vc->ch.req->dev.iov_count = 1;
	    vc->ch.req->ch.iov_offset = 0;
	    vc->ch.req->dev.ca = MPIDI_CH3I_CA_HANDLE_PKT;
	    vc->ch.recv_active = vc->ch.req;
	    error = MPIDI_CH3I_post_read( vc , &vc->ch.req->ch.pkt, sizeof(vc->ch.req->ch.pkt));
	    if (error != MPI_SUCCESS)
	    {
		error = MPIR_Err_create_code(error, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**rdma_init", 0);
		return error;
	    }
	}
    }

#ifdef HAVE_WINDOWS_H
    {
	/* if you know the number of processors, calculate the spin count relative to that number */
        SYSTEM_INFO info;
        GetSystemInfo(&info);
        if (info.dwNumberOfProcessors == 1)
            MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = 1;
        else if (info.dwNumberOfProcessors < (DWORD) pg_size)
            MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = ( MPIDI_CH3I_SPIN_COUNT_DEFAULT * info.dwNumberOfProcessors ) / pg_size;
    }
#else
    /* figure out how many processors are available and set the spin count accordingly */
#ifdef HAVE_SYSCONF
    {
	int num_cpus;
	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (num_cpus == 1)
	    MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = 1;
	else if (num_cpus > 0 && num_cpus < pg_size)
	    MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = ( MPIDI_CH3I_SPIN_COUNT_DEFAULT * num_cpus ) / pg_size;
    }
#endif
#endif

    error = PMI_Barrier(); /* barrier to make sure queues are initialized before continuing */
    if (error != 0)
    {
	error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", error);
	return error;
    }
#ifdef USE_POSIX_SHM
    shm_unlink(MPIDI_CH3I_RDMA_Process.key);
#elif defined (USE_SYSV_SHM)
    shmctl(MPIDI_CH3I_RDMA_Process.id, IPC_RMID, NULL);
#endif

    return MPI_SUCCESS;
}

/* finalize releases the RDMA memory and any other cleanup */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RDMA_finalize
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RDMA_finalize()
{
    int mpi_errno;

    mpi_errno = MPIDI_CH3I_SHM_Release_mem(MPIDI_Process.my_pg, (MPIDI_PG_Get_size(MPIDI_Process.my_pg) > 1) ? TRUE : FALSE);
    if (mpi_errno != MPI_SUCCESS)
    {
	MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**rdma_finalize", 0);
    }
    return mpi_errno;
}

static int MPIDI_CH3I_PG_Compare_ids(void * id1, void * id2)
{
    return (strcmp((char *) id1, (char *) id2) == 0) ? TRUE : FALSE;
}

static int MPIDI_CH3I_PG_Destroy(MPIDI_PG_t * pg, void * id)
{
    if (pg->ch.kvs_name != NULL)
    {
	MPIU_Free(pg->ch.kvs_name);
    }

    if (id != NULL)
    { 
	MPIU_Free(id);
    }
    
    return MPI_SUCCESS;
}
