/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "ch3i_progress.h"

int MPIDI_CH3I_shm_read_active = 0;
int MPIDI_CH3I_shm_write_active = 0;
int MPIDI_CH3I_sock_read_active = 0;
int MPIDI_CH3I_sock_write_active = 0;
int MPIDI_CH3I_active_flag = 0;
static MPIDI_CH3I_Shmem_queue_info info;

#undef USE_CH3I_PROGRESS_DELAY_QUEUE


volatile unsigned int MPIDI_CH3I_progress_completion_count = 0;
#if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)
    volatile int MPIDI_CH3I_progress_blocked = FALSE;
    volatile int MPIDI_CH3I_progress_wakeup_signalled = FALSE;

#   if (USE_THREAD_IMPL == MPICH_THREAD_IMPL_GLOBAL_MUTEX)
#       if defined(USE_CH3I_PROGRESS_DELAY_QUEUE)
            struct MPIDI_CH3I_Progress_delay_queue_elem
	    {
		unsigned int count;
		volatile int flag;
		MPID_Thread_cond_t cond;
		struct MPIDI_CH3I_Progress_delay_queue_elem * next;
	    };

            static struct MPIDI_CH3I_Progress_delay_queue_elem * MPIDI_CH3I_Progress_delay_queue_head = NULL;
            static struct MPIDI_CH3I_Progress_delay_queue_elem * MPIDI_CH3I_Progress_delay_queue_tail = NULL;
#       else
            MPID_Thread_cond_t MPIDI_CH3I_progress_completion_cond;
#       endif
#   endif
#endif


#if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)
    static int MPIDI_CH3I_Progress_delay(unsigned int completion_count);
    static int MPIDI_CH3I_Progress_continue(unsigned int completion_count);
#endif

#if (MPICH_THREAD_LEVEL == MPI_THREAD_MULTIPLE)

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_delay
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPIDI_CH3I_Progress_delay(unsigned int completion_count)
{
    int mpi_errno = MPI_SUCCESS;
    
#   if (USE_THREAD_IMPL == MPICH_THREAD_IMPL_GLOBAL_MONITOR)
    {
#	error This is so not right.  But what is the correct technique?
	
	if (MPIU_Monitor_closet_get_occupancy_count(MPIR_Process.global_closet) > 0)
	{
	    MPIU_Monitor_continue(MPIR_Process.global_monitor, MPIR_Process.global_closet);
	    MPIU_Monitor_enter(MPIR_Process.global_monitor);
	    if (completion_count != MPIDI_CH3I_progress_completion_count)
	    {
		goto impl_exit;
	    }
	}
		    
	MPIU_Monitor_delay(MPIR_Process.global_monitor, MPIR_Process.global_closet);

      impl_exit:
	{
	}
    }
#   elif (USE_THREAD_IMPL == MPICH_THREAD_IMPL_GLOBAL_MUTEX)
    {
#	if defined(USE_CH3I_PROGRESS_DELAY_QUEUE)
	{
	    int rc;
	    struct MPIDI_CH3I_Progress_delay_queue_elem dq_elem;
	
	    dq_elem.count = completion_count;
	    dq_elem.flag = FALSE;
    
	    dq_elem.next = NULL;
	    MPIDI_CH3I_Progress_delay_queue_tail->next = &dq_elem;
	    MPIDI_CH3I_Progress_delay_queue_tail = &dq_elem;
	    if (MPIDI_CH3I_Progress_delay_queue_head == NULL)
	    {
		MPIDI_CH3I_Progress_delay_queue_head = &dq_elem;
	    }

	    rc = MPID_Thread_cond_create(&dq_elem.cond, NULL);
	    if (rc != 0)
	    { 
		mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", NULL);
		goto impl_exit;
	    }
    
	    do
	    {
		MPID_Thread_cond_wait(&dq_elem.cond, &MPIR_Process.global_mutex);
	    }
	    while(dq_elem.flag == FALSE);
	    
	    MPID_Thread_cond_destroy(&dq_elem.cond, NULL);
	    
	  impl_exit:
	    {
	    }
	}
#	else
	{ 
	    while (completion_count == MPIDI_CH3I_progress_completion_count)
	    {
		MPID_Thread_cond_wait(&MPIDI_CH3I_progress_completion_cond, &MPIR_Process.global_mutex);
	    }
	}
#       endif
    }
#   endif
    
    return mpi_errno;
}
/* end MPIDI_CH3I_Progress_delay() */


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_continue
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static int MPIDI_CH3I_Progress_continue(unsigned int completion_count)
{
    int mpi_errno = MPI_SUCCESS;

#   if (USE_THREAD_IMPL == MPICH_THREAD_IMPL_GLOBAL_MONITOR)
    {
#	error This is so not right.  But what is the correct technique?
	if (MPIU_Monitor_closet_get_occupancy(MPIR_Process.global_closet) > 0)
	{
	    MPIU_Monitor_continue(MPIR_Process.global_monitor, MPIR_Process.global_closet);
	}
	else
	{ 
	    MPIU_Monitor_exit(MPIR_Process.global_monitor);
	}
    }
#   elif (USE_THREAD_IMPL == MPICH_THREAD_IMPL_GLOBAL_MUTEX)
    {
#	if defined(USE_CH3I_PROGRESS_DELAY_QUEUE)
	{
	    struct MPIDI_CH3I_Progress_delay_queue_elem * dq_elem;
	    
	    dq_elem = MPIDI_CH3I_Progress_delay_queue_head;
	    while(dq_elem != NULL && dq_elem->count != completion_count)
	    {
		dq_elem->flag = TRUE;
		MPID_Thread_cond_signal(&dq_elem->cond);
		dq_elem = dq_elem->next;
	    }
	    MPIDI_CH3I_Progress_delay_queue_head = dq_elem;
	}
#	else
	{
	    MPID_Thread_cond_broadcast(&MPIDI_CH3I_progress_completion_cond);
	}
#	endif
    }
#   endif
    
    return mpi_errno;
}
/* end MPIDI_CH3I_Progress_continue() */

#endif /* (USE_THREAD_IMPL == MPICH_THREAD_IMPL_GLOBAL) */

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Progress(int is_blocking, MPID_Progress_state *state)
{
    int mpi_errno = MPI_SUCCESS;
    int rc;
    int register bShmProgressMade;
#ifdef MPICH_DBG_OUTPUT
    int count;
#endif
    unsigned completions = MPIDI_CH3I_progress_completion_count;
    int num_bytes;
    MPIDI_VC_t *vc_ptr;
    static int spin_count = 1;
    static int msg_queue_count = 0;
    int sock_progress, msgq_progress;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS);
    MPIDI_STATE_DECL(MPID_STATE_MPIDU_YIELD);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS);

#ifdef MPICH_DBG_OUTPUT
    if (is_blocking)
    {
	MPIDI_DBG_PRINTF((50, FCNAME, "entering, blocking=%s", is_blocking ? "true" : "false"));
    }
#endif
    do
    {
	/* make progress on the shared memory queues */

	/* reset the shared memory event */
	mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[0]);
	if (mpi_errno != MPI_SUCCESS)
	{
	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	    return mpi_errno;
	}
	bShmProgressMade = FALSE;
	if (MPIDI_CH3I_Process.shm_reading_list)
	{
	    rc = MPIDI_CH3I_SHM_read_progress(MPIDI_CH3I_Process.shm_reading_list, 0, &vc_ptr, &num_bytes);
	    if (rc == MPI_SUCCESS)
	    {
		MPIDI_DBG_PRINTF((50, FCNAME, "MPIDI_CH3I_SHM_read_progress reported %d bytes read", num_bytes));
		mpi_errno = handle_shm_read(vc_ptr, num_bytes);
		if (mpi_errno != MPI_SUCCESS)
		{
		    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
		    goto fn_exit;
		}
		bShmProgressMade = TRUE;
	    }
	    else
	    {
		if (rc != SHM_WAIT_TIMEOUT)
		{
		    /*MPIDI_err_printf("MPIDI_CH3_Progress", "MPIDI_CH3I_SHM_read_progress returned error %d\n", rc);*/
		    mpi_errno = MPIR_Err_create_code(rc, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**shm_read_progress", 0);
		    goto fn_exit;
		}
	    }
	}

	if (MPIDI_CH3I_Process.shm_writing_list)
	{
	    vc_ptr = MPIDI_CH3I_Process.shm_writing_list;
	    while (vc_ptr)
	    {
		if (vc_ptr->ch.send_active != NULL)
		{
		    rc = MPIDI_CH3I_SHM_write_progress(vc_ptr);
		    if (rc == MPI_SUCCESS)
		    {
			bShmProgressMade = TRUE;
		    }
		    else if (rc != SHM_WAIT_TIMEOUT)
		    {
			mpi_errno = MPIR_Err_create_code(rc, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
			goto fn_exit;
		    }
		}
		vc_ptr = vc_ptr->ch.shm_next_writer;
	    }
	}

	/*
	MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_YIELD);
	MPIDU_Yield();
	MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_YIELD);
	*/

	/* make progress on the sockets */

	mpi_errno = MPIU_Event_test(MPIDI_CH3I_Process.event[1], &sock_progress);
	if (mpi_errno != MPI_SUCCESS)
	{
	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
	    goto fn_exit;
	}
	if (sock_progress)
	{
	    /* reset my event */
	    mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[1]);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
		goto fn_exit;
	    }
	    /* handle the operation */
	    mpi_errno = MPIDI_CH3I_Progress_handle_sock_event(&MPIDI_CH3I_Process.sock_event);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_handle_sock_op", 0);
		goto fn_exit;
	    }
	    /* signal the sock thread to wait for the next operation */
	    mpi_errno = MPIU_Event_set(MPIDI_CH3I_Process.event[3]);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
		goto fn_exit;
	    }
	}
	else
	{
	    if (!bShmProgressMade)
	    {
		MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_YIELD);
		MPIDU_Yield();
		MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_YIELD);
	    }
	}

	mpi_errno = MPIU_Event_test(MPIDI_CH3I_Process.event[2], &msgq_progress);
	if (mpi_errno != MPI_SUCCESS)
	{
	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
	    goto fn_exit;
	}
	if (msgq_progress)
	{
	    MPIDI_PG_t *pg;
	    /* new shmem queue connection */
	    rc = MPIU_Event_reset(MPIDI_CH3I_Process.event[2]);
	    if (rc != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(rc, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
		goto fn_exit;
	    }
	    MPIDI_PG_Find(info.pg_id, &pg);
	    MPIDI_PG_Get_vc(pg, info.pg_rank, &vc_ptr);
	    /*vc_ptr = &MPIDI_CH3I_Process.pg->vc_table[info.pg_rank];*/
	    rc = MPIDI_CH3I_SHM_Attach_to_mem(&info.info, &vc_ptr->ch.shm_read_queue_info);
	    if (rc != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(rc, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**attach_to_mem", "**attach_to_mem %d", vc_ptr->ch.shm_read_queue_info.error);
		goto fn_exit;
	    }
	    MPIU_DBG_PRINTF(("attached to queue from process %d\n", info.pg_rank));
	    /*vc_ptr->ch.state = MPIDI_CH3I_VC_STATE_CONNECTED;*/ /* we are read connected but not write connected */
	    vc_ptr->ch.bShm = TRUE;
	    vc_ptr->ch.read_shmq = vc_ptr->ch.shm_read_queue_info.addr;/*info.info.addr;*/
	    MPIU_DBG_PRINTF(("read_shmq = %p\n", vc_ptr->ch.read_shmq));
	    vc_ptr->ch.shm_reading_pkt = TRUE;
	    /* add this VC to the global list to be shm_waited on */
	    vc_ptr->ch.shm_next_reader = MPIDI_CH3I_Process.shm_reading_list;
	    MPIDI_CH3I_Process.shm_reading_list = vc_ptr;
	    rc = MPIU_Event_set(MPIDI_CH3I_Process.event[4]);
	    if (rc != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(rc, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_set", 0);
		goto fn_exit;
	    }
	    /* set the shared memory event */
	    mpi_errno = MPIU_Event_set(MPIDI_CH3I_Process.event[0]);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_set", 0);
		return mpi_errno;
	    }
	}

	if (completions == MPIDI_CH3I_progress_completion_count && is_blocking)
	{
	    MPIU_DBG_PRINTF(("blocking progress call waiting for either a sock or shm event.\n"));
	    mpi_errno = MPIU_Event_wait_multiple(MPIDI_CH3I_Process.event, 3, 0);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress", 0);
		goto fn_exit;
	    }
	}
    }
    while (completions == MPIDI_CH3I_progress_completion_count && is_blocking);

fn_exit:
#ifdef MPICH_DBG_OUTPUT
    count = MPIDI_CH3I_progress_completion_count - completions;
    if (is_blocking)
    {
	MPIDI_DBG_PRINTF((50, FCNAME, "exiting, count=%d", count));
    }
    else
    {
	if (count > 0)
	{
	    MPIDI_DBG_PRINTF((50, FCNAME, "exiting (non-blocking), count=%d", count));
	}
    }
#endif
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_wakeup
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
void MPIDI_CH3I_Progress_wakeup(void)
{
    MPIDU_Sock_wakeup(sock_set);
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress_poke
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_Progress_poke()
{
    int mpi_errno;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS_POKE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS_POKE);
    mpi_errno = MPIDI_CH3_Progress_test();
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS_POKE);
    return mpi_errno;
}

#if !defined(MPIDI_CH3_Progress_start)
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress_start
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
void MPIDI_CH3_Progress_start()
{
    /* MT - This function is empty for the single-threaded implementation */
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS_START);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS_START);
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS_START);
}
#endif

#if !defined(MPIDI_CH3_Progress_end)
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress_end
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
void MPIDI_CH3_Progress_end()
{
    /* MT: This function is empty for the single-threaded implementation */
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS_END);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS_END);
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS_END);
}
#endif

static MPID_Thread_id_t sock_thread;
#undef FUNCNAME
#define FUNCNAME sock_thread_function
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
void sock_thread_function(void *p)
{
    int mpi_errno;

    for (;;)
    {
	MPIU_DBG_PRINTF(("sock thread waiting for a sock event.\n"));
	mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[3]);
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	    MPID_Thread_exit();
	}
	mpi_errno = MPIDU_Sock_wait(sock_set, MPIDU_SOCK_INFINITE_TIME, &MPIDI_CH3I_Process.sock_event);
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_sock_wait", 0);
	    MPID_Thread_exit();
	}
	MPIU_DBG_PRINTF(("sock thread, sock_wait returned, signalling main thread.\n"));
	mpi_errno = MPIU_Event_set(MPIDI_CH3I_Process.event[1]);
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_set", 0);
	    MPID_Thread_exit();
	}
	MPIU_DBG_PRINTF(("sock thread waiting for the sock event to be reset.\n"));
	mpi_errno = MPIU_Event_wait(MPIDI_CH3I_Process.event[3]);
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_wait", 0);
	    MPID_Thread_exit();
	}
    }
}

static MPID_Thread_id_t msgq_thread;
#undef FUNCNAME
#define FUNCNAME msgq_thread_function
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
void msgq_thread_function(void *p)
{
    int num_bytes;
    int mpi_errno;

    for (;;)
    {
	MPIU_DBG_PRINTF(("msgq thread reseting msgq event.\n"));
	mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[4]);
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIU_DBG_PRINTF(("msgq thread unable to reset the event, exiting\n"));
	    MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**sock_wait", 0);
	    MPID_Thread_exit();
	}
	MPIU_DBG_PRINTF(("msgq thread waiting for a msgq event.\n"));
	/*MPIDI_Process.my_pg is not initialized until after MPIDI_CH3I_Progress_init has been called so stash a copy in MPIDI_CH3I_my_pg */
	mpi_errno = MPIDI_CH3I_BootstrapQ_recv_msg(/*MPIDI_Process.my_pg*/MPIDI_CH3I_my_pg->ch.bootstrapQ, &info, sizeof(info), &num_bytes, TRUE);
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIU_DBG_PRINTF(("msgq thread unable to recv_msg, exiting\n"));
	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**boot_recv", 0);
	    MPID_Thread_exit();
	}
	if (num_bytes != 0 && num_bytes != sizeof(info))
	{
	    MPIU_DBG_PRINTF(("msgq recv_msg returned %d bytes instead of %d\n", num_bytes, sizeof(info)));
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**bootqmsg", "**bootqmsg %d", num_bytes);
	    MPID_Thread_exit();
	}
	if (num_bytes)
	{
	    MPIU_DBG_PRINTF(("msgq thread, msgq_recv returned, signalling main thread.\n"));
	    mpi_errno = MPIU_Event_set(MPIDI_CH3I_Process.event[2]);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		MPIU_DBG_PRINTF(("msgq thread unable to set the event, exiting\n"));
		MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**sock_wait", 0);
		MPID_Thread_exit();
	    }
	    MPIU_DBG_PRINTF(("msgq thread waiting for the msgq event to be reset.\n"));
	    mpi_errno = MPIU_Event_wait(MPIDI_CH3I_Process.event[4]);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		MPIU_DBG_PRINTF(("msgq thread unable to wait for the event, exiting\n"));
		MPIDI_CH3I_Process.sock_event.error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**sock_wait", 0);
		MPID_Thread_exit();
	    }
	}
    }
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Progress_init()
{
    MPIDU_Sock_t sock;
    int mpi_errno = MPI_SUCCESS;
    MPID_Thread_func_t tfunc;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PROGRESS_INIT);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PROGRESS_INIT);

    MPIDI_DBG_PRINTF((60, FCNAME, "entering"));

    /* create the events */
    mpi_errno = MPIU_Event_create(&MPIDI_CH3I_Process.event[0], MPIDI_CH3I_Process.shm_event_name, MPIU_EVENT_NAME_LEN_MAX);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_create", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_create(&MPIDI_CH3I_Process.event[1], MPIDI_CH3I_Process.sock_event_a_name, MPIU_EVENT_NAME_LEN_MAX);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_create", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_create(&MPIDI_CH3I_Process.event[2], MPIDI_CH3I_Process.msgq_event_a_name, MPIU_EVENT_NAME_LEN_MAX);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_create", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_create(&MPIDI_CH3I_Process.event[3], MPIDI_CH3I_Process.sock_event_b_name, MPIU_EVENT_NAME_LEN_MAX);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_create", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_create(&MPIDI_CH3I_Process.event[4], MPIDI_CH3I_Process.msgq_event_b_name, MPIU_EVENT_NAME_LEN_MAX);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_create", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[0]);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[1]);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[2]);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[3]);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	return mpi_errno;
    }
    mpi_errno = MPIU_Event_reset(MPIDI_CH3I_Process.event[4]);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**event_reset", 0);
	return mpi_errno;
    }

    /* initialize the sock library */
    mpi_errno = MPIDU_Sock_init();
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_init", 0);
	goto fn_exit;
    }

    /* create sock set */
    mpi_errno = MPIDU_Sock_create_set(&sock_set);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_init", 0);
	goto fn_exit;
    }

    /* establish non-blocking listener */
    mpi_errno = connection_alloc(&MPIDI_CH3I_listener_conn);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", NULL);
	goto fn_exit;
    }
    MPIDI_CH3I_listener_conn->sock = NULL;
    MPIDI_CH3I_listener_conn->vc = NULL;
    MPIDI_CH3I_listener_conn->state = CONN_STATE_LISTENING;
    MPIDI_CH3I_listener_conn->send_active = NULL;
    MPIDI_CH3I_listener_conn->recv_active = NULL;

    mpi_errno = MPIDU_Sock_listen(sock_set, MPIDI_CH3I_listener_conn, &MPIDI_CH3I_listener_port, &sock);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_init", 0);
	goto fn_exit;
    }

    MPIDI_CH3I_listener_conn->sock = sock;

    tfunc = sock_thread_function;
    MPID_Thread_create(tfunc, MPIDI_CH3I_listener_conn, &sock_thread, &mpi_errno);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_init", 0);
	goto fn_exit;
    }
    tfunc = msgq_thread_function;
    MPID_Thread_create(tfunc, NULL, &msgq_thread, &mpi_errno);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**progress_init", 0);
	goto fn_exit;
    }

fn_exit:
    MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PROGRESS_INIT);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_finalize
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Progress_finalize()
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Progress_state progress_state;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PROGRESS_FINALIZE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PROGRESS_FINALIZE);
    MPIDI_DBG_PRINTF((60, FCNAME, "entering"));

#if 0
    MPIR_Nest_incr();
    {
	NMPI_Barrier(MPI_COMM_WORLD); /* FIXME: this barrier may not be necessary */
	shutting_down = TRUE;
	NMPI_Barrier(MPI_COMM_WORLD);
    }
    MPIR_Nest_decr();
#endif

    /* Shut down the listener */
    mpi_errno = MPIDU_Sock_post_close(MPIDI_CH3I_listener_conn->sock);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", NULL);
	goto fn_exit;
    }
    
    MPID_Progress_start(&progress_state);
    while(MPIDI_CH3I_listener_conn != NULL)
    {
	mpi_errno = MPID_Progress_wait(&progress_state);
	
    }
    MPID_Progress_end(&progress_state);

    /* FIXME: Cleanly shutdown other socks and MPIU_Free connection structures. (close protocol?) */

    MPIDU_Sock_destroy_set(sock_set);
    MPIDU_Sock_finalize();

fn_exit:
    MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PROGRESS_FINALIZE);
    return mpi_errno;
}
