/*  _______________________________________________________________________

    DAKOTA: Design Analysis Kit for Optimization and Terascale Applications
    Copyright (c) 2001, Sandia National Laboratories.
    This software is distributed under the GNU General Public License.
    For more information, see the README file in the top Dakota directory.
    _______________________________________________________________________ */

//- Class:        ForkApplicInterface
//- Description:  Class implementation
//- Owner:        Mike Eldred

#include <sys/wait.h>
#include <unistd.h>
#include "DakotaResponse.H"
#include "ParamResponsePair.H"
#include "ForkApplicInterface.H"


ForkApplicInterface::
ForkApplicInterface(const ProblemDescDB& problem_db, const size_t& num_fns):
  ApplicationInterface(problem_db, num_fns), forkSimulator(problem_db)
{
  forkSimulator.quiet_flag(suppressOutput); // turns off slave output
}


ForkApplicInterface::~ForkApplicInterface() 
{
  // Virtual destructor handles referenceCount at DakotaInterface level.
}


// -------------------------------------------------------
// Begin derived functions for evaluation level schedulers
// -------------------------------------------------------
void ForkApplicInterface::derived_map(const DakotaVariables& vars,
			              const DakotaIntArray& asv,
			              DakotaResponse& response, int fn_eval_id)
{
  // This function may be executed by a multiprocessor evalComm.

  forkSimulator.define_filenames(fn_eval_id); // all of evalComm
  if (evalCommRank == 0)
    forkSimulator.write_parameters_file(vars, asv, fn_eval_id);

  // execute the simulator application -- blocking call
  fork_application(BLOCK);

  try { 
    if (evalCommRank == 0)
      forkSimulator.read_results_file(response, fn_eval_id);
  }

  catch(DakotaString& err_msg) {
    // a DakotaString exception involves detection of an incomplete file/data
    // set. For the ForkApplicInterface, there is no potential for an 
    // incomplete file resulting from a race condition -> therefore, echo 
    // the error and abort.
    Cerr << err_msg << endl;
    abort_handler(-1);
  }

  catch(int fail_code) {
    // see explanantion in SysCallApplicInterface::derived_map()
    throw; // rethrow to outer catch
  }
}


void ForkApplicInterface::derived_map_asynch(const ParamResponsePair& pair)
{
  // This function may not be executed by a multiprocessor evalComm.

  int fn_eval_id = pair.eval_id();
  forkSimulator.define_filenames(fn_eval_id); // all of evalComm
  forkSimulator.write_parameters_file(pair.prp_parameters(), 
                                      pair.active_set_vector(), fn_eval_id);
 
  // execute the simulator application -- nonblocking call
  pid_t pid = fork_application(FALL_THROUGH);

  // store the process & eval ids in lists.  The correspondence in these lists
  // allows a completed process id to be mapped to a fn. eval. id, which is 
  // then mapped to the appropriate index of prp_list in
  // derived_synch_kernel.  Correspondence between processIdList order and
  // beforeSynchPRPList order can no longer be assumed due to the existence
  // of hybrid parallelism, i.e. ApplicationInterface::serve_asynch().
  processIdList.insert(pid);
  evalIdList.insert(fn_eval_id);
}


void ForkApplicInterface::
derived_synch(DakotaList<ParamResponsePair>& prp_list)
{
  // Check for return of process id's corresponding to those stored in PRPairs.
  // Wait for at least one completion and complete all jobs that have returned.
  // This satisifies a "fairness" principle, in the sense that a completed job
  // will _always_ be processed (whereas accepting only a single completion 
  // could always accept the same completion - the case of very inexpensive fn.
  // evals. - and starve some servers).

  int status;

#if (!defined(TFLOPS_COMPUTE) && !defined(CPLANT_COMPUTE))
  pid_t pid = wait(&status); // wait for any (1st level) child process to
  // finish.  No need for usleep in derived_synch since wait provides a system
  // optimized test facility.

  do { // Perform this loop at least once for the pid from wait.
    forkSimulator.check_status(status); // check the exit status
    derived_synch_kernel(prp_list, pid);
  } while( (pid=waitpid((pid_t)-1, &status, WNOHANG)) > 0 ); // Check for any
  // additional completed pid's.  This satisfies the fairness requirement & is 
  // particularly useful in the case of inexpensive fn. evals.
#endif // TFLOPS_COMPUTE/CPLANT_COMPUTE
}


void ForkApplicInterface::
derived_synch_nowait(DakotaList<ParamResponsePair>& prp_list)
{
  // Check for return of process id's corresponding to those stored in PRPairs.
  // Do not wait - complete all jobs that are immediately available.

  int status;
  pid_t pid;

#if (!defined(TFLOPS_COMPUTE) && !defined(CPLANT_COMPUTE))
  while( (pid=waitpid((pid_t)-1, &status, WNOHANG)) > 0 ) {
    forkSimulator.check_status(status); // check the exit status
    derived_synch_kernel(prp_list, pid);
  }
#endif // TFLOPS_COMPUTE/CPLANT_COMPUTE

#if (!defined(OSF) && !defined(TFLOPS))
  // reduce processor load from DAKOTA testing if jobs are not finishing
  if (!completionList.entries())
    usleep(1000); // 1000 microseconds = 1 millisec
#endif
}


void ForkApplicInterface::
derived_synch_kernel(DakotaList<ParamResponsePair>& prp_list, const pid_t pid)
{
  // Convenience function for common code between derived_synch() &
  // derived_synch_nowait() cases

  // Map pid to index to fn_eval_id to prp_index.  Note that processIdList &
  // evalIdList have identical ordering.
  size_t index = processIdList.index(pid);
  if (index == _NPOS) {
    // should not happen so long as wait ignores any 2nd level child processes
    Cerr << "Error: pid returned from wait does not match any 1st level child "
	 << "process." << endl;
    abort_handler(-1);
  }
  int fn_eval_id = evalIdList[index];
  int prp_index = prp_list.index(eval_id_compare, &fn_eval_id);

  // now populate the corresponding response by reading the results file 
  DakotaResponse response = prp_list[prp_index].prp_response(); 
  try { forkSimulator.read_results_file(response, fn_eval_id); }

  catch(DakotaString& err_msg) { // For forks, there is no potential for an 
    // file write race condition since the process has completed -> an 
    // exception involving an incomplete file/data set is a true error.
    Cerr << err_msg << endl;
    abort_handler(-1);
  }

  catch(int fail_code) { // If an int exception ("fail" detected in results 
    // file) is caught, call manage_failure which will either (1) repair the 
    // failure and populate response, or (2) abort the run.  NOTE: this 
    // destroys load balancing but trying to load balance failure recovery 
    // would be more difficult than it is worth.
    manage_failure(prp_list[prp_index].prp_parameters(), 
                   response.active_set_vector(), response, fn_eval_id);
  }

  prp_list[prp_index].prp_response(response);

  // bookkeep the completed jobs for use in ApplicationInterface::synch_local
  // and ApplicationInterface::serve_asynch
  completionList.insert(fn_eval_id);

  // remove the processed job from the bookkeeping lists
  processIdList.removeAt(index);
  evalIdList.removeAt(index);
}


int ForkApplicInterface::
derived_synchronous_local_analysis(const int& analysis_id)
{
  // This code provides the derived function used by ApplicationInterface::
  // serve_analyses_synch as well as a convenience function for
  // ForkApplicInterface::synchronous_local_analyses above.

  forkSimulator.argument_list(0, analysisDrivers[analysis_id-1]);
  if (numAnalysisDrivers > 1) // append program counter to results file
    forkSimulator.tag_argument_list(2, analysis_id);
  else // should not happen (master-slave with a single analysis)
    forkSimulator.argument_list(2,forkSimulator.modified_results_filename());
#ifdef MPI_DEBUG
  Cout << "Blocking fork to analysis " << analysis_id << endl; // flush buffer
#endif // MPI_DEBUG
  forkSimulator.fork_program(BLOCK);
  return 0; // used for failure codes in DirectFn case
}


/** Manage the input filter, 1 or more analysis programs, and the
    output filter in blocking or nonblocking mode as governed by
    block_flag.  In the case of a single analysis and no filters, a
    single fork is performed, while in other cases, an initial fork is
    reforked multiple times.  Called from derived_map() with
    block_flag == BLOCK and from derived_map_asynch() with block_flag
    == FALL_THROUGH.  Uses ForkAnalysisCode::fork_program() to spawn
    individual program components within the function evaluation. */
pid_t ForkApplicInterface::fork_application(const short block_flag)
{
  // Note that a commandUsage specification is not supported with the fork 
  // interface since execvp requires an argList which cannot support a general
  // string representation.  It would be feasible to parse the commandUsage 
  // string and convert it to an argList, but this would only work when 
  // commandUsage begins with an executable file and all following tokens are
  // command line arguments to that executable (i.e., anything involving "()",
  // ";", or other Unix shell syntax would not work). 

  const DakotaString& ifilter_name = forkSimulator.input_filter_name();
  const DakotaString& ofilter_name = forkSimulator.output_filter_name();
  const DakotaString& mod_params_filename
    = forkSimulator.modified_parameters_filename();
  const DakotaString& mod_res_filename
    = forkSimulator.modified_results_filename();
  pid_t pid = 0;
  size_t i;

#if (!defined(TFLOPS_COMPUTE) && !defined(CPLANT_COMPUTE))
  if (evalCommRank == 0 && !suppressOutput) {
    if (block_flag) {
      if (evalDedMasterFlag)
        Cout << "blocking fork self-schedule: ";
      else if (numAnalysisServers > 1)
        Cout << "blocking fork static schedule: ";
      else
        Cout << "blocking fork: ";
    }
    else
      Cout << "nonblocking fork: ";
    if (!ifilter_name.isNull())
      Cout << ifilter_name << ' ' << mod_params_filename << ' '
           << mod_res_filename << "; ";
    for (i=0; i<numAnalysisDrivers; i++) {
      Cout << analysisDrivers[i] << ' ' << mod_params_filename << ' '
           << mod_res_filename;
      if (numAnalysisDrivers > 1)
        Cout << '.' << i+1;
      if (i != numAnalysisDrivers-1)
        Cout << "; ";
    }
    if (!ofilter_name.isNull())
      Cout << "; " << ofilter_name << ' ' << mod_params_filename << ' ' 
           << mod_res_filename;
    Cout << '\n';
  }
  // Cout must be flushed prior to the fork to clear the stdout buffer.
  // Otherwise, the intermediate process receives a copy of the contents of
  // this buffer and outputs the contents on the next buffer flush.
  Cout << flush;

  // input file name is consistent across all cases
  forkSimulator.argument_list(1, mod_params_filename);

  if (ifilter_name.isNull() && ofilter_name.isNull() && numAnalysisDrivers==1) {
    // fork the one-piece interface directly (no intermediate process required)
    forkSimulator.argument_list(0, analysisDrivers[0]);
    forkSimulator.argument_list(2, mod_res_filename);
    pid = forkSimulator.fork_program(block_flag);
  }
  else if (evalCommSize > 1) {
    // run a blocking schedule of single-proc. analyses over analysis servers.
    // The schedules are executed by the parent processes.  Forks are not used
    // at this level since the message passing provides the needed asynchrony
    // at the evaluation level (unlike the final case below where 2 levels of
    // forks must be used to provide asynchrony at the eval level).

    if (!block_flag) {
      Cerr << "Error: multiprocessor evalComm does not support nonblocking "
	   << "ForkApplicInterface::fork_application." << endl;
      abort_handler(-1);
    }

    if (!ifilter_name.isNull() && evalCommRank == 0) {
      forkSimulator.argument_list(0, ifilter_name);
      forkSimulator.argument_list(2, mod_res_filename);
      forkSimulator.fork_program(BLOCK);
    }

    // Schedule analyses using either master-slave/dynamic or peer/static
    if (evalDedMasterFlag) {
      // master-slave dynamic scheduling requires a central point of control 
      // and therefore needs separate schedule & serve functions.
      if (evalCommRank == 0)
        self_schedule_analyses();
      else {
	// in message passing mode, the user must explicitly specify analysis
	// concurrency to get hybrid parallelism
        if (asynchLocalAnalysisConcurrency > 1)
          serve_analyses_asynch();
        else
          serve_analyses_synch();
      }
    }
    else {
      // static scheduling does not require special schedule/serve functions
      // since it can support message passing & hybrid mode directly using
      // synchronous_local & asynchronous_local with staggered starts.  However,
      // it does require MPI_Barrier's since there's no scheduler to enforce
      // synchronization.
#ifdef USE_MPI
      // avoid peers 2-n initiating analyses prior to completion of 
      // write_parameters_file() by peer 1
      MPI_Barrier(evalComm);
#endif
      if (asynchLocalAnalysisConcurrency > 1) // hybrid requires explicit spec
        asynchronous_local_analyses(analysisServerId, numAnalysisDrivers,
                                    numAnalysisServers); // hybrid mode
      else
        synchronous_local_analyses(analysisServerId, numAnalysisDrivers,
                                   numAnalysisServers);  // msg passing mode
#ifdef USE_MPI
      // avoid peer 1 reading all the results files before peers 2-n have
      // completed writing them
      MPI_Barrier(evalComm);
#endif
    }

    if (!ofilter_name.isNull() && evalCommRank == 0) {
      forkSimulator.argument_list(0, ofilter_name);
      forkSimulator.argument_list(2, mod_res_filename);
      forkSimulator.fork_program(BLOCK);
    }
  }
  else { // schedule all analyses local to this processor

    // If the evaluation is nonblocking, then an intermediate process must be
    // forked to manage the 3-piece interface, multiple analysis drivers, or
    // both.  The intermediate process provides asynchrony at the evaluation
    // level, even though the iFilter execution, analysisDrivers scheduling, 
    // and oFilter execution are blocking.

    // In the 3-piece case, it would be desirable to utilize the same format as
    // is used in the SysCall case, i.e., grouping i_filter, simulator, and
    // o_filter with ()'s and ;'s, but this is not supported by the exec family
    // of functions (see exec man pages).

    // vfork should only be used when followed immediately by an exec since 
    // vfork borrows the parent process and only returns control to the parent
    // when one of the functions from the exec() or exit() family is 
    // encountered.  Therefore, since we want this intermediate process to be 
    // able to execute concurrently with the parent dakota and other asynch
    // processes, fork should be used here since there is no matching exec().
    if (!block_flag)
      pid = fork();

    if (pid == 0) { // if nonblocking, then this is the intermediate (1st level
      // child) process.  If blocking, then no fork has yet been performed, and
      // this is the parent (default pid == 0).

      // run the input filter by reforking the child process (2nd level child).
      // This refork is always blocking.  The ifilter is used just once per
      // evaluation since it is responsible for non-replicated pre-processing.
      // Any replicated pre-processing must be part of the analysis drivers
      // (see DirectFnApplicInterface::derived_map for additional info).
      if (!ifilter_name.isNull()) {
        forkSimulator.argument_list(0, ifilter_name);
        forkSimulator.argument_list(2, mod_res_filename);
        forkSimulator.fork_program(BLOCK);
      }

      // run the simulator programs by reforking the child process again
      // (additional 2nd level children).  These reforks run a blocking schedule
      // (i.e., while jobs within the schedule may be nonblocking, the schedule
      // itself does not complete until all analyses are completed).  Need for a
      // nonblocking schedule is not currently anticipated, since the 1st level
      // fork provides the nonblocking evaluations needed for nonblocking
      // synchronization by certain iterators.
      if (asynchLocalAnalysisFlag) // asynch w/ concurrency limit>1 or unlimited
        asynchronous_local_analyses(1, numAnalysisDrivers, 1);
      else
        synchronous_local_analyses(1, numAnalysisDrivers, 1);

      // run the output filter by reforking the child process again (another 2nd
      // level child).  This refork is always blocking.  The ofilter is used
      // just once per evaluation since it is responsible for non-replicated
      // post-processing.  Any replicated post-processing must be part of the
      // analysis drivers (see DirectFnApplicInterface::derived_map for
      // additional info).
      if (!ofilter_name.isNull()) {
        forkSimulator.argument_list(0, ofilter_name);
        forkSimulator.argument_list(2, mod_res_filename);
        forkSimulator.fork_program(BLOCK);
      }

      // If nonblocking, then this is the 1st level child process.  Quit this
      // process now.
      if (!block_flag)
        _exit(1);
    }
  }
#else
  Cerr << "Error: Fork application interfaces not supported under this OS." 
       << endl;
  abort_handler(-1);
#endif // TFLOPS_COMPUTE/CPLANT_COMPUTE

  return(pid);
}


// ---------------------------------------------------
// Begin analysis level schedulers (eventually derived
// fns for ApplicationInterface analysis schedulers)
// ---------------------------------------------------

/** Schedule analyses asynchronously on the local processor using a
    self-scheduling approach (start to end in step increments).
    Concurrency is limited by asynchLocalAnalysisConcurrency.  Modeled
    after ApplicationInterface::asynchronous_local_evaluations().
    NOTE: This function should be elevated to ApplicationInterface if
    and when another derived interface class supports asynchronous
    local analyses. */
void ForkApplicInterface::
asynchronous_local_analyses(const int& start, const int& end, const int& step)
{
  DakotaList<pid_t> pid_list;         // list of process id's for asynch jobs
  DakotaIntList     analysis_id_list; // list of analysis id's for asynch jobs
  size_t i, num_sends;
  if (numAnalysisDrivers <= 1) {
    Cerr << "Error: fork_application_asynchronous_local should only be called "
	 << "for multiple analysis_drivers." << endl;
    abort_handler(-1);
  }
  int analysis_id, num_jobs = 1 + (int)((end-start)/step);

  if (asynchLocalAnalysisConcurrency)  // concurrency limited by user
    num_sends = (asynchLocalAnalysisConcurrency < num_jobs) ? 
      asynchLocalAnalysisConcurrency : num_jobs;
  else // default: no limit, launch all jobs in first pass
    num_sends = num_jobs; // don't need to limit num_sends to 1 in the message
    // passing case since this fn is only called by the message passing
    // schedulers if there is asynchLocalAnalysisConcurrency

#ifdef MPI_DEBUG
  Cout << "First pass: initiating " << num_sends << " asynchronous analyses\n";
#endif // MPI_DEBUG
  for (i=0; i<num_sends; i++) {
    analysis_id = start + i*step;
#ifdef MPI_DEBUG
    Cout << "Initiating analysis " << analysis_id << endl; // flush buffer
#endif // MPI_DEBUG
    forkSimulator.argument_list(0, analysisDrivers[analysis_id-1]);
    forkSimulator.tag_argument_list(2, analysis_id);
    pid_t pid = forkSimulator.fork_program(FALL_THROUGH);
    pid_list.insert(pid);
    analysis_id_list.insert(analysis_id);
  }

#ifdef MPI_DEBUG
  if (num_sends < num_jobs)
    Cout << "Second pass: self-scheduling " << num_jobs-num_sends 
         << " remaining analyses\n";
#endif // MPI_DEBUG
  size_t send_cntr = num_sends, recv_cntr = 0;
  while (recv_cntr < num_jobs) {
#ifdef MPI_DEBUG
    Cout << "Waiting on completed analyses" << endl;
#endif // MPI_DEBUG

    // Enforce scheduling fairness with a Waitsome design
    int status, completed = 0;
    pid_t pid = wait(&status); // wait for any analysis to finish
    do { // perform this loop at least once for the pid from wait.
      forkSimulator.check_status(status); // check the exit status
      completed++;
      size_t index = pid_list.index(pid);
#ifdef MPI_DEBUG
      Cout << "Analysis " << analysis_id_list[index] <<" has completed" << endl;
#endif // MPI_DEBUG
      pid_list.removeAt(index);
      analysis_id_list.removeAt(index);
    } while( (pid=waitpid((pid_t)-1, &status, WNOHANG)) > 0 ); // any additional

    recv_cntr += completed;
    for (i=0; i<completed; i++) {
      if (send_cntr < num_jobs) {
        analysis_id = start + send_cntr*step;
#ifdef MPI_DEBUG
        Cout << "Initiating analysis " << analysis_id << endl; // flush buffer
#endif // MPI_DEBUG
        forkSimulator.argument_list(0, analysisDrivers[analysis_id-1]);
        forkSimulator.tag_argument_list(2, analysis_id);
        pid_t pid = forkSimulator.fork_program(FALL_THROUGH);
        pid_list.insert(pid);
        analysis_id_list.insert(analysis_id);
        send_cntr++;
      }
    }
  }
}


/** Execute analyses synchronously in succession on the local
    processor (start to end in step increments).  Modeled after
    ApplicationInterface::synchronous_local_evaluations(). */
void ForkApplicInterface::
synchronous_local_analyses(const int& start, const int& end, const int& step)
{
  int analysis_id;
  for (analysis_id=start; analysis_id<=end; analysis_id+=step)
    derived_synchronous_local_analysis(analysis_id);
}


/** This code runs multiple asynch analyses on each server.  It is
    modeled after ApplicationInterface::serve_evaluations_asynch().
    NOTE: This fn should be elevated to ApplicationInterface if and
    when another derived interface class supports hybrid analysis
    parallelism. */
void ForkApplicInterface::serve_analyses_asynch()
{
#ifdef USE_MPI
  if (numAnalysisDrivers <= 1) {
    Cerr << "Error: fork_application_serve_asynch should only be called for "
	 << "multiple analysis_drivers." << endl;
    abort_handler(-1);
  }
  pid_t pid;
  int analysis_id;
  DakotaList<pid_t>    pid_list; // list of process id's for asynch jobs
  DakotaIntList analysis_id_list; // corresponding list of analysis id's
  MPI_Status  status; // holds MPI_SOURCE, MPI_TAG, & MPI_ERROR
  MPI_Request recv_request = MPI_REQUEST_NULL;

  // ----------------------------------------------------------
  // Step 1: block on first message before entering while loops
  // ----------------------------------------------------------
  parallelLib.recv_ea(analysis_id, 0, MPI_ANY_TAG, status);

  do { // main loop

    // -----------------------------------------------------------------
    // Step 2: check for additional incoming messages & execute all jobs
    // -----------------------------------------------------------------
    int mpi_test_flag = 1;
    // check on asynchLocalAnalysisConcurrency limit below only required for
    // static scheduling (self scheduler handles this from the master side).
    // Leave it in for completeness even though static analysis scheduler
    // doesn't use serve fns.
    while (mpi_test_flag && analysis_id &&
           pid_list.entries() < asynchLocalAnalysisConcurrency) {
      // test for completion
      if (recv_request)
        MPI_Test(&recv_request, &mpi_test_flag, &status);

      // if test indicates a completion: unpack, execute, & repost
      if (mpi_test_flag) {
        analysis_id = status.MPI_TAG;

        if (analysis_id) {
	  // execute
          forkSimulator.argument_list(0, analysisDrivers[analysis_id-1]);
          forkSimulator.tag_argument_list(2, analysis_id);
          pid = forkSimulator.fork_program(FALL_THROUGH);
          pid_list.insert(pid);
          analysis_id_list.insert(analysis_id);
	  // repost
          parallelLib.irecv_ea(analysis_id, 0, MPI_ANY_TAG, recv_request);
	}
      }
    }

    // -----------------------------------------------------------------
    // Step 3: check for any completed jobs and return results to master
    // -----------------------------------------------------------------
    if (pid_list.entries()) {
      int wait_status, rtn_code = 0;
      while( (pid=waitpid((pid_t)-1, &wait_status, WNOHANG)) > 0 ) {
        forkSimulator.check_status(wait_status); // check the exit status
        size_t index = pid_list.index(pid);
        analysis_id = analysis_id_list[index];
#ifdef MPI_DEBUG
        Cout << "Analysis " << analysis_id <<" has completed" << endl; // flush
#endif // MPI_DEBUG
	// In this case, use a blocking send to avoid having to manage waits on
	// multiple send buffers (which would be a pain since the number of
	// send_buffers would vary with num_completed).
        parallelLib.send_ea(rtn_code, 0, analysis_id);
        pid_list.removeAt(index);
        analysis_id_list.removeAt(index);
      }
    }

  } while (analysis_id || pid_list.entries());

#endif // USE_MPI
}
