/*!\file:  IssmMpiDenseMat.h
 * \brief implementation of parallel dense ISSM matrix. Internally, the parallel dense matrix is 
 * split in rows across each cpu. Each matrix (representing a subset of rows) on each cpu is fully 
 * dense, and is represented by a linear buffer of type doubletype. 
 * This object needs to answer the API defined by the virtual functions in IssmAbsMat, 
 * and the contructors required by IssmMat (see IssmMat.h)
 */ 

#ifndef _ISSM_MPI_DENSE_MAT_H_
#define _ISSM_MPI_DENSE_MAT_H_

/*Headers:*/
/*{{{*/
#ifdef HAVE_CONFIG_H
	#include <config.h>
#else
#error "Cannot compile with HAVE_CONFIG_H symbol! run configure first!"
#endif

#include "../../shared/Exceptions/exceptions.h"
#include "../../shared/MemOps/xMemCpy.h"
#include "../../shared/Alloc/alloc.h"
#include "../../include/macros.h"
#include "../../Container/DataSet.h"
#include "../../classes/IssmComm.h"
#include "../../classes/objects/Bucket.h"
#include "../../toolkits/toolkits.h"
#include <math.h>

/*}}}*/

/*We need to template this class, in case we want to create Matrices that hold
  IssmDouble* matrix or IssmPDouble* matrix. 
  Such matrices would be useful for use without or with the matlab or python
  interface (which do not care for IssmDouble types, but only rely on
  IssmPDouble types)*/
template <class doubletype> class IssmAbsMat;

template <class doubletype> 
class IssmMpiDenseMat:public IssmAbsMat<doubletype>{

	public:

		int M,N;  //global size
		int m;    //local number of rows
		doubletype* matrix;  /*here, doubletype is either IssmDouble or IssmPDouble*/
		DataSet*    buckets;  /*here, we store buckets of values that we will Assemble into a global matrix.*/

		/*IssmMpiDenseMat constructors, destructors*/
		/*FUNCTION IssmMpiDenseMat(){{{*/
		IssmMpiDenseMat(){
			this->M=0;
			this->N=0;
			this->m=0;
			this->matrix=NULL;
			this->buckets=new DataSet();
		}
		/*}}}*/
		/*FUNCTION IssmMpiDenseMat(int M,int N){{{*/
		IssmMpiDenseMat(int Min,int Nin){
			this->Init(Min,Nin);
		}
		/*}}}*/
		/*FUNCTION IssmMpiDenseMat(int M,int N, doubletype sparsity){{{*/
		IssmMpiDenseMat(int pM,int pN, doubletype sparsity){
			/*no sparsity involved here, we are fully dense, so just use the previous constructor: */
			this->Init(pM,pN);
		}
		/*}}}*/
		/*FUNCTION IssmMpiDenseMat(int m,int n,int M,int N,int* d_nnz,int* o_nnz){{{*/
		IssmMpiDenseMat(int m,int n,int pM,int pN,int* d_nnz,int* o_nnz){
			/*not needed, we are fully dense!: */
			this->Init(pM,pN);
		}
		/*}}}*/
		/*FUNCTION IssmMpiDenseMat(doubletype* serial_mat,int M,int N,doubletype sparsity){{{*/
		IssmMpiDenseMat(doubletype* serial_mat,int Min,int Nin,doubletype sparsity){
			
			/*Here, we assume that the serial_mat is local to the local cpu, and that it has 
			 * the correct size (m rows by N colums), n determined by DetermineLocalSize: */
			this->buckets=new DataSet();
			this->M=Min;
			this->N=Nin;
			this->m=DetermineLocalSize(this->M,IssmComm::GetComm());

			this->matrix=NULL;
			if(m*N){
				this->matrix=xNewZeroInit<doubletype>(m*N);
				xMemCpy<doubletype>(this->matrix,serial_mat,m*N);
			}
		}
		/*}}}*/
		/*FUNCTION IssmMpiDenseMat(int M,int N, int connectivity, int numberofdofspernode){{{*/
		IssmMpiDenseMat(int pM,int pN, int connectivity,int numberofdofspernode){
			/*not needed, we are fully dense!: */
			this->Init(pM,pN);
		}
		/*}}}*/
		/*FUNCTION ~IssmMpiDenseMat(){{{*/
		~IssmMpiDenseMat(){
			xDelete<doubletype>(this->matrix);
			M=0;
			N=0;
			m=0;
			delete this->buckets;
		}
		/*}}}*/
		/*FUNCTION IssmMpiDenseMat::Init(int Min,int Nin){{{*/
		void Init(int Min,int Nin){

			this->buckets=new DataSet();
			
			this->M=Min;
			this->N=Nin;
			
			/*Figure out local number of rows: */
			this->m=DetermineLocalSize(this->M,IssmComm::GetComm());
			
			/*Initialize pointer: */
			this->matrix=NULL;

			/*Allocate: */
			if (m*N)this->matrix=xNewZeroInit<doubletype>(this->m*N);
		}
		/*}}}*/

		/*IssmMpiDenseMat specific routines */
		/*FUNCTION Echo{{{*/
		void Echo(void){

			int my_rank;
			int i,j,k;

			/*Do a synchronized dump across all the rows: */
			my_rank=IssmComm::GetRank();
			for(i=0;i<IssmComm::GetSize();i++){
				if (my_rank==i){
					printf("cpu %i #rows: %i\n",i,this->m);
					for (j=0;j<this->m;j++){
						printf("row %i ",j);
						for (k=0;k<this->N;k++){
							printf("%g ",this->matrix[j*this->N+k]);
						}
					}
				}
				MPI_Barrier(IssmComm::GetComm());
			}

		}
		/*}}}*/
		/*FUNCTION Assemble{{{*/
		void Assemble(){


			int           i;
			int           j;
			int           k;
			int           my_rank;
			int           num_procs;
			int          *RowRank             = NULL;

			DataSet     **bucketspercpu       = NULL;
			int          *bucketspercpu_sizes = NULL;
			MPI_Request  *requests            = NULL;
			MPI_Status   *statuses            = NULL;
			MPI_Status    status;
			int           num_requests        = 0;
			DataSet      *mybuckets           = NULL;
			int           lower_row;
			int           upper_row;
			int           count               = 0;

			int           size;



			/*some communicator info: */
			num_procs=IssmComm::GetSize();
			my_rank=IssmComm::GetRank();
			MPI_Comm comm=IssmComm::GetComm();

			/*First, make a vector of size M, which for each row between 0 and M-1, tells which cpu this row belongs to: */
			RowRank=DetermineRowRankFromLocalSize(M,m,comm);

			/*Now, sort out our dataset of buckets according to cpu ownership of rows: */
			bucketspercpu=xNew<DataSet*>(num_procs);
			bucketspercpu_sizes=xNew<int>(num_procs);

			for(i=0;i<num_procs;i++){
				DataSet* bucketsofcpu_i=new DataSet();
				for (j=0;j<buckets->Size();j++){
					Bucket<doubletype>* bucket=(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
					bucket->SpawnBucketsPerCpu(bucketsofcpu_i,i,RowRank);
				}
				bucketspercpu[i]=bucketsofcpu_i;
				bucketspercpu_sizes[i]=bucketsofcpu_i->Size();
			}

			/*Recap, each cpu has num_procs datasets of buckets. For a certain cpu j, for a given dataset i, the buckets this 
			 * dataset owns correspond to rows that are owned by cpu i, not j!:*/

			/*First, figure out how many requests are going to be sent by MPI_Isend. Do this a little bit better? */
			for(i=0;i<num_procs;i++){
				if(i!=my_rank){
					num_requests+=bucketspercpu[i]->Size()*MATRIXBUCKETSIZEOFREQUESTS; //this is to take into account all the MPI_ISend calls in each bucket.
					num_requests++; //this is to take into account on MPI_ISend in BucketsSend.
				}
			}

			/*Initialize array to track requests and statuses: */
			requests=new MPI_Request[num_requests];
			statuses=new MPI_Status[num_requests];

			/*Now, go through all our bucketspercpu datasets, and send them to the corresponding cpus. Do not send our own buckets though!: */
			count=0; //count requests
			for(i=0;i<num_procs;i++){
				if(my_rank==i){
					for(j=0;j<num_procs;j++){
						if(j!=i){//only send the buckets that this cpu does not own.
						
							/*Go through the buckets belonging to cpu j, and send them accordingly. */
							DataSet* buckets=bucketspercpu[j];
							MPI_Isend(bucketspercpu_sizes+j,1,MPI_INT,j,1,comm,requests+count); count++; //we use bucketspercpu_sizes because we need a permanent buffer for an asynchronous send
							for(k=0;k<buckets->Size();k++){
								Bucket<doubletype>* bucket=(Bucket<doubletype>*)buckets->GetObjectByOffset(k);
								bucket->Isend(j,requests,&count,comm);
							}
						}
					}
				}
				else{
							
					/*Receive buckets from cpu i, and add them to my own my_rank bucket list: */
					/*First, are we receiving anything from sender_rank? :*/
					MPI_Recv(&size,1, MPI_INT,i,1, comm, &status);

					/*If so, started receiving extra buckets and plug them into out buckets: */
					if(size){
						for(j=0;j<size;j++){
							Bucket<doubletype>* bucket=new Bucket<doubletype>();
							bucket->Recv(i,comm);
							bucketspercpu[my_rank]->AddObject(bucket);
						}
					}
				}
			}
			/*Wait for all requests to complete: */
			MPI_Waitall(num_requests,requests,statuses);

			/*Every cpu now has a dataset of buckets  in bucketspercpu[my_rank], which holds all the values 
			 *local to this cpu that should be added to the global matrix. Just do that: */
			GetOwnershipBoundariesFromRange(&lower_row,&upper_row,m,comm);
			mybuckets=bucketspercpu[my_rank];

			for(i=0;i<mybuckets->Size();i++){
				Bucket<doubletype>* bucket=(Bucket<doubletype>*)mybuckets->GetObjectByOffset(i);
				bucket->SetLocalMatrixValues(this->matrix,lower_row,N);
			}

			/*Free ressources:{{{*/
			xDelete<int>(RowRank);
			for(i=0;i<num_procs;i++){
				DataSet* buckets=bucketspercpu[i];
				delete buckets;
			}
			xDelete<DataSet*>(bucketspercpu);
			xDelete<int>(bucketspercpu_sizes);
			xDelete<MPI_Request>(requests);
			/*}}}*/
		}
		/*}}}*/
		/*FUNCTION Norm{{{*/
		doubletype Norm(NormMode mode){
			
			
			doubletype norm,local_norm;
			doubletype absolute;
			int i,j;

			switch(mode){
				case NORM_INF:
					local_norm=0;
					for(i=0;i<this->M;i++){
						absolute=0;
						for(j=0;j<this->N;j++){
							absolute+=fabs(this->matrix[N*i+j]);
						}
						local_norm=max(local_norm,absolute);
					}
					MPI_Reduce(&local_norm, &norm, 1, MPI_DOUBLE, MPI_MAX, 0, IssmComm::GetComm());
					MPI_Bcast(&norm,1,MPI_DOUBLE,0,IssmComm::GetComm());
					return norm;
					break; 
				default:
					_error_("unknown norm !");
					break;
			}
		}
		/*}}}*/
		/*FUNCTION GetSize{{{*/
		void GetSize(int* pM,int* pN){
			*pM=M;
			*pN=N;
		}
		/*}}}*/
		/*FUNCTION GetLocalSize{{{*/
		void GetLocalSize(int* pM,int* pN){
			*pM=m;
			*pN=N;
		}
		/*}}}*/
		/*FUNCTION MatMult{{{*/
		void MatMult(IssmAbsVec<doubletype>* Xin,IssmAbsVec<doubletype>* AXin){


			int         i,j;
			doubletype *X_serial  = NULL;


			/*A check on the types: */
			if(IssmVecTypeFromToolkitOptions()!=MpiEnum)_error_("MatMult operation only possible with 'mpi' vectors");

			/*Now that we are sure, cast vectors: */
			IssmMpiVec<doubletype>* X=(IssmMpiVec<doubletype>*)Xin;
			IssmMpiVec<doubletype>* AX=(IssmMpiVec<doubletype>*)AXin;

			/*Serialize input Xin: */
			X_serial=X->ToMPISerial();

			/*Every cpu has a serial version of the input vector. Use it to do the Matrix-Vector multiply 
			 *locally and plug it into AXin: */
			for(i=0;i<this->m;i++){
				for(j=0;j<this->N;j++){
					AX->vector[i]+=this->matrix[i*N+j]*X_serial[j];
				}
			}

			/*Free ressources: */
			xDelete<doubletype>(X_serial);
		}
		/*}}}*/
		/*FUNCTION Duplicate{{{*/
		IssmMpiDenseMat<doubletype>* Duplicate(void){

			IssmMpiDenseMat<doubletype>* dup=new IssmMpiDenseMat<doubletype>(this->matrix,this->M,this->N,0);
			return dup;

		}
		/*}}}*/
		/*FUNCTION ToSerial{{{*/
		doubletype* ToSerial(void){
			_error_("not supported yet!");
		}
		/*}}}*/
		/*FUNCTION SetValues{{{*/
		void SetValues(int min,int* idxm,int nin,int* idxn,doubletype* values,InsMode mode){

			/*we need to store all the values we collect here in order to Assemble later. 
			 * Indeed, the values we are collecting here most of the time will not belong 
			 * to us, but to another part of the matrix on another cpu: */
			_assert_(buckets);

			buckets->AddObject(new Bucket<doubletype>(min,idxm,nin,idxn,values,mode));

		}
		/*}}}*/
		/*FUNCTION Convert{{{*/
		void Convert(MatrixType type){
			_error_("not supported yet!");
		}
		/*}}}*/		
};
							

#endif //#ifndef _ISSM_MPI_DENSE_MAT_H_
