/*!\file:  IssmMpiVec.h
 * \brief implementation of parallel dense ISSM vector. Internally, the parallel dense vector is 
 * split in rows across each cpu. Each vector (representing a subset of rows) on each cpu is fully 
 * dense, and is represented by a linear buffer of type doubletype. 
 * This object needs to answer the API defined by the virtual functions in IssmAbsVec, 
 * and the contructors required by IssmVec (see IssmVec.h)
 */ 

#ifndef _ISSM_MPI_VEC_H_
#define _ISSM_MPI_VEC_H_

/*Headers:*/
/*{{{*/
#ifdef HAVE_CONFIG_H
	#include <config.h>
#else
#error "Cannot compile with HAVE_CONFIG_H symbol! run configure first!"
#endif

#include "../../shared/Exceptions/exceptions.h"
#include "../../shared/MemOps/xMemCpy.h"
#include "../../shared/Alloc/alloc.h"
#include "../../include/macros.h"
#include "../../io/io.h"
#ifdef _HAVE_MPI_
#include "../mpi/mpiincludes.h"
#endif
#include <math.h>

/*}}}*/

/*We need to template this class, in case we want to create vectors that hold IssmDouble* vector or IssmPDouble* vector. 
  Such vectors would be useful for use without or with the matlab or python interface (which do not care for IssmDouble types, 
  but only rely on IssmPDouble types)*/
template <class doubletype> class IssmAbsVec;

template <class doubletype> 
class IssmMpiVec:public IssmAbsVec<doubletype>{

	public:

		int M; //global size
		int m; //local number of rows
		doubletype* vector;  /*here, doubletype is either IssmDouble or IssmPDouble*/
		DataSet*    buckets;  /*here, we store buckets of values that we will Assemble into a global vector.*/

		/*IssmMpiVec constructors, destructors*/
		/*FUNCTION IssmMpiVec(){{{*/
		IssmMpiVec(){

			this->M=0;
			this->m=0;
			this->vector=NULL;
			this->buckets=new DataSet();
		}
		/*}}}*/
		/*FUNCTION IssmMpiVec(int M){{{*/
		IssmMpiVec(int Min){
			this->Init(Min,false);
		}
		/*}}}*/
		/*FUNCTION IssmMpiVec(int M,bool fromlocalsize){{{*/
		IssmMpiVec(int Min, bool fromlocalsize){
			this->Init(Min,fromlocalsize);
		}
		/*}}}*/
		/*FUNCTION IssmMpiVec(doubletype* serial_vec,int M){{{*/
		IssmMpiVec(doubletype* buffer,int Min){

			this->Init(Min,false);

			if(this->M){
				this->vector=xNew<doubletype>(this->m);
				xMemCpy<doubletype>(this->vector,buffer,this->m);
			}
		}
		/*}}}*/
		/*FUNCTION IssmMpiVec::Init(int Min,bool fromlocalsize){{{*/
		void Init(int Min,bool fromlocalsize){

			this->buckets=new DataSet();

			if(fromlocalsize){
				this->m=Min;
				this->M=DetermineGlobalSize(this->m,IssmComm::GetComm());
			}
			else{
				this->M=Min;
				this->m=DetermineLocalSize(this->M,IssmComm::GetComm());
			}

			/*Initialize pointer: */
			this->vector=NULL;

			/*Allocate: */
			if (m)this->vector=xNewZeroInit<doubletype>(this->m);
		}
		/*}}}*/
		/*FUNCTION ~IssmMpiVec(){{{*/
		~IssmMpiVec(){
			xDelete<doubletype>(this->vector);
			this->M=0;
			this->n=0;
			delete buckets;
		}
		/*}}}*/

		/*IssmMpiVec specific routines*/
		/*FUNCTION Echo{{{*/
		void Echo(void){

			int i,j;

			/*Do a synchronized dump across all the rows: */
			for(i=0;i<IssmComm::GetSize();i++){
				if (IssmComm::GetRank()==i){
					printf("cpu %i #rows: %i\n",i,this->m);
					for (j=0;j<this->m;j++){
						printf("row %i %g",j,this->vector[j]);
					}
				}
				MPI_Barrier(IssmComm::GetComm());
			}
		}
		/*}}}*/
		/*FUNCTION Assemble{{{*/
		void Assemble(){


			int           i,j;

			int         *RowRank            = NULL;
			int           num_procs;

			int        *row_indices_forcpu = NULL;
			int        *col_indices_forcpu = NULL;
			int        *modes_forcpu       = NULL;
			doubletype *values_forcpu      = NULL;
			int         *numvalues_forcpu   = NULL;
			DataSet     **bucketsforcpu       = NULL;

			int        **row_indices_fromcpu = NULL;
			int        **col_indices_fromcpu = NULL;
			int        **modes_fromcpu       = NULL;
			doubletype **values_fromcpu      = NULL;
			int         *numvalues_fromcpu   = NULL;

			int           lower_row;
			int           upper_row;
			int*          sendcnts            = NULL;
			int*          displs              = NULL;
			int           count               = 0;

			/*some communicator info: */
			num_procs=IssmComm::GetSize();
			MPI_Comm comm=IssmComm::GetComm();


			/*First, make a vector of size M, which for each row between 0 and M-1, tells which cpu this row belongs to: */
			RowRank=DetermineRowRankFromLocalSize(M,m,comm);


			/*Now, sort out our dataset of buckets according to cpu ownership of rows: {{{*/
			bucketsforcpu=xNew<DataSet*>(num_procs);

			for(i=0;i<num_procs;i++){
				DataSet* bucketsofcpu_i=new DataSet();
				for (j=0;j<buckets->Size();j++){
					Bucket<doubletype>* bucket=(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
					bucket->SpawnBucketsPerCpu(bucketsofcpu_i,i,RowRank);
				}
				bucketsforcpu[i]=bucketsofcpu_i;
			}
			/*}}}*/

			/*Recap, each cpu has num_procs datasets of buckets. For a certain cpu j, for a given dataset i, the buckets this  {{{
			 * dataset owns correspond to rows that are owned by cpu i, not j!. Out of all the buckets we own, make row,col,value,insert_mode 
			 * vectors that will be shipped around the cluster: */
			this->BucketsBuildScatterBuffers(&numvalues_forcpu,&row_indices_forcpu,&values_forcpu,&modes_forcpu,bucketsforcpu,num_procs);
			/*}}}*/

			/*Now, we need to allocate on each cpu arrays to receive data from all the other cpus. To know what we need to allocate, we need  {{{
			 *some scatter calls: */
			numvalues_fromcpu   = xNew<int>(num_procs);
			for(i=0;i<num_procs;i++){
				MPI_Scatter(numvalues_forcpu,1,MPI_INT,numvalues_fromcpu+i,1,MPI_INT,i,comm);
			}
			
			row_indices_fromcpu=xNew<int*>(num_procs);
			values_fromcpu=xNew<doubletype*>(num_procs);
			modes_fromcpu=xNew<int*>(num_procs);
			for(i=0;i<num_procs;i++){
				int size=numvalues_fromcpu[i];
				if(size){
					row_indices_fromcpu[i]=xNew<int>(size);
					values_fromcpu[i]=xNew<doubletype>(size);
					modes_fromcpu[i]=xNew<int>(size);
				}
				else{
					row_indices_fromcpu[i]=NULL;
					values_fromcpu[i]=NULL;
					modes_fromcpu[i]=NULL;
				}
			}
			/*}}}*/

			/*Scatter values around: {{{*/
			/*Now, to scatter values across the cluster, we need sendcnts and displs. Our sendbufs have been built by BucketsBuildScatterBuffers, with a stride given 
			 * by numvalues_forcpu. Get this ready to go before starting the scatter itslef. For reference, here is the MPI_Scatterv prototype: 
			 * int MPI_Scatterv( void *sendbuf, int *sendcnts, int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) :*/
			sendcnts=xNew<int>(num_procs);
			displs=xNew<int>(num_procs);
			count=0;
			for(i=0;i<num_procs;i++){
				sendcnts[i]=numvalues_forcpu[i];
				displs[i]=count;
				count+=numvalues_forcpu[i];
			}

			for(i=0;i<num_procs;i++){
				MPI_Scatterv( row_indices_forcpu, sendcnts, displs, MPI_INT, row_indices_fromcpu[i], numvalues_fromcpu[i], MPI_INT, i, comm);
				MPI_Scatterv( values_forcpu, sendcnts, displs, MPI_DOUBLE, values_fromcpu[i], numvalues_fromcpu[i], MPI_DOUBLE, i, comm);
				MPI_Scatterv( modes_forcpu, sendcnts, displs, MPI_INT, modes_fromcpu[i], numvalues_fromcpu[i], MPI_INT, i, comm);
			}
			/*}}}*/

			/*Plug values into global vector: {{{*/
			GetOwnershipBoundariesFromRange(&lower_row,&upper_row,m,comm);
			for(i=0;i<num_procs;i++){
				int  numvalues=numvalues_fromcpu[i];
				int* rows=row_indices_fromcpu[i];
				doubletype* values=values_fromcpu[i];
				int* mods=modes_fromcpu[i];

				for(j=0;j<numvalues;j++){
					if(mods[j]==ADD_VAL) *(vector+(rows[j]-lower_row))+=values[j];
					else *(vector+(rows[j]-lower_row))=values[j];
				}
			}
			/*}}}*/

			/*Free ressources:{{{*/
			xDelete<int>(RowRank);
			xDelete<int>(row_indices_forcpu);
			xDelete<int>(modes_forcpu);
			xDelete<doubletype>(values_forcpu);
			xDelete<int>(numvalues_forcpu);
			
			for(i=0;i<num_procs;i++){
				DataSet* buckets=bucketsforcpu[i];
				delete buckets;
			}
			xDelete<DataSet*>(bucketsforcpu);

			for(i=0;i<num_procs;i++){
				int* rows=row_indices_fromcpu[i];
				int* modes=modes_fromcpu[i];
				doubletype* values=values_fromcpu[i];

				xDelete<int>(rows);
				xDelete<int>(modes);
				xDelete<doubletype>(values);
			}
			xDelete<int*>(row_indices_fromcpu);
			xDelete<int*>(modes_fromcpu);
			xDelete<doubletype*>(values_fromcpu);
			xDelete<int>(numvalues_fromcpu);
			
			xDelete<int>(sendcnts);
			xDelete<int>(displs);
			/*}}}*/


		}
		/*}}}*/
		/*FUNCTION Assemble2{{{*/
		void Assemble2(){

			int           i;
			int           j;
			int           k;
			int           my_rank;
			int           num_procs;
			int          *RowRank             = NULL;

			DataSet     **bucketspercpu       = NULL;
			int          *bucketspercpu_sizes = NULL;
			MPI_Request  *requests            = NULL;
			MPI_Status   *statuses            = NULL;
			MPI_Status    status;
			int           num_requests        = 0;
			DataSet      *mybuckets           = NULL;
			int           lower_row;
			int           upper_row;
			int           count               = 0;

			int           size;

			/*some communicator info: */
			num_procs=IssmComm::GetSize();
			my_rank=IssmComm::GetRank();
			MPI_Comm comm=IssmComm::GetComm();

			/*First, make a vector of size M, which for each row between 0 and M-1, tells which cpu this row belongs to: */
			RowRank=DetermineRowRankFromLocalSize(M,m,comm);

			/*Now, sort out our dataset of buckets according to cpu ownership of rows: */
			bucketspercpu=xNew<DataSet*>(num_procs);
			bucketspercpu_sizes=xNew<int>(num_procs);

			for(i=0;i<num_procs;i++){
				DataSet* bucketsofcpu_i=new DataSet();
				for (j=0;j<buckets->Size();j++){
					Bucket<doubletype>* bucket=(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
					bucket->SpawnBucketsPerCpu(bucketsofcpu_i,i,RowRank);
				}
				bucketspercpu[i]=bucketsofcpu_i;
				bucketspercpu_sizes[i]=bucketsofcpu_i->Size();
			}

			/*Recap, each cpu has num_procs datasets of buckets. For a certain cpu j, for a given dataset i, the buckets this 
			 * dataset owns correspond to rows that are owned by cpu i, not j!:*/

			/*First, figure out how many requests are going to be sent by MPI_Isend. Do this a little bit better? */
			for(i=0;i<num_procs;i++){
				if(i!=my_rank){
					num_requests+=bucketspercpu[i]->Size()*VECTORBUCKETSIZEOFREQUESTS; //this is to take into account all the MPI_ISend calls in each bucket.
					num_requests++; //this is to take into account on MPI_ISend in BucketsSend.
				}
			}

			/*Initialize array to track requests and statuses: */
			requests=new MPI_Request[num_requests];
			statuses=new MPI_Status[num_requests];

			/*Now, go through all our bucketspercpu datasets, and send them to the corresponding cpus. Do not send our own buckets though!: */
			count=0; //count requests
			for(i=0;i<num_procs;i++){
				if(my_rank==i){
					for(j=0;j<num_procs;j++){
						if(j!=i){//only send the buckets that this cpu does not own.

							/*Go through the buckets belonging to cpu j, and send them accordingly. */
							DataSet* buckets=bucketspercpu[j];
							MPI_Isend(bucketspercpu_sizes+j,1,MPI_INT,j,1,comm,requests+count); count++; //we use bucketspercpu_sizes because we need a permanent buffer for an asynchronous send
							for(k=0;k<buckets->Size();k++){
								Bucket<doubletype>* bucket=(Bucket<doubletype>*)buckets->GetObjectByOffset(k);
								bucket->Isend(j,requests,&count,comm);
							}
						}
					}
				}
				else{

					/*Receive buckets from cpu i, and add them to my own my_rank bucket list: */
					/*First, are we receiving anything from sender_rank? :*/
					MPI_Recv(&size,1, MPI_INT,i,1, comm, &status);

					/*If so, started receiving extra buckets and plug them into out buckets: */
					if(size){
						for(j=0;j<size;j++){
							Bucket<doubletype>* bucket=new Bucket<doubletype>();
							bucket->Recv(i,comm);
							bucketspercpu[my_rank]->AddObject(bucket);
						}
					}
				}
			}
			/*Wait for all requests to complete: */
			MPI_Waitall(num_requests,requests,statuses);

			/*Every cpu now has a dataset of buckets  in bucketspercpu[my_rank], which holds all the values 
			 *local to this cpu that should be added to the global matrix. Just do that: */
			GetOwnershipBoundariesFromRange(&lower_row,&upper_row,m,comm);
			mybuckets=bucketspercpu[my_rank];

			for(i=0;i<mybuckets->Size();i++){
				Bucket<doubletype>* bucket=(Bucket<doubletype>*)mybuckets->GetObjectByOffset(i);
				bucket->SetLocalVectorValues(this->vector,lower_row);
			}

			/*Free ressources:{{{*/
			xDelete<int>(RowRank);
			for(i=0;i<num_procs;i++){
				DataSet* buckets=bucketspercpu[i];
				delete buckets;
			}
			xDelete<DataSet*>(bucketspercpu);
			xDelete<int>(bucketspercpu_sizes);
			xDelete<MPI_Request>(requests);
			/*}}}*/
		}
		/*}}}*/
		/*FUNCTION SetValues{{{*/
		void SetValues(int ssize, int* list, doubletype* values, InsMode mode){

			/*we need to store all the values we collect here in order to Assemble later. 
			 * Indeed, the values we are collecting here most of the time will not belong 
			 * to us, but to another part of the vector on another cpu: */
			_assert_(buckets);

			buckets->AddObject(new Bucket<doubletype>(ssize, list, values, mode));

		}
		/*}}}*/
		/*FUNCTION SetValue{{{*/
		void SetValue(int dof, doubletype value, InsMode mode){

			/*we need to store the value we collect here in order to Assemble later. 
			 * Indeed, the value we are collecting here most of the time will not belong 
			 * to us, but to another part of the vector on another cpu: */
			_assert_(buckets);

			buckets->AddObject(new Bucket<doubletype>(1,&dof,&value, mode));
		}
		/*}}}*/
		/*FUNCTION GetValue{{{*/
		void GetValue(doubletype* pvalue,int dof){
			_error_("Get value on a MpiVec vector not implemented yet!");
		}
		/*}}}*/
		/*FUNCTION GetSize{{{*/
		void GetSize(int* pM){

			*pM=this->M;

		}
		/*}}}*/
		/*FUNCTION GetLocalSize{{{*/
		void GetLocalSize(int* pM){

			*pM=this->m;

		}
		/*}}}*/
		/*FUNCTION Duplicate{{{*/
		IssmMpiVec<doubletype>* Duplicate(void){

			return new IssmMpiVec<doubletype>(this->vector,this->M);

		}
		/*}}}*/
		/*FUNCTION Set{{{*/
		void Set(doubletype value){

			int i;
			for(i=0;i<this->m;i++)this->vector[i]=value;

		}
		/*}}}*/
		/*FUNCTION AXPY{{{*/
		void AXPY(IssmAbsVec<doubletype>* Xin, doubletype a){

			int i;

			/*Assume X is of the correct type, and downcast: */
			IssmMpiVec* X=NULL;

			X=(IssmMpiVec<doubletype>*)Xin;

			/*y=a*x+y where this->vector is y*/
			for(i=0;i<this->m;i++)this->vector[i]=a*X->vector[i]+this->vector[i];

		}
		/*}}}*/
		/*FUNCTION AYPX{{{*/
		void AYPX(IssmAbsVec<doubletype>* Xin, doubletype a){
			int i;

			/*Assume X is of the correct type, and downcast: */
			IssmMpiVec* X=NULL;

			X=(IssmMpiVec<doubletype>*)Xin;

			/*y=x+a*y where this->vector is y*/
			for(i=0;i<this->m;i++)this->vector[i]=X->vector[i]+a*this->vector[i];

		}
		/*}}}*/
		/*FUNCTION ToMPISerial{{{*/
		doubletype* ToMPISerial(void){

			/*communicator info: */
			MPI_Comm comm;
			int num_procs;

			/*MPI_Allgatherv info: */
			int  lower_row,upper_row;
			int* recvcounts=NULL;
			int* displs=NULL;

			/*output: */
			doubletype* buffer=NULL;

			/*initialize comm info: */
			comm=IssmComm::GetComm();
			num_procs=IssmComm::GetSize();

			/*Allocate: */
			buffer=xNew<doubletype>(M);
			recvcounts=xNew<int>(num_procs);
			displs=xNew<int>(num_procs);

			/*recvcounts:*/
			MPI_Allgather(&this->m,1,MPI_INT,recvcounts,1,MPI_INT,comm);

			/*get lower_row: */
			GetOwnershipBoundariesFromRange(&lower_row,&upper_row,this->m,comm);

			/*displs: */
			MPI_Allgather(&lower_row,1,MPI_INT,displs,1,MPI_INT,comm);

			/*All gather:*/
			MPI_Allgatherv(this->vector, this->m, MPI_DOUBLE, buffer, recvcounts, displs, MPI_DOUBLE,comm);

			/*free ressources: */
			xDelete<int>(recvcounts);
			xDelete<int>(displs);

			/*return: */
			return buffer;

		}
		/*}}}*/
		/*FUNCTION Copy{{{*/
		void Copy(IssmAbsVec<doubletype>* toin){

			int i;

			/*Assume toin is of the correct type, and downcast: */
			IssmMpiVec* to=NULL;

			to=(IssmMpiVec<doubletype>*)toin;

			to->M=this->M;
			for(i=0;i<this->m;i++)to->vector[i]=this->vector[i];

		}
		/*}}}*/
		/*FUNCTION Norm{{{*/
		doubletype Norm(NormMode mode){

			doubletype local_norm;
			doubletype norm;
			int i;

			switch(mode){
				case NORM_INF:
					//local_norm=0; for(i=0;i<this->m;i++)local_norm=max(local_norm,fabs(this->vector[i]));
					local_norm=0; for(i=0;i<this->m;i++)local_norm=max(local_norm,this->vector[i]);
					MPI_Reduce(&local_norm, &norm, 1, MPI_DOUBLE, MPI_MAX, 0, IssmComm::GetComm());
					return norm;
					break;
				case NORM_TWO:
					local_norm=0; 
					for(i=0;i<this->m;i++)local_norm+=pow(this->vector[i],2);
					MPI_Reduce(&local_norm, &norm, 1, MPI_DOUBLE, MPI_SUM, 0, IssmComm::GetComm());
					return sqrt(norm);
					break;
				default:
					_error_("unknown norm !");
					break;
			}
		}
		/*}}}*/
		/*FUNCTION Scale{{{*/
		void Scale(doubletype scale_factor){

			int i;
			for(i=0;i<this->M;i++)this->vector[i]=scale_factor*this->vector[i];

		}
		/*}}}*/
		/*FUNCTION Dot{{{*/
		doubletype Dot(IssmAbsVec<doubletype>* inputin){

			int i;
			doubletype local_dot=0;
			doubletype dot=0;

			/*Assume inputin is of the correct type, and downcast: */
			IssmMpiVec* input=NULL;

			input=(IssmMpiVec<doubletype>*)inputin;

			for(i=0;i<this->m;i++)local_dot+=this->vector[i]*input->vector[i];

			#ifdef _HAVE_MPI_
			/*MPI_SUM all the dots across the cluster: */
			MPI_Reduce(&local_dot, &dot, 1, MPI_DOUBLE, MPI_SUM, 0, IssmComm::GetComm());
			MPI_Bcast(&dot,1,MPI_DOUBLE,0,IssmComm::GetComm());
			#endif

			return dot;
		}
		/*}}}*/
		/*FUNCTION PointwiseDivide{{{*/
		void PointwiseDivide(IssmAbsVec<doubletype>* xin,IssmAbsVec<doubletype>* yin){

			int i;

			/*Assume xin and yin are of the correct type, and downcast: */
			IssmMpiVec* x=NULL;
			IssmMpiVec* y=NULL;

			x=(IssmMpiVec<doubletype>*)xin;
			y=(IssmMpiVec<doubletype>*)yin;

			/*pointwise w=x/y where this->vector is w: */
			for(i=0;i<this->m;i++)this->vector[i]=x->vector[i]/y->vector[i];
		}
		/*}}}*/
		/*FUNCTION BucketsBuildScatterBuffers{{{*/
		void BucketsBuildScatterBuffers(int** pnumvalues_forcpu,int** prow_indices_forcpu,doubletype** pvalues_forcpu,int** pmodes_forcpu,DataSet** bucketsforcpu,int num_procs){


			/*intermediary: */
			int         i,j;
			int         count                   = 0;
			int         total_size              = 0;
			int        *temp_row_indices_forcpu = NULL;
			doubletype *temp_values_forcpu      = NULL;
			int        *temp_modes_forcpu       = NULL;

			/*output: */
			int        *numvalues_forcpu        = NULL;
			int        *row_indices_forcpu      = NULL;
			doubletype *values_forcpu           = NULL;
			int        *modes_forcpu            = NULL;

			/*figure out size of buffers per cpu: */

			numvalues_forcpu=xNew<int>(num_procs);
			for(i=0;i<num_procs;i++){
				DataSet    *buckets            = bucketsforcpu[i];
				
				count=0;
				for(j=0;j<buckets->Size();j++){
					Bucket<doubletype>* bucket =(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
					count+=bucket->MarshallSize();
				}

				numvalues_forcpu[i]=count;
			}

			/*now, figure out size of  total buffers (for all cpus!): */
			count=0;
			for(i=0;i<num_procs;i++){
				count+=numvalues_forcpu[i];
			}
			total_size=count;

			/*Allocate buffers: */
			row_indices_forcpu = xNew<int>(total_size);
			values_forcpu = xNew<doubletype>(total_size);
			modes_forcpu = xNew<int>(total_size);

			/*we are going to march through the buffers, and marshall data onto them, so in order to not
			 *lose track of where these buffers are located in memory, we are going to work using copies 
			 of them: */
			temp_row_indices_forcpu=row_indices_forcpu;
			temp_values_forcpu=values_forcpu;
			temp_modes_forcpu=modes_forcpu;

			/*Fill buffers: */
			for(i=0;i<num_procs;i++){
				DataSet    *buckets            = bucketsforcpu[i];
				for(j=0;j<buckets->Size();j++){
					Bucket<doubletype>* bucket =(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
					bucket->Marshall(&temp_row_indices_forcpu,&temp_values_forcpu,&temp_modes_forcpu); //pass in the address of the buffers, so as to have the Marshall routine increment them.
				}
			}

			/*sanity check: */
			if (temp_row_indices_forcpu!=row_indices_forcpu+total_size)_error_("problem with marshalling of buckets");
			if (temp_values_forcpu!=values_forcpu+total_size)_error_("problem with marshalling of buckets");
			if (temp_modes_forcpu!=modes_forcpu+total_size)_error_("problem with marshalling of buckets");

			/*output buffers: */
			*pnumvalues_forcpu   = numvalues_forcpu;
			*prow_indices_forcpu = row_indices_forcpu;
			*pvalues_forcpu      = values_forcpu;
			*pmodes_forcpu       = modes_forcpu;
		}
		/*}}}*/		
};
#endif //#ifndef _ISSM_MPI_VEC_H_	
