Index: /issm/trunk-jpl/src/c/classes/objects/Bucket.h
===================================================================
--- /issm/trunk-jpl/src/c/classes/objects/Bucket.h	(revision 14832)
+++ /issm/trunk-jpl/src/c/classes/objects/Bucket.h	(revision 14833)
@@ -175,4 +175,55 @@
 		};
 		/*}}}*/
+		void Marshall(int** prow_indices_forcpu,int** pcol_indices_forcpu,doubletype** pvalues_forcpu,int** pmodes_forcpu){ /*{{{*/
+
+			/*intermediary: */
+			int         i;
+			int         j;
+
+			/*buffers: */
+			int        *row_indices_forcpu = NULL;
+			int        *col_indices_forcpu = NULL;
+			doubletype *values_forcpu      = NULL;
+			int        *modes_forcpu       = NULL;
+
+			/*initialize buffers: */
+			row_indices_forcpu=*prow_indices_forcpu;
+			col_indices_forcpu=*pcol_indices_forcpu;
+			values_forcpu=*pvalues_forcpu;
+			modes_forcpu=*pmodes_forcpu;
+
+			/*fill buffers with out values and indices and modes: */
+			for(i=0;i<m;i++){
+				for(j=0;j<n;j++){
+					row_indices_forcpu[i*n+j]=idxm[i];
+					col_indices_forcpu[i*n+j]=idxn[j];
+					values_forcpu[i*n+j]=values[i*n+j];
+					modes_forcpu[i*n+j]=mode;
+				}
+			}
+
+			/*increment buffer for next Bucket who will marshall his data: */
+			row_indices_forcpu+=m*n;
+			col_indices_forcpu+=m*n;
+			values_forcpu+=m*n;
+			modes_forcpu+=m*n;
+
+			/*output modified buffers: */
+			*prow_indices_forcpu=row_indices_forcpu;
+			*pcol_indices_forcpu=col_indices_forcpu;
+			*pvalues_forcpu=values_forcpu;
+			*pmodes_forcpu=modes_forcpu;
+		};
+		/*}}}*/
+		int MarshallSize(void){ /*{{{*/
+
+			if(type=MATRIX_BUCKET){
+				return m*n;
+			}
+			else{
+				return m;
+			}
+		};
+		/*}}}*/
 #ifdef _HAVE_MPI_
 			void Isend(int receiver_rank,MPI_Request* requests,int* pcount,MPI_Comm comm){ /*{{{*/
Index: /issm/trunk-jpl/src/c/toolkits/issm/IssmMpiDenseMat.h
===================================================================
--- /issm/trunk-jpl/src/c/toolkits/issm/IssmMpiDenseMat.h	(revision 14832)
+++ /issm/trunk-jpl/src/c/toolkits/issm/IssmMpiDenseMat.h	(revision 14833)
@@ -149,6 +149,6 @@
 		}
 		/*}}}*/
-		/*FUNCTION Assemble{{{*/
-		void Assemble(){
+		/*FUNCTION Assemble2{{{*/
+		void Assemble2(){
 
 			int           i;
@@ -265,4 +265,142 @@
 			xDelete<MPI_Request>(requests);
 			/*}}}*/
+		}
+		/*}}}*/
+		/*FUNCTION Assemble{{{*/
+		void Assemble(){
+
+
+			int           i,j;
+
+			int         *RowRank            = NULL;
+			int           num_procs;
+
+			int        *row_indices_forcpu = NULL;
+			int        *col_indices_forcpu = NULL;
+			int        *modes_forcpu       = NULL;
+			doubletype *values_forcpu      = NULL;
+			int         *numvalues_forcpu   = NULL;
+			DataSet     **bucketsforcpu       = NULL;
+
+			int        **row_indices_fromcpu = NULL;
+			int        **col_indices_fromcpu = NULL;
+			int        **modes_fromcpu       = NULL;
+			doubletype **values_fromcpu      = NULL;
+			int         *numvalues_fromcpu   = NULL;
+
+			int           lower_row;
+			int           upper_row;
+			int*          sendcnts            = NULL;
+			int*          displs              = NULL;
+			int           count               = 0;
+
+			/*some communicator info: */
+			num_procs=IssmComm::GetSize();
+			MPI_Comm comm=IssmComm::GetComm();
+
+			/*First, make a vector of size M, which for each row between 0 and M-1, tells which cpu this row belongs to: */
+			RowRank=DetermineRowRankFromLocalSize(M,m,comm);
+
+			/*Now, sort out our dataset of buckets according to cpu ownership of rows: */
+			bucketsforcpu=xNew<DataSet*>(num_procs);
+
+			for(i=0;i<num_procs;i++){
+				DataSet* bucketsofcpu_i=new DataSet();
+				for (j=0;j<buckets->Size();j++){
+					Bucket<doubletype>* bucket=(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
+					bucket->SpawnBucketsPerCpu(bucketsofcpu_i,i,RowRank);
+				}
+				bucketsforcpu[i]=bucketsofcpu_i;
+			}
+
+			/*Recap, each cpu has num_procs datasets of buckets. For a certain cpu j, for a given dataset i, the buckets this 
+			 * dataset owns correspond to rows that are owned by cpu i, not j!. Out of all the buckets we own, make row,col,value,insert_mode 
+			 * vectors that will be shipped around the cluster: */
+			this->BucketsBuildScatterBuffers(&numvalues_forcpu,&row_indices_forcpu,&col_indices_forcpu,&values_forcpu,&modes_forcpu,bucketsforcpu,num_procs);
+
+			/*Now, we need to allocate on each cpu arrays to receive data from all the other cpus. To know what we need to allocate, we need 
+			 *some scatter calls: */
+			numvalues_fromcpu   = xNew<int>(num_procs);
+			for(i=0;i<num_procs;i++){
+				MPI_Scatter(numvalues_forcpu,num_procs,MPI_INT,numvalues_fromcpu+i,1,MPI_INT,i,comm);
+			}
+			for(i=0;i<num_procs;i++){
+				row_indices_fromcpu[i]=xNew<int>(numvalues_fromcpu[i]);
+				col_indices_fromcpu[i]=xNew<int>(numvalues_fromcpu[i]);
+				values_fromcpu[i]=xNew<doubletype>(numvalues_fromcpu[i]);
+				modes_fromcpu[i]=xNew<int>(numvalues_fromcpu[i]);
+			}
+
+			/*Now, to scatter values across the cluster, we need sendcnts and displs. Our sendbufs have been built by BucketsBuildScatterBuffers, with a stride given 
+			 * by numvalues_forcpu. Get this ready to go before starting the scatter itslef. For reference, here is the MPI_Scatterv prototype: 
+			 * int MPI_Scatterv( void *sendbuf, int *sendcnts, int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) :*/
+			sendcnts=xNew<int>(num_procs);
+			displs=xNew<int>(num_procs);
+			count=0;
+			for(i=0;i<num_procs;i++){
+				sendcnts[i]=numvalues_forcpu[i];
+				displs[i]=count;
+				count+=numvalues_forcpu[i];
+			}
+
+			/*Start the scattering: */
+			for(i=0;i<num_procs;i++){
+				MPI_Scatterv( row_indices_forcpu, sendcnts, displs, MPI_INT, row_indices_fromcpu[i], numvalues_fromcpu[i], MPI_INT, i, comm);
+				MPI_Scatterv( col_indices_forcpu, sendcnts, displs, MPI_INT, col_indices_fromcpu[i], numvalues_fromcpu[i], MPI_INT, i, comm);
+				MPI_Scatterv( values_forcpu, sendcnts, displs, MPI_DOUBLE, values_fromcpu[i], numvalues_fromcpu[i], MPI_DOUBLE, i, comm);
+				MPI_Scatterv( modes_forcpu, sendcnts, displs, MPI_INT, modes_fromcpu[i], numvalues_fromcpu[i], MPI_INT, i, comm);
+			}
+			
+			/*Plug values into global matrix: */
+			GetOwnershipBoundariesFromRange(&lower_row,&upper_row,m,comm);
+			for(i=0;i<num_procs;i++){
+				int  numvalues=numvalues_fromcpu[i];
+				int* rows=row_indices_fromcpu[i];
+				int* cols=col_indices_fromcpu[i];
+				doubletype* values=values_fromcpu[i];
+				int* mods=modes_fromcpu[i];
+
+				for(j=0;j<numvalues;j++){
+					if(mods[j]==ADD_VAL) *(matrix+N*(rows[j]-lower_row)+cols[j])+=values[j];
+					else *(matrix+N*(rows[j]-lower_row)+cols[j])=values[j];
+				}
+			}
+			
+			/*Free ressources:{{{*/
+			xDelete<int>(RowRank);
+			xDelete<int>(row_indices_forcpu);
+			xDelete<int>(col_indices_forcpu);
+			xDelete<int>(modes_forcpu);
+			xDelete<doubletype>(values_forcpu);
+			xDelete<int>(numvalues_forcpu);
+			
+			for(i=0;i<num_procs;i++){
+				DataSet* buckets=bucketsforcpu[i];
+				delete buckets;
+			}
+			xDelete<DataSet*>(bucketsforcpu);
+
+			for(i=0;i<num_procs;i++){
+				int* rows=row_indices_fromcpu[i];
+				int* cols=col_indices_fromcpu[i];
+				int* modes=modes_fromcpu[i];
+				doubletype* values=values_fromcpu[i];
+
+				xDelete<int>(rows);
+				xDelete<int>(cols);
+				xDelete<int>(modes);
+				xDelete<doubletype>(values);
+			}
+			xDelete<int*>(row_indices_fromcpu);
+			xDelete<int*>(col_indices_fromcpu);
+			xDelete<int*>(modes_fromcpu);
+			xDelete<doubletype*>(values_fromcpu);
+			xDelete<int>(numvalues_fromcpu);
+			
+			xDelete<int>(sendcnts);
+			xDelete<int>(displs);
+			/*}}}*/
+
+
 		}
 		/*}}}*/
@@ -364,4 +502,75 @@
 		}
 		/*}}}*/		
+		/*FUNCTION BucketsBuildScatterBuffers{{{*/
+		void BucketsBuildScatterBuffers(int** pnumvalues_forcpu,int** prow_indices_forcpu,int** pcol_indices_forcpu,doubletype** pvalues_forcpu,int** pmodes_forcpu,DataSet** bucketsforcpu,int num_procs){
+
+
+			/*intermediary: */
+			int         i,j;
+			int         count                   = 0;
+			int         total_size              = 0;
+			int        *temp_row_indices_forcpu = NULL;
+			int        *temp_col_indices_forcpu = NULL;
+			doubletype *temp_values_forcpu      = NULL;
+			int        *temp_modes_forcpu       = NULL;
+
+			/*output: */
+			int        *numvalues_forcpu        = NULL;
+			int        *row_indices_forcpu      = NULL;
+			int        *col_indices_forcpu      = NULL;
+			doubletype *values_forcpu           = NULL;
+			int        *modes_forcpu            = NULL;
+
+			/*figure out size of buffers per cpu: */
+			for(i=0;i<num_procs;i++){
+				DataSet    *buckets            = bucketsforcpu[i];
+				
+				count=0;
+				for(j=0;j<buckets->Size();j++){
+					Bucket<doubletype>* bucket =(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
+					count+=bucket->MarshallSize();
+				}
+
+				numvalues_forcpu[i]=count;
+			}
+
+			/*now, figure out size of  total buffers (for all cpus!): */
+			count=0;
+			for(i=0;i<num_procs;i++){
+				count+=numvalues_forcpu[i];
+			}
+			total_size=count;
+
+			/*Allocate buffers: */
+			row_indices_forcpu = xNew<int>(total_size);
+			col_indices_forcpu = xNew<int>(total_size);
+			values_forcpu = xNew<doubletype>(total_size);
+			modes_forcpu = xNew<int>(total_size);
+
+			/*we are going to march through the buffers, and marshall data onto them, so in order to not
+			 *lose track of where these buffers are located in memory, we are going to work using copies 
+			 of them: */
+			temp_row_indices_forcpu=row_indices_forcpu;
+			temp_col_indices_forcpu=col_indices_forcpu;
+			temp_values_forcpu=values_forcpu;
+			temp_modes_forcpu=modes_forcpu;
+
+			/*Fill buffers: */
+			for(i=0;i<num_procs;i++){
+				DataSet    *buckets            = bucketsforcpu[i];
+				for(j=0;j<buckets->Size();j++){
+					Bucket<doubletype>* bucket =(Bucket<doubletype>*)buckets->GetObjectByOffset(j);
+					bucket->Marshall(&temp_row_indices_forcpu,&temp_col_indices_forcpu,&temp_values_forcpu,&temp_modes_forcpu); //pass in the address of the buffers, so as to have the Marshall routine increment them.
+				}
+			}
+
+			/*output buffers: */
+			*pnumvalues_forcpu   = row_indices_forcpu;
+			*prow_indices_forcpu = row_indices_forcpu;
+			*pcol_indices_forcpu = col_indices_forcpu;
+			*pvalues_forcpu      = values_forcpu;
+			*pmodes_forcpu       = modes_forcpu;
+		}
+		/*}}}*/		
 };
 
