#include <stdlib.h>
#include <stdio.h>
#include <iostream.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include "mpi.h"

void PrintSurfaceMatrix(double** M, int LD, int D);

int main(int argc, char* argv[]){
  
  MPI_Init(&argc,&argv);
  int NCPUS, MY_PE;
  MPI_Comm_size(MPI_COMM_WORLD, &NCPUS);
  MPI_Comm_rank(MPI_COMM_WORLD, &MY_PE);

  // Read Input Parameters
  if(argc < 4) {
    cerr << "\nToo many input paramters.. should be four\n";
    exit(2);
  }

  int Dimension = atoi(argv[1]);
  int Iterations = atoi(argv[2]);
  int RowPeek = atoi(argv[3]);
  int ColPeek = atoi(argv[4]);
  
  if((RowPeek > Dimension) || (ColPeek > Dimension)){
    if(MY_PE==0){
      cerr << "Cannot Peek a matrix element outside of the surface";
      cerr << "Arguments 3 and 4 must be smaller than " << Dimension;
    }
    MPI_Finalize();
    exit(3);
  }

  // Initialize Matrix
  // Now each processor gets a cube of the matrix, not a row.
  double SQ = sqrt(NCPUS);
  int SQint = floor(SQ);
  if((floor(SQ)-SQ)!=0){
    if(MY_PE==0){
      cerr << "\nThe number of processors needs to be a perfect sqare (e.g 16)";
    }
    MPI_Finalize();
    exit(4);
  }
  
  // For this assignment, we are assuming that the size of the matrix is
  // mappable to the CPUs. 
  if((Dimension % SQint)!=0){
    if(MY_PE == 0){
      cerr << "\n The Dimensions must be divisible by "<<SQint;
    }
    MPI_Finalize();
    exit(5);
  }
  
  // So on each processor, we need only enough memory to hold our rows
  // and the two other rows
  double **SurfaceMatrix,**SurfaceMatrix_t;
  int *Offsets = (int*)malloc(sizeof(int)*NCPUS);
  int BlockSize = (Dimension)/SQint;
  
  //Allocate a local portion of Surface (and a temp matrix for the iteration)
  SurfaceMatrix = (double **)malloc(sizeof(double*)*(BlockSize+2));
  for(int i=0;i<BlockSize+2;i++)
    SurfaceMatrix[i] = (double*)calloc(sizeof(double*),(BlockSize+2));
   
  SurfaceMatrix_t = (double **)malloc(sizeof(double*)*(BlockSize+2));
  for(int i=0;i<BlockSize+2;i++)
    SurfaceMatrix_t[i] = (double*)calloc(sizeof(double*),(BlockSize+2));
  
  //Fill the horizontal Borders
  double increment = 100.0/(Dimension+1);
  double* values = (double*)malloc(sizeof(double)*(Dimension));
  for(int i=0;i<Dimension;i++)
    values[i] = (i+1)*increment;
  
  // Initalize all inner elements to 0.5
  for(int i=1;i<=BlockSize;i++){
    for(int j=1;j<=BlockSize;j++){
      SurfaceMatrix[i][j]   = 0.5;
      SurfaceMatrix_t[i][j] = 0.5;
    }
  }

  //Left
  if((MY_PE % SQint) == 0){    
    for(int i=1;i<=BlockSize;i++){
      int localDim = (MY_PE/SQint)*BlockSize;
      SurfaceMatrix[i][0] = values[localDim+i-1];
      SurfaceMatrix_t[i][0] = values[localDim+i-1];
    }
  }
  
  //Bottom
  if(MY_PE >= (SQint*(SQint-1))){
    int localDim = (MY_PE-(SQint*(SQint-1)))*BlockSize;
    for(int i=1;i<=BlockSize;i++){
      SurfaceMatrix[BlockSize+1][i] = values[localDim+i-1];
      SurfaceMatrix_t[BlockSize+1][i] = values[localDim+i-1];
    }
  }

  //PrintSurfaceMatrix(SurfaceMatrix,BlockSize,Dimension);
  //MPI_Finalize();
  //exit(1);
  //Iterate
  double TimeStart = MPI_Wtime();

  MPI_Barrier(MPI_COMM_WORLD);
  // We will need a buffer to pass the rows
  double *sRbuffer   = (double*)malloc(sizeof(double)*BlockSize);
  double *sLbuffer   = (double*)malloc(sizeof(double)*BlockSize);
  // double *sTbuffer   = (double*)malloc(sizeof(double)*BlockSize);
  double *rRbuffer   = (double*)malloc(sizeof(double)*BlockSize);
  double *rLbuffer   = (double*)malloc(sizeof(double)*BlockSize);
  
  MPI_Request reqs1,reqs2,reqs3,reqs4;
  MPI_Request reqr1,reqr2,reqr3,reqr4;
  MPI_Status status;
  //MPI_Finalize();
  //exit(1);
  for(int iCount = 1; iCount <=Iterations; iCount++){
    
    if(MY_PE % SQint != (SQint-1)) //Buffer up right edges
      for(int i=0;i<BlockSize;i++)
	sRbuffer[i] = SurfaceMatrix[i+1][BlockSize];
    
    if(MY_PE % SQint != 0) // Buffer up left edges
      for(int i=0;i<BlockSize;i++)
	sLbuffer[i] = SurfaceMatrix[i+1][1];
    
    if(MY_PE % SQint != 0){ //Recieve from Right
        MPI_Irecv(rRbuffer,BlockSize,MPI_DOUBLE,MY_PE-1,0,
       		MPI_COMM_WORLD,&reqr1); 
    }
    if(MY_PE % SQint != (SQint-1)){ // Recieve from Left
       MPI_Irecv(rLbuffer,BlockSize,MPI_DOUBLE,MY_PE+1,0,
      		MPI_COMM_WORLD,&reqr2);
    }
    
    if(MY_PE < (SQint*(SQint-1))){ // Recieve from Below
       MPI_Irecv(&SurfaceMatrix[BlockSize+1][1],BlockSize,MPI_DOUBLE,
		 MY_PE+SQint,0,MPI_COMM_WORLD,&reqr3);
    }

    if(MY_PE >= SQint){ //Recieve from Above
      MPI_Irecv(&SurfaceMatrix[0][1],BlockSize,MPI_DOUBLE,MY_PE-SQint,
      		0,MPI_COMM_WORLD,&reqr4);
    }
    //Send Up
    if(MY_PE >= SQint){ 
        MPI_Isend(&SurfaceMatrix[1][1],BlockSize,MPI_DOUBLE,MY_PE-SQint,
      		0,MPI_COMM_WORLD,&reqs3);   
    }
    
    //Send right
    if(MY_PE % SQint != (SQint-1)) { 
        MPI_Isend(sRbuffer,BlockSize,MPI_DOUBLE,MY_PE+1,0,
      		MPI_COMM_WORLD,&reqs1);
    }    
    //Send left
    if(MY_PE % SQint != 0) { // Right Edge
      // fill buffer
         MPI_Isend(sLbuffer,BlockSize,MPI_DOUBLE,MY_PE-1,0,
      		MPI_COMM_WORLD,&reqs2);
    }
    //Send Down
    if(MY_PE < (SQint*(SQint-1))){ //Bottom Edge
       MPI_Isend(&SurfaceMatrix[BlockSize][1],BlockSize,MPI_DOUBLE,MY_PE+SQint,
      		0,MPI_COMM_WORLD,&reqs4);
    }
    
    //Posts the waits for the right and left so that we can process them.
    if(MY_PE % SQint != (SQint-1)){ // Right Edge
      MPI_Wait(&reqs1,&status);
      MPI_Wait(&reqr2,&status);
    }
    if(MY_PE % SQint != 0){ //Left Edge
      MPI_Wait(&reqs2,&status);
      MPI_Wait(&reqr1,&status);
    }
   
    //The rights and lefts are here, Unpack em
    
    if(MY_PE % SQint !=0) //Left Edge
     for(int i=0; i<=BlockSize;i++)
		SurfaceMatrix[i+1][0] = rRbuffer[i];
    
    if(MY_PE % SQint != (SQint-1))
      for(int i=0;i<BlockSize;i++)
		SurfaceMatrix[i+1][BlockSize+1] = rLbuffer[i];
    
    //Wait for the rest.
    if(MY_PE < (SQint*(SQint-1))){
      MPI_Wait(&reqr3,&status);
      MPI_Wait(&reqs4,&status);
      }
    
    if(MY_PE >= SQint){
      MPI_Wait(&reqs3,&status);
      MPI_Wait(&reqr4,&status);
    }
    
    //This will be a row dominant program.
    for(int i=1;i<=BlockSize;i++)
      for(int j=1;j<=BlockSize;j++)
	SurfaceMatrix_t[i][j] = (0.25)*(SurfaceMatrix[i-1][j] +
				       SurfaceMatrix[i][j+1] +
				       SurfaceMatrix[i+1][j] +
				       SurfaceMatrix[i][j-1]);
    
    // Swap pointers
    double ** tmp;
    tmp = SurfaceMatrix;
    SurfaceMatrix = SurfaceMatrix_t;
    SurfaceMatrix_t = tmp;
  }
  // These are no longer needed
  free(sRbuffer);
  free(sLbuffer);
  free(rRbuffer);
  free(rLbuffer);
  double TimeEnd = MPI_Wtime();
  
  if(MY_PE == 0) 
    cout <<"\n Time Iterations = "<<TimeEnd-TimeStart << " seconds";
  
  // Need to find= the proper matrix element to print out.
  int LocalRowPeek, IPRowPeek,IP;
  int LocalColPeek, IPColPeek;
  IPRowPeek = RowPeek/BlockSize;
  IPColPeek = ColPeek/BlockSize;
  IP = IPRowPeek*SQint + IPColPeek;
  LocalRowPeek = RowPeek -(IPRowPeek*BlockSize);
  LocalColPeek = ColPeek -(IPColPeek*BlockSize);

  if(MY_PE == IP){
    cout << "\n Result SurfaceMatrix["<<RowPeek<<"]["<<ColPeek<<"]="
	 <<SurfaceMatrix[LocalRowPeek+1][LocalColPeek+1]<<"\n";
  }

  //Free up
  for(int i=0;i<BlockSize+2;i++){
    free(SurfaceMatrix[i]);
    free(SurfaceMatrix_t[i]);
  }
  free(SurfaceMatrix);
  free(SurfaceMatrix_t);
  free(values);
  MPI_Finalize();
  return 0;
}

void PrintSurfaceMatrix(double**SurfaceMatrix, int BlockSize, int Dimension){
  
  cout << "\t";
  int mype,ncpus;
  MPI_Comm_rank(MPI_COMM_WORLD,&mype);
  MPI_Comm_size(MPI_COMM_WORLD,&ncpus);
  MPI_Barrier(MPI_COMM_WORLD);
  for(int ip=0;ip<ncpus;ip++){
    if(mype == ip){
      cout << "\nMatrix for CPU "<< ip << "\n";
      cout << "\t";
      for(int i=0;i<BlockSize+2;i++){
	cout << i << "\t";
      }

      cout << "\n";
  
      for(int i=0;i<BlockSize+2;i++){
	cout << i << "\t";  
	for(int j=0;j<BlockSize+2;j++){
	  cout << SurfaceMatrix[i][j] << "\t";
	}
	cout << "\n";
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }
}