/*****************************************************************************/
/* Laplace Equation Numerically Solved by Jacobi Iteration Method on PSC T3E */ 
/* Temperature T is initially 225 everywhere except at boundaries.           */
/* Revision for Correcting Errors and Stopping for Convergence.              */
/* Laplace Code in row panel block for C with source in cpgm.c and data      */
/*                                                                           */
/*      T=300.0+100*x/LX                      0<x<LX                         */
/*         _________                LY  ___________________   Y,NR+1-I       */
/*         |       |                   |*      VPE0        |   |             */
/*         |       |              3LY/4|___________________|   |             */
/* T=100.0 |T=225.0| T=100.0    0<y<LY |       VPE1        |   |             */
/*+200*y/LY|       | +300*y/LY     LY/2|___________________|   |             */
/*         |       |                   |       VPE2        |   |             */
/*         |       |               LY/4|___________________|   |             */
/*         |       |                   |       VPE3        |   |             */
/*         |_______|                0.0|___________________|   |_______X,J   */
/*          T=100.0                   0.0                  LX                */
/* Use Central Differencing Method with 4 PEs.                               */
/* Rectangular Domain decomposed into 4 vertical panels for Colwise Fortran. */
/* Each process gets its own copy of the code to execute,                    */
/* with parallel synchronization handled by MPI in mpprun environment.       */
/* Each process only has subgrid for VPE=0,1,2,...,npes-1=3 here,            */
/* where VPE=VirtualOrUserProcessingElement.                                 */
/* * Note: Geometric Origin is LowerLeft, But Array T[0][0] is UpperLeft     */
/* Note: Array T[i][j] is Only Local to Each PE and its Panel!               */
/* Each Processor works sub grid and then sends its Boundaries to neighbours */
/*97: FH/MCS572F97  interactive run suggestions:                             */
/* compiles and links malleable cpgm: cc -O2 -Xm -l mpi -o cpgm cpgm.c       */
/* execute: mpprun -n4 cpgm < data                                           */
/* Data input for number of iterations niter: data                           */
/* Try ps Process Status Command While Running & Should See 4 Jobs Executing!*/
/* Try jobs Jobs Status Command to Check for Running, Exit or Done.          */
/*****************************************************************************/
#define NPES     4                       /* Number of Processing Elements    */
#define NYI    200                       /* Number of Y-Intervals            */
#define NXI    200                       /* Number of X-Intervals            */
#define NYIL  NYI/NPES                   /* Number of Local Y-Intervals      */
#define NC       NXI-1                   /* Number of Interior Cols          */
#define NR       NYI-1                   /* Number of Interior Rows          */
#define NR4      NYIL                    /* Number of Local Interior Rows    */
#define MXITER    5000                   /* Max num of Iterations            */
#define DOWN     100                     /* MsgTag for messages down         */
#define UP       101                     /* MsgTag for messages up           */
#define ROOT     0                       /* Root PE                          */
#define MAX(x,y) ( ((x) > (y)) ? x : y ) /* Define Max Function              */
#define LY  1.e+0                        /* Y-Length                         */
#define LX  1.e+0                        /* X-Length                         */
#define HY  LY/NYI                       /* Y-StepSize                       */
#define HX  LX/NXI                       /* X-StepSize                       */
#define stoptol  0.5E-2                  /* stopping tolerance: Convergence  */
#define NC2      NXI/2                   /* Central Root Sample Print Column */
#define NR0      NYIL/25                 /* First  Root Sample Print Row     */
#define NRE      NYIL-NR0+1              /* End    Root Sample Print Row     */
#define NRL2     NYIL/2                  /* Middle Root Sample Print Row     */
#define DNR      NRL2-NR0                /* Step   Root Sample Print Row     */
#define NCO0     0                       /* First All Sample Print Column    */
#define NCOE     NXI                     /* End   All Sample Print Column:unu*/
#define DNCO     NXI/20                  /* Step  All Sample Print Column    */
#define NRO0     0                       /* First All Sample Print Row       */
#define NROE     NYIL                    /* End   All Sample Print Row:unused*/
#define DNRO     NYIL/5                  /* Step  All Sample Print Row       */
/*****************************************************************************/
#include <stdio.h>
#include <math.h>
#include <mpi.h>   /* MPI Step: Include MPI Fortran Header Defns. and Decls. */


void initialize( double t[NR4+2][NC+2] );
void set_bcs   ( double t[NR4+2][NC+2], int mype, int npes );

int main( int argc, char **argv ){      /* Main Procedure                    */
  
  int        npes;                      /* Actual Number of PEs       */
  int        mype;                      /* My PE number               */
  int        stat;                      /* Error Status/Return Code   */
  int        niter;                     /* Number of Iterations       */
  int        nstop;                     /* Interation Stop Parameter  */
  MPI_Status status;        /* MPI Step: Declare status structure     */

  double     t[NR4+2][NC+2], told[NR4+2][NC+2];  /* Temp. Arrays: Old & New  */
  double     dt;                      /* Delta t                             */
  double     dtg;                     /* Delta t global                      */
  double     starttime,stoptime,timerres,totalwalltime;/* Wall Time Variable */
  int        i, j, k, l;
  int        iter;                    /* Current Number of Iterations        */
  int        NRL;                     /* Local Interior Rows: PE Dependent   */
/* MPI Step: Initialize MPI, giving each PE copy of code;                    */
/* MPI required subroutine using programming model:SPMD=SglPgmMultiData.     */
  MPI_Init(&argc, &argv);
/* MPI Step: Start MPI Wall Timer and get Tick Resolution:                   */
      starttime = MPI_Wtime();
      timerres = MPI_Wtick();
/* MPI Step: Get T3E assigned npes=#PEs or Communication_size:               */
  MPI_Comm_size(MPI_COMM_WORLD, &npes );
/*        where MPI_COMM_WORLD=MPI_Communication_World=UserSetOfPEs.         */
/* MPI Step: Get local (this PE) mype=PE#, also called Communication_rank:   */
  MPI_Comm_rank(MPI_COMM_WORLD, &mype );
  if ( npes != NPES ){                /* Currently FoolProofCheck Hardcoded  */
    MPI_Finalize();
    if( mype == 0 )
      fprintf(stderr, "The example is only for %d PEs\n", NPES);
    exit(1);
  }
/* Laplace Step: Set Local Initial Iteration Conditions for Laplace Equation:*/
  initialize(t);                  
/* Laplace Step: Set Local Boundary Conditions for Laplace Equation:         */
  set_bcs(t, mype, npes);         /* Set the Boundary values   */
/* Initialize Iteration Stopping Parameter                                   */
      nstop = 0;
      NRL = NR4;
/* Laplace Step: Root Reads NumberIterations from "data" Input File:         */
  if( mype == ROOT ){
     scanf("%d", &niter); /* */
    if( niter>MXITER ) niter = MXITER;
/* Correct Root Local Interior Point Count so Sum= NC= #TotalInteriorPoints  */
     NRL=NR4-1;
  }
/* MPI Step: All VPES Get Collective Communication of niter from Root:       */
  MPI_Bcast(&niter, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
/*      where count is 1 and datatype is MPI_INT.                        */
/* Laplace Step: Save Old Iteration:                                         */
  for( i=0; i<=NRL+1; i++ )       /* Copy the values into told */
    for( j=0; j<=NC+1; j++ )
      told[i][j] = t[i][j];
/*-------------------------------------------------------------------------*
 |  Do Computation on Local Sub-grid for Niter 5-Star Jacobi iterations.   |
 |    Corrected for Arbitrary (LY,LX)Lengths with (HY,HX)Steps.            |
 *-------------------------------------------------------------------------*/
  for( iter=1; iter<=niter; iter++ ) {
    for( i=1; i<=NRL; i++ )
      for( j=1; j<=NC; j++ )
        t[i][j] = 0.5 * ((told[i+1][j] + told[i-1][j])*HX*HX
                + (told[i][j+1] + told[i][j-1])*HY*HY)/(HX*HX+HY*HY);
/* Get dt=LocalMaxAbsChange for this PE and Save Old Temperatures:         */
    dt = 0.;
    for( i=1; i<=NRL; i++ )       
      for( j=1; j<=NC; j++ ){
        dt = MAX( fabs(t[i][j]-told[i][j]), dt);
        told[i][j] = t[i][j];
      }
/* MPI Step: mype Sends Down Boundary Data to Down Neighbor mype+1,         */
/*   using Point_to_Point Send Communication for mype=0,...,npes-2:         */
    if( mype < npes-1 )    /*   Sending Down; Only npes-1 do this           */
/*                                                                          */
      MPI_Send( &t[NRL][1],  NC, MPI_DOUBLE, mype+1, DOWN, MPI_COMM_WORLD);
/*                                                                          */
/* starting at pointer &t[NRL][1] for NC cols using datatype MPI_DOUBLE.    */
/*                                                                          */
/* MPI Step: mype Sends Up Boundary Data to Up Neighbor mype-1,             */
/*     using Point_to_Point Send Communication for mype=1,...,npes-1:       */
    if( mype != 0 )        /*   Sending Msg Up  ; Only mype+1 do this       */
/*                                                                          */
      MPI_Send( &t[1][1],    NC, MPI_DOUBLE, mype-1, UP,   MPI_COMM_WORLD);
/*                                                                          */
/* starting at pointer &t[1][1] for NC cols using datatype MPI_DOUBLE.      */
/*                                                                          */
/* MPI Step: mype Receives its UP Boundary Data from Up Neighbor mype-1     */
/* (here using wildcard name MPI_ANY_SOURCE; but note MsgTag=Down),         */
/* using Point_to_Point Receive Communication for mype=1,...,npes-1:        */
    if( mype != 0 )        /*   Receive  Msg DOWN mype-1 above              */
/*                                                                          */
      MPI_Recv( &t[0][1], NC, MPI_DOUBLE, MPI_ANY_SOURCE, DOWN,
                             MPI_COMM_WORLD, &status);
/*                                                                          */
/* starting at pointer &t[0][1] for NC cols using datatype MPI_DOUBLE,      */
/* with Additional Vector Structure status Argument.                        */
/*                                                                          */
/* MPI Step: mype Receives Down Boundary Data from Down Neighbor mype+1     */
/* (here using wildcard name MPI_ANY_SOURCE; but note MsgTag=UP),           */
/* using Point_to_Point Receive Communication for mype=1,...,npes-1:        */
    if( mype != npes-1 )        /*   Receive Msg Up from mype+1 below       */
/*                                                                          */
      MPI_Recv( &t[NRL+1][1],NC, MPI_DOUBLE, MPI_ANY_SOURCE, UP  ,
                                MPI_COMM_WORLD, &status);
/*                                                                          */
/* starting at address &t[NRL+1][1] for NC rows using datatype MPI_DOUBLE,  */
/* with Additional Vector Structure status Argument.                        */
/*                                                                          */
/* MPI Step: mype Sends to RootPE LocalMaxAbsChange=dt using Collective     */
/* (Global) Reduction Maximum Function MPI_MAX with Result Eventually       */
/* Collected in GlobalMaxAbsChange=dtg of count 1 and datatype MPI_DoUBLE:  */
/*                                                                          */
    MPI_Reduce(&dt, &dtg, 1, MPI_DOUBLE, MPI_MAX, ROOT, MPI_COMM_WORLD);
/*                                                                          */
/* Root Print Selected Values at Every 100th Iteration:                     */
/* Important that Writes are in Serial Sections on DMP T3E:                 */
/* Root Also Check Stopping Criteria Tolerance                              */
    if( mype == 0 ) {
       if( (iter%100) == 0 ) {
          printf("\nMYPE = %4d; ITER = %5d; GlobalMaxAbsChange:dtg = %10.3e\n", 
             mype, iter, dtg); 
/* VPE=0 Writes Out a Sample Central Values:                                */
 printf("MYPE = %4d; ITER = %5d; t[%3d:%3d:%3d][%3d] = %9.4f %9.4f %9.4f\n",
             mype, iter, NR0, NRL2, NRE, DNR,
	     t[NR0][NC2], t[NRL2][NC2], t[NRE][NC2]); 
       }

/* If Iteration Stopping, set Stopping Flag:                                */
       if(dtg < stoptol) nstop = 1;
    }
/* Global Call Broadcast of Possible Stopping Value By All.                 */
/*                                                                          */
      MPI_Bcast(&nstop, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
/*                                                                          */
/* Call Generic T3E barrier to Make Slaves Wait for Root to Write:          */
/*                                                                          */
      _barrier();
/*                                                                          */
/* MPI Step:  Barrier is Final Call for all PEs to Stop and Wait for Rest   */
/* in MPI_COMM_WORLD to Finish Before Continuing on to Next Iteration:      */
/*                                                                          */
    MPI_Barrier( MPI_COMM_WORLD );
/*                                                                          */
/*  If Convergence for Small dtg, Then Break Out of For Loop!               */
    if(nstop == 1) break;
/*                                                                          */
  }  /* End of Big Iteration Loop */
/* Stopping Criterion Stop:                                                 */
      if( mype == 0 )  {
/* MPI Step: Stop MPI Wall Timer:                                           */
/*                                                                          */
       stoptime = MPI_Wtime();
/*                                                                          */
       totalwalltime = stoptime - starttime;
       printf("\nStopping for Small GlobalMaxAbsChange:dtg = %10.3e;",dtg);
 printf("\nMYPE = %4d; TotalWallTime = %12.4e secs; WallTick = %12.4e secs;\n", 
        mype, totalwalltime, timerres);                   
       }
/* Print Final Sample Output Temperatures for Each mype's Panel:            */
  printf("\nSample Output: MYPE =%4d; ITER =%5d\n J/I ",mype, iter);
  for( k=0; k<=5; k++ ){
     i=NRO0+ k*DNRO;
     printf("%7d",i);}
     printf("%7d",NRL+1);
  for( l=0; l<=20; l++ ){
     j=NCO0+ l*DNCO;
     printf("\n%4d ",j);
     for( k=0; k<=5; k++ ){
     i=NRO0+ k*DNRO;
     printf("%7.2f",t[i][j]);}
     printf("%7.2f",t[NRL+1][j]);}
/* MPI Step: MPI Required Finalization Step:                                */
/*                                                                          */
  MPI_Finalize();
/*                                                                          */
}    /* End of Main Procedure  */

/********************************************************************
 *                                                                  *
 * Initialize all the values to 225.0 as a starting iteration       *
 *                                                                  *
 ********************************************************************/

void initialize( double t[NR4+2][NC+2] ){

  int        i, j, iter;
  
  for( i=0; i<=NR4+1; i++ )        /* Initialize Iterations */
    for ( j=0; j<=NC+1; j++ )
      t[i][j] = 225.0;
}

/********************************************************************
 *                                                                  *
 * Set the values at the boundary.  Values at the boundary do not   *
 * Change through out the execution of the program                  *
 *                                                                  *
 ********************************************************************/

void set_bcs( double t[NR4+2][NC+2], int mype, int npes ){

  int i, j;
  int NRL, iy;

/* Reset Parameters for LocalInteriorRows to Preserve #GlobalInteriorPoints  */
  NRL = NR4;
  iy = 1;

  if( mype == 0 ) {                  
    NRL = NR4-1;
    iy =0;
    for( j=1; j<NC+1; j++ )
      t[0][j] = 300.0 + 100.0*j*HX;   /* Top boundary */
      }

  if( mype == npes-1 ) 
    for( j=1; j<NC+1; j++ )
      t[NRL+1][j] = 100.0;              /* Bottom boundary */

  for( i=0; i<=NR4+1; i++ ) {      
    t[i][0]    = 300.0 - 50.0*mype - 200.0*(i-iy)*HY; /* Set Left Boundary  */
/*                                                                          */
    t[i][NC+1] = 400.0 - 75.0*mype - 300.0*(i-iy)*HY; /* Set Right Boundary */
/* Caution: mype=0 Panel 1-Step Shorter Than Other Panels with (NR+1)/4-1   */
/*    Interior Points, But Other Panels Have (NR+1)/4 Interior Points,      */
/*    Making Total Interior Points (NR) As Required for (NR+1) Subintervals.*/
  }
}

