        typedef unsigned long UINT_P;
        typedef long INT_P;
typedef struct _iobuf
{
        char* _ptr;
        int _cnt;
        char* _base;
        int _flag;
        int _file;
        int _charbuf;
        int _bufsiz;
        char* _tmpfname;
} FILE;
typedef int MPI_Comm;

typedef int MPI_Datatype;

typedef UINT_P MPI_Group;

typedef int MPI_Op;

typedef UINT_P MPI_Request;

typedef int MPI_Errhandler;

typedef INT_P MPI_Aint;

typedef struct
{
        int count;
        int MPI_SOURCE;
        int MPI_TAG;
        int MPI_ERROR;
} MPI_Status;






typedef void (MPI_User_function)(
                void *invec,
                void *inoutvec,
                int *len,
                MPI_Datatype *datatype
                );


typedef int (MPI_Copy_function)(
                MPI_Comm oldcomm,
                int keyval,
                void *extra_state,
                void *attribute_val_in,
                void *attribute_val_out,
                int *flag
                );


typedef int (MPI_Delete_function)(
                MPI_Comm comm,
                int keyval,
                void *attribute_val,
                void *extra_state
                );


typedef void (MPI_Handler_function)(
                MPI_Comm *comm,
                int *errorcode,
                ...
                );
void recordApplicationCheckpoint();

double timeSpendRecordingMPIChkpt=0.0;
double timeBeforeChkptCommitted=0.0;
int totalCheckpointsTaken=0;



void printIntArray(int* array, char* arrayname, int size);
void printDoubleArray(double* array, char* arrayname, int size);

const int BUTTERFLY = 0;
const int RINGPASS = 1;
int np;
int my_rank;
int topo;
int opt_verbose, opt_notime, opt_noiter, opt_nonorm;
int opt_printx;
MPI_Status status;
void allGather(
                double* localbuf,
                double* gatherbuf,
                int local_size
        ){
        int stage, from, to;
        double *psend, *precv;
        int bitmask, partner, blocksize;

   MPI_Request sendRequest;
   MPI_Request recvRequest;

        if(topo == RINGPASS){



                from = (my_rank - 1 + np) % np;
                to = (my_rank + 1) % np;
                memcpy(gatherbuf + my_rank * local_size, localbuf, local_size * sizeof(double));
                for(stage = 0; stage < np - 1; stage++){
                        psend = gatherbuf + ((my_rank - stage + np) % np) * local_size;
                        precv = gatherbuf + ((from - stage + np) % np) * local_size;






         MPI_Ibsend(psend, local_size, 11, to, 0, 1, &sendRequest);
         MPI_Irecv(precv, local_size, 11, from, 0, 1, &recvRequest);

         MPI_Wait(&sendRequest, &status);
         MPI_Wait(&recvRequest, &status);
                }
        } else {



                bitmask = 1; blocksize = local_size;
                memcpy(gatherbuf + my_rank * local_size, localbuf, local_size * sizeof(double));
                while(bitmask < np){
                        partner = my_rank ^ bitmask;
                        psend = gatherbuf + my_rank / bitmask * blocksize;
                        precv = gatherbuf + partner / bitmask * blocksize;






         MPI_Ibsend(psend, local_size, 11, partner, 0, 1, &sendRequest);
         MPI_Irecv(precv, local_size, 11, partner, 0, 1, &recvRequest);

         MPI_Wait(&sendRequest, &status);
         MPI_Wait(&recvRequest, &status);






                        bitmask <<= 1;
                        blocksize <<= 1;
                }
        }
}






void allReduce(
                double* local,
                double* sum
        ){
        double temp, temprcv;
        int stage, from, to;
        int bitmask, partner;
   MPI_Request sendRequest;
   MPI_Request recvRequest;

        if(topo == RINGPASS){



                temp = *local;
                *sum = *local;
                from = (my_rank - 1 + np) % np;
                to = (my_rank + 1) % np;
                for(stage = 0; stage < np - 1; stage++){






         MPI_Ibsend(&temp, 1, 11, to, 0, 1, &sendRequest);
         MPI_Irecv(&temprcv, 1, 11, from, 0, 1, &recvRequest);

         MPI_Wait(&sendRequest, &status);
         MPI_Wait(&recvRequest, &status);

                        temp = temprcv;
                        *sum += temp;
                }
        } else {



                *sum = *local;
                bitmask = 1;
                while(bitmask < np) {
                        partner = my_rank ^ bitmask;






         MPI_Ibsend(sum, 1, 11, partner, 0, 1, &sendRequest);
         MPI_Irecv(&temp, 1, 11, partner, 0, 1, &recvRequest);

         MPI_Wait(&sendRequest, &status);
         MPI_Wait(&recvRequest, &status);

                        *sum += temp;
                        bitmask <<= 1;
                }
        }
}






void gather(
                double* localbuf,
                double* globalbuf,
                int local_size
        ){
        int i;

        if(my_rank == 0)
        {
                memcpy(globalbuf, localbuf, local_size * sizeof(double));
                for(i = 1; i < np; i++)
                {
                        MPI_Recv(
                                globalbuf + i * local_size, local_size, 11,
                                i, 0, 1, &status
                        );
                }
        }
        else
        {
                MPI_Send(localbuf, local_size, 11, 0, 0, 1);
        }
}




void printMat(
                double* mat,
                int nrow,
                int ncol
        ){
        int i,j;
        double *prow;




        prow = mat;
        for(i = 0; i < nrow; i++)
        {
                for(j = 0; j < ncol; j++)
                {
                        printf("    %10.6f", prow[j]);
                        fflush(stdout);
                }
                printf("\n"); fflush(stdout);
                prow += ncol;
        }
        fflush(stdout);
}
void loadMatAndDistribute(
                double* mat,
                int nrow,
                int ncol,
                char* matFilename
        )
{

        int i;
        int j;
        int rcvr;
        int buffersize;
        double *prow;
        float temp;
        const int CUTSIZE = 1024 * 256;
        FILE * fp;

        buffersize = nrow * ncol;

        if(my_rank == 0){
                fp = fopen(matFilename, "r");
                if(opt_verbose == 1){
                        printf("\tLoad and distribute: %s ", matFilename);
                        fflush(stdout);
                }
        }
        for(rcvr = 0; rcvr < np; rcvr++){
                if(my_rank == 0){


                        prow = mat;
                        for(i = 0; i < nrow; i++)
                        {
                                for(j = 0; j < ncol; j++){
                                        fscanf(fp, "%f", &temp);
                                        prow[j] = temp;
                                }
                                fscanf(fp, "\n");
                                prow += ncol;
                        }
                        if(rcvr != 0){

                                for(i = 0; i < buffersize / CUTSIZE; i++)
                                        MPI_Send(
                                                mat + i * CUTSIZE, CUTSIZE,
                                                11, rcvr, 0, 1
                                        );
                                if(buffersize % CUTSIZE != 0)
                                        MPI_Send(
                                                mat + i * CUTSIZE, buffersize % CUTSIZE,
                                                11, rcvr, 0, 1
                                        );
                                if(opt_verbose == 1){ printf("."); fflush(stdout); }
                        }
                } else if(my_rank == rcvr){

                        for(i = 0; i < buffersize / CUTSIZE; i++)
                                MPI_Recv(
                                        mat + i * CUTSIZE, CUTSIZE,
                                        11, 0, 0, 1, &status
                                );
                        if(buffersize % CUTSIZE != 0)
                                MPI_Recv(
                                        mat + i * CUTSIZE, buffersize % CUTSIZE,
                                        11, 0, 0, 1, &status
                                );
                }
        }
        if(my_rank == 0) {
                rewind(fp);
                prow = mat;
                for(i = 0; i < nrow; i++)
                {
                        for(j = 0; j < ncol; j++){
                                fscanf(fp, "%f", &temp);
                                prow[j] = temp;
                        }
                        fscanf(fp, "\n");
                        prow += ncol;
                }
                fclose(fp);
                if(opt_verbose == 1){
                        printf(" Done\n", matFilename);
                        fflush(stdout);
                }
        }
}





void writeMat(
                double* mat,
                int nrow,
                int ncol,
                char* matFilename
        ){
        FILE* fp;
        int i,j;
        double* prow;
        fp = fopen(matFilename, "w");
        prow = mat;
        for(i = 0; i < nrow; i++){
                for(j = 0; j < ncol; j++)
                        fprintf(fp, "   %10.6f", prow[j]);
                fprintf(fp, "\n");
                prow += ncol;
        }
        fclose(fp);
        if(opt_verbose == 1){
                printf("\n[%6d *%6d ] matrix written to file: %s\n", nrow, ncol, matFilename);
                fflush(stdout);
        }
}




void vectorCopy(
                double* dest,
                double* src,
                int n
        ){
        memcpy(dest, src, n * sizeof(double));
}




double serialDot(
                double* v1,
                double* v2,
                int n
        ){
        int i;
        double sum = 0.0;
        for(i = 0; i < n; i++)
                sum += v1[i] * v2[i];
        return sum;
}




double parDot(
                double* local_v1,
                double* local_v2,
                int local_n
        ){



        double localdot = serialDot(local_v1, local_v2, local_n);




        double globalsum = 0.0;





        allReduce(&localdot, &globalsum);




        return globalsum;
}




void serialMulMatVec(
                double* mat,
                double* vec,
                int nrow,
                int ncol,
                double* result
        ){
        int i;
        double* prow;
        prow = mat;
        for(i = 0; i < nrow; i++)
        {
                result[i] = serialDot(prow, vec, ncol);
                prow += ncol;
        }
}





void parMulMatVec(
                double* local_mat,
                double* local_vec,
                int local_n,
                int n,
                double* local_result
        ){
        double* tempcol = (double*) malloc(n * sizeof(double));





        allGather(local_vec, tempcol, local_n);







        serialMulMatVec(local_mat, tempcol, local_n, n, local_result);




        free(tempcol);
}




void daxpy(
                double alpha,
                double* local_x,
                double* local_y,
                int local_n
        ){
        int i;
        for(i = 0; i < local_n; i++)
                local_y[i] += alpha * local_x[i];
}




void daxpyVariant(
                double alpha,
                double* local_x,
                double* local_y,
                int local_n
        ){
        int i;
        for(i = 0; i < local_n; i++)
                local_y[i] = local_y[i] * alpha + local_x[i];
}




void cgSolver(
                int n,
                int n_bar,
                double tol,
                int max_iter,
                double* local_A,
                double* local_b,
                double* global_x,
                int* piter,
                double* perror,
                double* pnorm
        ){
        int i, k;
        double alpha, beta, dotrr_prev;
   double dotrr=0;
        double *x, *r, *p, *s;


        x = (double*) malloc(n_bar * sizeof(double));
        r = (double*) malloc(n_bar * sizeof(double));
        p = (double*) malloc(n_bar * sizeof(double));
        s = (double*) malloc(n_bar * sizeof(double));


        k = 0;
        for(i = 0; i < n_bar; i++)
                x[i] = 0;
        vectorCopy(r, local_b, n_bar);

        while( k < max_iter ){
                k++;
                if(k == 1)
                        vectorCopy(p, r, n_bar);
                else {
                        beta = dotrr / dotrr_prev;

                        daxpyVariant(beta, r, p, n_bar);
                }

      parMulMatVec(local_A, p, n_bar, n, s);
                alpha = dotrr / parDot(p, s, n_bar);
                daxpy(alpha, p, x, n_bar);
                daxpy( - alpha, s, r, n_bar);

                dotrr_prev = dotrr;


        }



        gather(x, global_x, n_bar);

        *piter = k; *perror = dotrr;




        if(opt_nonorm == 0){


                parMulMatVec(local_A, x, n_bar, n, p);


                daxpyVariant(-1.0, local_b, p, n_bar);
                *pnorm = sqrt(parDot(p, p, n_bar));
        }



        free(x); free(r); free(p); free(s);
}




int parseInput(
                int argc,
                char** argv,
                int* pn,
                double* ptol,
                int* pmax,
                char* file_a,
                char* file_b,
                char* file_x
        ){
        float tempf;
        int i;
        int found[6];

        char* paraname[6];
        char msg[100];

        found[0] = 0;
        found[1] = 0;
        found[2] = 0;
        found[3] = 0;
        found[4] = 0;
        found[5] = 0;


        paraname[0] = " -n";
        paraname[1] = " -tol";
        paraname[2] = " -max";
        paraname[3] = " -fa";
        paraname[4] = " -fb";
        paraname[5] = " -fx";

        strcpy(msg, "");

        opt_verbose = 0; opt_notime = 0; opt_noiter = 0; opt_nonorm = 0;
        opt_printx =0;
        for(i = 1; i < argc; i++){
                if(strcmp(argv[i], "-n") == 0) {
                        *pn = atoi(argv[i + 1]);
                        found[0] = 1;
                }
                if(strcmp(argv[i], "-tol") == 0) {
                        sscanf(argv[i + 1], "%f", &tempf);
                        *ptol = tempf;
                        found[1] = 1;
                }
                if(strcmp(argv[i], "-max") == 0) {
                        *pmax = atoi(argv[i + 1]);
                        found[2] = 1;
                }
                if(strcmp(argv[i], "-fa") == 0) {
                        strcpy(file_a, argv[i + 1]);
                        found[3] = 1;
                }
                if(strcmp(argv[i], "-fb") == 0) {
                        strcpy(file_b, argv[i + 1]);
                        found[4] = 1;
                }
                if(strcmp(argv[i], "-fx") == 0) {
                        strcpy(file_x, argv[i + 1]);
                        found[5] = 1;
                }
                if(strcmp(argv[i], "-verbose") == 0) {
                        opt_verbose = 1;
                }
                if(strcmp(argv[i], "-notime") == 0) {
                        opt_notime = 1;
                }
                if(strcmp(argv[i], "-noiter") == 0) {
                        opt_noiter = 1;
                }
                if(strcmp(argv[i], "-nonorm") == 0) {
                        opt_nonorm = 1;
                }
                if(strcmp(argv[i], "-printx") == 0) {
                        opt_printx = 1;
                }
        }

        for(i = 0; i < 6; i++)
                if(found[i] == 0)
                        strcat(msg, paraname[i]);

        if(strcmp(msg, "") != 0){
                if(my_rank == 0) {
                        printf("\nInvalid arguments...\n\n** Please specify %s\n\n", msg);
                        printf("Syntax: prog1 -n   n\n");
                        printf("              -tol tolerance\n");
                        printf("              -max max_iter\n");
                        printf("              -fa  file_a.mat\n");
                        printf("              -fb  file_b.mat\n");
                        printf("              -fx  file_x.mat (output file)\n");
                        printf("              [-verbose] verbose output\n");
                        printf("              [-printx]  print x (solution) to screen\n");
                        printf("                         (no matter this option is on or off,\n");
                        printf("                          x is always stored in file_x.mat)\n");
                        printf("              [-notime]  do not display elapsed time\n");
                        printf("              [-noiter]  do not display elapsed iteration\n");
                        printf("              [-nonorm]  do not display norm(r)\n");
                        printf("\ne.g:\n    prog1 -n 128 -tol 1e-12 -max 1000 -fa a.mat -fb b.mat -fx x.mat -verbose\n\n");
                        printf("file_A.mat and file_b.mat should be stored in MATLAB -ascii compatible format.\n\n");
                        printf("e.g: MATLAB>> save b.mat b -ascii\n\nSee README file for detail.\n\n");
                        fflush(stdout);
                }
                return 1;
        }


        if(*pn % np != 0) {
                if(my_rank == 0){
                        printf("n is not divisible by np. Program aborted.\n"); fflush(stdout);
                }
                return 1;
        }
        return 0;
}


char MPIBuffer[4194304];




int main(int argc, char** argv) {
        int n, temp, n_bar;
        double *local_A, *local_b, *global_x;
        double timestart, timeend;
        int max_iter, iter;
        double tol, error, norm;
        char file_a[100], file_b[100], file_x[100];

   timestart=MPI_Wtime();
        MPI_Init(&argc, &argv);




        MPI_Comm_size(1, &np);
        MPI_Comm_rank(1, &my_rank);




   MPI_Buffer_attach(MPIBuffer, 4194304);





        if(parseInput(argc, argv, &n, &tol, &max_iter, file_a, file_b, file_x) != 0){

                MPI_Finalize();
                return 1;
        }

        n_bar = n / np;

        local_A = (double*) malloc(n_bar * n * sizeof(double));
        local_b = (double*) malloc(n_bar * sizeof(double));
        global_x = (double*) malloc(n * sizeof(double));




        temp = np;
        topo = BUTTERFLY;
        while(temp > 1) {
            if(temp & 1){
                topo = RINGPASS;
                break;
            }
            temp >>= 1;
        }
        if(my_rank == 0 && opt_verbose == 1) {
                printf("\nnp = %d, we choose: ", np);
                if(topo == BUTTERFLY)
                        printf("BUTTERFLY\n");
                else
                        printf("RINGPASS\n");
        }



        if(my_rank == 0 && opt_verbose == 1) {
                        printf("\nn = %d, np = %d, tol = %g, max_iter = %d\n\nLoading matrix files\n", n, np, tol, max_iter);
                        fflush(stdout);
        }
        loadMatAndDistribute(local_A, n_bar, n, file_a);
        loadMatAndDistribute(local_b, n_bar, 1, file_b);

        if(my_rank == 0) {
                if(opt_verbose == 1) printf("End loading.\n\nTimer started\n\tCG started\n");
        }


        cgSolver(n, n_bar, tol, max_iter, local_A, local_b, global_x, &iter, &error, &norm);



                timeend=MPI_Wtime();
      if(opt_notime == 0) printf("Elapsed time = %g\n", timeend - timestart);

                if(opt_noiter == 0) printf("Elapsed iter = %d\n", iter);
                if(opt_nonorm == 0){
                        printf("Error = %g\n", error);
                        printf("Norm(r) = sqrt(error) = %g\n", sqrt(error));
                        printf("Norm( b - Ax ) =        %g\n", norm);
                }
                if(opt_printx == 1){
                        printMat(global_x, n, 1);
                }
                fflush(stdout);


                writeMat(global_x, n, 1, file_x);
                printf("\n"); fflush(stdout);




        free(local_A); free(local_b); free(global_x);
   MPI_Finalize();


        MPI_Finalize();
        return 0;
}


void printIntArray(int* array, char* arrayname, int size)
{
   int i;

   printf("%s[] = <", arrayname);
   for(i=0; i<size; i++)
   {
      printf("%d", array[i]);
      if(i!=(size-1))
         printf(", ");
   }

   printf(">\n");
   fflush(stdout);
}


void printDoubleArray(double* array, char* arrayname, int size)
{
   int i;

   printf("%s[] = <", arrayname);
   for(i=0; i<size; i++)
   {
      printf("%6.3f", array[i]);
      if(i!=(size-1))
         printf(", ");
   }

   printf(">\n");
   fflush(stdout);
}
