/* Getting good timings can be pretty annoying.  The (old) version of
 * membench at
 *
 *   www.cs.berkeley.edu/~richie/bebop/notes/matmul/files/membench/
 *
 * use timing routines which are available on the Sun, but not on Linux
 * boxen.  This version provides two timing options:
 *
 * 1) Use the clock() function.  This function is part of the standard
 *    C library, and is pretty universally available.  Unfortunately,
 *    the granularity available from clock() is generally not
 *    that great.  You can get around that somewhat by running the
 *    timing trial many times in a loop (a good idea regardless),
 *    but it's still sort of annoying.
 *
 *    The the clock routine is that is supposed to return only the 
 *    processor time used by the program.  In this case, that's what we
 *    want (I think), but often you'd like to use wall clock time for
 *    programming more complicated things.  The functions "time"
 *    and "difftime" are useful for that, though once again you must
 *    be careful about the resolution.
 *
 * 2) Use the POSIX realtime clock, which I think accesses the cycle counters
 *    on chip.  This is available under Linux, but it is not universally
 *    available; in particular, it is not available on my OS X laptop.
 *
 * By default, we try to use the realtime clock.  You will need to compile
 * with the flag -DUSE_CLOCK to use the basic clock() function.
 */

#include <stdio.h>
#include <time.h>
#include <limits.h>
#include <sys/times.h>
#include <sys/types.h>

#include "timing.h"

#define SAMPLE    10
#define CACHE_MIN (1024)
#define CACHE_MAX (16*1024*1024)

int x[CACHE_MAX];

int main()
{
    int i, index, stride, limit, temp;
    long steps, tsteps, csize;
    double sample_ns, sample_sec, sec, sec0, sec1, 
           ns_per_step, reads_per_step;
    timing_t start, finish;
  
    for (csize = CACHE_MIN; csize <= CACHE_MAX; csize *= 2){
        for (stride = 1; stride <= csize/2; stride *= 2){
	    sec0 = 0;
            sec1 = 0;
	    limit = csize-stride+1;
            
            /* Time the loop with strided access + loop overhead */
	    steps = 0;
            get_time(&start);
	    do {
		for (i = SAMPLE*stride; i != 0; i--)
		    for (index = 0; index < limit; index += stride)
			x[index]++;
                get_time(&finish);
		steps++;
                sec0 = timespec_diff(start,finish);
	    } while (sec0 < 1.0);

            /* Try to time just the overheads */
	    tsteps=0;
            get_time(&start);
	    do {
		for (i = SAMPLE*stride; i != 0; i--)
		    for (index = 0; index < limit; index += stride)
			temp += index;
                get_time(&finish);
		tsteps++;
                sec1 = timespec_diff(start,finish);
	    } while (tsteps < steps);
            
            /* Report on the average time per read/write */
            sec            = sec0 - sec1;
            ns_per_step    = (sec*1.0e9)/steps;
            reads_per_step = SAMPLE*stride*((limit-1.0)/stride+1.0);
	    printf("Size: %7d Stride: %7d read+write: %14.2f ns\n",
		   csize*sizeof(int),
		   stride*sizeof(int), 
		   ns_per_step/reads_per_step);
            fflush(stdout);
	}
	printf ("\n");
    }
    return 0;
}
