#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>

const int max_num_threads = 4; //how many cores should we use
const int iters_per_round = 128; //how often should each benchmark be repeated
const int NUM_RUNS = 100000000; //tune this number so that a benchmark runs ~1sec

double get_time() {

    struct timespec tp;
    clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
    double ret = (tp.tv_sec * 1e6 + tp.tv_nsec / 1e3) / 1e6;
    return ret;

}

double benchmark(uint8_t* data, int sep, int num_threads) {

    uint8_t a = rand() % 256;
    omp_set_num_threads(num_threads);

    #pragma omp parallel shared(data)
    for (size_t i=0; i<num_threads*sep; i++) {
        data[i] = 0;
    }

    double time = 0.0;
    #pragma omp parallel shared(data, a, sep) reduction(+:time)
    {
        int tid = omp_get_thread_num();
        #pragma omp barrier
        time = -get_time();
        #pragma omp for
        for (size_t i=0; i<NUM_RUNS; i++) {
            data[tid * sep + 0] ^= a; 
            data[tid * sep + 1] ^= a; 
            data[tid * sep + 2] ^= a; 
            data[tid * sep + 3] ^= a; 
            data[tid * sep + 4] ^= a; 
            data[tid * sep + 5] ^= a; 
            data[tid * sep + 6] ^= a; 
            data[tid * sep + 7] ^= a;
        }
        time += get_time();
    }

    return time;

}

int main(int argc, char** argv) {

    srand(time(NULL));
    printf("%12s %12s %12.4s\n", "threads", "separation", "time");

    uint8_t* data = valloc(max_num_threads*1024);
    for (int sep=0; sep<128; sep+=4) {
		for (int threads=1; threads<=max_num_threads; threads++) {
        	for (int iters=0; iters<iters_per_round; iters++) {
            	double time = benchmark(data, sep, threads);
            	printf("%12i %12i %12.4f\n", threads, sep, time);
				fflush(stdout);
			}
        }
    }
    free((void*)data);

}