/*
 * pipeflex.c - flexible benchmark for measuring pipe's bandwidth
 *
 * Copyright (C) 2000 IBM
 *
 * Written by Rajan Ravindran (rajancr@us.ibm.com) 29 Oct 2001
 * Based on reflex.c written by Shailabh Nagar (nagar@us.ibm.com)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/sem.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sched.h>
#include <errno.h>
#include <wait.h>
#include <math.h>
#include <getopt.h>
#include <fcntl.h>

#define STACK_SIZE	(8192)
#define CLONE_FLAGS	(CLONE_VM | CLONE_SIGHAND | CLONE_FS)
#define DEF_PERCENT	(1)
#define NUM_WARMUP	(1)
#define MIN_TRIALS	(NUM_WARMUP+5)
#define MAX_TRIALS	(25)
#define MAX_CHILDREN    (1024)


/* Macros related to active set formation 
 * depend on setsize,num_active,setid 
 * setid is a per-thread value that should be defined properly
 * before macros depending on it are called 
 */

#define isextra(myid)   (myid >= (regsetsize * num_active))      
#define idextra(setid)  ((regsetsize * num_active) + setid)
#define needextra(setid) (setid < num_children - (regsetsize*num_active))

#define regsetstart(setid)  (regsetsize * setid)
#define regsetend(setid)    ((regsetsize * (setid+1)) - 1)


/*
 * system calls
 */

int __clone (int (*fn) (void *arg), void *thread_stack, int flags, void *arg);

/*
 * prototypes for this file.
 */

void run_test_time(void); 
int bouncer(void *arg);
int (*worker) (void *arg);
double local_exec(void) ;
double probrange(unsigned long top);
void calibration(void) ;
float variance(int n, float sum, float sum2);
int confidence(int iterations);
double uniform(double mean);
void usage(void);
char *child_stack ;

struct timezone tz1;
struct timeval  tv1;
struct timezone tz2;
struct timeval  tv2;
struct timeval  tvr;

int num_children = 200;	        /* number of child processes to create */
int num_active ; 
int num_seconds = 20 ;          /* number of seconds to run test */
int foutput = 0;                /* controls type of output */
int verbose = 0;                 /* controls type of output */
int read_compute_time = 300;      /* number of microseconds of computation before read*/
int write_compute_time = 300;      /* number of microseconds of computation before write*/

int start_sem;			/* sem to serialize test */
int stop_test = 0;		/* flag to end test when convergence met */
int valid_test = 1;
int read_size=1;                    /* size of message treated as token */
int write_size=1;
double rounds_per_microsecond = 0.0 ; /* obtained through calibration */
int local_exec_count = 0;       /* unused */
int mode = 0;			/* 0 - R&W Nonblocking, 1 - R&W Blocking
				 * 2 - R Blocking & W Nonblocking
				 * 3 - W Blocking & R Nonblocking
				 */


unsigned long long nbytes[MAX_CHILDREN];
int hash[MAX_CHILDREN];

int childpipe[MAX_CHILDREN][2]; /* pipes used to bounce token around */

float tau[MAX_TRIALS-1] = { 12.706, 4.303, 3.182, 2.776, 2.571, 2.447, 2.365, 2.306, 2.262, 2.228, 2.201, 2.179, 2.160, 2.145, 2.131, 2.120, 2.110, 2.101, 2.093, 2.086, 2.080, 2.074, 2.069, 2.064 };

char *child_bufs[MAX_CHILDREN];

struct _results
{
	float data;
	float ave;
	float var;
	float conf;
} results[MAX_TRIALS] = {{0.0}};

struct _total
{
	unsigned long long count;
	char pad[24];
} *total;

int main(int argc, char *argv[])
{
	int c;
	int clone_err;
	int i,rc;
	int exit_rc = 0;
	int wait_status;
	struct sched_param param;
	struct rlimit myrlimit;
	int TOKENSZ;


	while ((c = getopt(argc, argv, "c:t:x:y:r:w:o:m:v")) != -1) {
	  switch (c) {
	  case 'c': num_children = atoi(optarg); break;
	  case 't': num_seconds = atoi(optarg); break;
	  case 'x': read_compute_time = atoi(optarg); break;
	  case 'y': write_compute_time = atoi(optarg); break;
	  case 'r': read_size = atoi(optarg) * 1024; break;
	  case 'w': write_size = atoi(optarg) * 1024; break;
	  case 'o': foutput = atoi(optarg); break;
	  case 'm': mode = atoi(optarg); break;
	  case 'v': verbose = 1; break;
	  default:  usage();
	    goto exit_main2;
	  }
	}

	if (num_children % 2 != 0)
		num_children -= 1;

	num_active = num_children/2; 

	if (verbose) {
	        printf("c:%d, t:%d, x:%d, y:%d, r:%d, w:%d, o:%d, m:%d, v:%d\n",
                        num_children, num_seconds, read_compute_time,
                        write_compute_time, read_size, write_size,
                        foutput, mode, verbose);
	}

	if ((num_seconds <= 0) ||
	    (num_active <= 0) || 
	    (num_children <= 0) || (num_children > MAX_CHILDREN) ||
	    (read_compute_time < 0) || (write_compute_time < 0) ||
	    (read_size <= 0) || (write_size <= 0)
	   )
	    
	{
	    	usage();
		goto exit_main2;
	}
	

	/* Increase limits on number of open files */
	/* normally 1024 (cur & max), set to MAX_CHILDREN */

	myrlimit.rlim_cur = myrlimit.rlim_max = MAX_CHILDREN*2 ; 

	if (setrlimit(RLIMIT_NOFILE,&myrlimit) != 0) {
		exit_rc = errno ;
		perror("setrlimit() ");
		goto exit_main2;
	}
	
	/* allocate childrens stacks*/
	
	child_stack = malloc(num_children*STACK_SIZE);
	if (child_stack == NULL) {
		exit_rc = errno;
		perror ("malloc of 'child_stack' failed ");
		goto exit_main2;
	}
	
	/* open num_children pipes */
	for (i=0; i< num_children/2; i++) {
		if (pipe(childpipe[i]) < 0) {
			exit_rc = errno;
			perror ("pipe() ");
			goto exit_main3;
		}
	}

	/* set the pipe access value blocking/Nonblocking*/
	if (mode) {
		for (i=0; i< num_children/2; i++) {
			int rval, wval;
			rval = fcntl(childpipe[i][0], F_GETFL);
			wval = fcntl(childpipe[i][1], F_GETFL);
			switch (mode) {
			default:
				break;		

			case 1: 
				rval |= O_NONBLOCK;
				wval |= O_NONBLOCK;
				break;

			case 2: 
				wval |= O_NONBLOCK;
				break;

			case 3: 
				rval |= O_NONBLOCK;
				break;
			}	
			rval = fcntl(childpipe[i][0], F_SETFL, rval);
			if (rval == -1) perror("fcntl(F_SETFL,read) ");
			wval = fcntl(childpipe[i][1], F_SETFL, wval);
			if (wval == -1) perror("fcntl(F_SETFL,write) ");
		}
	}

        /* calibrate internal loops */
	calibration();

	/* start_sem is used to start all children at same time */	

	start_sem = semget (IPC_PRIVATE, 1, IPC_CREAT | IPC_EXCL | 0660);
	if (start_sem == -1) {
		exit_rc = errno;
		perror("semget(start_sem) IPC_CREATE ");
		goto exit_main4;
	}
	
	/* allocate/initialize statistic variables */
	
	total = malloc(num_children*sizeof(struct _total));
	if (total == NULL) {
		exit_rc = errno;
		perror ("malloc of 'total' failed ");
		goto exit_main3;
	}
	for (i = 0 ; i < num_children ; i++)
		total[i].count = 0;

	TOKENSZ = ((read_size > write_size)? read_size:write_size);
	for (i = 0 ; i < num_children ; i++) {
		child_bufs[i] = (char *)calloc(1,TOKENSZ);
	  	if (!child_bufs[i]) {
	    		perror("allocation of child_bufs failed: main");
	    		exit_rc = errno;
	    		goto exit_main3;
		}
	}

	/* Launch threads */
  	worker = bouncer;
	for (i=0; i< num_children; i++) {
		clone_err = __clone(worker, 
				    &child_stack[(i+1)*STACK_SIZE],
				    CLONE_FLAGS,
				    (void*)i);
		if (clone_err == -1) {
			exit_rc = errno;
			perror ("clone() ");
			goto exit_main5;
		}
		if (verbose)
			printf("\t\tLaunched child %d\n",i);

	}

	/* Increase priority of parent thread */

	param.sched_priority = 90;
	rc = sched_setscheduler(getpid(), SCHED_FIFO, &param);
	if (rc == -1) {
		exit_rc = errno;
		perror ("sched_setscheduler() ");
		goto exit_main5;
	}

	run_test_time();
	
exit_main5: 
	/* wait until all children complete */	
	for (i = 0 ; i < num_children ; i++) {
		rc = waitpid (-1, &wait_status, __WCLONE);
		if (verbose)
			printf("Child %d returned\n",i);
		if (rc == -1) {
			exit_rc = errno;
			perror ("waitpid() ");
		}
	}

exit_main4:
 
	rc = semctl(start_sem, 0, IPC_RMID, 0);

exit_main3:

	/* explicitly close all pipes */
	for (i=0; i< num_children/2; i++) {
		close(childpipe[i][0]);
		close(childpipe[i][1]);
	}
	free(child_stack);

	for (i=0; i< num_children; i++) 
		free(child_bufs[i]);
	
exit_main2:
	return (exit_rc) ;
}


int bouncer(void *arg)
{
	int i=0, rc, exit_rc = 0;
	int myid,nextid,previd;
	struct sembuf mysembuf;
	int msgsize ;
	int regsetsize,setid ;
	char *pbuf;
	int comp_read_rounds ;     
	int comp_write_rounds ;   

	/* Active set formation : 
	 * each id put into sets of size "regsetsize" first
	 * leftover ids distributed amongst regular sets, one per set
	 * e.g.  to divide [0..10] into num_active=3,
	 *       form regular sets : [0,1,2] [3,4,5] [6,7,8] 9,10
	 *       9,10 are "extras", distribute one each to regular sets 
	 *	          forming    [0,1,2,9] [3,4,5,10] [6,7,8]
	 *	 and adjust previd,nextid appropriately	to form logically 
	 *       circular linked lists per set such as (for first set)
	 *                 0 <--> 1 <--> 2 <--> 9 <--> 0 
	 */


	/* Defaults */
	myid = (int) arg ;
	nextid = (myid+1)%num_children;
	previd = (myid+num_children-1)%num_children ;

	pbuf = child_bufs[myid];

	/* Determine extras */
	regsetsize = num_children/num_active ;

	if (isextra(myid))
		setid = myid - (regsetsize * num_active) ;
	else
		setid = myid / regsetsize ;
	
	/* Fit extras into appropriate sets */
	if (needextra(setid)) {
		if (myid == regsetstart(setid))
			previd = idextra(setid);
		if (myid == regsetend(setid))
			nextid = idextra(setid);
		if (isextra(myid)) {
			previd = regsetend(setid);
			nextid = regsetstart(setid);
		}
	}
	else {
		if (myid == regsetstart(setid))
			previd = regsetend(setid);
		if (myid == regsetend(setid))
			nextid = regsetstart(setid);
	}
	
	/* wait to be released */
	mysembuf.sem_num = 0;
	mysembuf.sem_op  = -1;
	mysembuf.sem_flg = 0;
	
	rc = semop(start_sem, &mysembuf, 1);
	if (rc == -1) {
		exit_rc = errno;
		perror ("child semop(start_sem) failed");
		return(exit_rc);
	} 

	/* Actions to be done by each thread */

	while (!stop_test) {
                /* Blocking read */
		if ((myid % 2) == 0) {
			static int rcnt = 0;
			msgsize = read(childpipe[myid/2][0],pbuf,read_size) ;
			rcnt++;
			if (verbose && (msgsize != read_size)) {
				printf("read error %d %d %d\n",rcnt,msgsize,read_size);
			}
			nbytes[myid] += msgsize;
			if (!stop_test) {
				comp_read_rounds = (int) (uniform(read_compute_time) * rounds_per_microsecond) ;
				for (i=0;i<comp_read_rounds;i++) 
					local_exec();
			}
		}

		if ((myid % 2) == 1) {
			static int wcnt = 0;
			if (!stop_test) {
				comp_write_rounds = (int) (uniform(write_compute_time) * rounds_per_microsecond) ;
				for (i=0;i<comp_write_rounds;i++) 
					local_exec();
			}
			wcnt++;
			msgsize = write(childpipe[(myid-1)/2][1],pbuf,write_size);
			if (verbose && (msgsize != write_size)) {
				printf("write error %d %d %d\n",wcnt,msgsize,write_size);
			}
		}
	}
	hash[myid] = 1;
  	sleep(3);
	/* make sure all reader and writer process returned */

        if (myid % 2 == 0)  {				// reader
		while (hash[myid+1] == 0) {
	                read(childpipe[myid/2][0],pbuf,read_size) ;
			sleep (2);
		}
	}
        if (myid % 2 == 1)  {				// writer
		while (hash[myid-1] == 0) {
			write(childpipe[(myid-1)/2][1],pbuf,write_size);
			sleep (2);
		}
	}

	return(exit_rc);
}

void usage(void)
{

	printf ("Usage: pipeflex [-c num_child] [-t num_sec] [-x read_compute_time] [-y write_compute_time] [-r pipe_buf] [-w pipe_buf] [-o N] [-v]\n\n");
	printf ("version 1.0.0  rajancr@us.ibm.com \n");
	printf ("-c N, number of child processes to create (should be EVEN)\n");
	printf ("-t N, number of seconds to run test before checking throughput\n");
	printf ("-x N, number of microseconds of random computation to perform \n");
	printf ("      before read from pipe [0.5*N .. 1.5*N]\n");
	printf ("-y N, number of microseconds of random computation to perform \n");
	printf ("      before write into pipe [0.5*N .. 1.5*N]\n");
	printf ("-r  , token size to read from pipe [Kbytes]\n");
	printf ("-w  , token size to write into pipe [Kbytes]\n");
	printf ("-o N, how to report results\n");
	printf ("      0 default reporting of results\n");
	printf ("      1 tablular output\n");
	printf ("      2 for -t, prints 'seconds , transfer/sec'\n");
	printf ("-m N, access mode\n");
	printf ("      0 Read & Write blocking\n");
	printf ("      1 Read & Write Nonblocking\n");
	printf ("      2 Read blocking, Write Nonblocking\n");
	printf ("      3 Write blocking, Read Nonblocking\n");
	printf ("-v  , verbose mode\n");
}

void run_test_time(void)
{
	int i,rc;
	int exit_rc = 0;
	int iterations = 0;
	unsigned long long x;
	unsigned long long t, prev_t;
	struct sembuf mysembuf1;

	mysembuf1.sem_num = 0;
	mysembuf1.sem_op  = num_children;
	mysembuf1.sem_flg = 0;
	
	/* post the sema4 to allow children to start */

	rc = semop (start_sem, &mysembuf1, 1);
	if (rc == -1) {
		stop_test = 1;
		exit_rc = errno;
		perror ("parent semop(start_sem) failed ");
		goto exit_test;
	}

restart:

        prev_t = 0;
        t = 0;
 
        /* get the start time */
 
        rc = gettimeofday (&tv1, &tz1);
        if (rc) {
                stop_test = 1;
                exit_rc = errno;
                perror ("gettimeofday failed on tv1 ");
                goto exit_test;
        }
 
        /* wait until timeout */
 
        for (i = 0 ; i < num_children ; i++) prev_t += nbytes[i];
        sleep (num_seconds);
        for (i = 0 ; i < num_children ; i++) t += nbytes[i];
 
        /* get end time */
 
        rc = gettimeofday (&tv2, &tz2);
        if (rc) {
                stop_test = 1;                
                exit_rc = rc;
                perror ("gettimeofday failed on tv2 ");
                goto exit_test;
        }
 
        /* compute microseconds per transfer */
 
        timersub(&tv2, &tv1, &tvr); /* tvr now contains result of tv2-tv1 */
 
        x = (unsigned long long)tvr.tv_sec * 1000000;
        x+= (unsigned long long)tvr.tv_usec;
        results[iterations].data = (float)(t - prev_t);
        results[iterations].data /= (float)x;
 
        iterations++;
        if (confidence(iterations)) {
                stop_test = 1;
        } else {
                goto restart;
        }                                
	if (verbose) printf (" Test Completed.\n");

	switch (foutput) {
		
	case 1:
		printf ("     MB/sec        ave    variance  confidence\n");
		printf ("     ------        ---    --------  ----------\n");
		for (i = 0 ; i < iterations ; i++) {
			printf ("%11.3f %11.3f %11.3f %11.3f ",
				results[i].data,
				results[i].ave,
				results[i].var,
				results[i].conf);
			if (i < NUM_WARMUP)
				printf ("warmup");
			printf ("\n");
		}
		printf ("%u children transferred %11.2f [MB/sec]\n", num_children, (results[iterations-1].ave * 1000000)/1024.0/1024.0);
		break;
	case 2:
		if (!valid_test) break;
		printf ("%u , %0.3f\n", num_seconds, (results[iterations-1].ave * 1000000)/1024.0/1024.0);
		break;
		
	default:
		if (!valid_test) break;
		printf ("%u children transferred %11.2f [MB/sec]\n", num_children, (results[iterations-1].ave * 1000000)/1024.0/1024.0);
		break;
	}
	
exit_test:	
	return;	
}

double local_exec()
{
	unsigned int a = 0, b=0;
	memcpy(&a,&b,1);
}

void calibration(void)
{
	/* figure out how many loops we can execute per micro second */
	int i;
	int count = 0;
	unsigned long n_initial = 100000;
	unsigned long clock1, clock2, clockterm;

	clock1 = clock() ;
	clockterm = clock1 + 5*CLOCKS_PER_SEC;
	do {
		for(i=0; i< n_initial; i++)
		{
			local_exec();
		}
		clock2 = clock();
		count++;
	} while (clock2 < clockterm);

	n_initial *= count;

	rounds_per_microsecond = ((double)n_initial*CLOCKS_PER_SEC) / ((double)(clock2-clock1) * 1000000.0);

	/* printf(">> [%ld] %ld %ld %lf\n",CLOCKS_PER_SEC,n_initial,(clock2-clock1),rounds_per_microsecond); */
}

/******************** Statistical functions **********************************/

double probrange(unsigned long top)
{
	double value = random();
	value = (top*value) / ((double)RAND_MAX);
	return value;
}

float variance(int n, float sum, float sum2)
{
	return ((((float)n * sum2)-(sum * sum))/((float)n * (float)(n - 1)));
}

/*
 * iterertions includes NUM_WARMUP + a min of 3 iterations
 */
int confidence(int iter)
{
	float	sum_trials = 0.0;
	float	sum_trials2 = 0.0;
	float	percent = (float)DEF_PERCENT / 100.0;
	int	i,x,y;
	
	x = iter - NUM_WARMUP;
	y = iter - 1;

	/* compute average */

	if (iter <= NUM_WARMUP) {
		return(0);
	} else {
 		for (i = NUM_WARMUP ; i < iter ; i++) {
			sum_trials  += results[i].data;
			sum_trials2 += results[i].data * results[i].data;
		}
	}
	results[y].ave = sum_trials / (float)(x);

	if (iter < NUM_WARMUP + 2)
		return(0);

	/* compute the variance */

	results[y].var = variance(x,sum_trials,sum_trials2);
	if (isnan(results[y].var))
		results[y].var = 0.0;
	if (results[y].var < 0.0)
		results[y].var = 0.0;

	/* 95% confident that ave is within percent% of "true" average ? */

	results[y].conf = tau[x-2] * sqrt(results[y].var / (float)x);
	if (isnan(results[y].conf))
		results[y].conf = 0.0;
	if (results[y].var < 0.0)
		results[y].var = 0.0;

	if (iter < MIN_TRIALS)
		return(0);

	if (results[y].conf <= results[y].ave * percent) return (1);

	return(1);

	if (iter == MAX_TRIALS) {
		valid_test = 0;
		printf("\n*****> failed to reach confidence1 level <*****\n");
		return(1);
        } else {
		return(0);
	}
}

double uniform(double mean)
{
	/* Generate a random uniformly in [0.5xmean,1.5xmean] */
	double value = random();
	value = ((value/(double)RAND_MAX) + 0.5) * mean ; 
	return value ;
}