Mflops is seen for melmak,
For the communication characteristics, it is seen that as the message size goes above 1500 bytes (for COCOA), the message start-up time (i.e., latency) increases (from 0.182 ms to 0.275 ms, i.e., 51% increase), but so does the communication bandwidth (from 53.02 Mb/s to 87.15 Mb/s, i.e, 64% increase). This has been explained due the MTU on the ethernet card (or raw packet size) being set as 1500 bytes. Also, it is impressive to note that a communication bandwidth as high as 87% of the peak theoretical bandwidth (of 100 Mb/s) can be obtained on COCOA. It is also seen that the net communication bandwidth tends to saturate with the increase in the number of communicating pairs. This can be attributed to the finite backplane bandwidth of the switch and the possibility of other processors (owned by other users) communicating at the same time.
/****************************************************************************/
/* "prog1.c" */
/****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
/****************************************************************************/
#define STOPCOUNT 19
/****************************************************************************/
int main(int argc, char *argv[])
{
int maxsize, nsamples, unroll = 4, count;
int i, k, isize, ip1, ip2, ip3;
double *x, *y, a, b, c, d;
double drand48();
double factor, flops_per_loop, nflops, mflops, size, starttime, time;
FILE *out;
out = fopen("prog1.out","w");
MPI_Init(&argc,&argv);
maxsize = 1000000; // 2 arrays of this size = 16MBytes
x = (double *) malloc (maxsize * sizeof(double));
y = (double *) malloc (maxsize * sizeof(double));
printf("Maxsize = %g KB\n",maxsize*sizeof(double)/1024.0);
isize = 200; factor = 1.6; flops_per_loop = 28.0, count = 1;
// main loop, keep increasing active set size until as large as allocated memory
while (isize < (maxsize - unroll))
{
// set nsamples large relative to tick, but not too large
nsamples = 1000;
size = nsamples * sizeof(double);
// total number of flops per timing (note done as double)
nflops = flops_per_loop*nsamples*1.0*isize/unroll;
// initialize and get active set in cache
for (i = 0; i < isize+unroll; i++)
{
x[i] = drand48(); y[i] = drand48();
}
a = drand48(); b = drand48(); c = drand48(); d = drand48();
// timing loop
starttime = MPI_Wtime();
for (k = 0; k < nsamples; k++)
{
// actual calculation, unrolling to minimize indexing overhead
for (i = 0; i < isize; i += unroll)
{
ip1 = i+1; ip2 = i+2; ip3 = i+3;
x[i] = a*y[i] + b*y[ip1] + c*y[ip2] + d*y[ip3];
x[ip1] = b*y[i] + c*y[ip1] + d*y[ip2] + a*y[ip3];
x[ip2] = c*y[i] + d*y[ip1] + a*y[ip2] + b*y[ip3];
x[ip3] = d*y[i] + a*y[ip1] + d*y[ip2] + c*y[ip3];
}
}
time = MPI_Wtime() - starttime;
// calculate and print
mflops = nflops * 1.0e-6 / time;
printf("size(%d) = %g KB; mflops(%d) = %e;\n", count,
2.0*isize*8.0/1024.0, count, mflops);
fprintf(out,"%g %g\n", 2*isize*8.0/1024.0, mflops);
fflush(out);
count++;
if (count == STOPCOUNT) break;
isize = (int) (factor * ((double) isize));
}
fclose(out);
MPI_Finalize();
return 0;
}
/****************************************************************************/
/****************************************************************************/
/* "prog2.c" */
/****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "mpi.h"
/****************************************************************************/
// Program 2 outline
#define START_LEN 0
#define FINISH_LEN 101000
#define NUM_INCR 101
#define NUM_SAMPLES 1000
#define SLEEP_VAL 1
#define MASTER (myid == 0)
/****************************************************************************/
int main(int argc, char **argv)
{
char *buffer;
int count1, count2;
int dest, order;
long s_val = SLEEP_VAL;
int np, myid, p, i, j, start_len, finish_len, inc;
double time1, time2, work, t;
FILE *out = NULL;
MPI_Status mpistat;
start_len = START_LEN;
finish_len = FINISH_LEN;
inc = (finish_len-start_len)/NUM_INCR;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&np);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
printf("myid = %d, np = %d\n",myid,np);
fflush(stdout);
if (MASTER)
out = fopen("prog2.out", "w");
if ((myid % 2) == 0)
{
dest = myid + 1;
order = 0;
}
else
{
dest = myid - 1;
order = 1;
}
// character array for sending back and forth...
buffer = (char *) malloc(sizeof(char)*finish_len);
count1 = 1;
// loop over pairs of processors...
// increase the number of pairs by 2 each time through...
for (p = 2; p <= np; p += 2)
{
count2 = 1;
i = start_len;
while (i < finish_len)
{
// sync everyone up...
MPI_Barrier(MPI_COMM_WORLD);
time1 = MPI_Wtime();
if (myid < p)
{
for (j = 0; j < NUM_SAMPLES; j++)
{
if (order == 0)
{
MPI_Send(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD);
MPI_Recv(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD,&mpistat);
}
else
{
MPI_Recv(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD,&mpistat);
MPI_Send(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD);
}
}
}
time2 = MPI_Wtime();
// wait a small amount of time to avoid interrupting
// other processes
sleep(s_val);
// get the avg of the times across the participating procs
t = time2-time1;
work = t;
MPI_Allreduce(&work,&t,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
t /= p;
if (MASTER)
{
int microsec = floor(t/(2*NUM_SAMPLES)*1.0e+6);
printf("size = %d bytes, t = %d microsec, %d processors\n",
i, microsec, p);
fprintf(out,"%d %d %d %d %d\n",i, microsec, p,
count1, count2);
fflush(stdout);
fflush(out);
}
count2++;
// some rule for incrementing message size with inc
i += inc;
}
count1++;
if (MASTER)
{
fprintf(out,"\n");
fflush(out);
}
}
free(buffer);
if (MASTER)
fclose(out);
MPI_Finalize();
return 0;
}
/****************************************************************************/