Next: About this document ... Up: CSE 557 Spring 2000 Previous: Results

Conclusions

The performance characteristics for a network of workstations has been successfully measured in this work with the help of the two programs discussed above. The peak processing speed for melmak.cse.psu.edu was measured to be 170 Mflops, for the COCOA server (P-II Xeon 450) to be 255 Mflops, and for each of the COCOA client nodes (P-II 400) to be 225 Mflops. The L1 and L2 cache sizes (data cache only) for melmak was found to be 32 KB and 1024 KB respectively, for COCOA server to be 16 KB and 1024 KB respectively, and for the COCOA client nodes to be 16 KB and 512 KB respectively. It can be seen that the processing speed drops drastically once the problem size grows larger than the L1 and L2 cache (A drop $175 \rightarrow 130 \rightarrow 84$ Mflops is seen for melmak, $255 \rightarrow 207 \rightarrow 65$ Mflops for COCOA server, and $225 \rightarrow 166 \rightarrow 75$ for the COCOA client nodes, corresponding to the data residing in L1 cache, L2 cache, and main memory, respectively). To achieve the peak processing speed, care should be taken to make the problem fit entirely in the L1 data cache, and the best optimization flags for the available compiler should be used.

For the communication characteristics, it is seen that as the message size goes above 1500 bytes (for COCOA), the message start-up time (i.e., latency) increases (from 0.182 ms to 0.275 ms, i.e., 51% increase), but so does the communication bandwidth (from 53.02 Mb/s to 87.15 Mb/s, i.e, 64% increase). This has been explained due the MTU on the ethernet card (or raw packet size) being set as 1500 bytes. Also, it is impressive to note that a communication bandwidth as high as 87% of the peak theoretical bandwidth (of 100 Mb/s) can be obtained on COCOA. It is also seen that the net communication bandwidth tends to saturate with the increase in the number of communicating pairs. This can be attributed to the finite backplane bandwidth of the switch and the possibility of other processors (owned by other users) communicating at the same time.

/****************************************************************************/
/*    "prog1.c"                                                             */
/****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
/****************************************************************************/
#define STOPCOUNT 19
/****************************************************************************/
int main(int argc, char *argv[])
{
int         maxsize, nsamples, unroll = 4, count;
int         i, k, isize, ip1, ip2, ip3;
double      *x, *y, a, b, c, d;
double      drand48();
double      factor, flops_per_loop, nflops, mflops, size, starttime, time;
FILE        *out;

out = fopen("prog1.out","w");

MPI_Init(&argc,&argv);

maxsize = 1000000;  // 2 arrays of this size = 16MBytes

x = (double *) malloc (maxsize * sizeof(double));
y = (double *) malloc (maxsize * sizeof(double));

printf("Maxsize = %g KB\n",maxsize*sizeof(double)/1024.0);

isize = 200; factor = 1.6; flops_per_loop = 28.0, count = 1;

// main loop, keep increasing active set size until as large as allocated memory
while (isize < (maxsize - unroll))
        {
         // set nsamples large relative to tick, but not too large
         nsamples = 1000;
         size = nsamples * sizeof(double);
         // total number of flops per timing (note done as double)
         nflops = flops_per_loop*nsamples*1.0*isize/unroll;

         // initialize and get active set in cache
         for (i = 0; i < isize+unroll; i++)
                {
                 x[i] = drand48(); y[i] = drand48();
                }
         a = drand48(); b = drand48(); c = drand48(); d = drand48(); 

         // timing loop
         starttime = MPI_Wtime();
         for (k = 0; k < nsamples; k++)
                {
                 // actual calculation, unrolling to minimize indexing overhead
                 for (i = 0; i < isize; i += unroll)
                        {
                         ip1 = i+1; ip2 = i+2; ip3 = i+3;
                         x[i]   = a*y[i] + b*y[ip1] + c*y[ip2] + d*y[ip3];
                         x[ip1] = b*y[i] + c*y[ip1] + d*y[ip2] + a*y[ip3];
                         x[ip2] = c*y[i] + d*y[ip1] + a*y[ip2] + b*y[ip3];
                         x[ip3] = d*y[i] + a*y[ip1] + d*y[ip2] + c*y[ip3];
                        }
                } 
         time = MPI_Wtime() - starttime;

         // calculate and print
         mflops = nflops * 1.0e-6 / time;
         printf("size(%d) = %g KB; mflops(%d) = %e;\n", count,
2.0*isize*8.0/1024.0, count, mflops); 
         fprintf(out,"%g %g\n", 2*isize*8.0/1024.0, mflops);
fflush(out);
         count++;

         if (count == STOPCOUNT) break;

         isize = (int) (factor * ((double) isize));
        }

fclose(out);
MPI_Finalize();
return 0;
}
/****************************************************************************/

/****************************************************************************/
/*    "prog2.c"                                                             */
/****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "mpi.h"
/****************************************************************************/
// Program 2 outline
#define START_LEN 0
#define FINISH_LEN 101000
#define NUM_INCR 101
#define NUM_SAMPLES 1000
#define SLEEP_VAL 1
#define MASTER (myid == 0)
/****************************************************************************/
int main(int argc, char **argv)
{
char    *buffer;
int     count1, count2;
int     dest, order;
long    s_val = SLEEP_VAL;
int     np, myid, p, i, j, start_len, finish_len, inc;
double  time1, time2, work, t;
FILE    *out = NULL;
MPI_Status mpistat;

start_len = START_LEN;
finish_len = FINISH_LEN;
inc = (finish_len-start_len)/NUM_INCR;

MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&np);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);

printf("myid = %d, np = %d\n",myid,np);
fflush(stdout);

if (MASTER)
        out = fopen("prog2.out", "w");

if ((myid % 2) == 0)
{
         dest = myid + 1;
         order = 0;
        } 
else
        {
         dest = myid - 1;
         order = 1;
        }

// character array for sending back and forth...
buffer = (char *) malloc(sizeof(char)*finish_len);

count1 = 1;
// loop over pairs of processors...
// increase the number of pairs by 2 each time through...
for (p = 2; p <= np; p += 2)
        {
         count2 = 1;
         i = start_len;
         while (i < finish_len)
                {
                 // sync everyone up...
                 MPI_Barrier(MPI_COMM_WORLD);

                 time1 = MPI_Wtime();
                 if (myid < p)
                        {
                         for (j = 0; j < NUM_SAMPLES; j++)
                                {
                                 if (order == 0) 
                                        {
                MPI_Send(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD);
                MPI_Recv(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD,&mpistat);
                                        }
                                 else
                                        {
                MPI_Recv(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD,&mpistat);
                MPI_Send(buffer,i,MPI_BYTE,dest,11,MPI_COMM_WORLD);
                                        }
                                } 
                        } 
                 time2 = MPI_Wtime();

                 // wait a small amount of time to avoid interrupting 
                 // other processes
                 sleep(s_val);

                 // get the avg of the times across the participating procs
                 t = time2-time1;
                 work = t;
                 MPI_Allreduce(&work,&t,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
                 t /= p;
                 if (MASTER) 
                        {
                         int microsec = floor(t/(2*NUM_SAMPLES)*1.0e+6);
                         printf("size = %d bytes, t = %d microsec, %d processors\n",
i, microsec, p);
                         fprintf(out,"%d %d %d %d %d\n",i, microsec, p, 
count1, count2);
                         fflush(stdout);
                         fflush(out);
                        }
                 count2++;
                 // some rule for incrementing message size with inc
                 i += inc;
                }
         count1++;
         if (MASTER)
                {
                 fprintf(out,"\n");
                 fflush(out);
                }
        }

free(buffer);

if (MASTER)
        fclose(out);

MPI_Finalize();

return 0;
}
/****************************************************************************/

Next: About this document ... Up: CSE 557 Spring 2000 Previous: Results

Anirudh Modi
2000-02-21