* Program: Stream
* Programmer: John D. McCalpin
* Revision: 4.0, July 24, 1995
*
* This version of the STREAM program which uses different starting
* points for  each array.  Charles Grassl, Cray Research, Inc., cmg@cray.com
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran.  These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
* INSTRUCTIONS:
*	1) Stream requires a cpu timing function called second().
*	   A sample is shown below.  This is unfortunately rather
*	   system dependent.  It helps to know the granularity of the
*	   timing.  The code below assumes that the granularity is
*	   1/100 seconds.
*	2) Stream requires a good bit of memory to run.
*	   Adjust the Parameter 'N' in the second line of the main
*	   program to give a 'timing calibration' of at least 20 clicks.
*	   This will provide rate estimates that should be good to
*	   about 5% precision.
*	3) Compile the code with full optimization.  Many compilers
*	   generate unreasonably bad code before the optimizer tightens
*	   things up.  If the results are unreasonable good, on the
*	   other hand, the optimizer might be too smart for me!
*	4) Mail the results to mccalpin@cs.virginia.edu
*	   Be sure to include:
*		a) computer hardware model number and software revision
*		b) the compiler flags
*		c) all of the output from the test case.
*
* Thanks!
*
	program Stream
	parameter (N = 4 001 *1024, NTIMES = 2)
        integer ipad,aoff,boff,coff
        parameter (ipad = 20248 )
	real a(N+ipad),b(N+ipad),c(N+ipad),times(4,NTIMES)
        common /abccom/a,b,c
	real rmstime(4),mintime(4),maxtime(4)
	character*11 label(4)
	real second
	integer realsize,nbpw,bytes(4)
	external second,realsize
	data rmstime/4*0.0/,mintime/4*1.0e+36/,maxtime/4*0.0/
	data label/'Assignment:','Scaling:   ','Summing:   ',
     $              'SAXPYing:  '/
	data bytes/2,2,3,3/
	data aoff,boff,coff/0,0,0/
*        etime()=0.001*timef()

*	--- SETUP --- determine precision and check timing ---

*        write(6,9020) ncpu(),N/1024,ipad
        write(6,9020) 1,N/1024,ipad

        write(6,9030) (label(j),j=1,4)

	nbpw = realsize()

	t = second()
	do 10 j=1,N
	    a(j) = 1.0
	    b(j) = 2.0
	    c(j) = 0.0
   10	continue
	t = second()-t

c	print *,'Timing calibration ; time = ',t*100,' hundredths',
c     $		' of a second'
c	print *,'Increase the size of the arrays if this is <30 ',
c     $		' and your clock precision is =<1/100 second'
c	print *,'---------------------------------------------------'

        do coff=8,32,8
        do boff=8,32,8
        do aoff=8,32,8
*	--- MAIN LOOP --- repeat test cases NTIMES times ---
	do 1000 k=1,NTIMES

	    t = second()
	    do 20 j=1,N
	        c(coff+j) = a(aoff+j)
   20	    continue
	    t = second()-t
	    times(1,k) = t

	    t = second()
	    do 30 j=1,N
	        c(coff+j) = 3.0e0*a(aoff+j)
   30	    continue
	    t = second()-t
	    times(2,k) = t

	    t = second()
	    do 40 j=1,N
	        c(coff+j) = a(aoff+j)+b(boff+j)
   40	    continue
	    t = second()-t
	    times(3,k) = t

	    t = second()
	    do 50 j=1,N
	        c(coff+j) = a(aoff+j)+3.0e0*b(boff+j)
   50	    continue
	    t = second()-t
	    times(4,k) = t
            call dummysub(a,b,c,n)
 1000	continue

*	--- SUMMARY ---
	do 300 k=1,NTIMES
	    do 200 j=1,4
		rmstime(j) = rmstime(j) + times(j,k)**2
		mintime(j) = min( mintime(j), times(j,k) )
		maxtime(j) = max( maxtime(j), times(j,k) )
  200	    continue
  300	continue

      write(6,9040) aoff,boff,coff,
     .              (N*bytes(j)*nbpw/mintime(j)/1.0e6,j=1,4)

c	write (*,9000) 	
c	do 320 j=1,4
c	    rmstime(j) = sqrt(rmstime(j)/float(NTIMES))
c	    write (*,9010) label(j),N*bytes(j)*nbpw/mintime(j)/1.0e6,
c     $                     rmstime(j),mintime(j),maxtime(j)
c  320	continue
        end do
        end do
        end do

 9000	format (' Function',5x,
     $          'Rate (MB/s)     RMS time     Min time     Max time')
 9010	format (1x,a11,4(2x,f11.4))

 9020   format(/' STREAM benchmark'
     .         /' ----------------'
     .         /' Number of CPUs: ',i8
     .         /' Array size:     ',i8,' Kwords'
     .         /' Array padding:  ',i8,' Words'/)

 9030   format (/1x,' Aoff Boff Coff',2x,4a11
     .          /1x,61('-'))
 9040   format (1x,3i5,2x,4f11.3)

        stop
	end

*-------------------------------------
* Sample timing routine
*	This code works on Sun and Silicon Graphics machines.
	real function second(t0)
	real*4 dummy(2),etime
	second = etime(dummy)
	end
* Sample timing routine
*	This code works on IBM RS/6000 machines
cray	real function second(t0)
cray	second = mclock()*0.01
cray	end


*-------------------------------------
* INTEGER FUNCTION realsize()
*
* A semi-portable way to determine the precision of default REAL
* in Fortran.
* Here used to guess how many bytes of storage a real number occupies.
*
	integer function realsize()
	double precision ref(30)
	real test
	double precision pi
        logical silent
        data silent/.true./

C	Test #1 - compare double precision pi to acos(-1.0e0)

	pi = 3.14159 26535 89793 23846 26433 83279 50288 d0
	picalc = acos(-1.0e0)
	diff = abs(picalc-pi)
	if (diff.eq.0.0) then
           if (.not. silent) then
 	    print *,'Test #1 Failed = picalc=piexact'
	    print *,'Apparently Single=Double Precision'
	    print *,'Proceeding to Test #2'
	    print *,' '
           end if
	    goto 200
	else
	    ndigits = -log10(abs(diff))+0.5
	    goto 1000
	endif

C	Test #2 - compare single(1.0d0+delta) to 1.0e0

  200	do 10 j=1,30
	    ref(j) = 1.0d0+10.0d0**(-j)
   10	continue

	do 20 j=1,30
	    test = ref(j)
	    ndigits = j
	    call dummy(test,result)
	    if (test.eq.1.0e0) then
		goto 1000
	    endif
   20	continue
        if (.not. silent) then
	print *,'Test #2 failed - Precision appears to exceed 30 digits'
	print *,'Proceeding to Test #3'
        end if
	goto 300

C	Test #3 - abs(sqrt(1.0d0)-sqrt(1.0e0))

  300	diff = abs(sqrt(1.0d0)-sqrt(1.0e0))
	if (diff.eq.0.0) then
           if (.not. silent) then
	    print *,'Test Failed - sqrt(1.0e0)=sqrt(1.0d0)'
	    print *,'Apparently Single=Double Precision'
	    print *,'Giving up'
           end if
	    goto 400
	else
	    ndigits = -log10(abs(diff))+0.5
	    goto 1000
	endif


 1000   continue
        if (.not. silent) then
        write (*,'(a)') '--------------------------------------'
	write (*,'(1x,a,i2,a)') 'Single precision appears to have ',
     $		ndigits,' digits of accuracy'
	write (*,'(1x,a,i1,a)') 'Assuming ',realsize,
     $                       ' bytes per default REAL word'
	write (*,'(a)') '--------------------------------------'
        end if
	if (ndigits.le.8) then
	    realsize = 4
	else
	    realsize = 8
	endif
	return

  400	print *,'Hmmmm.  I am unable to determine the size of a REAL'
  	print *,'Please enter the number of Bytes per REAL number : '
  	read (*,*) realsize
	if (realsize.ne.4.and.realsize.ne.8) then
	    print *,'Your answer ',realsize,' does not make sense!'
	    print *,'Try again!'
  	    print *,'Please enter the number of Bytes per ',
     $              'REAL number : '
  	    read (*,*) realsize
	endif
	print *,'You have manually entered a size of ',realsize,
     $          ' bytes per REAL number'
	write (*,'(a)') '--------------------------------------'
	end

	subroutine dummy(q,r)
	r = cos(q)
	return
	end
        subroutine dummysub(a,b,c,n)
	return
	end

