diff --git a/test/benchmark/c-ray/README.md b/test/benchmark/c-ray/README.md index a92e402..2ab6598 100644 --- a/test/benchmark/c-ray/README.md +++ b/test/benchmark/c-ray/README.md @@ -42,7 +42,7 @@ cat scene | wasmer run c-ray.wasm -- -s 1024x768 > foo.ppm cat scene | wasmer run --backend singlepass c-ray.wasm -- -s 1024x768 > foo.ppm cat scene | wasmer run --backend llvm c-ray.wasm -- -s 1024x768 > foo.ppm -# Wasmer-JS (V8) https://www.npmjs.com/package/@wasmer/cli +# Wasmer-JS (V8) cat scene | wasmer-js run c-ray.wasm -s 1024x768 > foo.ppm cat scene | node --wasm_interpret_all $(which wasmer-js) run c-ray.wasm -s 1024x768 > foo.ppm diff --git a/test/benchmark/c-ray/scene.jpg b/test/benchmark/c-ray/scene.jpg new file mode 100644 index 0000000..1b93960 Binary files /dev/null and b/test/benchmark/c-ray/scene.jpg differ diff --git a/test/benchmark/c-ray/sphfract.jpg b/test/benchmark/c-ray/sphfract.jpg new file mode 100644 index 0000000..859f327 Binary files /dev/null and b/test/benchmark/c-ray/sphfract.jpg differ diff --git a/test/benchmark/coremark/README.md b/test/benchmark/coremark/README.md index 2a75378..3a6096f 100644 --- a/test/benchmark/coremark/README.md +++ b/test/benchmark/coremark/README.md @@ -1,5 +1,7 @@ # CoreMark 1.0 +https://github.com/eembc/coremark + ### Results ```log @@ -50,6 +52,9 @@ $ENGINES_PATH/wac/wax coremark-wasi.wasm # wasm-micro-runtime $ENGINES_PATH/wasm-micro-runtime/core/iwasm/products/linux/build/iwasm coremark-wasi.wasm +# wasmtime +wasmtime --optimize coremark-wasi.wasm + # Wasmer wasmer run coremark-wasi.wasm @@ -58,7 +63,6 @@ wapm upload coremark-wasi # Wasmer-JS (V8) -# https://www.npmjs.com/package/@wasmer/cli wasmer-js run coremark-wasi.wasm diff --git a/test/benchmark/mandelbrot/README.md b/test/benchmark/mandelbrot/README.md index e36e7b6..236f088 100644 --- a/test/benchmark/mandelbrot/README.md +++ b/test/benchmark/mandelbrot/README.md @@ -1,5 +1,7 @@ # mandelbrot +Based on https://github.com/josch/mandelbrot + ### Results ```log diff --git a/test/benchmark/mandelbrot/image.png b/test/benchmark/mandelbrot/image.png new file mode 100644 index 0000000..4fb0c4c Binary files /dev/null and b/test/benchmark/mandelbrot/image.png differ diff --git a/test/benchmark/mandelbrot/mandel_dd.c b/test/benchmark/mandelbrot/mandel_dd.c index 10872c7..c12f87d 100644 --- a/test/benchmark/mandelbrot/mandel_dd.c +++ b/test/benchmark/mandelbrot/mandel_dd.c @@ -38,6 +38,7 @@ int main(int argc, char **argv) { double bailout = 128; // with a smaller value there are lines on magn=1 double logLogBailout = log(log(bailout)); int foundperiods = 0; + long maxiter = 50000; /*// maxiter = width * sqrt(magn); temp1 = dd_sqrt(magn); unsigned long maxiter = width * dd_get_ui(temp1);*/ @@ -83,8 +84,7 @@ int main(int argc, char **argv) { int whenupdate = 10; hx = 0; hy = 0; - //for (i = 1; i <= maxiter; i++) { - for (i = 1; i <= 50000; i++) { + for (i = 1; i <= maxiter; i++) { //xx = zx * zx; xx = dd_sqr(zx); //yy = zy * zy; diff --git a/test/benchmark/smallpt/image.jpg b/test/benchmark/smallpt/image.jpg new file mode 100644 index 0000000..2461c7d Binary files /dev/null and b/test/benchmark/smallpt/image.jpg differ diff --git a/test/benchmark/smallpt/smallpt-ex.cpp b/test/benchmark/smallpt/smallpt-ex.cpp new file mode 100644 index 0000000..6051fdf --- /dev/null +++ b/test/benchmark/smallpt/smallpt-ex.cpp @@ -0,0 +1,128 @@ +#include // smallpt, a Path Tracer by Kevin Beason, 2009 +#include // Make : g++ -O3 -fopenmp explicit.cpp -o explicit +#include // Remove "-fopenmp" for g++ version < 4.2 +#include +double get_time() { + struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; +} +struct Vec { // Usage: time ./explicit 16 && xv image.ppm + double x, y, z; // position, also color (r,g,b) + Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; } + Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); } + Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); } + Vec operator*(double b) const { return Vec(x*b,y*b,z*b); } + Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); } + Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); } + double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross: + Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);} +}; +struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} }; +enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance() +struct Sphere { + double rad; // radius + Vec p, e, c; // position, emission, color + Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive) + Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_): + rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {} + double intersect(const Ray &r) const { // returns distance, 0 if nohit + Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0 + double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad; + if (det<0) return 0; else det=sqrt(det); + return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0); + } +}; +Sphere spheres[] = {//Scene: radius, position, emission, color, material + Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left + Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght + Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),//Back + Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),//Frnt + Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),//Botm + Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top + Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.6, SPEC),//Mirr + Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(.75,1.,.95), REFR),//Glas + Sphere(4.0, Vec(50,81.6-16.5,81.6),Vec(4,4,4)*12, Vec(), DIFF),//Lite +}; +int numSpheres = sizeof(spheres)/sizeof(Sphere); +inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; } +inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); } +inline bool intersect(const Ray &r, double &t, int &id){ + double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20; + for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&df.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl + if (++depth>5||!p) if (erand48(Xi).1?Vec(0,1):Vec(1))%w).norm(), v=w%u; + Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm(); + + // Loop over any lights + Vec e; + for (int i=0; i.1?Vec(0,1):Vec(1))%sw).norm(), sv=sw%su; + double cos_a_max = sqrt(1-s.rad*s.rad/(x-s.p).dot(x-s.p)); + double eps1 = erand48(Xi), eps2 = erand48(Xi); + double cos_a = 1-eps1+eps1*cos_a_max; + double sin_a = sqrt(1-cos_a*cos_a); + double phi = 2*M_PI*eps2; + Vec l = su*cos(phi)*sin_a + sv*sin(phi)*sin_a + sw*cos_a; + l.norm(); + if (intersect(Ray(x,l), t, id) && id==i){ // shadow ray + double omega = 2*M_PI*(1-cos_a_max); + e = e + f.mult(s.e*l.dot(nl)*omega)*M_1_PI; // 1/pi for brdf + } + } + + return obj.e*E+e+f.mult(radiance(Ray(x,d),depth,Xi,0)); + } else if (obj.refl == SPEC) // Ideal SPECULAR reflection + return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi)); + Ray reflRay(x, r.d-n*2*n.dot(r.d)); // Ideal dielectric REFRACTION + bool into = n.dot(nl)>0; // Ray from outside going in? + double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t; + if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) // Total internal reflection + return obj.e + f.mult(radiance(reflRay,depth,Xi)); + Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm(); + double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n)); + double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P); + return obj.e + f.mult(depth>2 ? (erand48(Xi)

// smallpt, a Path Tracer by Kevin Beason, 2008 +#include // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt +#include // Remove "-fopenmp" for g++ version < 4.2 +#include +double get_time() { + struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; +} +struct Vec { // Usage: time ./smallpt 5000 && xv image.ppm + double x, y, z; // position, also color (r,g,b) + Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; } + Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); } + Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); } + Vec operator*(double b) const { return Vec(x*b,y*b,z*b); } + Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); } + Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); } + double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross: + Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);} +}; +struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} }; +enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance() +struct Sphere { + double rad; // radius + Vec p, e, c; // position, emission, color + Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive) + Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_): + rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {} + double intersect(const Ray &r) const { // returns distance, 0 if nohit + Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0 + double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad; + if (det<0) return 0; else det=sqrt(det); + return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0); + } +}; +Sphere spheres[] = {//Scene: radius, position, emission, color, material + Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left + Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght + Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),//Back + Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),//Frnt + Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),//Botm + Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top + Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.6, SPEC),//Mirr + Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(.75,1.,.95), REFR),//Glas + Sphere(4.0, Vec(50,81.6-16.5,81.6),Vec(4,4,4)*12, Vec(), DIFF),//Lite +}; +inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; } +inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); } +inline bool intersect(const Ray &r, double &t, int &id){ + double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20; + for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&df.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl + if (++depth>5) if (erand48(Xi).1?Vec(0,1):Vec(1))%w).norm(), v=w%u; + Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm(); + return obj.e + f.mult(radiance(Ray(x,d),depth,Xi)); + } else if (obj.refl == SPEC) // Ideal SPECULAR reflection + return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi)); + Ray reflRay(x, r.d-n*2*n.dot(r.d)); // Ideal dielectric REFRACTION + bool into = n.dot(nl)>0; // Ray from outside going in? + double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t; + if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) // Total internal reflection + return obj.e + f.mult(radiance(reflRay,depth,Xi)); + Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm(); + double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n)); + double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P); + return obj.e + f.mult(depth>2 ? (erand48(Xi)

+# include +# include +# include +# include +# include + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 10 +#endif +#endif +#ifndef NTIMES +# define NTIMES 10 +#endif + +/* Users are allowed to modify the "OFFSET" variable, which *may* change the + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). + * Use of non-zero values for OFFSET can be especially helpful if the + * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. + * OFFSET can also be set on the compile line without changing the source + * code using, for example, "-DOFFSET=56". + */ +#ifndef OFFSET +# define OFFSET 0 +#endif + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], + b[STREAM_ARRAY_SIZE+OFFSET], + c[STREAM_ARRAY_SIZE+OFFSET]; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +#ifndef abs +#define abs(a) ((a) >= 0 ? (a) : -(a)) +#endif +void checkSTREAMresults () +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j