forked from Mirrors/wasm3
parent
4b4d131955
commit
ad1fdbc8f4
After Width: | Height: | Size: 109 KiB |
After Width: | Height: | Size: 188 KiB |
After Width: | Height: | Size: 474 KiB |
After Width: | Height: | Size: 248 KiB |
@ -0,0 +1,128 @@
|
|||||||
|
#include <math.h> // smallpt, a Path Tracer by Kevin Beason, 2009
|
||||||
|
#include <stdlib.h> // Make : g++ -O3 -fopenmp explicit.cpp -o explicit
|
||||||
|
#include <stdio.h> // Remove "-fopenmp" for g++ version < 4.2
|
||||||
|
#include <time.h>
|
||||||
|
double get_time() {
|
||||||
|
struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts);
|
||||||
|
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||||||
|
}
|
||||||
|
struct Vec { // Usage: time ./explicit 16 && xv image.ppm
|
||||||
|
double x, y, z; // position, also color (r,g,b)
|
||||||
|
Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
|
||||||
|
Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
|
||||||
|
Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
|
||||||
|
Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
|
||||||
|
Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
|
||||||
|
Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
|
||||||
|
double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
|
||||||
|
Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
|
||||||
|
};
|
||||||
|
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
|
||||||
|
enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance()
|
||||||
|
struct Sphere {
|
||||||
|
double rad; // radius
|
||||||
|
Vec p, e, c; // position, emission, color
|
||||||
|
Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive)
|
||||||
|
Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
|
||||||
|
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
|
||||||
|
double intersect(const Ray &r) const { // returns distance, 0 if nohit
|
||||||
|
Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
|
||||||
|
double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
|
||||||
|
if (det<0) return 0; else det=sqrt(det);
|
||||||
|
return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Sphere spheres[] = {//Scene: radius, position, emission, color, material
|
||||||
|
Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
|
||||||
|
Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
|
||||||
|
Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),//Back
|
||||||
|
Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),//Frnt
|
||||||
|
Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),//Botm
|
||||||
|
Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
|
||||||
|
Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.6, SPEC),//Mirr
|
||||||
|
Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(.75,1.,.95), REFR),//Glas
|
||||||
|
Sphere(4.0, Vec(50,81.6-16.5,81.6),Vec(4,4,4)*12, Vec(), DIFF),//Lite
|
||||||
|
};
|
||||||
|
int numSpheres = sizeof(spheres)/sizeof(Sphere);
|
||||||
|
inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
|
||||||
|
inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
|
||||||
|
inline bool intersect(const Ray &r, double &t, int &id){
|
||||||
|
double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
|
||||||
|
for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
|
||||||
|
return t<inf;
|
||||||
|
}
|
||||||
|
Vec radiance(const Ray &r, int depth, unsigned short *Xi,int E=1){
|
||||||
|
double t; // distance to intersection
|
||||||
|
int id=0; // id of intersected object
|
||||||
|
if (!intersect(r, t, id)) return Vec(); // if miss, return black
|
||||||
|
const Sphere &obj = spheres[id]; // the hit object
|
||||||
|
Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
|
||||||
|
double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
|
||||||
|
if (++depth>5||!p) if (erand48(Xi)<p) f=f*(1/p); else return obj.e*E;
|
||||||
|
if (obj.refl == DIFF){ // Ideal DIFFUSE reflection
|
||||||
|
double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
|
||||||
|
Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
|
||||||
|
Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
|
||||||
|
|
||||||
|
// Loop over any lights
|
||||||
|
Vec e;
|
||||||
|
for (int i=0; i<numSpheres; i++){
|
||||||
|
const Sphere &s = spheres[i];
|
||||||
|
if (s.e.x<=0 && s.e.y<=0 && s.e.z<=0) continue; // skip non-lights
|
||||||
|
|
||||||
|
Vec sw=s.p-x, su=((fabs(sw.x)>.1?Vec(0,1):Vec(1))%sw).norm(), sv=sw%su;
|
||||||
|
double cos_a_max = sqrt(1-s.rad*s.rad/(x-s.p).dot(x-s.p));
|
||||||
|
double eps1 = erand48(Xi), eps2 = erand48(Xi);
|
||||||
|
double cos_a = 1-eps1+eps1*cos_a_max;
|
||||||
|
double sin_a = sqrt(1-cos_a*cos_a);
|
||||||
|
double phi = 2*M_PI*eps2;
|
||||||
|
Vec l = su*cos(phi)*sin_a + sv*sin(phi)*sin_a + sw*cos_a;
|
||||||
|
l.norm();
|
||||||
|
if (intersect(Ray(x,l), t, id) && id==i){ // shadow ray
|
||||||
|
double omega = 2*M_PI*(1-cos_a_max);
|
||||||
|
e = e + f.mult(s.e*l.dot(nl)*omega)*M_1_PI; // 1/pi for brdf
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return obj.e*E+e+f.mult(radiance(Ray(x,d),depth,Xi,0));
|
||||||
|
} else if (obj.refl == SPEC) // Ideal SPECULAR reflection
|
||||||
|
return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
|
||||||
|
Ray reflRay(x, r.d-n*2*n.dot(r.d)); // Ideal dielectric REFRACTION
|
||||||
|
bool into = n.dot(nl)>0; // Ray from outside going in?
|
||||||
|
double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
|
||||||
|
if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) // Total internal reflection
|
||||||
|
return obj.e + f.mult(radiance(reflRay,depth,Xi));
|
||||||
|
Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
|
||||||
|
double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
|
||||||
|
double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
|
||||||
|
return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ? // Russian roulette
|
||||||
|
radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
|
||||||
|
radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
|
||||||
|
}
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 2; // # samples
|
||||||
|
double tbeg = get_time();
|
||||||
|
Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
|
||||||
|
Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
|
||||||
|
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
|
||||||
|
for (int y=0; y<h; y++){ // Loop over image rows
|
||||||
|
fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
|
||||||
|
for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++) // Loop cols
|
||||||
|
for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++) // 2x2 subpixel rows
|
||||||
|
for (int sx=0; sx<2; sx++, r=Vec()){ // 2x2 subpixel cols
|
||||||
|
for (int s=0; s<samps; s++){
|
||||||
|
double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
|
||||||
|
double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
|
||||||
|
Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
|
||||||
|
cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
|
||||||
|
r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
|
||||||
|
} // Camera rays are pushed ^^^^^ forward to start in interior
|
||||||
|
c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double tend = get_time();
|
||||||
|
fprintf(stderr, "\nElapsed: %5.1f ms\n", tend - tbeg);
|
||||||
|
fprintf(stdout, "P3\n%d %d\n%d\n", w, h, 255);
|
||||||
|
for (int i=0; i<w*h; i++)
|
||||||
|
fprintf(stdout,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
|
||||||
|
}
|
@ -0,0 +1,106 @@
|
|||||||
|
#include <math.h> // smallpt, a Path Tracer by Kevin Beason, 2008
|
||||||
|
#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
|
||||||
|
#include <stdio.h> // Remove "-fopenmp" for g++ version < 4.2
|
||||||
|
#include <time.h>
|
||||||
|
double get_time() {
|
||||||
|
struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts);
|
||||||
|
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||||||
|
}
|
||||||
|
struct Vec { // Usage: time ./smallpt 5000 && xv image.ppm
|
||||||
|
double x, y, z; // position, also color (r,g,b)
|
||||||
|
Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
|
||||||
|
Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
|
||||||
|
Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
|
||||||
|
Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
|
||||||
|
Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
|
||||||
|
Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
|
||||||
|
double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
|
||||||
|
Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
|
||||||
|
};
|
||||||
|
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
|
||||||
|
enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance()
|
||||||
|
struct Sphere {
|
||||||
|
double rad; // radius
|
||||||
|
Vec p, e, c; // position, emission, color
|
||||||
|
Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive)
|
||||||
|
Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
|
||||||
|
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
|
||||||
|
double intersect(const Ray &r) const { // returns distance, 0 if nohit
|
||||||
|
Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
|
||||||
|
double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
|
||||||
|
if (det<0) return 0; else det=sqrt(det);
|
||||||
|
return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Sphere spheres[] = {//Scene: radius, position, emission, color, material
|
||||||
|
Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
|
||||||
|
Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
|
||||||
|
Sphere(1e5, Vec(50,40.8, 1e5), Vec(),Vec(.75,.75,.75),DIFF),//Back
|
||||||
|
Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(), DIFF),//Frnt
|
||||||
|
Sphere(1e5, Vec(50, 1e5, 81.6), Vec(),Vec(.75,.75,.75),DIFF),//Botm
|
||||||
|
Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
|
||||||
|
Sphere(16.5,Vec(27,16.5,47), Vec(),Vec(1,1,1)*.6, SPEC),//Mirr
|
||||||
|
Sphere(16.5,Vec(73,16.5,78), Vec(),Vec(.75,1.,.95), REFR),//Glas
|
||||||
|
Sphere(4.0, Vec(50,81.6-16.5,81.6),Vec(4,4,4)*12, Vec(), DIFF),//Lite
|
||||||
|
};
|
||||||
|
inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
|
||||||
|
inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
|
||||||
|
inline bool intersect(const Ray &r, double &t, int &id){
|
||||||
|
double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
|
||||||
|
for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
|
||||||
|
return t<inf;
|
||||||
|
}
|
||||||
|
Vec radiance(const Ray &r, int depth, unsigned short *Xi){
|
||||||
|
double t; // distance to intersection
|
||||||
|
int id=0; // id of intersected object
|
||||||
|
if (!intersect(r, t, id)) return Vec(); // if miss, return black
|
||||||
|
const Sphere &obj = spheres[id]; // the hit object
|
||||||
|
Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
|
||||||
|
double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
|
||||||
|
if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj.e; //R.R.
|
||||||
|
if (obj.refl == DIFF){ // Ideal DIFFUSE reflection
|
||||||
|
double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
|
||||||
|
Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
|
||||||
|
Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
|
||||||
|
return obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
|
||||||
|
} else if (obj.refl == SPEC) // Ideal SPECULAR reflection
|
||||||
|
return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
|
||||||
|
Ray reflRay(x, r.d-n*2*n.dot(r.d)); // Ideal dielectric REFRACTION
|
||||||
|
bool into = n.dot(nl)>0; // Ray from outside going in?
|
||||||
|
double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
|
||||||
|
if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) // Total internal reflection
|
||||||
|
return obj.e + f.mult(radiance(reflRay,depth,Xi));
|
||||||
|
Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
|
||||||
|
double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
|
||||||
|
double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
|
||||||
|
return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ? // Russian roulette
|
||||||
|
radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
|
||||||
|
radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
|
||||||
|
}
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 2; // # samples
|
||||||
|
double tbeg = get_time();
|
||||||
|
Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
|
||||||
|
Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
|
||||||
|
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
|
||||||
|
for (int y=0; y<h; y++){ // Loop over image rows
|
||||||
|
fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
|
||||||
|
for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++) // Loop cols
|
||||||
|
for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++) // 2x2 subpixel rows
|
||||||
|
for (int sx=0; sx<2; sx++, r=Vec()){ // 2x2 subpixel cols
|
||||||
|
for (int s=0; s<samps; s++){
|
||||||
|
double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
|
||||||
|
double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
|
||||||
|
Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
|
||||||
|
cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
|
||||||
|
r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
|
||||||
|
} // Camera rays are pushed ^^^^^ forward to start in interior
|
||||||
|
c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double tend = get_time();
|
||||||
|
fprintf(stderr, "\nElapsed: %5.1f ms\n", tend - tbeg);
|
||||||
|
fprintf(stdout, "P3\n%d %d\n%d\n", w, h, 255);
|
||||||
|
for (int i=0; i<w*h; i++)
|
||||||
|
fprintf(stdout,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
|
||||||
|
}
|
@ -0,0 +1,585 @@
|
|||||||
|
/*-----------------------------------------------------------------------*/
|
||||||
|
/* Program: STREAM */
|
||||||
|
/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
|
||||||
|
/* Original code developed by John D. McCalpin */
|
||||||
|
/* Programmers: John D. McCalpin */
|
||||||
|
/* Joe R. Zagar */
|
||||||
|
/* */
|
||||||
|
/* This program measures memory transfer rates in MB/s for simple */
|
||||||
|
/* computational kernels coded in C. */
|
||||||
|
/*-----------------------------------------------------------------------*/
|
||||||
|
/* Copyright 1991-2013: John D. McCalpin */
|
||||||
|
/*-----------------------------------------------------------------------*/
|
||||||
|
/* License: */
|
||||||
|
/* 1. You are free to use this program and/or to redistribute */
|
||||||
|
/* this program. */
|
||||||
|
/* 2. You are free to modify this program for your own use, */
|
||||||
|
/* including commercial use, subject to the publication */
|
||||||
|
/* restrictions in item 3. */
|
||||||
|
/* 3. You are free to publish results obtained from running this */
|
||||||
|
/* program, or from works that you derive from this program, */
|
||||||
|
/* with the following limitations: */
|
||||||
|
/* 3a. In order to be referred to as "STREAM benchmark results", */
|
||||||
|
/* published results must be in conformance to the STREAM */
|
||||||
|
/* Run Rules, (briefly reviewed below) published at */
|
||||||
|
/* http://www.cs.virginia.edu/stream/ref.html */
|
||||||
|
/* and incorporated herein by reference. */
|
||||||
|
/* As the copyright holder, John McCalpin retains the */
|
||||||
|
/* right to determine conformity with the Run Rules. */
|
||||||
|
/* 3b. Results based on modified source code or on runs not in */
|
||||||
|
/* accordance with the STREAM Run Rules must be clearly */
|
||||||
|
/* labelled whenever they are published. Examples of */
|
||||||
|
/* proper labelling include: */
|
||||||
|
/* "tuned STREAM benchmark results" */
|
||||||
|
/* "based on a variant of the STREAM benchmark code" */
|
||||||
|
/* Other comparable, clear, and reasonable labelling is */
|
||||||
|
/* acceptable. */
|
||||||
|
/* 3c. Submission of results to the STREAM benchmark web site */
|
||||||
|
/* is encouraged, but not required. */
|
||||||
|
/* 4. Use of this program or creation of derived works based on this */
|
||||||
|
/* program constitutes acceptance of these licensing restrictions. */
|
||||||
|
/* 5. Absolutely no warranty is expressed or implied. */
|
||||||
|
/*-----------------------------------------------------------------------*/
|
||||||
|
# include <stdio.h>
|
||||||
|
# include <unistd.h>
|
||||||
|
# include <math.h>
|
||||||
|
# include <float.h>
|
||||||
|
# include <limits.h>
|
||||||
|
# include <sys/time.h>
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------
|
||||||
|
* INSTRUCTIONS:
|
||||||
|
*
|
||||||
|
* 1) STREAM requires different amounts of memory to run on different
|
||||||
|
* systems, depending on both the system cache size(s) and the
|
||||||
|
* granularity of the system timer.
|
||||||
|
* You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
|
||||||
|
* to meet *both* of the following criteria:
|
||||||
|
* (a) Each array must be at least 4 times the size of the
|
||||||
|
* available cache memory. I don't worry about the difference
|
||||||
|
* between 10^6 and 2^20, so in practice the minimum array size
|
||||||
|
* is about 3.8 times the cache size.
|
||||||
|
* Example 1: One Xeon E3 with 8 MB L3 cache
|
||||||
|
* STREAM_ARRAY_SIZE should be >= 4 million, giving
|
||||||
|
* an array size of 30.5 MB and a total memory requirement
|
||||||
|
* of 91.5 MB.
|
||||||
|
* Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
|
||||||
|
* STREAM_ARRAY_SIZE should be >= 20 million, giving
|
||||||
|
* an array size of 153 MB and a total memory requirement
|
||||||
|
* of 458 MB.
|
||||||
|
* (b) The size should be large enough so that the 'timing calibration'
|
||||||
|
* output by the program is at least 20 clock-ticks.
|
||||||
|
* Example: most versions of Windows have a 10 millisecond timer
|
||||||
|
* granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds.
|
||||||
|
* If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
|
||||||
|
* This means the each array must be at least 1 GB, or 128M elements.
|
||||||
|
*
|
||||||
|
* Version 5.10 increases the default array size from 2 million
|
||||||
|
* elements to 10 million elements in response to the increasing
|
||||||
|
* size of L3 caches. The new default size is large enough for caches
|
||||||
|
* up to 20 MB.
|
||||||
|
* Version 5.10 changes the loop index variables from "register int"
|
||||||
|
* to "ssize_t", which allows array indices >2^32 (4 billion)
|
||||||
|
* on properly configured 64-bit systems. Additional compiler options
|
||||||
|
* (such as "-mcmodel=medium") may be required for large memory runs.
|
||||||
|
*
|
||||||
|
* Array size can be set at compile time without modifying the source
|
||||||
|
* code for the (many) compilers that support preprocessor definitions
|
||||||
|
* on the compile line. E.g.,
|
||||||
|
* gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
|
||||||
|
* will override the default size of 10M with a new size of 100M elements
|
||||||
|
* per array.
|
||||||
|
*/
|
||||||
|
#ifndef STREAM_ARRAY_SIZE
|
||||||
|
# define STREAM_ARRAY_SIZE 10000000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result
|
||||||
|
* for any iteration after the first, therefore the minimum value
|
||||||
|
* for NTIMES is 2.
|
||||||
|
* There are no rules on maximum allowable values for NTIMES, but
|
||||||
|
* values larger than the default are unlikely to noticeably
|
||||||
|
* increase the reported performance.
|
||||||
|
* NTIMES can also be set on the compile line without changing the source
|
||||||
|
* code using, for example, "-DNTIMES=7".
|
||||||
|
*/
|
||||||
|
#ifdef NTIMES
|
||||||
|
#if NTIMES<=1
|
||||||
|
# define NTIMES 10
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#ifndef NTIMES
|
||||||
|
# define NTIMES 10
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Users are allowed to modify the "OFFSET" variable, which *may* change the
|
||||||
|
* relative alignment of the arrays (though compilers may change the
|
||||||
|
* effective offset by making the arrays non-contiguous on some systems).
|
||||||
|
* Use of non-zero values for OFFSET can be especially helpful if the
|
||||||
|
* STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
|
||||||
|
* OFFSET can also be set on the compile line without changing the source
|
||||||
|
* code using, for example, "-DOFFSET=56".
|
||||||
|
*/
|
||||||
|
#ifndef OFFSET
|
||||||
|
# define OFFSET 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 3) Compile the code with optimization. Many compilers generate
|
||||||
|
* unreasonably bad code before the optimizer tightens things up.
|
||||||
|
* If the results are unreasonably good, on the other hand, the
|
||||||
|
* optimizer might be too smart for me!
|
||||||
|
*
|
||||||
|
* For a simple single-core version, try compiling with:
|
||||||
|
* cc -O stream.c -o stream
|
||||||
|
* This is known to work on many, many systems....
|
||||||
|
*
|
||||||
|
* To use multiple cores, you need to tell the compiler to obey the OpenMP
|
||||||
|
* directives in the code. This varies by compiler, but a common example is
|
||||||
|
* gcc -O -fopenmp stream.c -o stream_omp
|
||||||
|
* The environment variable OMP_NUM_THREADS allows runtime control of the
|
||||||
|
* number of threads/cores used when the resulting "stream_omp" program
|
||||||
|
* is executed.
|
||||||
|
*
|
||||||
|
* To run with single-precision variables and arithmetic, simply add
|
||||||
|
* -DSTREAM_TYPE=float
|
||||||
|
* to the compile line.
|
||||||
|
* Note that this changes the minimum array sizes required --- see (1) above.
|
||||||
|
*
|
||||||
|
* The preprocessor directive "TUNED" does not do much -- it simply causes the
|
||||||
|
* code to call separate functions to execute each kernel. Trivial versions
|
||||||
|
* of these functions are provided, but they are *not* tuned -- they just
|
||||||
|
* provide predefined interfaces to be replaced with tuned code.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* 4) Optional: Mail the results to mccalpin@cs.virginia.edu
|
||||||
|
* Be sure to include info that will help me understand:
|
||||||
|
* a) the computer hardware configuration (e.g., processor model, memory type)
|
||||||
|
* b) the compiler name/version and compilation flags
|
||||||
|
* c) any run-time information (such as OMP_NUM_THREADS)
|
||||||
|
* d) all of the output from the test case.
|
||||||
|
*
|
||||||
|
* Thanks!
|
||||||
|
*
|
||||||
|
*-----------------------------------------------------------------------*/
|
||||||
|
|
||||||
|
# define HLINE "-------------------------------------------------------------\n"
|
||||||
|
|
||||||
|
# ifndef MIN
|
||||||
|
# define MIN(x,y) ((x)<(y)?(x):(y))
|
||||||
|
# endif
|
||||||
|
# ifndef MAX
|
||||||
|
# define MAX(x,y) ((x)>(y)?(x):(y))
|
||||||
|
# endif
|
||||||
|
|
||||||
|
#ifndef STREAM_TYPE
|
||||||
|
#define STREAM_TYPE double
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET],
|
||||||
|
b[STREAM_ARRAY_SIZE+OFFSET],
|
||||||
|
c[STREAM_ARRAY_SIZE+OFFSET];
|
||||||
|
|
||||||
|
static double avgtime[4] = {0}, maxtime[4] = {0},
|
||||||
|
mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
|
||||||
|
|
||||||
|
static char *label[4] = {"Copy: ", "Scale: ",
|
||||||
|
"Add: ", "Triad: "};
|
||||||
|
|
||||||
|
static double bytes[4] = {
|
||||||
|
2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
|
||||||
|
2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
|
||||||
|
3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
|
||||||
|
3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
|
||||||
|
};
|
||||||
|
|
||||||
|
extern double mysecond();
|
||||||
|
extern void checkSTREAMresults();
|
||||||
|
#ifdef TUNED
|
||||||
|
extern void tuned_STREAM_Copy();
|
||||||
|
extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
|
||||||
|
extern void tuned_STREAM_Add();
|
||||||
|
extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
|
||||||
|
#endif
|
||||||
|
#ifdef _OPENMP
|
||||||
|
extern int omp_get_num_threads();
|
||||||
|
#endif
|
||||||
|
int
|
||||||
|
main()
|
||||||
|
{
|
||||||
|
int quantum, checktick();
|
||||||
|
int BytesPerWord;
|
||||||
|
int k;
|
||||||
|
ssize_t j;
|
||||||
|
STREAM_TYPE scalar;
|
||||||
|
double t, times[4][NTIMES];
|
||||||
|
|
||||||
|
/* --- SETUP --- determine precision and check timing --- */
|
||||||
|
|
||||||
|
printf(HLINE);
|
||||||
|
printf("STREAM version $Revision: 5.10 $\n");
|
||||||
|
printf(HLINE);
|
||||||
|
BytesPerWord = sizeof(STREAM_TYPE);
|
||||||
|
printf("This system uses %d bytes per array element.\n",
|
||||||
|
BytesPerWord);
|
||||||
|
|
||||||
|
printf(HLINE);
|
||||||
|
#ifdef N
|
||||||
|
printf("***** WARNING: ******\n");
|
||||||
|
printf(" It appears that you set the preprocessor variable N when compiling this code.\n");
|
||||||
|
printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
|
||||||
|
printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
|
||||||
|
printf("***** WARNING: ******\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
|
||||||
|
printf("Memory per array = %.1f MiB (= %.1f GiB).\n",
|
||||||
|
BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
|
||||||
|
BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
|
||||||
|
printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
|
||||||
|
(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
|
||||||
|
(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
|
||||||
|
printf("Each kernel will be executed %d times.\n", NTIMES);
|
||||||
|
printf(" The *best* time for each kernel (excluding the first iteration)\n");
|
||||||
|
printf(" will be used to compute the reported bandwidth.\n");
|
||||||
|
|
||||||
|
#ifdef _OPENMP
|
||||||
|
printf(HLINE);
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
#pragma omp master
|
||||||
|
{
|
||||||
|
k = omp_get_num_threads();
|
||||||
|
printf ("Number of Threads requested = %i\n",k);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _OPENMP
|
||||||
|
k = 0;
|
||||||
|
#pragma omp parallel
|
||||||
|
#pragma omp atomic
|
||||||
|
k++;
|
||||||
|
printf ("Number of Threads counted = %i\n",k);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Get initial value for system clock. */
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||||
|
a[j] = 1.0;
|
||||||
|
b[j] = 2.0;
|
||||||
|
c[j] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf(HLINE);
|
||||||
|
|
||||||
|
if ( (quantum = checktick()) >= 1)
|
||||||
|
printf("Your clock granularity/precision appears to be "
|
||||||
|
"%d microseconds.\n", quantum);
|
||||||
|
else {
|
||||||
|
printf("Your clock granularity appears to be "
|
||||||
|
"less than one microsecond.\n");
|
||||||
|
quantum = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
t = mysecond();
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j = 0; j < STREAM_ARRAY_SIZE; j++)
|
||||||
|
a[j] = 2.0E0 * a[j];
|
||||||
|
t = 1.0E6 * (mysecond() - t);
|
||||||
|
|
||||||
|
printf("Each test below will take on the order"
|
||||||
|
" of %d microseconds.\n", (int) t );
|
||||||
|
printf(" (= %d clock ticks)\n", (int) (t/quantum) );
|
||||||
|
printf("Increase the size of the arrays if this shows that\n");
|
||||||
|
printf("you are not getting at least 20 clock ticks per test.\n");
|
||||||
|
|
||||||
|
printf(HLINE);
|
||||||
|
|
||||||
|
printf("WARNING -- The above is only a rough guideline.\n");
|
||||||
|
printf("For best results, please be sure you know the\n");
|
||||||
|
printf("precision of your system timer.\n");
|
||||||
|
printf(HLINE);
|
||||||
|
|
||||||
|
/* --- MAIN LOOP --- repeat test cases NTIMES times --- */
|
||||||
|
|
||||||
|
scalar = 3.0;
|
||||||
|
for (k=0; k<NTIMES; k++)
|
||||||
|
{
|
||||||
|
times[0][k] = mysecond();
|
||||||
|
#ifdef TUNED
|
||||||
|
tuned_STREAM_Copy();
|
||||||
|
#else
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
c[j] = a[j];
|
||||||
|
#endif
|
||||||
|
times[0][k] = mysecond() - times[0][k];
|
||||||
|
|
||||||
|
times[1][k] = mysecond();
|
||||||
|
#ifdef TUNED
|
||||||
|
tuned_STREAM_Scale(scalar);
|
||||||
|
#else
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
b[j] = scalar*c[j];
|
||||||
|
#endif
|
||||||
|
times[1][k] = mysecond() - times[1][k];
|
||||||
|
|
||||||
|
times[2][k] = mysecond();
|
||||||
|
#ifdef TUNED
|
||||||
|
tuned_STREAM_Add();
|
||||||
|
#else
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
c[j] = a[j]+b[j];
|
||||||
|
#endif
|
||||||
|
times[2][k] = mysecond() - times[2][k];
|
||||||
|
|
||||||
|
times[3][k] = mysecond();
|
||||||
|
#ifdef TUNED
|
||||||
|
tuned_STREAM_Triad(scalar);
|
||||||
|
#else
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
a[j] = b[j]+scalar*c[j];
|
||||||
|
#endif
|
||||||
|
times[3][k] = mysecond() - times[3][k];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- SUMMARY --- */
|
||||||
|
|
||||||
|
for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
|
||||||
|
{
|
||||||
|
for (j=0; j<4; j++)
|
||||||
|
{
|
||||||
|
avgtime[j] = avgtime[j] + times[j][k];
|
||||||
|
mintime[j] = MIN(mintime[j], times[j][k]);
|
||||||
|
maxtime[j] = MAX(maxtime[j], times[j][k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Function Best Rate MB/s Avg time Min time Max time\n");
|
||||||
|
for (j=0; j<4; j++) {
|
||||||
|
avgtime[j] = avgtime[j]/(double)(NTIMES-1);
|
||||||
|
|
||||||
|
printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j],
|
||||||
|
1.0E-06 * bytes[j]/mintime[j],
|
||||||
|
avgtime[j],
|
||||||
|
mintime[j],
|
||||||
|
maxtime[j]);
|
||||||
|
}
|
||||||
|
printf(HLINE);
|
||||||
|
|
||||||
|
/* --- Check Results --- */
|
||||||
|
checkSTREAMresults();
|
||||||
|
printf(HLINE);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
# define M 20
|
||||||
|
|
||||||
|
int
|
||||||
|
checktick()
|
||||||
|
{
|
||||||
|
int i, minDelta, Delta;
|
||||||
|
double t1, t2, timesfound[M];
|
||||||
|
|
||||||
|
/* Collect a sequence of M unique time values from the system. */
|
||||||
|
|
||||||
|
for (i = 0; i < M; i++) {
|
||||||
|
t1 = mysecond();
|
||||||
|
while( ((t2=mysecond()) - t1) < 1.0E-6 )
|
||||||
|
;
|
||||||
|
timesfound[i] = t1 = t2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine the minimum difference between these M values.
|
||||||
|
* This result will be our estimate (in microseconds) for the
|
||||||
|
* clock granularity.
|
||||||
|
*/
|
||||||
|
|
||||||
|
minDelta = 1000000;
|
||||||
|
for (i = 1; i < M; i++) {
|
||||||
|
Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
|
||||||
|
minDelta = MIN(minDelta, MAX(Delta,0));
|
||||||
|
}
|
||||||
|
|
||||||
|
return(minDelta);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* A gettimeofday routine to give access to the wall
|
||||||
|
clock timer on most UNIX-like systems. */
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
double mysecond()
|
||||||
|
{
|
||||||
|
struct timeval tp;
|
||||||
|
struct timezone tzp;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
i = gettimeofday(&tp,&tzp);
|
||||||
|
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef abs
|
||||||
|
#define abs(a) ((a) >= 0 ? (a) : -(a))
|
||||||
|
#endif
|
||||||
|
void checkSTREAMresults ()
|
||||||
|
{
|
||||||
|
STREAM_TYPE aj,bj,cj,scalar;
|
||||||
|
STREAM_TYPE aSumErr,bSumErr,cSumErr;
|
||||||
|
STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
|
||||||
|
double epsilon;
|
||||||
|
ssize_t j;
|
||||||
|
int k,ierr,err;
|
||||||
|
|
||||||
|
/* reproduce initialization */
|
||||||
|
aj = 1.0;
|
||||||
|
bj = 2.0;
|
||||||
|
cj = 0.0;
|
||||||
|
/* a[] is modified during timing check */
|
||||||
|
aj = 2.0E0 * aj;
|
||||||
|
/* now execute timing loop */
|
||||||
|
scalar = 3.0;
|
||||||
|
for (k=0; k<NTIMES; k++)
|
||||||
|
{
|
||||||
|
cj = aj;
|
||||||
|
bj = scalar*cj;
|
||||||
|
cj = aj+bj;
|
||||||
|
aj = bj+scalar*cj;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* accumulate deltas between observed and expected results */
|
||||||
|
aSumErr = 0.0;
|
||||||
|
bSumErr = 0.0;
|
||||||
|
cSumErr = 0.0;
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||||
|
aSumErr += abs(a[j] - aj);
|
||||||
|
bSumErr += abs(b[j] - bj);
|
||||||
|
cSumErr += abs(c[j] - cj);
|
||||||
|
// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // MCCALPIN
|
||||||
|
}
|
||||||
|
aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
|
||||||
|
bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
|
||||||
|
cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
|
||||||
|
|
||||||
|
if (sizeof(STREAM_TYPE) == 4) {
|
||||||
|
epsilon = 1.e-6;
|
||||||
|
}
|
||||||
|
else if (sizeof(STREAM_TYPE) == 8) {
|
||||||
|
epsilon = 1.e-13;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
|
||||||
|
epsilon = 1.e-6;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = 0;
|
||||||
|
if (abs(aAvgErr/aj) > epsilon) {
|
||||||
|
err++;
|
||||||
|
printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
|
||||||
|
printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
|
||||||
|
ierr = 0;
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||||
|
if (abs(a[j]/aj-1.0) > epsilon) {
|
||||||
|
ierr++;
|
||||||
|
#ifdef VERBOSE
|
||||||
|
if (ierr < 10) {
|
||||||
|
printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
|
||||||
|
j,aj,a[j],abs((aj-a[j])/aAvgErr));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(" For array a[], %d errors were found.\n",ierr);
|
||||||
|
}
|
||||||
|
if (abs(bAvgErr/bj) > epsilon) {
|
||||||
|
err++;
|
||||||
|
printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
|
||||||
|
printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
|
||||||
|
printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon);
|
||||||
|
ierr = 0;
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||||
|
if (abs(b[j]/bj-1.0) > epsilon) {
|
||||||
|
ierr++;
|
||||||
|
#ifdef VERBOSE
|
||||||
|
if (ierr < 10) {
|
||||||
|
printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
|
||||||
|
j,bj,b[j],abs((bj-b[j])/bAvgErr));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(" For array b[], %d errors were found.\n",ierr);
|
||||||
|
}
|
||||||
|
if (abs(cAvgErr/cj) > epsilon) {
|
||||||
|
err++;
|
||||||
|
printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
|
||||||
|
printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
|
||||||
|
printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon);
|
||||||
|
ierr = 0;
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||||
|
if (abs(c[j]/cj-1.0) > epsilon) {
|
||||||
|
ierr++;
|
||||||
|
#ifdef VERBOSE
|
||||||
|
if (ierr < 10) {
|
||||||
|
printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
|
||||||
|
j,cj,c[j],abs((cj-c[j])/cAvgErr));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(" For array c[], %d errors were found.\n",ierr);
|
||||||
|
}
|
||||||
|
if (err == 0) {
|
||||||
|
printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
|
||||||
|
}
|
||||||
|
#ifdef VERBOSE
|
||||||
|
printf ("Results Validation Verbose Results: \n");
|
||||||
|
printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
|
||||||
|
printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
|
||||||
|
printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef TUNED
|
||||||
|
/* stubs for "tuned" versions of the kernels */
|
||||||
|
void tuned_STREAM_Copy()
|
||||||
|
{
|
||||||
|
ssize_t j;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
c[j] = a[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
void tuned_STREAM_Scale(STREAM_TYPE scalar)
|
||||||
|
{
|
||||||
|
ssize_t j;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
b[j] = scalar*c[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
void tuned_STREAM_Add()
|
||||||
|
{
|
||||||
|
ssize_t j;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
c[j] = a[j]+b[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
void tuned_STREAM_Triad(STREAM_TYPE scalar)
|
||||||
|
{
|
||||||
|
ssize_t j;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||||
|
a[j] = b[j]+scalar*c[j];
|
||||||
|
}
|
||||||
|
/* end of stubs for the "tuned" versions of the kernels */
|
||||||
|
#endif
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue