diff --git a/test/benchmark/c-ray/README.md b/test/benchmark/c-ray/README.md
index a92e402..2ab6598 100644
--- a/test/benchmark/c-ray/README.md
+++ b/test/benchmark/c-ray/README.md
@@ -42,7 +42,7 @@ cat scene | wasmer run c-ray.wasm -- -s 1024x768 > foo.ppm
 cat scene | wasmer run --backend singlepass c-ray.wasm -- -s 1024x768 > foo.ppm
 cat scene | wasmer run --backend llvm       c-ray.wasm -- -s 1024x768 > foo.ppm
 
-# Wasmer-JS (V8) https://www.npmjs.com/package/@wasmer/cli
+# Wasmer-JS (V8)
 cat scene | wasmer-js run c-ray.wasm -s 1024x768 > foo.ppm
 
 cat scene | node --wasm_interpret_all $(which wasmer-js) run c-ray.wasm -s 1024x768 > foo.ppm
diff --git a/test/benchmark/c-ray/scene.jpg b/test/benchmark/c-ray/scene.jpg
new file mode 100644
index 0000000..1b93960
Binary files /dev/null and b/test/benchmark/c-ray/scene.jpg differ
diff --git a/test/benchmark/c-ray/sphfract.jpg b/test/benchmark/c-ray/sphfract.jpg
new file mode 100644
index 0000000..859f327
Binary files /dev/null and b/test/benchmark/c-ray/sphfract.jpg differ
diff --git a/test/benchmark/coremark/README.md b/test/benchmark/coremark/README.md
index 2a75378..3a6096f 100644
--- a/test/benchmark/coremark/README.md
+++ b/test/benchmark/coremark/README.md
@@ -1,5 +1,7 @@
 # CoreMark 1.0
 
+https://github.com/eembc/coremark
+
 ### Results
 
 ```log
@@ -50,6 +52,9 @@ $ENGINES_PATH/wac/wax coremark-wasi.wasm
 # wasm-micro-runtime
 $ENGINES_PATH/wasm-micro-runtime/core/iwasm/products/linux/build/iwasm coremark-wasi.wasm
 
+# wasmtime
+wasmtime --optimize coremark-wasi.wasm
+
 # Wasmer
 wasmer run coremark-wasi.wasm
 
@@ -58,7 +63,6 @@ wapm upload
 coremark-wasi
 
 # Wasmer-JS (V8)
-# https://www.npmjs.com/package/@wasmer/cli
 wasmer-js run coremark-wasi.wasm
 
 
diff --git a/test/benchmark/mandelbrot/README.md b/test/benchmark/mandelbrot/README.md
index e36e7b6..236f088 100644
--- a/test/benchmark/mandelbrot/README.md
+++ b/test/benchmark/mandelbrot/README.md
@@ -1,5 +1,7 @@
 # mandelbrot
 
+Based on https://github.com/josch/mandelbrot
+
 ### Results
 
 ```log
diff --git a/test/benchmark/mandelbrot/image.png b/test/benchmark/mandelbrot/image.png
new file mode 100644
index 0000000..4fb0c4c
Binary files /dev/null and b/test/benchmark/mandelbrot/image.png differ
diff --git a/test/benchmark/mandelbrot/mandel_dd.c b/test/benchmark/mandelbrot/mandel_dd.c
index 10872c7..c12f87d 100644
--- a/test/benchmark/mandelbrot/mandel_dd.c
+++ b/test/benchmark/mandelbrot/mandel_dd.c
@@ -38,6 +38,7 @@ int main(int argc, char **argv) {
     double bailout = 128; // with a smaller value there are lines on magn=1
     double logLogBailout = log(log(bailout));
     int foundperiods = 0;
+    long maxiter = 50000;
     /*// maxiter = width * sqrt(magn);
     temp1 = dd_sqrt(magn);
     unsigned long maxiter = width * dd_get_ui(temp1);*/
@@ -83,8 +84,7 @@ int main(int argc, char **argv) {
             int whenupdate = 10;
             hx = 0;
             hy = 0;
-            //for (i = 1; i <= maxiter; i++) {
-            for (i = 1; i <= 50000; i++) {
+            for (i = 1; i <= maxiter; i++) {
                 //xx = zx * zx;
                 xx = dd_sqr(zx);
                 //yy = zy * zy;
diff --git a/test/benchmark/smallpt/image.jpg b/test/benchmark/smallpt/image.jpg
new file mode 100644
index 0000000..2461c7d
Binary files /dev/null and b/test/benchmark/smallpt/image.jpg differ
diff --git a/test/benchmark/smallpt/smallpt-ex.cpp b/test/benchmark/smallpt/smallpt-ex.cpp
new file mode 100644
index 0000000..6051fdf
--- /dev/null
+++ b/test/benchmark/smallpt/smallpt-ex.cpp
@@ -0,0 +1,128 @@
+#include <math.h>   // smallpt, a Path Tracer by Kevin Beason, 2009
+#include <stdlib.h> // Make : g++ -O3 -fopenmp explicit.cpp -o explicit
+#include <stdio.h>  //        Remove "-fopenmp" for g++ version < 4.2
+#include <time.h>
+double get_time() {
+  struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts);
+  return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+struct Vec {        // Usage: time ./explicit 16 && xv image.ppm
+  double x, y, z;                  // position, also color (r,g,b)
+  Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
+  Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
+  Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
+  Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
+  Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
+  Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
+  double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
+  Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
+};
+struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
+enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
+struct Sphere {
+  double rad;       // radius
+  Vec p, e, c;      // position, emission, color
+  Refl_t refl;      // reflection type (DIFFuse, SPECular, REFRactive)
+  Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
+    rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
+  double intersect(const Ray &r) const { // returns distance, 0 if nohit
+    Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
+    double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
+    if (det<0) return 0; else det=sqrt(det);
+    return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
+  }
+};
+Sphere spheres[] = {//Scene: radius, position, emission, color, material
+  Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
+  Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
+  Sphere(1e5, Vec(50,40.8, 1e5),     Vec(),Vec(.75,.75,.75),DIFF),//Back
+  Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(),           DIFF),//Frnt
+  Sphere(1e5, Vec(50, 1e5, 81.6),    Vec(),Vec(.75,.75,.75),DIFF),//Botm
+  Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
+  Sphere(16.5,Vec(27,16.5,47),       Vec(),Vec(1,1,1)*.6,   SPEC),//Mirr
+  Sphere(16.5,Vec(73,16.5,78),       Vec(),Vec(.75,1.,.95), REFR),//Glas
+  Sphere(4.0, Vec(50,81.6-16.5,81.6),Vec(4,4,4)*12,  Vec(), DIFF),//Lite
+};
+int numSpheres = sizeof(spheres)/sizeof(Sphere);
+inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
+inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
+inline bool intersect(const Ray &r, double &t, int &id){
+  double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
+  for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
+  return t<inf;
+}
+Vec radiance(const Ray &r, int depth, unsigned short *Xi,int E=1){
+  double t;                               // distance to intersection
+  int id=0;                               // id of intersected object
+  if (!intersect(r, t, id)) return Vec(); // if miss, return black
+  const Sphere &obj = spheres[id];        // the hit object
+  Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
+  double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
+  if (++depth>5||!p) if (erand48(Xi)<p) f=f*(1/p); else return obj.e*E;
+  if (obj.refl == DIFF){                  // Ideal DIFFUSE reflection
+    double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
+    Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
+    Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
+
+    // Loop over any lights
+    Vec e;
+    for (int i=0; i<numSpheres; i++){
+      const Sphere &s = spheres[i];
+      if (s.e.x<=0 && s.e.y<=0 && s.e.z<=0) continue; // skip non-lights
+      
+      Vec sw=s.p-x, su=((fabs(sw.x)>.1?Vec(0,1):Vec(1))%sw).norm(), sv=sw%su;
+      double cos_a_max = sqrt(1-s.rad*s.rad/(x-s.p).dot(x-s.p));
+      double eps1 = erand48(Xi), eps2 = erand48(Xi);
+      double cos_a = 1-eps1+eps1*cos_a_max;
+      double sin_a = sqrt(1-cos_a*cos_a);
+      double phi = 2*M_PI*eps2;
+      Vec l = su*cos(phi)*sin_a + sv*sin(phi)*sin_a + sw*cos_a;
+      l.norm();
+      if (intersect(Ray(x,l), t, id) && id==i){  // shadow ray
+        double omega = 2*M_PI*(1-cos_a_max);
+        e = e + f.mult(s.e*l.dot(nl)*omega)*M_1_PI;  // 1/pi for brdf
+      }
+    }
+    
+    return obj.e*E+e+f.mult(radiance(Ray(x,d),depth,Xi,0));
+  } else if (obj.refl == SPEC)            // Ideal SPECULAR reflection
+    return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
+  Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
+  bool into = n.dot(nl)>0;                // Ray from outside going in?
+  double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
+  if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
+    return obj.e + f.mult(radiance(reflRay,depth,Xi));
+  Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
+  double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
+  double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
+  return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ?   // Russian roulette
+    radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
+    radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
+}
+int main(int argc, char *argv[]){
+  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 2; // # samples
+  double tbeg = get_time();
+  Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
+  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
+#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP
+  for (int y=0; y<h; y++){                       // Loop over image rows
+    fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
+    for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++)   // Loop cols
+      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++)     // 2x2 subpixel rows
+        for (int sx=0; sx<2; sx++, r=Vec()){        // 2x2 subpixel cols
+          for (int s=0; s<samps; s++){
+            double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
+            double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
+            Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
+                    cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
+            r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
+          } // Camera rays are pushed ^^^^^ forward to start in interior
+          c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
+        }
+  }
+  double tend = get_time();
+  fprintf(stderr, "\nElapsed: %5.1f ms\n", tend - tbeg);
+  fprintf(stdout, "P3\n%d %d\n%d\n", w, h, 255);
+  for (int i=0; i<w*h; i++)
+    fprintf(stdout,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
+}
diff --git a/test/benchmark/smallpt/smallpt.cpp b/test/benchmark/smallpt/smallpt.cpp
new file mode 100644
index 0000000..ef71440
--- /dev/null
+++ b/test/benchmark/smallpt/smallpt.cpp
@@ -0,0 +1,106 @@
+#include <math.h>   // smallpt, a Path Tracer by Kevin Beason, 2008
+#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
+#include <stdio.h>  //        Remove "-fopenmp" for g++ version < 4.2
+#include <time.h>
+double get_time() {
+  struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts);
+  return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+struct Vec {        // Usage: time ./smallpt 5000 && xv image.ppm
+  double x, y, z;                  // position, also color (r,g,b)
+  Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
+  Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
+  Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
+  Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
+  Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
+  Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
+  double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
+  Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
+};
+struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
+enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
+struct Sphere {
+  double rad;       // radius
+  Vec p, e, c;      // position, emission, color
+  Refl_t refl;      // reflection type (DIFFuse, SPECular, REFRactive)
+  Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
+    rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
+  double intersect(const Ray &r) const { // returns distance, 0 if nohit
+    Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
+    double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
+    if (det<0) return 0; else det=sqrt(det);
+    return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
+  }
+};
+Sphere spheres[] = {//Scene: radius, position, emission, color, material
+  Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
+  Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
+  Sphere(1e5, Vec(50,40.8, 1e5),     Vec(),Vec(.75,.75,.75),DIFF),//Back
+  Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(),           DIFF),//Frnt
+  Sphere(1e5, Vec(50, 1e5, 81.6),    Vec(),Vec(.75,.75,.75),DIFF),//Botm
+  Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
+  Sphere(16.5,Vec(27,16.5,47),       Vec(),Vec(1,1,1)*.6,   SPEC),//Mirr
+  Sphere(16.5,Vec(73,16.5,78),       Vec(),Vec(.75,1.,.95), REFR),//Glas
+  Sphere(4.0, Vec(50,81.6-16.5,81.6),Vec(4,4,4)*12,  Vec(), DIFF),//Lite
+};
+inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
+inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
+inline bool intersect(const Ray &r, double &t, int &id){
+  double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
+  for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
+  return t<inf;
+}
+Vec radiance(const Ray &r, int depth, unsigned short *Xi){
+  double t;                               // distance to intersection
+  int id=0;                               // id of intersected object
+  if (!intersect(r, t, id)) return Vec(); // if miss, return black
+  const Sphere &obj = spheres[id];        // the hit object
+  Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
+  double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
+  if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj.e; //R.R.
+  if (obj.refl == DIFF){                  // Ideal DIFFUSE reflection
+    double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
+    Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
+    Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
+    return obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
+  } else if (obj.refl == SPEC)            // Ideal SPECULAR reflection
+    return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
+  Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
+  bool into = n.dot(nl)>0;                // Ray from outside going in?
+  double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
+  if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
+    return obj.e + f.mult(radiance(reflRay,depth,Xi));
+  Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
+  double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
+  double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
+  return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ?   // Russian roulette
+    radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
+    radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
+}
+int main(int argc, char *argv[]){
+  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 2; // # samples
+  double tbeg = get_time();
+  Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
+  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
+#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP
+  for (int y=0; y<h; y++){                       // Loop over image rows
+    fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
+    for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++)   // Loop cols
+      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++)     // 2x2 subpixel rows
+        for (int sx=0; sx<2; sx++, r=Vec()){        // 2x2 subpixel cols
+          for (int s=0; s<samps; s++){
+            double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
+            double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
+            Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
+                    cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
+            r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
+          } // Camera rays are pushed ^^^^^ forward to start in interior
+          c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
+        }
+  }
+  double tend = get_time();
+  fprintf(stderr, "\nElapsed: %5.1f ms\n", tend - tbeg);
+  fprintf(stdout, "P3\n%d %d\n%d\n", w, h, 255);
+  for (int i=0; i<w*h; i++)
+    fprintf(stdout,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
+}
diff --git a/test/benchmark/stream/stream.c b/test/benchmark/stream/stream.c
new file mode 100644
index 0000000..b9a2cee
--- /dev/null
+++ b/test/benchmark/stream/stream.c
@@ -0,0 +1,585 @@
+/*-----------------------------------------------------------------------*/
+/* Program: STREAM                                                       */
+/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
+/* Original code developed by John D. McCalpin                           */
+/* Programmers: John D. McCalpin                                         */
+/*              Joe R. Zagar                                             */
+/*                                                                       */
+/* This program measures memory transfer rates in MB/s for simple        */
+/* computational kernels coded in C.                                     */
+/*-----------------------------------------------------------------------*/
+/* Copyright 1991-2013: John D. McCalpin                                 */
+/*-----------------------------------------------------------------------*/
+/* License:                                                              */
+/*  1. You are free to use this program and/or to redistribute           */
+/*     this program.                                                     */
+/*  2. You are free to modify this program for your own use,             */
+/*     including commercial use, subject to the publication              */
+/*     restrictions in item 3.                                           */
+/*  3. You are free to publish results obtained from running this        */
+/*     program, or from works that you derive from this program,         */
+/*     with the following limitations:                                   */
+/*     3a. In order to be referred to as "STREAM benchmark results",     */
+/*         published results must be in conformance to the STREAM        */
+/*         Run Rules, (briefly reviewed below) published at              */
+/*         http://www.cs.virginia.edu/stream/ref.html                    */
+/*         and incorporated herein by reference.                         */
+/*         As the copyright holder, John McCalpin retains the            */
+/*         right to determine conformity with the Run Rules.             */
+/*     3b. Results based on modified source code or on runs not in       */
+/*         accordance with the STREAM Run Rules must be clearly          */
+/*         labelled whenever they are published.  Examples of            */
+/*         proper labelling include:                                     */
+/*           "tuned STREAM benchmark results"                            */
+/*           "based on a variant of the STREAM benchmark code"           */
+/*         Other comparable, clear, and reasonable labelling is          */
+/*         acceptable.                                                   */
+/*     3c. Submission of results to the STREAM benchmark web site        */
+/*         is encouraged, but not required.                              */
+/*  4. Use of this program or creation of derived works based on this    */
+/*     program constitutes acceptance of these licensing restrictions.   */
+/*  5. Absolutely no warranty is expressed or implied.                   */
+/*-----------------------------------------------------------------------*/
+# include <stdio.h>
+# include <unistd.h>
+# include <math.h>
+# include <float.h>
+# include <limits.h>
+# include <sys/time.h>
+
+/*-----------------------------------------------------------------------
+ * INSTRUCTIONS:
+ *
+ *	1) STREAM requires different amounts of memory to run on different
+ *           systems, depending on both the system cache size(s) and the
+ *           granularity of the system timer.
+ *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
+ *           to meet *both* of the following criteria:
+ *       (a) Each array must be at least 4 times the size of the
+ *           available cache memory. I don't worry about the difference
+ *           between 10^6 and 2^20, so in practice the minimum array size
+ *           is about 3.8 times the cache size.
+ *           Example 1: One Xeon E3 with 8 MB L3 cache
+ *               STREAM_ARRAY_SIZE should be >= 4 million, giving
+ *               an array size of 30.5 MB and a total memory requirement
+ *               of 91.5 MB.  
+ *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
+ *               STREAM_ARRAY_SIZE should be >= 20 million, giving
+ *               an array size of 153 MB and a total memory requirement
+ *               of 458 MB.  
+ *       (b) The size should be large enough so that the 'timing calibration'
+ *           output by the program is at least 20 clock-ticks.  
+ *           Example: most versions of Windows have a 10 millisecond timer
+ *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
+ *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
+ *               This means the each array must be at least 1 GB, or 128M elements.
+ *
+ *      Version 5.10 increases the default array size from 2 million
+ *          elements to 10 million elements in response to the increasing
+ *          size of L3 caches.  The new default size is large enough for caches
+ *          up to 20 MB. 
+ *      Version 5.10 changes the loop index variables from "register int"
+ *          to "ssize_t", which allows array indices >2^32 (4 billion)
+ *          on properly configured 64-bit systems.  Additional compiler options
+ *          (such as "-mcmodel=medium") may be required for large memory runs.
+ *
+ *      Array size can be set at compile time without modifying the source
+ *          code for the (many) compilers that support preprocessor definitions
+ *          on the compile line.  E.g.,
+ *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
+ *          will override the default size of 10M with a new size of 100M elements
+ *          per array.
+ */
+#ifndef STREAM_ARRAY_SIZE
+#   define STREAM_ARRAY_SIZE	10000000
+#endif
+
+/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
+ *         for any iteration after the first, therefore the minimum value
+ *         for NTIMES is 2.
+ *      There are no rules on maximum allowable values for NTIMES, but
+ *         values larger than the default are unlikely to noticeably
+ *         increase the reported performance.
+ *      NTIMES can also be set on the compile line without changing the source
+ *         code using, for example, "-DNTIMES=7".
+ */
+#ifdef NTIMES
+#if NTIMES<=1
+#   define NTIMES	10
+#endif
+#endif
+#ifndef NTIMES
+#   define NTIMES	10
+#endif
+
+/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
+ *         relative alignment of the arrays (though compilers may change the 
+ *         effective offset by making the arrays non-contiguous on some systems). 
+ *      Use of non-zero values for OFFSET can be especially helpful if the
+ *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
+ *      OFFSET can also be set on the compile line without changing the source
+ *         code using, for example, "-DOFFSET=56".
+ */
+#ifndef OFFSET
+#   define OFFSET	0
+#endif
+
+/*
+ *	3) Compile the code with optimization.  Many compilers generate
+ *       unreasonably bad code before the optimizer tightens things up.  
+ *     If the results are unreasonably good, on the other hand, the
+ *       optimizer might be too smart for me!
+ *
+ *     For a simple single-core version, try compiling with:
+ *            cc -O stream.c -o stream
+ *     This is known to work on many, many systems....
+ *
+ *     To use multiple cores, you need to tell the compiler to obey the OpenMP
+ *       directives in the code.  This varies by compiler, but a common example is
+ *            gcc -O -fopenmp stream.c -o stream_omp
+ *       The environment variable OMP_NUM_THREADS allows runtime control of the 
+ *         number of threads/cores used when the resulting "stream_omp" program
+ *         is executed.
+ *
+ *     To run with single-precision variables and arithmetic, simply add
+ *         -DSTREAM_TYPE=float
+ *     to the compile line.
+ *     Note that this changes the minimum array sizes required --- see (1) above.
+ *
+ *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
+ *       code to call separate functions to execute each kernel.  Trivial versions
+ *       of these functions are provided, but they are *not* tuned -- they just 
+ *       provide predefined interfaces to be replaced with tuned code.
+ *
+ *
+ *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
+ *	   Be sure to include info that will help me understand:
+ *		a) the computer hardware configuration (e.g., processor model, memory type)
+ *		b) the compiler name/version and compilation flags
+ *      c) any run-time information (such as OMP_NUM_THREADS)
+ *		d) all of the output from the test case.
+ *
+ * Thanks!
+ *
+ *-----------------------------------------------------------------------*/
+
+# define HLINE "-------------------------------------------------------------\n"
+
+# ifndef MIN
+# define MIN(x,y) ((x)<(y)?(x):(y))
+# endif
+# ifndef MAX
+# define MAX(x,y) ((x)>(y)?(x):(y))
+# endif
+
+#ifndef STREAM_TYPE
+#define STREAM_TYPE double
+#endif
+
+static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
+			b[STREAM_ARRAY_SIZE+OFFSET],
+			c[STREAM_ARRAY_SIZE+OFFSET];
+
+static double	avgtime[4] = {0}, maxtime[4] = {0},
+		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+
+static char	*label[4] = {"Copy:      ", "Scale:     ",
+    "Add:       ", "Triad:     "};
+
+static double	bytes[4] = {
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
+    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
+    };
+
+extern double mysecond();
+extern void checkSTREAMresults();
+#ifdef TUNED
+extern void tuned_STREAM_Copy();
+extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
+extern void tuned_STREAM_Add();
+extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
+#endif
+#ifdef _OPENMP
+extern int omp_get_num_threads();
+#endif
+int
+main()
+    {
+    int			quantum, checktick();
+    int			BytesPerWord;
+    int			k;
+    ssize_t		j;
+    STREAM_TYPE		scalar;
+    double		t, times[4][NTIMES];
+
+    /* --- SETUP --- determine precision and check timing --- */
+
+    printf(HLINE);
+    printf("STREAM version $Revision: 5.10 $\n");
+    printf(HLINE);
+    BytesPerWord = sizeof(STREAM_TYPE);
+    printf("This system uses %d bytes per array element.\n",
+	BytesPerWord);
+
+    printf(HLINE);
+#ifdef N
+    printf("*****  WARNING: ******\n");
+    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
+    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
+    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
+    printf("*****  WARNING: ******\n");
+#endif
+
+    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
+    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
+	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
+    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
+	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
+    printf("Each kernel will be executed %d times.\n", NTIMES);
+    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
+    printf(" will be used to compute the reported bandwidth.\n");
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel 
+    {
+#pragma omp master
+	{
+	    k = omp_get_num_threads();
+	    printf ("Number of Threads requested = %i\n",k);
+        }
+    }
+#endif
+
+#ifdef _OPENMP
+	k = 0;
+#pragma omp parallel
+#pragma omp atomic 
+		k++;
+    printf ("Number of Threads counted = %i\n",k);
+#endif
+
+    /* Get initial value for system clock. */
+#pragma omp parallel for
+    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+	    a[j] = 1.0;
+	    b[j] = 2.0;
+	    c[j] = 0.0;
+	}
+
+    printf(HLINE);
+
+    if  ( (quantum = checktick()) >= 1) 
+	printf("Your clock granularity/precision appears to be "
+	    "%d microseconds.\n", quantum);
+    else {
+	printf("Your clock granularity appears to be "
+	    "less than one microsecond.\n");
+	quantum = 1;
+    }
+
+    t = mysecond();
+#pragma omp parallel for
+    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
+		a[j] = 2.0E0 * a[j];
+    t = 1.0E6 * (mysecond() - t);
+
+    printf("Each test below will take on the order"
+	" of %d microseconds.\n", (int) t  );
+    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
+    printf("Increase the size of the arrays if this shows that\n");
+    printf("you are not getting at least 20 clock ticks per test.\n");
+
+    printf(HLINE);
+
+    printf("WARNING -- The above is only a rough guideline.\n");
+    printf("For best results, please be sure you know the\n");
+    printf("precision of your system timer.\n");
+    printf(HLINE);
+    
+    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
+
+    scalar = 3.0;
+    for (k=0; k<NTIMES; k++)
+	{
+	times[0][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Copy();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j];
+#endif
+	times[0][k] = mysecond() - times[0][k];
+	
+	times[1][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Scale(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+#endif
+	times[1][k] = mysecond() - times[1][k];
+	
+	times[2][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Add();
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+#endif
+	times[2][k] = mysecond() - times[2][k];
+	
+	times[3][k] = mysecond();
+#ifdef TUNED
+        tuned_STREAM_Triad(scalar);
+#else
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+#endif
+	times[3][k] = mysecond() - times[3][k];
+	}
+
+    /*	--- SUMMARY --- */
+
+    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
+	{
+	for (j=0; j<4; j++)
+	    {
+	    avgtime[j] = avgtime[j] + times[j][k];
+	    mintime[j] = MIN(mintime[j], times[j][k]);
+	    maxtime[j] = MAX(maxtime[j], times[j][k]);
+	    }
+	}
+    
+    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
+    for (j=0; j<4; j++) {
+		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
+
+		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
+	       1.0E-06 * bytes[j]/mintime[j],
+	       avgtime[j],
+	       mintime[j],
+	       maxtime[j]);
+    }
+    printf(HLINE);
+
+    /* --- Check Results --- */
+    checkSTREAMresults();
+    printf(HLINE);
+
+    return 0;
+}
+
+# define	M	20
+
+int
+checktick()
+    {
+    int		i, minDelta, Delta;
+    double	t1, t2, timesfound[M];
+
+/*  Collect a sequence of M unique time values from the system. */
+
+    for (i = 0; i < M; i++) {
+	t1 = mysecond();
+	while( ((t2=mysecond()) - t1) < 1.0E-6 )
+	    ;
+	timesfound[i] = t1 = t2;
+	}
+
+/*
+ * Determine the minimum difference between these M values.
+ * This result will be our estimate (in microseconds) for the
+ * clock granularity.
+ */
+
+    minDelta = 1000000;
+    for (i = 1; i < M; i++) {
+	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
+	minDelta = MIN(minDelta, MAX(Delta,0));
+	}
+
+   return(minDelta);
+    }
+
+
+
+/* A gettimeofday routine to give access to the wall
+   clock timer on most UNIX-like systems.  */
+
+#include <sys/time.h>
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+
+#ifndef abs
+#define abs(a) ((a) >= 0 ? (a) : -(a))
+#endif
+void checkSTREAMresults ()
+{
+	STREAM_TYPE aj,bj,cj,scalar;
+	STREAM_TYPE aSumErr,bSumErr,cSumErr;
+	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
+	double epsilon;
+	ssize_t	j;
+	int	k,ierr,err;
+
+    /* reproduce initialization */
+	aj = 1.0;
+	bj = 2.0;
+	cj = 0.0;
+    /* a[] is modified during timing check */
+	aj = 2.0E0 * aj;
+    /* now execute timing loop */
+	scalar = 3.0;
+	for (k=0; k<NTIMES; k++)
+        {
+            cj = aj;
+            bj = scalar*cj;
+            cj = aj+bj;
+            aj = bj+scalar*cj;
+        }
+
+    /* accumulate deltas between observed and expected results */
+	aSumErr = 0.0;
+	bSumErr = 0.0;
+	cSumErr = 0.0;
+	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+		aSumErr += abs(a[j] - aj);
+		bSumErr += abs(b[j] - bj);
+		cSumErr += abs(c[j] - cj);
+		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
+	}
+	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
+
+	if (sizeof(STREAM_TYPE) == 4) {
+		epsilon = 1.e-6;
+	}
+	else if (sizeof(STREAM_TYPE) == 8) {
+		epsilon = 1.e-13;
+	}
+	else {
+		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
+		epsilon = 1.e-6;
+	}
+
+	err = 0;
+	if (abs(aAvgErr/aj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(a[j]/aj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,aj,a[j],abs((aj-a[j])/aAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array a[], %d errors were found.\n",ierr);
+	}
+	if (abs(bAvgErr/bj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(b[j]/bj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,bj,b[j],abs((bj-b[j])/bAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array b[], %d errors were found.\n",ierr);
+	}
+	if (abs(cAvgErr/cj) > epsilon) {
+		err++;
+		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
+		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
+		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+		ierr = 0;
+		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
+			if (abs(c[j]/cj-1.0) > epsilon) {
+				ierr++;
+#ifdef VERBOSE
+				if (ierr < 10) {
+					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
+						j,cj,c[j],abs((cj-c[j])/cAvgErr));
+				}
+#endif
+			}
+		}
+		printf("     For array c[], %d errors were found.\n",ierr);
+	}
+	if (err == 0) {
+		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
+	}
+#ifdef VERBOSE
+	printf ("Results Validation Verbose Results: \n");
+	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
+	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
+	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
+#endif
+}
+
+#ifdef TUNED
+/* stubs for "tuned" versions of the kernels */
+void tuned_STREAM_Copy()
+{
+	ssize_t j;
+#pragma omp parallel for
+        for (j=0; j<STREAM_ARRAY_SIZE; j++)
+            c[j] = a[j];
+}
+
+void tuned_STREAM_Scale(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    b[j] = scalar*c[j];
+}
+
+void tuned_STREAM_Add()
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    c[j] = a[j]+b[j];
+}
+
+void tuned_STREAM_Triad(STREAM_TYPE scalar)
+{
+	ssize_t j;
+#pragma omp parallel for
+	for (j=0; j<STREAM_ARRAY_SIZE; j++)
+	    a[j] = b[j]+scalar*c[j];
+}
+/* end of stubs for the "tuned" versions of the kernels */
+#endif
diff --git a/test/benchmark/stream/stream.wasm b/test/benchmark/stream/stream.wasm
new file mode 100755
index 0000000..fc6d268
Binary files /dev/null and b/test/benchmark/stream/stream.wasm differ
diff --git a/test/benchmark/wasm/crc.wasm b/test/benchmark/wasm/crc.wasm
deleted file mode 100644
index 63923fd..0000000
Binary files a/test/benchmark/wasm/crc.wasm and /dev/null differ
diff --git a/test/benchmark/wasm/mandelbrot.wasm b/test/benchmark/wasm/mandelbrot.wasm
deleted file mode 100644
index cb585b6..0000000
Binary files a/test/benchmark/wasm/mandelbrot.wasm and /dev/null differ