diff --git a/bench.c b/bench.c
new file mode 100644
index 0000000..cb1b50f
--- /dev/null
+++ b/bench.c
@@ -0,0 +1,97 @@
+/// Author: asakhar
+/// Description: 
+///  I basically changed the order of iteration
+///  in matrix multiplication to make it more
+///  cache friendly. Here are the benchmarks and test.
+///  
+///  GCC with -O3 gave me 7x improvement (~0.021s -> ~0.003s)
+///  MSVC with /O2 has a little bit less difference but never the less: 
+///    2x improved (~0.021s -> ~0.01s)
+
+#define NN_IMPLEMENTATION
+#include "nn.h"
+#include <time.h>
+#include <stdio.h>
+
+#define WARM_UP_TIME 3
+#define ITERS 500
+
+void mat_dot_old(Mat dst, Mat a, Mat b);
+
+typedef void (*DotFunc)(Mat, Mat, Mat);
+void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name);
+void test_against(Mat a, Mat b, DotFunc reference, DotFunc to_test);
+
+int main(void)
+{
+  // setup
+  size_t R = 300;
+  size_t K = 200;
+  size_t C = 400;
+  Mat a = mat_alloc(R, K);
+  Mat b = mat_alloc(K, C);
+  Mat dst = mat_alloc(R, C);
+  mat_rand(a, 0, 1);
+  mat_rand(b, 0, 1);
+
+  // actual benches
+  bench(dst, a, b, mat_dot_old, "old");
+  bench(dst, a, b, mat_dot, "new");
+  
+  // testing
+  test_against(a, b, mat_dot_old, mat_dot);
+}
+
+void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name)
+{
+  double start = (double)clock() / CLOCKS_PER_SEC;
+  double end = start;
+  printf("Warming up for %d seconds...\n", WARM_UP_TIME);
+  while (end-start < WARM_UP_TIME)
+  {
+    func(dst, a, b);
+    end = (double)clock() / CLOCKS_PER_SEC;
+  }
+  printf("Running bench %s...\n", name);
+  double total_time = 0;
+  for (size_t i = 0; i < ITERS; ++i)
+  {
+    start = (double)clock() / CLOCKS_PER_SEC;
+    func(dst, a, b);
+    end = (double)clock() / CLOCKS_PER_SEC;
+    total_time += end-start;
+  }
+  printf("%s solution took: %fs to process in average among %d iterations\n", name, total_time/(double)ITERS, ITERS);
+}
+
+void mat_dot_old(Mat dst, Mat a, Mat b)
+{
+    NN_ASSERT(a.cols == b.rows);
+    size_t n = a.cols;
+    NN_ASSERT(dst.rows == a.rows);
+    NN_ASSERT(dst.cols == b.cols);
+
+    for (size_t i = 0; i < dst.rows; ++i) {
+        for (size_t j = 0; j < dst.cols; ++j) {
+            MAT_AT(dst, i, j) = 0;
+            for (size_t k = 0; k < n; ++k) {
+                MAT_AT(dst, i, j) += MAT_AT(a, i, k) * MAT_AT(b, k, j);
+            }
+        }
+    }
+}
+
+void test_against(Mat a, Mat b, DotFunc reference, DotFunc to_test) {
+  Mat reference_res = mat_alloc(a.rows, b.cols);
+  Mat test_res = mat_alloc(a.rows, b.cols);
+  reference(reference_res, a, b);
+  to_test(test_res, a, b);
+  size_t total = reference_res.rows * reference_res.cols;
+  for(size_t i = 0; i < total; ++i) {
+    if(reference_res.es[i] != test_res.es[i]) {
+      fputs("Matrices did not match", stderr);
+      return;
+    }
+  }
+  puts("Matrices are equal");
+}
\ No newline at end of file
diff --git a/build.sh b/build.sh
index b3bbed6..a62f49b 100755
--- a/build.sh
+++ b/build.sh
@@ -5,6 +5,8 @@ set -xe
 CFLAGS="-O3 -Wall -Wextra -I./thirdparty/ `pkg-config --cflags raylib`"
 LIBS="-lm `pkg-config --libs raylib` -lglfw -ldl -lpthread"
 
-clang $CFLAGS -o adder adder.c $LIBS
-clang $CFLAGS -o xor xor.c $LIBS
-clang $CFLAGS -o img2nn img2nn.c $LIBS
+clang $CFLAGS -o adder_gen adder_gen.c $LIBS
+clang $CFLAGS `pkg-config --cflags raylib` -o xor xor.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread
+clang $CFLAGS `pkg-config --cflags raylib` -o gym gym.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread
+clang $CFLAGS `pkg-config --cflags raylib` -o img2nn img2nn.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread
+clang $CFLAGS -o bench bench.c $LIBS
\ No newline at end of file
diff --git a/nn.h b/nn.h
index 8bc6e8e..1cf7f32 100644
--- a/nn.h
+++ b/nn.h
@@ -168,11 +168,11 @@ void mat_dot(Mat dst, Mat a, Mat b)
     size_t n = a.cols;
     NN_ASSERT(dst.rows == a.rows);
     NN_ASSERT(dst.cols == b.cols);
+    mat_fill(dst, 0);
 
     for (size_t i = 0; i < dst.rows; ++i) {
-        for (size_t j = 0; j < dst.cols; ++j) {
-            MAT_AT(dst, i, j) = 0;
-            for (size_t k = 0; k < n; ++k) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < dst.cols; ++j) {
                 MAT_AT(dst, i, j) += MAT_AT(a, i, k) * MAT_AT(b, k, j);
             }
         }