From 8251a26c4ec5a9e9b0910aedc2cd0bd05d42262e Mon Sep 17 00:00:00 2001 From: asakhar Date: Fri, 19 May 2023 21:07:31 +0300 Subject: [PATCH 1/3] optimized mat_dot & benchmarked it --- bench.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ build.sh | 1 + nn.h | 6 +++--- 3 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 bench.c diff --git a/bench.c b/bench.c new file mode 100644 index 0000000..057910e --- /dev/null +++ b/bench.c @@ -0,0 +1,65 @@ +#define NN_IMPLEMENTATION +#include "nn.h" +#include +#include + +#define WARM_UP_TIME 3 +#define ITERS 500 + +void mat_dot_old(Mat dst, Mat a, Mat b); + +typedef void (*DotFunc)(Mat, Mat, Mat); +void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name); + +int main(void) +{ + /// setup + size_t R = 300; + size_t K = 200; + size_t C = 400; + Mat a = mat_alloc(R, K); + Mat b = mat_alloc(K, C); + Mat dst = mat_alloc(R, C); + mat_rand(a, 0, 1); + mat_rand(b, 0, 1); + + bench(dst, a, b, mat_dot_old, "old"); + bench(dst, a, b, mat_dot, "new"); +} + +void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name) +{ + double start = (double)clock() / CLOCKS_PER_SEC; + double end = start; + printf("Warming up for %d seconds...\n", WARM_UP_TIME); + while (end-start < WARM_UP_TIME) + { + func(dst, a, b); + end = (double)clock() / CLOCKS_PER_SEC; + } + printf("Running bench %s...\n", name); + start = (double)clock() / CLOCKS_PER_SEC; + for (size_t i = 0; i < ITERS; ++i) + { + func(dst, a, b); + } + end = (double)clock() / CLOCKS_PER_SEC; + printf("%s solution took: %fs to process\n", name, end - start); +} + +void mat_dot_old(Mat dst, Mat a, Mat b) +{ + NN_ASSERT(a.cols == b.rows); + size_t n = a.cols; + NN_ASSERT(dst.rows == a.rows); + NN_ASSERT(dst.cols == b.cols); + + for (size_t i = 0; i < dst.rows; ++i) { + for (size_t j = 0; j < dst.cols; ++j) { + MAT_AT(dst, i, j) = 0; + for (size_t k = 0; k < n; ++k) { + MAT_AT(dst, i, j) += MAT_AT(a, i, k) * MAT_AT(b, k, j); + } + } + } +} diff --git a/build.sh b/build.sh index 1e9716a..b238559 100755 --- a/build.sh +++ b/build.sh @@ -8,3 +8,4 @@ LIBS="`pkg-config --libs raylib` -lm -lglfw -ldl -lpthread" clang $CFLAGS -o adder_gen adder_gen.c $LIBS clang $CFLAGS -o xor_gen xor_gen.c $LIBS clang $CFLAGS -o gym gym.c $LIBS +clang $CFLAGS -o bench bench.c diff --git a/nn.h b/nn.h index 5e50dee..e56db42 100644 --- a/nn.h +++ b/nn.h @@ -130,11 +130,11 @@ void mat_dot(Mat dst, Mat a, Mat b) size_t n = a.cols; NN_ASSERT(dst.rows == a.rows); NN_ASSERT(dst.cols == b.cols); + mat_fill(dst, 0); for (size_t i = 0; i < dst.rows; ++i) { - for (size_t j = 0; j < dst.cols; ++j) { - MAT_AT(dst, i, j) = 0; - for (size_t k = 0; k < n; ++k) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < dst.cols; ++j) { MAT_AT(dst, i, j) += MAT_AT(a, i, k) * MAT_AT(b, k, j); } } From abf3d86781c659187bde7bb0b652386e7e78d123 Mon Sep 17 00:00:00 2001 From: asakhar Date: Fri, 19 May 2023 21:27:19 +0300 Subject: [PATCH 2/3] implemented testing & moved compiler to CC var in build script --- bench.c | 17 +++++++++++++++++ build.sh | 11 ++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/bench.c b/bench.c index 057910e..fa841d6 100644 --- a/bench.c +++ b/bench.c @@ -10,6 +10,7 @@ void mat_dot_old(Mat dst, Mat a, Mat b); typedef void (*DotFunc)(Mat, Mat, Mat); void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name); +void test_against(Mat a, Mat b, DotFunc reference, DotFunc to_test); int main(void) { @@ -25,6 +26,7 @@ int main(void) bench(dst, a, b, mat_dot_old, "old"); bench(dst, a, b, mat_dot, "new"); + test_against(a, b, mat_dot_old, mat_dot); } void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name) @@ -63,3 +65,18 @@ void mat_dot_old(Mat dst, Mat a, Mat b) } } } + +void test_against(Mat a, Mat b, DotFunc reference, DotFunc to_test) { + Mat reference_res = mat_alloc(a.rows, b.cols); + Mat test_res = mat_alloc(a.rows, b.cols); + reference(reference_res, a, b); + to_test(test_res, a, b); + size_t total = reference_res.rows * reference_res.cols; + for(size_t i = 0; i < total; ++i) { + if(reference_res.es[i] != test_res.es[i]) { + fputs("Matrices did not match", stderr); + return; + } + } + puts("Matrices are equal"); +} \ No newline at end of file diff --git a/build.sh b/build.sh index 2ac6358..8923d53 100755 --- a/build.sh +++ b/build.sh @@ -4,9 +4,10 @@ set -xe CFLAGS="-O3 -Wall -Wextra" LIBS="-lm" +CC=clang -clang $CFLAGS -o adder_gen adder_gen.c $LIBS -clang $CFLAGS -o xor_gen xor_gen.c $LIBS -clang $CFLAGS `pkg-config --cflags raylib` -o gym gym.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread -clang $CFLAGS `pkg-config --cflags raylib` -o img2mat img2mat.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread -clang $CFLAGS -o bench bench.c +$CC $CFLAGS -o adder_gen adder_gen.c $LIBS +$CC $CFLAGS -o xor_gen xor_gen.c $LIBS +$CC $CFLAGS `pkg-config --cflags raylib` -o gym gym.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread +$CC $CFLAGS `pkg-config --cflags raylib` -o img2mat img2mat.c $LIBS `pkg-config --libs raylib` -lglfw -ldl -lpthread +$CC $CFLAGS -o bench bench.c $LIBS From 37ddd71e8c11635ee4643ccc5e827b86c52f7112 Mon Sep 17 00:00:00 2001 From: asakhar Date: Fri, 19 May 2023 21:45:16 +0300 Subject: [PATCH 3/3] added description --- bench.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/bench.c b/bench.c index fa841d6..cb1b50f 100644 --- a/bench.c +++ b/bench.c @@ -1,3 +1,13 @@ +/// Author: asakhar +/// Description: +/// I basically changed the order of iteration +/// in matrix multiplication to make it more +/// cache friendly. Here are the benchmarks and test. +/// +/// GCC with -O3 gave me 7x improvement (~0.021s -> ~0.003s) +/// MSVC with /O2 has a little bit less difference but never the less: +/// 2x improved (~0.021s -> ~0.01s) + #define NN_IMPLEMENTATION #include "nn.h" #include @@ -14,7 +24,7 @@ void test_against(Mat a, Mat b, DotFunc reference, DotFunc to_test); int main(void) { - /// setup + // setup size_t R = 300; size_t K = 200; size_t C = 400; @@ -24,8 +34,11 @@ int main(void) mat_rand(a, 0, 1); mat_rand(b, 0, 1); + // actual benches bench(dst, a, b, mat_dot_old, "old"); bench(dst, a, b, mat_dot, "new"); + + // testing test_against(a, b, mat_dot_old, mat_dot); } @@ -40,13 +53,15 @@ void bench(Mat dst, Mat a, Mat b, DotFunc func, char const *name) end = (double)clock() / CLOCKS_PER_SEC; } printf("Running bench %s...\n", name); - start = (double)clock() / CLOCKS_PER_SEC; + double total_time = 0; for (size_t i = 0; i < ITERS; ++i) { + start = (double)clock() / CLOCKS_PER_SEC; func(dst, a, b); + end = (double)clock() / CLOCKS_PER_SEC; + total_time += end-start; } - end = (double)clock() / CLOCKS_PER_SEC; - printf("%s solution took: %fs to process\n", name, end - start); + printf("%s solution took: %fs to process in average among %d iterations\n", name, total_time/(double)ITERS, ITERS); } void mat_dot_old(Mat dst, Mat a, Mat b)