From e80f2666aba0f93da9270d33264711a06d145465 Mon Sep 17 00:00:00 2001
From: sgui <sheng.gui@intel.com>
Date: Fri, 28 Jun 2024 00:41:58 +0000
Subject: [PATCH 1/4] Add env param KV_CACHE_LOCATION to control kv cache
 memory numanode location

Usage:
before you run instance
export KV_CACHE_LOCATION=#memory_numa_node_id_you_want_to_use_for_kv_cache

by defaults, kv_cache location is the same as other parts of instance.
---
 src/common/kvcache_mgr.h       |  5 ++++-
 src/common/kvcache_tensor.h    | 36 ++++++++++++++++++++++------------
 src/models/kvcache_manager.cpp | 14 +++++++------
 src/models/kvcache_manager.h   |  1 +
 4 files changed, 36 insertions(+), 20 deletions(-)
diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h
index c2fa271b..8f91f799 100644
--- a/src/common/kvcache_mgr.h
+++ b/src/common/kvcache_mgr.h
@@ -41,6 +41,8 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
         this->headNum_ = headNum;
         this->headSize_ = headSize;
         this->layers_ = layers;
+        // The KV Cache location configured in "KV_CACHE_LOCATION"
+        this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1;
     }
 
     ~KVCacheMgrImpl() {
@@ -89,7 +91,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
         // User specified maxSeqLen needs to be <= model's configured maxSeqLen
         auto maxLen = maxSeqLen > 0 ? std::min(maxSeqLen, maxSeqLen_) : maxSeqLen_;
         for (int i = 0; i < 2 * layers_; ++i) {
-            cache[i].resize(maxLen, 1, headNum_, headSize_);
+            cache[i].resize(maxLen, 1, headNum_, headSize_, this->allocNode);
         }
 
         sequenceCaches.insert({seqID, cache});
@@ -186,6 +188,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
     int headNum_;
     int headSize_;
     int layers_;
+    int allocNode;
 };
 
 class KVCacheMgr {
diff --git a/src/common/kvcache_tensor.h b/src/common/kvcache_tensor.h
index 438bd1f9..a37d7ade 100644
--- a/src/common/kvcache_tensor.h
+++ b/src/common/kvcache_tensor.h
@@ -24,6 +24,7 @@
 #include "allocator.h"
 #include "bfloat16.h"
 #include "float16.h"
+#include "numa_allocator.h"
 
 extern bool kvTrans();
 
@@ -67,23 +68,23 @@ template <typename T>
 class KVCacheTensor {
 public:
     KVCacheTensor()
-        : maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr) {}
+        : maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr), scalesAllocSize(0) {}
 
     ~KVCacheTensor() {
-        if (this->data) { free(this->data); }
-        if (this->scales) { free(this->scales); }
+        if (this->data) { xft_numa_free(this->data, allocSize); }
+        if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); }
     }
 
-    void resize(int maxSeqLen, int batchSize, int headNum, int headSize) {
+    void resize(int maxSeqLen, int batchSize, int headNum, int headSize, int allocNode) {
         this->maxSeqLen = maxSeqLen;
         this->batchSize = batchSize;
         this->headNum = headNum;
         this->headSize = headSize;
 
-        uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize;
+        uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize * sizeof(T);
         if (requiredSize > allocSize) {
-            if (this->data) { free(this->data); }
-            this->data = (T *)xft::alloc(requiredSize * sizeof(T));
+            if (this->data) { xft_numa_free(this->data, allocSize); }
+            this->data = (T *)xft_numa_alloc_onnode(requiredSize, allocNode);
             if (!this->data) {
                 printf("Failed to alloc mem for KV Cache [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize);
                 exit(-1);
@@ -91,8 +92,16 @@ class KVCacheTensor {
             allocSize = requiredSize;
         }
 
-        if (this->scales) { free(this->scales); }
-        this->scales = (float *)xft::alloc((uint64_t)maxSeqLen * batchSize * headNum * sizeof(float));
+        requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * sizeof(float);
+        if (requiredSize > scalesAllocSize) {
+            if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); }
+            this->scales = (float *)xft_numa_alloc_onnode(requiredSize, allocNode);
+            if (!this->scales) {
+                printf("Failed to alloc mem for KV Cache scales [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize);
+                exit(-1);
+            }
+            scalesAllocSize = requiredSize;
+        }
     }
 
     int getBatchSize() const { return batchSize; }
@@ -188,15 +197,15 @@ class KVCacheTensor {
      * initSeqLen: initial sequence length, which is the prompt token size
      * accSeqLen: accumulated sequence length
     */
-    void reorder(int *idx, int size, int initSeqLen, int accSeqLen) {
+    void reorder(int *idx, int size, int initSeqLen, int accSeqLen, int allocNode) {
         const int cols = this->getHeadNum() * this->getHeadSize();
         const int batchSize = this->getBatchSize();
 
         T *pdata = this->data + initSeqLen * batchSize * cols;
 
         // Temporary buffer used for reorder
-        T *extraKeyBuf = (T *)xft::alloc((batchSize - 1) * cols * sizeof(T));
-
+        uint64_t requiredSize = (uint64_t)(batchSize - 1) * cols * sizeof(T);
+        T *extraKeyBuf = (T *)xft_numa_alloc_onnode(requiredSize, allocNode);
         for (int seq = initSeqLen; seq < accSeqLen; ++seq) { // Reorder is not needed for the first few lines
             int extraBufIdx = 0;
             int remapped[batchSize];
@@ -260,7 +269,7 @@ class KVCacheTensor {
             pdata += batchSize * cols;
         }
 
-        free(extraKeyBuf);
+        xft_numa_free(extraKeyBuf, requiredSize);
     }
 
 private:
@@ -327,4 +336,5 @@ class KVCacheTensor {
 
     // The scale factor for each head (if T is int8)
     float *scales;
+    uint64_t scalesAllocSize;
 };
diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp
index 13ccec92..de324c03 100644
--- a/src/models/kvcache_manager.cpp
+++ b/src/models/kvcache_manager.cpp
@@ -23,17 +23,19 @@
 
 template <typename KVCacheT>
 void KVCacheManager<KVCacheT>::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) {
+    // The KV Cache location configured in "KV_CACHE_LOCATION"
+    this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1;
     if (prefix && this->cachedPrefixKeys == nullptr) {
         this->cachedPrefixKeys = new KVCacheTensor<KVCacheT>[layers];
         this->cachedPrefixValues = new KVCacheTensor<KVCacheT>[layers];
     }
     for (int i = 0; i < this->layers; ++i) {
         if (prefix) {
-            this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize);
-            this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize);
+            this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode);
+            this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode);
         } else {
-            this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize);
-            this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize);
+            this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode);
+            this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode);
         }
     }
 }
@@ -100,10 +102,10 @@ void KVCacheManager<KVCacheT>::reorderCache(int *idx, int size, int initSeqLen,
         int layer = i / 2;
         if (i % 2 == 0) {
             KVCacheTensor<KVCacheT> &keyTensor = this->getKey(layer);
-            keyTensor.reorder(idx, size, initSeqLen, accSeqLen);
+            keyTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode);
         } else {
             KVCacheTensor<KVCacheT> &valueTensor = this->getValue(layer);
-            valueTensor.reorder(idx, size, initSeqLen, accSeqLen);
+            valueTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode);
         }
     }
 }
diff --git a/src/models/kvcache_manager.h b/src/models/kvcache_manager.h
index 6f593029..430e88a5 100644
--- a/src/models/kvcache_manager.h
+++ b/src/models/kvcache_manager.h
@@ -69,6 +69,7 @@ class KVCacheManager {
     void reorderCache(int *idx, int size, int initSeqLen, int accSeqLen);
 
 private:
+    int allocNode;
     int layers; // how many layers
     KVCacheTensor<KVCacheT> *cachedKeys; // all accumulated keys
     KVCacheTensor<KVCacheT> *cachedValues; // all accumulated values

From c720606b5176ad4db16aeb8df552570d51a0de53 Mon Sep 17 00:00:00 2001
From: sgui <sheng.gui@intel.com>
Date: Tue, 2 Jul 2024 13:46:33 +0000
Subject: [PATCH 2/4] move env KV_CACHE_LOCATION init & get into environment.h

---
 src/common/kvcache_mgr.h       |  7 ++++---
 src/models/kvcache_manager.cpp |  5 +++--
 src/utils/environment.h        | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h
index 8f91f799..ed617348 100644
--- a/src/common/kvcache_mgr.h
+++ b/src/common/kvcache_mgr.h
@@ -13,10 +13,11 @@
 // limitations under the License.
 // ============================================================================
 #pragma once
-
+#include <unordered_map>
 #include <vector>
+
+#include "environment.h"
 #include "kvcache_tensor.h"
-#include <unordered_map>
 
 namespace xft {
 
@@ -42,7 +43,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
         this->headSize_ = headSize;
         this->layers_ = layers;
         // The KV Cache location configured in "KV_CACHE_LOCATION"
-        this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1;
+        this->allocNode = Env::getInstance().getPrimitiveCacheM();
     }
 
     ~KVCacheMgrImpl() {
diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp
index de324c03..eafd5dba 100644
--- a/src/models/kvcache_manager.cpp
+++ b/src/models/kvcache_manager.cpp
@@ -12,19 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // ============================================================================
-#include "kvcache_manager.h"
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include "allocator.h"
 #include "bfloat16.h"
+#include "environment.h"
 #include "float16.h"
+#include "kvcache_manager.h"
 
 template <typename KVCacheT>
 void KVCacheManager<KVCacheT>::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) {
     // The KV Cache location configured in "KV_CACHE_LOCATION"
-    this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1;
+    this->allocNode = Env::getInstance().getPrimitiveCacheM();
     if (prefix && this->cachedPrefixKeys == nullptr) {
         this->cachedPrefixKeys = new KVCacheTensor<KVCacheT>[layers];
         this->cachedPrefixValues = new KVCacheTensor<KVCacheT>[layers];
diff --git a/src/utils/environment.h b/src/utils/environment.h
index ddcb2df4..446b85a2 100644
--- a/src/utils/environment.h
+++ b/src/utils/environment.h
@@ -71,6 +71,9 @@ class Env {
     // get Primitive Cache M
     int getPrimitiveCacheM() { return primitiveCacheM; }
 
+    // get KV Cache Location
+    int getKVCacheLocation() { return primitiveCacheM; }
+
 private:
     Env() {
         // init Verbose
@@ -281,4 +284,16 @@ class Env {
             primitiveCacheM = 256;
         }
     }
+
+    // KV_CACHE_LOCATION
+    int kvCacheLocation = -1;
+    void initKVCacheLocation() {
+        // The KV Cache location configured in "KV_CACHE_LOCATION"
+        char *xft_kvcache_location_value = getenv("KV_CACHE_LOCATION");
+        if (xft_kvcache_location_value != NULL) {
+            int value = atoi(xft_kvcache_location_value);
+            if (value >= 0)
+                kvCacheLocation = value;
+        }
+    }
 };
\ No newline at end of file

From 2b7d50fcd44afc74db6b434b0e1d016d7e257e88 Mon Sep 17 00:00:00 2001
From: sgui <sheng.gui@intel.com>
Date: Wed, 3 Jul 2024 02:42:51 +0000
Subject: [PATCH 3/4] Fix typo

---
 src/utils/environment.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/utils/environment.h b/src/utils/environment.h
index 446b85a2..483d791a 100644
--- a/src/utils/environment.h
+++ b/src/utils/environment.h
@@ -72,7 +72,7 @@ class Env {
     int getPrimitiveCacheM() { return primitiveCacheM; }
 
     // get KV Cache Location
-    int getKVCacheLocation() { return primitiveCacheM; }
+    int getKVCacheLocation() { return kvCacheLocation; }
 
 private:
     Env() {
@@ -114,6 +114,9 @@ class Env {
 
         // init Primitive Cache M
         initPrimitiveCacheM();
+
+        // init KV Cache Location
+        initKVCacheLocation();
     }
 
     // Verbose

From f0236e603b499833d32c189dfbdacc2b6b19cc03 Mon Sep 17 00:00:00 2001
From: sgui <sheng.gui@intel.com>
Date: Wed, 3 Jul 2024 10:23:00 +0000
Subject: [PATCH 4/4] Recheck the code

---
 src/common/kvcache_mgr.h       | 2 +-
 src/models/kvcache_manager.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h
index ed617348..ace2919d 100644
--- a/src/common/kvcache_mgr.h
+++ b/src/common/kvcache_mgr.h
@@ -43,7 +43,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase {
         this->headSize_ = headSize;
         this->layers_ = layers;
         // The KV Cache location configured in "KV_CACHE_LOCATION"
-        this->allocNode = Env::getInstance().getPrimitiveCacheM();
+        this->allocNode = Env::getInstance().getKVCacheLocation();
     }
 
     ~KVCacheMgrImpl() {
diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp
index eafd5dba..c1ad1ced 100644
--- a/src/models/kvcache_manager.cpp
+++ b/src/models/kvcache_manager.cpp
@@ -25,7 +25,7 @@
 template <typename KVCacheT>
 void KVCacheManager<KVCacheT>::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) {
     // The KV Cache location configured in "KV_CACHE_LOCATION"
-    this->allocNode = Env::getInstance().getPrimitiveCacheM();
+    this->allocNode = Env::getInstance().getKVCacheLocation();
     if (prefix && this->cachedPrefixKeys == nullptr) {
         this->cachedPrefixKeys = new KVCacheTensor<KVCacheT>[layers];
         this->cachedPrefixValues = new KVCacheTensor<KVCacheT>[layers];