From e80f2666aba0f93da9270d33264711a06d145465 Mon Sep 17 00:00:00 2001 From: sgui Date: Fri, 28 Jun 2024 00:41:58 +0000 Subject: [PATCH 1/4] Add env param KV_CACHE_LOCATION to control kv cache memory numanode location Usage: before you run instance export KV_CACHE_LOCATION=#memory_numa_node_id_you_want_to_use_for_kv_cache by defaults, kv_cache location is the same as other parts of instance. --- src/common/kvcache_mgr.h | 5 ++++- src/common/kvcache_tensor.h | 36 ++++++++++++++++++++++------------ src/models/kvcache_manager.cpp | 14 +++++++------ src/models/kvcache_manager.h | 1 + 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h index c2fa271b..8f91f799 100644 --- a/src/common/kvcache_mgr.h +++ b/src/common/kvcache_mgr.h @@ -41,6 +41,8 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase { this->headNum_ = headNum; this->headSize_ = headSize; this->layers_ = layers; + // The KV Cache location configured in "KV_CACHE_LOCATION" + this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1; } ~KVCacheMgrImpl() { @@ -89,7 +91,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase { // User specified maxSeqLen needs to be <= model's configured maxSeqLen auto maxLen = maxSeqLen > 0 ? std::min(maxSeqLen, maxSeqLen_) : maxSeqLen_; for (int i = 0; i < 2 * layers_; ++i) { - cache[i].resize(maxLen, 1, headNum_, headSize_); + cache[i].resize(maxLen, 1, headNum_, headSize_, this->allocNode); } sequenceCaches.insert({seqID, cache}); @@ -186,6 +188,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase { int headNum_; int headSize_; int layers_; + int allocNode; }; class KVCacheMgr { diff --git a/src/common/kvcache_tensor.h b/src/common/kvcache_tensor.h index 438bd1f9..a37d7ade 100644 --- a/src/common/kvcache_tensor.h +++ b/src/common/kvcache_tensor.h @@ -24,6 +24,7 @@ #include "allocator.h" #include "bfloat16.h" #include "float16.h" +#include "numa_allocator.h" extern bool kvTrans(); @@ -67,23 +68,23 @@ template class KVCacheTensor { public: KVCacheTensor() - : maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr) {} + : maxSeqLen(0), batchSize(0), headNum(0), headSize(0), data(nullptr), allocSize(0), scales(nullptr), scalesAllocSize(0) {} ~KVCacheTensor() { - if (this->data) { free(this->data); } - if (this->scales) { free(this->scales); } + if (this->data) { xft_numa_free(this->data, allocSize); } + if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); } } - void resize(int maxSeqLen, int batchSize, int headNum, int headSize) { + void resize(int maxSeqLen, int batchSize, int headNum, int headSize, int allocNode) { this->maxSeqLen = maxSeqLen; this->batchSize = batchSize; this->headNum = headNum; this->headSize = headSize; - uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize; + uint64_t requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * headSize * sizeof(T); if (requiredSize > allocSize) { - if (this->data) { free(this->data); } - this->data = (T *)xft::alloc(requiredSize * sizeof(T)); + if (this->data) { xft_numa_free(this->data, allocSize); } + this->data = (T *)xft_numa_alloc_onnode(requiredSize, allocNode); if (!this->data) { printf("Failed to alloc mem for KV Cache [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize); exit(-1); @@ -91,8 +92,16 @@ class KVCacheTensor { allocSize = requiredSize; } - if (this->scales) { free(this->scales); } - this->scales = (float *)xft::alloc((uint64_t)maxSeqLen * batchSize * headNum * sizeof(float)); + requiredSize = (uint64_t)maxSeqLen * batchSize * headNum * sizeof(float); + if (requiredSize > scalesAllocSize) { + if (this->scales) { xft_numa_free(this->scales, scalesAllocSize); } + this->scales = (float *)xft_numa_alloc_onnode(requiredSize, allocNode); + if (!this->scales) { + printf("Failed to alloc mem for KV Cache scales [%d][%d][%d][%d].\n", maxSeqLen, batchSize, headNum, headSize); + exit(-1); + } + scalesAllocSize = requiredSize; + } } int getBatchSize() const { return batchSize; } @@ -188,15 +197,15 @@ class KVCacheTensor { * initSeqLen: initial sequence length, which is the prompt token size * accSeqLen: accumulated sequence length */ - void reorder(int *idx, int size, int initSeqLen, int accSeqLen) { + void reorder(int *idx, int size, int initSeqLen, int accSeqLen, int allocNode) { const int cols = this->getHeadNum() * this->getHeadSize(); const int batchSize = this->getBatchSize(); T *pdata = this->data + initSeqLen * batchSize * cols; // Temporary buffer used for reorder - T *extraKeyBuf = (T *)xft::alloc((batchSize - 1) * cols * sizeof(T)); - + uint64_t requiredSize = (uint64_t)(batchSize - 1) * cols * sizeof(T); + T *extraKeyBuf = (T *)xft_numa_alloc_onnode(requiredSize, allocNode); for (int seq = initSeqLen; seq < accSeqLen; ++seq) { // Reorder is not needed for the first few lines int extraBufIdx = 0; int remapped[batchSize]; @@ -260,7 +269,7 @@ class KVCacheTensor { pdata += batchSize * cols; } - free(extraKeyBuf); + xft_numa_free(extraKeyBuf, requiredSize); } private: @@ -327,4 +336,5 @@ class KVCacheTensor { // The scale factor for each head (if T is int8) float *scales; + uint64_t scalesAllocSize; }; diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp index 13ccec92..de324c03 100644 --- a/src/models/kvcache_manager.cpp +++ b/src/models/kvcache_manager.cpp @@ -23,17 +23,19 @@ template void KVCacheManager::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) { + // The KV Cache location configured in "KV_CACHE_LOCATION" + this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1; if (prefix && this->cachedPrefixKeys == nullptr) { this->cachedPrefixKeys = new KVCacheTensor[layers]; this->cachedPrefixValues = new KVCacheTensor[layers]; } for (int i = 0; i < this->layers; ++i) { if (prefix) { - this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize); - this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize); + this->cachedPrefixKeys[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode); + this->cachedPrefixValues[i].resize(maxSeqLen, 1, headsPerSplit, headSize, this->allocNode); } else { - this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize); - this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize); + this->cachedKeys[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode); + this->cachedValues[i].resize(maxSeqLen, batchSize, headsPerSplit, headSize, this->allocNode); } } } @@ -100,10 +102,10 @@ void KVCacheManager::reorderCache(int *idx, int size, int initSeqLen, int layer = i / 2; if (i % 2 == 0) { KVCacheTensor &keyTensor = this->getKey(layer); - keyTensor.reorder(idx, size, initSeqLen, accSeqLen); + keyTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode); } else { KVCacheTensor &valueTensor = this->getValue(layer); - valueTensor.reorder(idx, size, initSeqLen, accSeqLen); + valueTensor.reorder(idx, size, initSeqLen, accSeqLen, this->allocNode); } } } diff --git a/src/models/kvcache_manager.h b/src/models/kvcache_manager.h index 6f593029..430e88a5 100644 --- a/src/models/kvcache_manager.h +++ b/src/models/kvcache_manager.h @@ -69,6 +69,7 @@ class KVCacheManager { void reorderCache(int *idx, int size, int initSeqLen, int accSeqLen); private: + int allocNode; int layers; // how many layers KVCacheTensor *cachedKeys; // all accumulated keys KVCacheTensor *cachedValues; // all accumulated values From c720606b5176ad4db16aeb8df552570d51a0de53 Mon Sep 17 00:00:00 2001 From: sgui Date: Tue, 2 Jul 2024 13:46:33 +0000 Subject: [PATCH 2/4] move env KV_CACHE_LOCATION init & get into environment.h --- src/common/kvcache_mgr.h | 7 ++++--- src/models/kvcache_manager.cpp | 5 +++-- src/utils/environment.h | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h index 8f91f799..ed617348 100644 --- a/src/common/kvcache_mgr.h +++ b/src/common/kvcache_mgr.h @@ -13,10 +13,11 @@ // limitations under the License. // ============================================================================ #pragma once - +#include #include + +#include "environment.h" #include "kvcache_tensor.h" -#include namespace xft { @@ -42,7 +43,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase { this->headSize_ = headSize; this->layers_ = layers; // The KV Cache location configured in "KV_CACHE_LOCATION" - this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1; + this->allocNode = Env::getInstance().getPrimitiveCacheM(); } ~KVCacheMgrImpl() { diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp index de324c03..eafd5dba 100644 --- a/src/models/kvcache_manager.cpp +++ b/src/models/kvcache_manager.cpp @@ -12,19 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================ -#include "kvcache_manager.h" #include #include #include #include #include "allocator.h" #include "bfloat16.h" +#include "environment.h" #include "float16.h" +#include "kvcache_manager.h" template void KVCacheManager::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) { // The KV Cache location configured in "KV_CACHE_LOCATION" - this->allocNode = getenv("KV_CACHE_LOCATION") ? atoi(getenv("KV_CACHE_LOCATION")) : -1; + this->allocNode = Env::getInstance().getPrimitiveCacheM(); if (prefix && this->cachedPrefixKeys == nullptr) { this->cachedPrefixKeys = new KVCacheTensor[layers]; this->cachedPrefixValues = new KVCacheTensor[layers]; diff --git a/src/utils/environment.h b/src/utils/environment.h index ddcb2df4..446b85a2 100644 --- a/src/utils/environment.h +++ b/src/utils/environment.h @@ -71,6 +71,9 @@ class Env { // get Primitive Cache M int getPrimitiveCacheM() { return primitiveCacheM; } + // get KV Cache Location + int getKVCacheLocation() { return primitiveCacheM; } + private: Env() { // init Verbose @@ -281,4 +284,16 @@ class Env { primitiveCacheM = 256; } } + + // KV_CACHE_LOCATION + int kvCacheLocation = -1; + void initKVCacheLocation() { + // The KV Cache location configured in "KV_CACHE_LOCATION" + char *xft_kvcache_location_value = getenv("KV_CACHE_LOCATION"); + if (xft_kvcache_location_value != NULL) { + int value = atoi(xft_kvcache_location_value); + if (value >= 0) + kvCacheLocation = value; + } + } }; \ No newline at end of file From 2b7d50fcd44afc74db6b434b0e1d016d7e257e88 Mon Sep 17 00:00:00 2001 From: sgui Date: Wed, 3 Jul 2024 02:42:51 +0000 Subject: [PATCH 3/4] Fix typo --- src/utils/environment.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils/environment.h b/src/utils/environment.h index 446b85a2..483d791a 100644 --- a/src/utils/environment.h +++ b/src/utils/environment.h @@ -72,7 +72,7 @@ class Env { int getPrimitiveCacheM() { return primitiveCacheM; } // get KV Cache Location - int getKVCacheLocation() { return primitiveCacheM; } + int getKVCacheLocation() { return kvCacheLocation; } private: Env() { @@ -114,6 +114,9 @@ class Env { // init Primitive Cache M initPrimitiveCacheM(); + + // init KV Cache Location + initKVCacheLocation(); } // Verbose From f0236e603b499833d32c189dfbdacc2b6b19cc03 Mon Sep 17 00:00:00 2001 From: sgui Date: Wed, 3 Jul 2024 10:23:00 +0000 Subject: [PATCH 4/4] Recheck the code --- src/common/kvcache_mgr.h | 2 +- src/models/kvcache_manager.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/kvcache_mgr.h b/src/common/kvcache_mgr.h index ed617348..ace2919d 100644 --- a/src/common/kvcache_mgr.h +++ b/src/common/kvcache_mgr.h @@ -43,7 +43,7 @@ class KVCacheMgrImpl : public KVCacheMgrImplBase { this->headSize_ = headSize; this->layers_ = layers; // The KV Cache location configured in "KV_CACHE_LOCATION" - this->allocNode = Env::getInstance().getPrimitiveCacheM(); + this->allocNode = Env::getInstance().getKVCacheLocation(); } ~KVCacheMgrImpl() { diff --git a/src/models/kvcache_manager.cpp b/src/models/kvcache_manager.cpp index eafd5dba..c1ad1ced 100644 --- a/src/models/kvcache_manager.cpp +++ b/src/models/kvcache_manager.cpp @@ -25,7 +25,7 @@ template void KVCacheManager::resize(int maxSeqLen, int batchSize, int headsPerSplit, int headSize, bool prefix) { // The KV Cache location configured in "KV_CACHE_LOCATION" - this->allocNode = Env::getInstance().getPrimitiveCacheM(); + this->allocNode = Env::getInstance().getKVCacheLocation(); if (prefix && this->cachedPrefixKeys == nullptr) { this->cachedPrefixKeys = new KVCacheTensor[layers]; this->cachedPrefixValues = new KVCacheTensor[layers];