Skip to content

Commit 8ce4186

Browse files
fixes and suggestions
- add hash function - propagate hash to sub-functions - fix access block bug (missing padding) - update tests - optimize bit functions - optimize wrapping loop - avoid static functions for cleanup - fix looping over pages in a accessblock
1 parent 1a94bac commit 8ce4186

File tree

7 files changed

+213
-104
lines changed

7 files changed

+213
-104
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,6 @@ source_group(TREE "${CMAKE_CURRENT_LIST_DIR}/tests" FILES ${testSources})
8787
target_compile_features(tests PRIVATE cxx_std_17)
8888
target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
8989
target_link_libraries(tests PRIVATE alpaka::alpaka Catch2::Catch2WithMain)
90+
# emulate old behaviour to pass tests
91+
target_compile_definitions(tests PRIVATE OLD_BIT_SEARCH=1)
92+
target_compile_definitions(tests PRIVATE WAIST_FACTOR=1u)

src/include/mallocMC/creationPolicies/Scatter.hpp

Lines changed: 109 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf,
55
CERN
66
7-
Author(s): Julian Johannes Lenz
7+
Author(s): Julian Johannes Lenz, Rene Widera
88
99
Permission is hereby granted, free of charge, to any person obtaining a copy
1010
of this software and associated documentation files (the "Software"), to deal
@@ -30,10 +30,12 @@
3030
#include "mallocMC/auxiliary.hpp"
3131
#include "mallocMC/creationPolicies/Scatter/BitField.hpp"
3232
#include "mallocMC/creationPolicies/Scatter/DataPage.hpp"
33+
#include "mallocMC/creationPolicies/Scatter/Hash.hpp"
3334
#include "mallocMC/creationPolicies/Scatter/PageInterpretation.hpp"
3435
#include "mallocMC/mallocMC_utils.hpp"
3536

3637
#include <algorithm>
38+
#include <alpaka/alpaka.hpp>
3739
#include <alpaka/atomic/AtomicAtomicRef.hpp>
3840
#include <alpaka/core/Common.hpp>
3941
#include <alpaka/core/Positioning.hpp>
@@ -49,12 +51,11 @@
4951
#include <functional>
5052
#include <iterator>
5153
#include <numeric>
52-
#include <sys/types.h>
54+
#include <optional>
5355
#include <vector>
5456

5557
namespace mallocMC::CreationPolicies::ScatterAlloc
5658
{
57-
constexpr const uint32_t pageTableEntrySize = 4U + 4U;
5859

5960
template<size_t T_numPages>
6061
struct PageTable
@@ -69,7 +70,10 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
6970
public:
7071
ALPAKA_FN_ACC [[nodiscard]] constexpr static auto numPages() -> size_t
7172
{
72-
return T_blockSize / (T_pageSize + pageTableEntrySize);
73+
constexpr auto x = T_blockSize / (T_pageSize + sizeof(PageTable<1>));
74+
// check that the page table entries does not have a padding
75+
static_assert(sizeof(PageTable<x>) == x * sizeof(PageTable<1>));
76+
return x;
7377
}
7478

7579
ALPAKA_FN_ACC [[nodiscard]] auto getAvailableSlots(auto const& acc, uint32_t const chunkSize) -> size_t
@@ -104,8 +108,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
104108
: interpret(index, chunkSize).isValid(acc, pointer);
105109
}
106110

111+
//! @param hashValue the default makes testing easier because we can avoid adding the hash to each call^^
107112
template<typename TAcc>
108-
ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const numBytes) -> void*
113+
ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const numBytes, uint32_t const hashValue = 0u) -> void*
109114
{
110115
void* pointer{nullptr};
111116
if(numBytes >= multiPageThreshold())
@@ -114,7 +119,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
114119
}
115120
else
116121
{
117-
pointer = createChunk(acc, numBytes);
122+
pointer = createChunk(acc, numBytes, hashValue);
118123
}
119124
return pointer;
120125
}
@@ -123,9 +128,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
123128
ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* const pointer) -> void
124129
{
125130
auto const index = pageIndex(pointer);
126-
if(index > static_cast<ssize_t>(numPages()) || index < 0)
131+
if(index >= static_cast<ssize_t>(numPages()) || index < 0)
127132
{
128-
#ifndef NDEBUG
133+
#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
129134
throw std::runtime_error{
130135
"Attempted to destroy an invalid pointer! Pointer does not point to any page."};
131136
#endif // NDEBUG
@@ -145,6 +150,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
145150
private:
146151
DataPage<T_pageSize> pages[numPages()]{};
147152
PageTable<numPages()> pageTable{};
153+
char padding[T_blockSize - sizeof(DataPage<T_pageSize>) * numPages() - sizeof(PageTable<numPages()>)];
148154

149155
ALPAKA_FN_ACC constexpr static auto multiPageThreshold() -> uint32_t
150156
{
@@ -202,15 +208,20 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
202208
return numPages();
203209
}
204210

205-
ALPAKA_FN_ACC static auto startIndex(auto const& acc)
211+
ALPAKA_FN_ACC static auto startIndex(auto const& acc, uint32_t const hashValue)
206212
{
207-
return (laneid() * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum()) % numPages();
213+
return (hashValue >> 8u) % numPages();
214+
}
215+
216+
ALPAKA_FN_ACC bool isValidPageIdx(uint32_t const index) const
217+
{
218+
return index != noFreePageFound() && index < numPages();
208219
}
209220

210221
template<typename TAcc>
211-
ALPAKA_FN_ACC auto createChunk(TAcc const& acc, uint32_t const numBytes) -> void*
222+
ALPAKA_FN_ACC auto createChunk(TAcc const& acc, uint32_t const numBytes, uint32_t const hashValue) -> void*
212223
{
213-
auto index = startIndex(acc);
224+
auto index = startIndex(acc, hashValue);
214225

215226
// Under high pressure, this loop could potentially run for a long time because the information where and
216227
// when we started our search is not maintained and/or used. This is a feature, not a bug: Given a
@@ -223,42 +234,62 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
223234
//
224235
// In the latter case, it is considered desirable to wrap around multiple times until the thread was fast
225236
// enough to acquire some memory.
226-
index = choosePage(acc, numBytes, index);
227-
void* pointer = index != noFreePageFound()
228-
? PageInterpretation<T_pageSize>{pages[index], numBytes}.create(acc)
229-
: nullptr;
230-
231-
while(index != noFreePageFound() and pointer == nullptr)
237+
void* pointer = nullptr;
238+
do
232239
{
233-
leavePage(acc, index);
234-
++index;
235-
index = choosePage(acc, numBytes, index);
236-
pointer = PageInterpretation<T_pageSize>{pages[index], numBytes}.create(acc);
237-
}
238-
240+
index = (index + 1) % numPages();
241+
uint32_t chunkSize = numBytes;
242+
index = choosePage(acc, numBytes, chunkSize, index);
243+
if(isValidPageIdx(index))
244+
{
245+
pointer = PageInterpretation<T_pageSize>{pages[index], chunkSize}.create(acc, hashValue);
246+
if(pointer == nullptr)
247+
leavePage(acc, index);
248+
}
249+
} while(isValidPageIdx(index) and pointer == nullptr);
239250
return pointer;
240251
}
241252

242253
template<typename TAcc>
243-
ALPAKA_FN_ACC auto choosePage(TAcc const& acc, uint32_t const numBytes, size_t const startIndex = 0) -> size_t
254+
ALPAKA_FN_ACC auto choosePage(
255+
TAcc const& acc,
256+
uint32_t const numBytes,
257+
uint32_t& chunkSize,
258+
size_t const startIndex = 0) -> size_t
244259
{
245260
return wrappingLoop(
246261
acc,
247262
startIndex,
248263
numPages(),
249264
noFreePageFound(),
250-
[this, numBytes](auto const& localAcc, auto const index)
251-
{ return thisPageIsAppropriate(localAcc, index, numBytes) ? index : noFreePageFound(); });
265+
[this, numBytes, &chunkSize](auto const& localAcc, auto const index)
266+
{ return thisPageIsAppropriate(localAcc, index, numBytes, chunkSize) ? index : noFreePageFound(); });
252267
}
253268

269+
270+
#ifndef WAIST_FACTOR
271+
# define WAIST_FACTOR 2u
272+
#endif
254273
template<typename TAcc>
255-
ALPAKA_FN_ACC auto thisPageIsAppropriate(TAcc const& acc, size_t const index, uint32_t const numBytes) -> bool
274+
ALPAKA_FN_ACC auto thisPageIsAppropriate(
275+
TAcc const& acc,
276+
size_t const index,
277+
uint32_t const numBytes,
278+
uint32_t& chunkSize) -> bool
256279
{
257280
bool appropriate = false;
258281
if(enterPage(acc, index, numBytes))
259282
{
260283
auto oldChunkSize = atomicCAS(acc, pageTable._chunkSizes[index], 0U, numBytes);
284+
#if 0
261285
appropriate = (oldChunkSize == 0U || oldChunkSize == numBytes);
286+
chunkSize = std::max(oldChunkSize, numBytes);
287+
#else
288+
constexpr uint32_t waistFactor = WAIST_FACTOR;
289+
appropriate
290+
= (oldChunkSize == 0U || (oldChunkSize >= numBytes && oldChunkSize <= numBytes * waistFactor));
291+
chunkSize = std::max(oldChunkSize, numBytes);
292+
#endif
262293
}
263294
if(not appropriate)
264295
{
@@ -276,13 +307,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
276307
// Using T_pageSize/2 as chunkSize when entering the page means that the page reports to have at most one
277308
// chunk available.
278309
auto dummyChunkSize = T_pageSize / 2;
310+
uint32_t chunkSize = numBytes;
279311
for(size_t firstIndex = 0; firstIndex < numPages() - (numPagesNeeded - 1) and result == nullptr;
280312
++firstIndex)
281313
{
282314
size_t numPagesAcquired{};
283315
for(numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
284316
{
285-
if(not thisPageIsAppropriate(acc, firstIndex + numPagesAcquired, dummyChunkSize))
317+
if(not thisPageIsAppropriate(acc, firstIndex + numPagesAcquired, dummyChunkSize, chunkSize))
286318
{
287319
for(size_t cleanupIndex = numPagesAcquired; cleanupIndex > 0; --cleanupIndex)
288320
{
@@ -297,15 +329,15 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
297329
// still have to replace the dummy chunkSize with the real one.
298330
for(numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
299331
{
300-
#ifndef NDEBUG
332+
#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
301333
auto oldChunkSize =
302334
#endif
303335
atomicCAS(
304336
acc,
305337
pageTable._chunkSizes[firstIndex + numPagesAcquired],
306338
T_pageSize / 2,
307339
numBytes);
308-
#ifndef NDEBUG
340+
#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
309341
if(oldChunkSize != dummyChunkSize)
310342
{
311343
throw std::runtime_error{"Unexpected intermediate chunkSize in multi-page allocation."};
@@ -364,7 +396,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
364396
// Furthermore, chunkSize cannot have changed because we maintain the invariant that the
365397
// filling level is always considered first, so no other thread can have passed that barrier to
366398
// reset it.
367-
PageInterpretation<T_pageSize>{pages[pageIndex], chunkSize}.cleanup();
399+
PageInterpretation<T_pageSize>{pages[pageIndex], 1u}.resetBitfields();
368400
alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
369401

370402
// It is important to keep this after the clean-up line above: Otherwise another thread with a
@@ -398,9 +430,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
398430
{
399431
size_t heapSize{};
400432
AccessBlock<T_blockSize, T_pageSize>* accessBlocks{};
433+
volatile uint32_t block = 0u;
401434

402435
ALPAKA_FN_ACC [[nodiscard]] auto numBlocks() const -> size_t
403436
{
437+
// Guarantee that each access block start address is aligned.
438+
static_assert(
439+
T_blockSize == sizeof(AccessBlock<T_blockSize, T_pageSize>),
440+
"accessblock should equal to the use given block size to have a guaranteed alignment for pointers.");
404441
return heapSize / T_blockSize;
405442
}
406443

@@ -409,27 +446,59 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
409446
return numBlocks();
410447
}
411448

412-
ALPAKA_FN_ACC auto startIndex(auto const& acc, uint32_t const numBytes) const
449+
ALPAKA_FN_ACC auto hash(auto const& acc, uint32_t const numBytes) const
450+
{
451+
constexpr uint32_t hashingK = 38183u;
452+
constexpr uint32_t hashingDistMP = 17497u;
453+
constexpr uint32_t hashingDistWP = 1u;
454+
constexpr uint32_t hashingDistWPRel = 1u;
455+
456+
const uint32_t numpages = AccessBlock<T_blockSize, T_pageSize>::numPages();
457+
const uint32_t pagesperblock = numpages / numBlocks();
458+
const uint32_t reloff = warpSize * numBytes / T_pageSize;
459+
const uint32_t hash
460+
= (numBytes * hashingK + hashingDistMP * smid()
461+
+ (hashingDistWP + hashingDistWPRel * reloff) * warpid());
462+
return hash;
463+
}
464+
465+
ALPAKA_FN_ACC auto startIndex(auto const&, uint32_t const blockValue, uint32_t const hashValue)
413466
{
414-
return (numBytes * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum()) % numBlocks();
467+
#if 1
468+
constexpr uint32_t blockStride = 4;
469+
return ((hashValue % blockStride) + (blockValue * blockStride)) % numBlocks();
470+
#else
471+
return (block + hashValue) % numBlocks();
472+
#endif
415473
}
416474

417475
template<typename AlignmentPolicy, typename AlpakaAcc>
418-
ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32_t bytes) -> void*
476+
ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32_t const bytes) -> void*
419477
{
478+
auto blockValue = block;
479+
auto hashValue = hash(acc, bytes);
480+
auto startIdx = startIndex(acc, blockValue, hashValue);
420481
return wrappingLoop(
421482
acc,
422-
startIndex(acc, bytes),
483+
startIdx,
423484
numBlocks(),
424485
static_cast<void*>(nullptr),
425-
[this, bytes](auto const& localAcc, auto const index)
426-
{ return accessBlocks[index].create(localAcc, bytes); });
486+
[this, bytes, &acc, startIdx, &hashValue, blockValue](auto const& localAcc, auto const index) mutable
487+
{
488+
auto ptr = accessBlocks[index].create(localAcc, bytes, hashValue);
489+
if(!ptr && index == startIdx)
490+
if(blockValue == block)
491+
block = blockValue + 1;
492+
return ptr;
493+
});
427494
}
428495

429496
template<typename AlpakaAcc>
430497
ALPAKA_FN_ACC auto destroy(const AlpakaAcc& acc, void* pointer) -> void
431498
{
432-
auto blockIndex = indexOf(pointer, accessBlocks, T_blockSize);
499+
// indexOf requires the access block size instead of T_blockSize in case the reinterpreted AccessBlock
500+
// object is smaller than T_blockSize.
501+
auto blockIndex = indexOf(pointer, accessBlocks, sizeof(AccessBlock<T_blockSize, T_pageSize>));
433502
accessBlocks[blockIndex].destroy(acc, pointer);
434503
}
435504
};
@@ -454,7 +523,7 @@ struct InitKernel
454523
namespace mallocMC::CreationPolicies
455524
{
456525

457-
template<typename T_HeapConfig, typename T_HashConfig>
526+
template<typename T_HeapConfig, typename T_HashConfig = void>
458527
struct Scatter : public ScatterAlloc::Heap<T_HeapConfig::accessblocksize, T_HeapConfig::pagesize>
459528
{
460529
static_assert(T_HeapConfig::resetfreedpages, "resetfreedpages = false is no longer implemented.");

0 commit comments

Comments
 (0)