Skip to content

Commit 889d507

Browse files
fixes and suggestions
- add hash function - propagate hash to sub-functions - fix access block bug (missing padding) - update tests - optimize bit functions - optimize wrapping loop - avoid static functions for cleanup - fix looping over pages in a accessblock
1 parent 1a94bac commit 889d507

File tree

7 files changed

+210
-104
lines changed

7 files changed

+210
-104
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,6 @@ source_group(TREE "${CMAKE_CURRENT_LIST_DIR}/tests" FILES ${testSources})
8787
target_compile_features(tests PRIVATE cxx_std_17)
8888
target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
8989
target_link_libraries(tests PRIVATE alpaka::alpaka Catch2::Catch2WithMain)
90+
# emulate old behaviour to pass tests
91+
target_compile_definitions(tests PRIVATE OLD_BIT_SEARCH=1)
92+
target_compile_definitions(tests PRIVATE WAIST_FACTOR=1u)

src/include/mallocMC/creationPolicies/Scatter.hpp

Lines changed: 108 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf,
55
CERN
66
7-
Author(s): Julian Johannes Lenz
7+
Author(s): Julian Johannes Lenz, Rene Widera
88
99
Permission is hereby granted, free of charge, to any person obtaining a copy
1010
of this software and associated documentation files (the "Software"), to deal
@@ -34,6 +34,7 @@
3434
#include "mallocMC/mallocMC_utils.hpp"
3535

3636
#include <algorithm>
37+
#include <alpaka/alpaka.hpp>
3738
#include <alpaka/atomic/AtomicAtomicRef.hpp>
3839
#include <alpaka/core/Common.hpp>
3940
#include <alpaka/core/Positioning.hpp>
@@ -49,12 +50,11 @@
4950
#include <functional>
5051
#include <iterator>
5152
#include <numeric>
52-
#include <sys/types.h>
53+
#include <optional>
5354
#include <vector>
5455

5556
namespace mallocMC::CreationPolicies::ScatterAlloc
5657
{
57-
constexpr const uint32_t pageTableEntrySize = 4U + 4U;
5858

5959
template<size_t T_numPages>
6060
struct PageTable
@@ -69,7 +69,10 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
6969
public:
7070
ALPAKA_FN_ACC [[nodiscard]] constexpr static auto numPages() -> size_t
7171
{
72-
return T_blockSize / (T_pageSize + pageTableEntrySize);
72+
constexpr auto x = T_blockSize / (T_pageSize + sizeof(PageTable<1>));
73+
// check that the page table entries does not have a padding
74+
static_assert(sizeof(PageTable<x>) == x * sizeof(PageTable<1>));
75+
return x;
7376
}
7477

7578
ALPAKA_FN_ACC [[nodiscard]] auto getAvailableSlots(auto const& acc, uint32_t const chunkSize) -> size_t
@@ -104,8 +107,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
104107
: interpret(index, chunkSize).isValid(acc, pointer);
105108
}
106109

110+
//! @param hashValue the default makes testing easier because we can avoid adding the hash to each call^^
107111
template<typename TAcc>
108-
ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const numBytes) -> void*
112+
ALPAKA_FN_ACC auto create(TAcc const& acc, uint32_t const numBytes, uint32_t const hashValue = 0u) -> void*
109113
{
110114
void* pointer{nullptr};
111115
if(numBytes >= multiPageThreshold())
@@ -114,7 +118,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
114118
}
115119
else
116120
{
117-
pointer = createChunk(acc, numBytes);
121+
pointer = createChunk(acc, numBytes, hashValue);
118122
}
119123
return pointer;
120124
}
@@ -123,9 +127,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
123127
ALPAKA_FN_ACC auto destroy(TAcc const& acc, void* const pointer) -> void
124128
{
125129
auto const index = pageIndex(pointer);
126-
if(index > static_cast<ssize_t>(numPages()) || index < 0)
130+
if(index >= static_cast<ssize_t>(numPages()) || index < 0)
127131
{
128-
#ifndef NDEBUG
132+
#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
129133
throw std::runtime_error{
130134
"Attempted to destroy an invalid pointer! Pointer does not point to any page."};
131135
#endif // NDEBUG
@@ -145,6 +149,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
145149
private:
146150
DataPage<T_pageSize> pages[numPages()]{};
147151
PageTable<numPages()> pageTable{};
152+
char padding[T_blockSize - sizeof(DataPage<T_pageSize>) * numPages() - sizeof(PageTable<numPages()>)];
148153

149154
ALPAKA_FN_ACC constexpr static auto multiPageThreshold() -> uint32_t
150155
{
@@ -202,15 +207,20 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
202207
return numPages();
203208
}
204209

205-
ALPAKA_FN_ACC static auto startIndex(auto const& acc)
210+
ALPAKA_FN_ACC static auto startIndex(auto const& acc, uint32_t const hashValue)
206211
{
207-
return (laneid() * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum()) % numPages();
212+
return (hashValue >> 8u) % numPages();
213+
}
214+
215+
ALPAKA_FN_ACC bool isValidPageIdx(uint32_t const index) const
216+
{
217+
return index != noFreePageFound() && index < numPages();
208218
}
209219

210220
template<typename TAcc>
211-
ALPAKA_FN_ACC auto createChunk(TAcc const& acc, uint32_t const numBytes) -> void*
221+
ALPAKA_FN_ACC auto createChunk(TAcc const& acc, uint32_t const numBytes, uint32_t const hashValue) -> void*
212222
{
213-
auto index = startIndex(acc);
223+
auto index = startIndex(acc, hashValue);
214224

215225
// Under high pressure, this loop could potentially run for a long time because the information where and
216226
// when we started our search is not maintained and/or used. This is a feature, not a bug: Given a
@@ -223,42 +233,62 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
223233
//
224234
// In the latter case, it is considered desirable to wrap around multiple times until the thread was fast
225235
// enough to acquire some memory.
226-
index = choosePage(acc, numBytes, index);
227-
void* pointer = index != noFreePageFound()
228-
? PageInterpretation<T_pageSize>{pages[index], numBytes}.create(acc)
229-
: nullptr;
230-
231-
while(index != noFreePageFound() and pointer == nullptr)
236+
void* pointer = nullptr;
237+
do
232238
{
233-
leavePage(acc, index);
234-
++index;
235-
index = choosePage(acc, numBytes, index);
236-
pointer = PageInterpretation<T_pageSize>{pages[index], numBytes}.create(acc);
237-
}
238-
239+
index = (index + 1) % numPages();
240+
uint32_t chunkSize = numBytes;
241+
index = choosePage(acc, numBytes, chunkSize, index);
242+
if(isValidPageIdx(index))
243+
{
244+
pointer = PageInterpretation<T_pageSize>{pages[index], chunkSize}.create(acc, hashValue);
245+
if(pointer == nullptr)
246+
leavePage(acc, index);
247+
}
248+
} while(isValidPageIdx(index) and pointer == nullptr);
239249
return pointer;
240250
}
241251

242252
template<typename TAcc>
243-
ALPAKA_FN_ACC auto choosePage(TAcc const& acc, uint32_t const numBytes, size_t const startIndex = 0) -> size_t
253+
ALPAKA_FN_ACC auto choosePage(
254+
TAcc const& acc,
255+
uint32_t const numBytes,
256+
uint32_t& chunkSize,
257+
size_t const startIndex = 0) -> size_t
244258
{
245259
return wrappingLoop(
246260
acc,
247261
startIndex,
248262
numPages(),
249263
noFreePageFound(),
250-
[this, numBytes](auto const& localAcc, auto const index)
251-
{ return thisPageIsAppropriate(localAcc, index, numBytes) ? index : noFreePageFound(); });
264+
[this, numBytes, &chunkSize](auto const& localAcc, auto const index)
265+
{ return thisPageIsAppropriate(localAcc, index, numBytes, chunkSize) ? index : noFreePageFound(); });
252266
}
253267

268+
269+
#ifndef WAIST_FACTOR
270+
# define WAIST_FACTOR 2u
271+
#endif
254272
template<typename TAcc>
255-
ALPAKA_FN_ACC auto thisPageIsAppropriate(TAcc const& acc, size_t const index, uint32_t const numBytes) -> bool
273+
ALPAKA_FN_ACC auto thisPageIsAppropriate(
274+
TAcc const& acc,
275+
size_t const index,
276+
uint32_t const numBytes,
277+
uint32_t& chunkSize) -> bool
256278
{
257279
bool appropriate = false;
258280
if(enterPage(acc, index, numBytes))
259281
{
260282
auto oldChunkSize = atomicCAS(acc, pageTable._chunkSizes[index], 0U, numBytes);
283+
#if 0
261284
appropriate = (oldChunkSize == 0U || oldChunkSize == numBytes);
285+
chunkSize = std::max(oldChunkSize, numBytes);
286+
#else
287+
constexpr uint32_t waistFactor = WAIST_FACTOR;
288+
appropriate
289+
= (oldChunkSize == 0U || (oldChunkSize >= numBytes && oldChunkSize <= numBytes * waistFactor));
290+
chunkSize = std::max(oldChunkSize, numBytes);
291+
#endif
262292
}
263293
if(not appropriate)
264294
{
@@ -276,13 +306,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
276306
// Using T_pageSize/2 as chunkSize when entering the page means that the page reports to have at most one
277307
// chunk available.
278308
auto dummyChunkSize = T_pageSize / 2;
309+
uint32_t chunkSize = numBytes;
279310
for(size_t firstIndex = 0; firstIndex < numPages() - (numPagesNeeded - 1) and result == nullptr;
280311
++firstIndex)
281312
{
282313
size_t numPagesAcquired{};
283314
for(numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
284315
{
285-
if(not thisPageIsAppropriate(acc, firstIndex + numPagesAcquired, dummyChunkSize))
316+
if(not thisPageIsAppropriate(acc, firstIndex + numPagesAcquired, dummyChunkSize, chunkSize))
286317
{
287318
for(size_t cleanupIndex = numPagesAcquired; cleanupIndex > 0; --cleanupIndex)
288319
{
@@ -297,15 +328,15 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
297328
// still have to replace the dummy chunkSize with the real one.
298329
for(numPagesAcquired = 0U; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
299330
{
300-
#ifndef NDEBUG
331+
#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
301332
auto oldChunkSize =
302333
#endif
303334
atomicCAS(
304335
acc,
305336
pageTable._chunkSizes[firstIndex + numPagesAcquired],
306337
T_pageSize / 2,
307338
numBytes);
308-
#ifndef NDEBUG
339+
#if(!defined(NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
309340
if(oldChunkSize != dummyChunkSize)
310341
{
311342
throw std::runtime_error{"Unexpected intermediate chunkSize in multi-page allocation."};
@@ -364,7 +395,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
364395
// Furthermore, chunkSize cannot have changed because we maintain the invariant that the
365396
// filling level is always considered first, so no other thread can have passed that barrier to
366397
// reset it.
367-
PageInterpretation<T_pageSize>{pages[pageIndex], chunkSize}.cleanup();
398+
PageInterpretation<T_pageSize>{pages[pageIndex], 1u}.resetBitfields();
368399
alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
369400

370401
// It is important to keep this after the clean-up line above: Otherwise another thread with a
@@ -398,9 +429,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
398429
{
399430
size_t heapSize{};
400431
AccessBlock<T_blockSize, T_pageSize>* accessBlocks{};
432+
volatile uint32_t block = 0u;
401433

402434
ALPAKA_FN_ACC [[nodiscard]] auto numBlocks() const -> size_t
403435
{
436+
// Guarantee that each access block start address is aligned.
437+
static_assert(
438+
T_blockSize == sizeof(AccessBlock<T_blockSize, T_pageSize>),
439+
"accessblock should equal to the use given block size to have a guaranteed alignment for pointers.");
404440
return heapSize / T_blockSize;
405441
}
406442

@@ -409,27 +445,59 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
409445
return numBlocks();
410446
}
411447

412-
ALPAKA_FN_ACC auto startIndex(auto const& acc, uint32_t const numBytes) const
448+
ALPAKA_FN_ACC auto hash(auto const& acc, uint32_t const numBytes) const
449+
{
450+
constexpr uint32_t hashingK = 38183u;
451+
constexpr uint32_t hashingDistMP = 17497u;
452+
constexpr uint32_t hashingDistWP = 1u;
453+
constexpr uint32_t hashingDistWPRel = 1u;
454+
455+
const uint32_t numpages = AccessBlock<T_blockSize, T_pageSize>::numPages();
456+
const uint32_t pagesperblock = numpages / numBlocks();
457+
const uint32_t reloff = warpSize * numBytes / T_pageSize;
458+
const uint32_t hash
459+
= (numBytes * hashingK + hashingDistMP * smid()
460+
+ (hashingDistWP + hashingDistWPRel * reloff) * warpid());
461+
return hash;
462+
}
463+
464+
ALPAKA_FN_ACC auto startIndex(auto const&, uint32_t const blockValue, uint32_t const hashValue)
413465
{
414-
return (numBytes * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum()) % numBlocks();
466+
#if 1
467+
constexpr uint32_t blockStride = 4;
468+
return ((hashValue % blockStride) + (blockValue * blockStride)) % numBlocks();
469+
#else
470+
return (block + hashValue) % numBlocks();
471+
#endif
415472
}
416473

417474
template<typename AlignmentPolicy, typename AlpakaAcc>
418-
ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32_t bytes) -> void*
475+
ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32_t const bytes) -> void*
419476
{
477+
auto blockValue = block;
478+
auto hashValue = hash(acc, bytes);
479+
auto startIdx = startIndex(acc, blockValue, hashValue);
420480
return wrappingLoop(
421481
acc,
422-
startIndex(acc, bytes),
482+
startIdx,
423483
numBlocks(),
424484
static_cast<void*>(nullptr),
425-
[this, bytes](auto const& localAcc, auto const index)
426-
{ return accessBlocks[index].create(localAcc, bytes); });
485+
[this, bytes, &acc, startIdx, &hashValue, blockValue](auto const& localAcc, auto const index) mutable
486+
{
487+
auto ptr = accessBlocks[index].create(localAcc, bytes, hashValue);
488+
if(!ptr && index == startIdx)
489+
if(blockValue == block)
490+
block = blockValue + 1;
491+
return ptr;
492+
});
427493
}
428494

429495
template<typename AlpakaAcc>
430496
ALPAKA_FN_ACC auto destroy(const AlpakaAcc& acc, void* pointer) -> void
431497
{
432-
auto blockIndex = indexOf(pointer, accessBlocks, T_blockSize);
498+
// indexOf requires the access block size instead of T_blockSize in case the reinterpreted AccessBlock
499+
// object is smaller than T_blockSize.
500+
auto blockIndex = indexOf(pointer, accessBlocks, sizeof(AccessBlock<T_blockSize, T_pageSize>));
433501
accessBlocks[blockIndex].destroy(acc, pointer);
434502
}
435503
};
@@ -454,7 +522,7 @@ struct InitKernel
454522
namespace mallocMC::CreationPolicies
455523
{
456524

457-
template<typename T_HeapConfig, typename T_HashConfig>
525+
template<typename T_HeapConfig, typename T_HashConfig = void>
458526
struct Scatter : public ScatterAlloc::Heap<T_HeapConfig::accessblocksize, T_HeapConfig::pagesize>
459527
{
460528
static_assert(T_HeapConfig::resetfreedpages, "resetfreedpages = false is no longer implemented.");

0 commit comments

Comments
 (0)