4
4
Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf,
5
5
CERN
6
6
7
- Author(s): Julian Johannes Lenz
7
+ Author(s): Julian Johannes Lenz, Rene Widera
8
8
9
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
10
of this software and associated documentation files (the "Software"), to deal
30
30
#include " mallocMC/auxiliary.hpp"
31
31
#include " mallocMC/creationPolicies/Scatter/BitField.hpp"
32
32
#include " mallocMC/creationPolicies/Scatter/DataPage.hpp"
33
+ #include " mallocMC/creationPolicies/Scatter/Hash.hpp"
33
34
#include " mallocMC/creationPolicies/Scatter/PageInterpretation.hpp"
34
35
#include " mallocMC/mallocMC_utils.hpp"
35
36
36
37
#include < algorithm>
38
+ #include < alpaka/alpaka.hpp>
37
39
#include < alpaka/atomic/AtomicAtomicRef.hpp>
38
40
#include < alpaka/core/Common.hpp>
39
41
#include < alpaka/core/Positioning.hpp>
49
51
#include < functional>
50
52
#include < iterator>
51
53
#include < numeric>
52
- #include < sys/types.h >
54
+ #include < optional >
53
55
#include < vector>
54
56
55
57
namespace mallocMC ::CreationPolicies::ScatterAlloc
56
58
{
57
- constexpr const uint32_t pageTableEntrySize = 4U + 4U ;
58
59
59
60
template <size_t T_numPages>
60
61
struct PageTable
@@ -69,7 +70,10 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
69
70
public:
70
71
ALPAKA_FN_ACC [[nodiscard]] constexpr static auto numPages () -> size_t
71
72
{
72
- return T_blockSize / (T_pageSize + pageTableEntrySize);
73
+ constexpr auto x = T_blockSize / (T_pageSize + sizeof (PageTable<1 >));
74
+ // check that the page table entries does not have a padding
75
+ static_assert (sizeof (PageTable<x>) == x * sizeof (PageTable<1 >));
76
+ return x;
73
77
}
74
78
75
79
ALPAKA_FN_ACC [[nodiscard]] auto getAvailableSlots (auto const & acc, uint32_t const chunkSize) -> size_t
@@ -104,8 +108,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
104
108
: interpret (index, chunkSize).isValid (acc, pointer);
105
109
}
106
110
111
+ // ! @param hashValue the default makes testing easier because we can avoid adding the hash to each call^^
107
112
template <typename TAcc>
108
- ALPAKA_FN_ACC auto create (TAcc const & acc, uint32_t const numBytes) -> void*
113
+ ALPAKA_FN_ACC auto create (TAcc const & acc, uint32_t const numBytes, uint32_t const hashValue = 0u ) -> void*
109
114
{
110
115
void * pointer{nullptr };
111
116
if (numBytes >= multiPageThreshold ())
@@ -114,7 +119,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
114
119
}
115
120
else
116
121
{
117
- pointer = createChunk (acc, numBytes);
122
+ pointer = createChunk (acc, numBytes, hashValue );
118
123
}
119
124
return pointer;
120
125
}
@@ -123,9 +128,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
123
128
ALPAKA_FN_ACC auto destroy (TAcc const & acc, void * const pointer) -> void
124
129
{
125
130
auto const index = pageIndex (pointer);
126
- if (index > static_cast <ssize_t >(numPages ()) || index < 0 )
131
+ if (index >= static_cast <ssize_t >(numPages ()) || index < 0 )
127
132
{
128
- #ifndef NDEBUG
133
+ #if (!defined( NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
129
134
throw std::runtime_error{
130
135
" Attempted to destroy an invalid pointer! Pointer does not point to any page." };
131
136
#endif // NDEBUG
@@ -145,6 +150,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
145
150
private:
146
151
DataPage<T_pageSize> pages[numPages()]{};
147
152
PageTable<numPages()> pageTable{};
153
+ char padding[T_blockSize - sizeof (DataPage<T_pageSize>) * numPages () - sizeof(PageTable<numPages()>)];
148
154
149
155
ALPAKA_FN_ACC constexpr static auto multiPageThreshold () -> uint32_t
150
156
{
@@ -202,15 +208,20 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
202
208
return numPages ();
203
209
}
204
210
205
- ALPAKA_FN_ACC static auto startIndex (auto const & acc)
211
+ ALPAKA_FN_ACC static auto startIndex (auto const & acc, uint32_t const hashValue )
206
212
{
207
- return (laneid () * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum ()) % numPages ();
213
+ return (hashValue >> 8u ) % numPages ();
214
+ }
215
+
216
+ ALPAKA_FN_ACC bool isValidPageIdx (uint32_t const index) const
217
+ {
218
+ return index != noFreePageFound () && index < numPages ();
208
219
}
209
220
210
221
template <typename TAcc>
211
- ALPAKA_FN_ACC auto createChunk (TAcc const & acc, uint32_t const numBytes) -> void*
222
+ ALPAKA_FN_ACC auto createChunk (TAcc const & acc, uint32_t const numBytes, uint32_t const hashValue ) -> void*
212
223
{
213
- auto index = startIndex (acc);
224
+ auto index = startIndex (acc, hashValue );
214
225
215
226
// Under high pressure, this loop could potentially run for a long time because the information where and
216
227
// when we started our search is not maintained and/or used. This is a feature, not a bug: Given a
@@ -223,42 +234,62 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
223
234
//
224
235
// In the latter case, it is considered desirable to wrap around multiple times until the thread was fast
225
236
// enough to acquire some memory.
226
- index = choosePage (acc, numBytes, index);
227
- void * pointer = index != noFreePageFound ()
228
- ? PageInterpretation<T_pageSize>{pages[index], numBytes}.create (acc)
229
- : nullptr ;
230
-
231
- while (index != noFreePageFound () and pointer == nullptr )
237
+ void * pointer = nullptr ;
238
+ do
232
239
{
233
- leavePage (acc, index);
234
- ++index;
235
- index = choosePage (acc, numBytes, index);
236
- pointer = PageInterpretation<T_pageSize>{pages[index], numBytes}.create (acc);
237
- }
238
-
240
+ index = (index + 1 ) % numPages ();
241
+ uint32_t chunkSize = numBytes;
242
+ index = choosePage (acc, numBytes, chunkSize, index);
243
+ if (isValidPageIdx (index))
244
+ {
245
+ pointer = PageInterpretation<T_pageSize>{pages[index], chunkSize}.create (acc, hashValue);
246
+ if (pointer == nullptr )
247
+ leavePage (acc, index);
248
+ }
249
+ } while (isValidPageIdx (index) and pointer == nullptr );
239
250
return pointer;
240
251
}
241
252
242
253
template <typename TAcc>
243
- ALPAKA_FN_ACC auto choosePage (TAcc const & acc, uint32_t const numBytes, size_t const startIndex = 0 ) -> size_t
254
+ ALPAKA_FN_ACC auto choosePage (
255
+ TAcc const & acc,
256
+ uint32_t const numBytes,
257
+ uint32_t & chunkSize,
258
+ size_t const startIndex = 0 ) -> size_t
244
259
{
245
260
return wrappingLoop (
246
261
acc,
247
262
startIndex,
248
263
numPages (),
249
264
noFreePageFound (),
250
- [this , numBytes](auto const & localAcc, auto const index)
251
- { return thisPageIsAppropriate (localAcc, index, numBytes) ? index : noFreePageFound (); });
265
+ [this , numBytes, &chunkSize ](auto const & localAcc, auto const index)
266
+ { return thisPageIsAppropriate (localAcc, index, numBytes, chunkSize ) ? index : noFreePageFound (); });
252
267
}
253
268
269
+
270
+ #ifndef WAIST_FACTOR
271
+ # define WAIST_FACTOR 2u
272
+ #endif
254
273
template <typename TAcc>
255
- ALPAKA_FN_ACC auto thisPageIsAppropriate (TAcc const & acc, size_t const index, uint32_t const numBytes) -> bool
274
+ ALPAKA_FN_ACC auto thisPageIsAppropriate (
275
+ TAcc const & acc,
276
+ size_t const index,
277
+ uint32_t const numBytes,
278
+ uint32_t & chunkSize) -> bool
256
279
{
257
280
bool appropriate = false ;
258
281
if (enterPage (acc, index, numBytes))
259
282
{
260
283
auto oldChunkSize = atomicCAS (acc, pageTable._chunkSizes [index], 0U , numBytes);
284
+ #if 0
261
285
appropriate = (oldChunkSize == 0U || oldChunkSize == numBytes);
286
+ chunkSize = std::max(oldChunkSize, numBytes);
287
+ #else
288
+ constexpr uint32_t waistFactor = WAIST_FACTOR;
289
+ appropriate
290
+ = (oldChunkSize == 0U || (oldChunkSize >= numBytes && oldChunkSize <= numBytes * waistFactor));
291
+ chunkSize = std::max (oldChunkSize, numBytes);
292
+ #endif
262
293
}
263
294
if (not appropriate)
264
295
{
@@ -276,13 +307,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
276
307
// Using T_pageSize/2 as chunkSize when entering the page means that the page reports to have at most one
277
308
// chunk available.
278
309
auto dummyChunkSize = T_pageSize / 2 ;
310
+ uint32_t chunkSize = numBytes;
279
311
for (size_t firstIndex = 0 ; firstIndex < numPages () - (numPagesNeeded - 1 ) and result == nullptr ;
280
312
++firstIndex)
281
313
{
282
314
size_t numPagesAcquired{};
283
315
for (numPagesAcquired = 0U ; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
284
316
{
285
- if (not thisPageIsAppropriate (acc, firstIndex + numPagesAcquired, dummyChunkSize))
317
+ if (not thisPageIsAppropriate (acc, firstIndex + numPagesAcquired, dummyChunkSize, chunkSize ))
286
318
{
287
319
for (size_t cleanupIndex = numPagesAcquired; cleanupIndex > 0 ; --cleanupIndex)
288
320
{
@@ -297,15 +329,15 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
297
329
// still have to replace the dummy chunkSize with the real one.
298
330
for (numPagesAcquired = 0U ; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
299
331
{
300
- #ifndef NDEBUG
332
+ #if (!defined( NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
301
333
auto oldChunkSize =
302
334
#endif
303
335
atomicCAS (
304
336
acc,
305
337
pageTable._chunkSizes [firstIndex + numPagesAcquired],
306
338
T_pageSize / 2 ,
307
339
numBytes);
308
- #ifndef NDEBUG
340
+ #if (!defined( NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
309
341
if (oldChunkSize != dummyChunkSize)
310
342
{
311
343
throw std::runtime_error{" Unexpected intermediate chunkSize in multi-page allocation." };
@@ -364,7 +396,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
364
396
// Furthermore, chunkSize cannot have changed because we maintain the invariant that the
365
397
// filling level is always considered first, so no other thread can have passed that barrier to
366
398
// reset it.
367
- PageInterpretation<T_pageSize>{pages[pageIndex], chunkSize}. cleanup ();
399
+ PageInterpretation<T_pageSize>{pages[pageIndex], 1u }. resetBitfields ();
368
400
alpaka::mem_fence (acc, alpaka::memory_scope::Device{});
369
401
370
402
// It is important to keep this after the clean-up line above: Otherwise another thread with a
@@ -398,9 +430,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
398
430
{
399
431
size_t heapSize{};
400
432
AccessBlock<T_blockSize, T_pageSize>* accessBlocks{};
433
+ volatile uint32_t block = 0u ;
401
434
402
435
ALPAKA_FN_ACC [[nodiscard]] auto numBlocks () const -> size_t
403
436
{
437
+ // Guarantee that each access block start address is aligned.
438
+ static_assert (
439
+ T_blockSize == sizeof (AccessBlock<T_blockSize, T_pageSize>),
440
+ " accessblock should equal to the use given block size to have a guaranteed alignment for pointers." );
404
441
return heapSize / T_blockSize;
405
442
}
406
443
@@ -409,27 +446,59 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
409
446
return numBlocks ();
410
447
}
411
448
412
- ALPAKA_FN_ACC auto startIndex (auto const & acc, uint32_t const numBytes) const
449
+ ALPAKA_FN_ACC auto hash (auto const & acc, uint32_t const numBytes) const
450
+ {
451
+ constexpr uint32_t hashingK = 38183u ;
452
+ constexpr uint32_t hashingDistMP = 17497u ;
453
+ constexpr uint32_t hashingDistWP = 1u ;
454
+ constexpr uint32_t hashingDistWPRel = 1u ;
455
+
456
+ const uint32_t numpages = AccessBlock<T_blockSize, T_pageSize>::numPages ();
457
+ const uint32_t pagesperblock = numpages / numBlocks ();
458
+ const uint32_t reloff = warpSize * numBytes / T_pageSize;
459
+ const uint32_t hash
460
+ = (numBytes * hashingK + hashingDistMP * smid ()
461
+ + (hashingDistWP + hashingDistWPRel * reloff) * warpid ());
462
+ return hash;
463
+ }
464
+
465
+ ALPAKA_FN_ACC auto startIndex (auto const &, uint32_t const blockValue, uint32_t const hashValue)
413
466
{
414
- return (numBytes * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum ()) % numBlocks ();
467
+ #if 1
468
+ constexpr uint32_t blockStride = 4 ;
469
+ return ((hashValue % blockStride) + (blockValue * blockStride)) % numBlocks ();
470
+ #else
471
+ return (block + hashValue) % numBlocks();
472
+ #endif
415
473
}
416
474
417
475
template <typename AlignmentPolicy, typename AlpakaAcc>
418
- ALPAKA_FN_ACC auto create (const AlpakaAcc& acc, uint32_t bytes) -> void*
476
+ ALPAKA_FN_ACC auto create (const AlpakaAcc& acc, uint32_t const bytes) -> void*
419
477
{
478
+ auto blockValue = block;
479
+ auto hashValue = hash (acc, bytes);
480
+ auto startIdx = startIndex (acc, blockValue, hashValue);
420
481
return wrappingLoop (
421
482
acc,
422
- startIndex (acc, bytes) ,
483
+ startIdx ,
423
484
numBlocks (),
424
485
static_cast <void *>(nullptr ),
425
- [this , bytes](auto const & localAcc, auto const index)
426
- { return accessBlocks[index].create (localAcc, bytes); });
486
+ [this , bytes, &acc, startIdx, &hashValue, blockValue](auto const & localAcc, auto const index) mutable
487
+ {
488
+ auto ptr = accessBlocks[index].create (localAcc, bytes, hashValue);
489
+ if (!ptr && index == startIdx)
490
+ if (blockValue == block)
491
+ block = blockValue + 1 ;
492
+ return ptr;
493
+ });
427
494
}
428
495
429
496
template <typename AlpakaAcc>
430
497
ALPAKA_FN_ACC auto destroy (const AlpakaAcc& acc, void * pointer) -> void
431
498
{
432
- auto blockIndex = indexOf (pointer, accessBlocks, T_blockSize);
499
+ // indexOf requires the access block size instead of T_blockSize in case the reinterpreted AccessBlock
500
+ // object is smaller than T_blockSize.
501
+ auto blockIndex = indexOf (pointer, accessBlocks, sizeof (AccessBlock<T_blockSize, T_pageSize>));
433
502
accessBlocks[blockIndex].destroy (acc, pointer);
434
503
}
435
504
};
@@ -454,7 +523,7 @@ struct InitKernel
454
523
namespace mallocMC ::CreationPolicies
455
524
{
456
525
457
- template <typename T_HeapConfig, typename T_HashConfig>
526
+ template <typename T_HeapConfig, typename T_HashConfig = void >
458
527
struct Scatter : public ScatterAlloc ::Heap<T_HeapConfig::accessblocksize, T_HeapConfig::pagesize>
459
528
{
460
529
static_assert (T_HeapConfig::resetfreedpages, " resetfreedpages = false is no longer implemented." );
0 commit comments