4
4
Copyright 2024 Helmholtz-Zentrum Dresden - Rossendorf,
5
5
CERN
6
6
7
- Author(s): Julian Johannes Lenz
7
+ Author(s): Julian Johannes Lenz, Rene Widera
8
8
9
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
10
of this software and associated documentation files (the "Software"), to deal
34
34
#include " mallocMC/mallocMC_utils.hpp"
35
35
36
36
#include < algorithm>
37
+ #include < alpaka/alpaka.hpp>
37
38
#include < alpaka/atomic/AtomicAtomicRef.hpp>
38
39
#include < alpaka/core/Common.hpp>
39
40
#include < alpaka/core/Positioning.hpp>
49
50
#include < functional>
50
51
#include < iterator>
51
52
#include < numeric>
52
- #include < sys/types.h >
53
+ #include < optional >
53
54
#include < vector>
54
55
55
56
namespace mallocMC ::CreationPolicies::ScatterAlloc
56
57
{
57
- constexpr const uint32_t pageTableEntrySize = 4U + 4U ;
58
58
59
59
template <size_t T_numPages>
60
60
struct PageTable
@@ -69,7 +69,10 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
69
69
public:
70
70
ALPAKA_FN_ACC [[nodiscard]] constexpr static auto numPages () -> size_t
71
71
{
72
- return T_blockSize / (T_pageSize + pageTableEntrySize);
72
+ constexpr auto x = T_blockSize / (T_pageSize + sizeof (PageTable<1 >));
73
+ // check that the page table entries does not have a padding
74
+ static_assert (sizeof (PageTable<x>) == x * sizeof (PageTable<1 >));
75
+ return x;
73
76
}
74
77
75
78
ALPAKA_FN_ACC [[nodiscard]] auto getAvailableSlots (auto const & acc, uint32_t const chunkSize) -> size_t
@@ -104,8 +107,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
104
107
: interpret (index, chunkSize).isValid (acc, pointer);
105
108
}
106
109
110
+ // ! @param hashValue the default makes testing easier because we can avoid adding the hash to each call^^
107
111
template <typename TAcc>
108
- ALPAKA_FN_ACC auto create (TAcc const & acc, uint32_t const numBytes) -> void*
112
+ ALPAKA_FN_ACC auto create (TAcc const & acc, uint32_t const numBytes, uint32_t const hashValue = 0u ) -> void*
109
113
{
110
114
void * pointer{nullptr };
111
115
if (numBytes >= multiPageThreshold ())
@@ -114,7 +118,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
114
118
}
115
119
else
116
120
{
117
- pointer = createChunk (acc, numBytes);
121
+ pointer = createChunk (acc, numBytes, hashValue );
118
122
}
119
123
return pointer;
120
124
}
@@ -123,9 +127,9 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
123
127
ALPAKA_FN_ACC auto destroy (TAcc const & acc, void * const pointer) -> void
124
128
{
125
129
auto const index = pageIndex (pointer);
126
- if (index > static_cast <ssize_t >(numPages ()) || index < 0 )
130
+ if (index >= static_cast <ssize_t >(numPages ()) || index < 0 )
127
131
{
128
- #ifndef NDEBUG
132
+ #if (!defined( NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
129
133
throw std::runtime_error{
130
134
" Attempted to destroy an invalid pointer! Pointer does not point to any page." };
131
135
#endif // NDEBUG
@@ -145,6 +149,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
145
149
private:
146
150
DataPage<T_pageSize> pages[numPages()]{};
147
151
PageTable<numPages()> pageTable{};
152
+ char padding[T_blockSize - sizeof (DataPage<T_pageSize>) * numPages () - sizeof(PageTable<numPages()>)];
148
153
149
154
ALPAKA_FN_ACC constexpr static auto multiPageThreshold () -> uint32_t
150
155
{
@@ -202,15 +207,20 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
202
207
return numPages ();
203
208
}
204
209
205
- ALPAKA_FN_ACC static auto startIndex (auto const & acc)
210
+ ALPAKA_FN_ACC static auto startIndex (auto const & acc, uint32_t const hashValue )
206
211
{
207
- return (laneid () * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum ()) % numPages ();
212
+ return (hashValue >> 8u ) % numPages ();
213
+ }
214
+
215
+ ALPAKA_FN_ACC bool isValidPageIdx (uint32_t const index) const
216
+ {
217
+ return index != noFreePageFound () && index < numPages ();
208
218
}
209
219
210
220
template <typename TAcc>
211
- ALPAKA_FN_ACC auto createChunk (TAcc const & acc, uint32_t const numBytes) -> void*
221
+ ALPAKA_FN_ACC auto createChunk (TAcc const & acc, uint32_t const numBytes, uint32_t const hashValue ) -> void*
212
222
{
213
- auto index = startIndex (acc);
223
+ auto index = startIndex (acc, hashValue );
214
224
215
225
// Under high pressure, this loop could potentially run for a long time because the information where and
216
226
// when we started our search is not maintained and/or used. This is a feature, not a bug: Given a
@@ -223,42 +233,62 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
223
233
//
224
234
// In the latter case, it is considered desirable to wrap around multiple times until the thread was fast
225
235
// enough to acquire some memory.
226
- index = choosePage (acc, numBytes, index);
227
- void * pointer = index != noFreePageFound ()
228
- ? PageInterpretation<T_pageSize>{pages[index], numBytes}.create (acc)
229
- : nullptr ;
230
-
231
- while (index != noFreePageFound () and pointer == nullptr )
236
+ void * pointer = nullptr ;
237
+ do
232
238
{
233
- leavePage (acc, index);
234
- ++index;
235
- index = choosePage (acc, numBytes, index);
236
- pointer = PageInterpretation<T_pageSize>{pages[index], numBytes}.create (acc);
237
- }
238
-
239
+ index = (index + 1 ) % numPages ();
240
+ uint32_t chunkSize = numBytes;
241
+ index = choosePage (acc, numBytes, chunkSize, index);
242
+ if (isValidPageIdx (index))
243
+ {
244
+ pointer = PageInterpretation<T_pageSize>{pages[index], chunkSize}.create (acc, hashValue);
245
+ if (pointer == nullptr )
246
+ leavePage (acc, index);
247
+ }
248
+ } while (isValidPageIdx (index) and pointer == nullptr );
239
249
return pointer;
240
250
}
241
251
242
252
template <typename TAcc>
243
- ALPAKA_FN_ACC auto choosePage (TAcc const & acc, uint32_t const numBytes, size_t const startIndex = 0 ) -> size_t
253
+ ALPAKA_FN_ACC auto choosePage (
254
+ TAcc const & acc,
255
+ uint32_t const numBytes,
256
+ uint32_t & chunkSize,
257
+ size_t const startIndex = 0 ) -> size_t
244
258
{
245
259
return wrappingLoop (
246
260
acc,
247
261
startIndex,
248
262
numPages (),
249
263
noFreePageFound (),
250
- [this , numBytes](auto const & localAcc, auto const index)
251
- { return thisPageIsAppropriate (localAcc, index, numBytes) ? index : noFreePageFound (); });
264
+ [this , numBytes, &chunkSize ](auto const & localAcc, auto const index)
265
+ { return thisPageIsAppropriate (localAcc, index, numBytes, chunkSize ) ? index : noFreePageFound (); });
252
266
}
253
267
268
+
269
+ #ifndef WAIST_FACTOR
270
+ # define WAIST_FACTOR 2u
271
+ #endif
254
272
template <typename TAcc>
255
- ALPAKA_FN_ACC auto thisPageIsAppropriate (TAcc const & acc, size_t const index, uint32_t const numBytes) -> bool
273
+ ALPAKA_FN_ACC auto thisPageIsAppropriate (
274
+ TAcc const & acc,
275
+ size_t const index,
276
+ uint32_t const numBytes,
277
+ uint32_t & chunkSize) -> bool
256
278
{
257
279
bool appropriate = false ;
258
280
if (enterPage (acc, index, numBytes))
259
281
{
260
282
auto oldChunkSize = atomicCAS (acc, pageTable._chunkSizes [index], 0U , numBytes);
283
+ #if 0
261
284
appropriate = (oldChunkSize == 0U || oldChunkSize == numBytes);
285
+ chunkSize = std::max(oldChunkSize, numBytes);
286
+ #else
287
+ constexpr uint32_t waistFactor = WAIST_FACTOR;
288
+ appropriate
289
+ = (oldChunkSize == 0U || (oldChunkSize >= numBytes && oldChunkSize <= numBytes * waistFactor));
290
+ chunkSize = std::max (oldChunkSize, numBytes);
291
+ #endif
262
292
}
263
293
if (not appropriate)
264
294
{
@@ -276,13 +306,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
276
306
// Using T_pageSize/2 as chunkSize when entering the page means that the page reports to have at most one
277
307
// chunk available.
278
308
auto dummyChunkSize = T_pageSize / 2 ;
309
+ uint32_t chunkSize = numBytes;
279
310
for (size_t firstIndex = 0 ; firstIndex < numPages () - (numPagesNeeded - 1 ) and result == nullptr ;
280
311
++firstIndex)
281
312
{
282
313
size_t numPagesAcquired{};
283
314
for (numPagesAcquired = 0U ; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
284
315
{
285
- if (not thisPageIsAppropriate (acc, firstIndex + numPagesAcquired, dummyChunkSize))
316
+ if (not thisPageIsAppropriate (acc, firstIndex + numPagesAcquired, dummyChunkSize, chunkSize ))
286
317
{
287
318
for (size_t cleanupIndex = numPagesAcquired; cleanupIndex > 0 ; --cleanupIndex)
288
319
{
@@ -297,15 +328,15 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
297
328
// still have to replace the dummy chunkSize with the real one.
298
329
for (numPagesAcquired = 0U ; numPagesAcquired < numPagesNeeded; ++numPagesAcquired)
299
330
{
300
- #ifndef NDEBUG
331
+ #if (!defined( NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
301
332
auto oldChunkSize =
302
333
#endif
303
334
atomicCAS (
304
335
acc,
305
336
pageTable._chunkSizes [firstIndex + numPagesAcquired],
306
337
T_pageSize / 2 ,
307
338
numBytes);
308
- #ifndef NDEBUG
339
+ #if (!defined( NDEBUG) && !BOOST_LANG_CUDA && !BOOST_LANG_HIP)
309
340
if (oldChunkSize != dummyChunkSize)
310
341
{
311
342
throw std::runtime_error{" Unexpected intermediate chunkSize in multi-page allocation." };
@@ -364,7 +395,7 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
364
395
// Furthermore, chunkSize cannot have changed because we maintain the invariant that the
365
396
// filling level is always considered first, so no other thread can have passed that barrier to
366
397
// reset it.
367
- PageInterpretation<T_pageSize>{pages[pageIndex], chunkSize}. cleanup ();
398
+ PageInterpretation<T_pageSize>{pages[pageIndex], 1u }. resetBitfields ();
368
399
alpaka::mem_fence (acc, alpaka::memory_scope::Device{});
369
400
370
401
// It is important to keep this after the clean-up line above: Otherwise another thread with a
@@ -398,9 +429,14 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
398
429
{
399
430
size_t heapSize{};
400
431
AccessBlock<T_blockSize, T_pageSize>* accessBlocks{};
432
+ volatile uint32_t block = 0u ;
401
433
402
434
ALPAKA_FN_ACC [[nodiscard]] auto numBlocks () const -> size_t
403
435
{
436
+ // Guarantee that each access block start address is aligned.
437
+ static_assert (
438
+ T_blockSize == sizeof (AccessBlock<T_blockSize, T_pageSize>),
439
+ " accessblock should equal to the use given block size to have a guaranteed alignment for pointers." );
404
440
return heapSize / T_blockSize;
405
441
}
406
442
@@ -409,27 +445,59 @@ namespace mallocMC::CreationPolicies::ScatterAlloc
409
445
return numBlocks ();
410
446
}
411
447
412
- ALPAKA_FN_ACC auto startIndex (auto const & acc, uint32_t const numBytes) const
448
+ ALPAKA_FN_ACC auto hash (auto const & acc, uint32_t const numBytes) const
449
+ {
450
+ constexpr uint32_t hashingK = 38183u ;
451
+ constexpr uint32_t hashingDistMP = 17497u ;
452
+ constexpr uint32_t hashingDistWP = 1u ;
453
+ constexpr uint32_t hashingDistWPRel = 1u ;
454
+
455
+ const uint32_t numpages = AccessBlock<T_blockSize, T_pageSize>::numPages ();
456
+ const uint32_t pagesperblock = numpages / numBlocks ();
457
+ const uint32_t reloff = warpSize * numBytes / T_pageSize;
458
+ const uint32_t hash
459
+ = (numBytes * hashingK + hashingDistMP * smid ()
460
+ + (hashingDistWP + hashingDistWPRel * reloff) * warpid ());
461
+ return hash;
462
+ }
463
+
464
+ ALPAKA_FN_ACC auto startIndex (auto const &, uint32_t const blockValue, uint32_t const hashValue)
413
465
{
414
- return (numBytes * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc).sum ()) % numBlocks ();
466
+ #if 1
467
+ constexpr uint32_t blockStride = 4 ;
468
+ return ((hashValue % blockStride) + (blockValue * blockStride)) % numBlocks ();
469
+ #else
470
+ return (block + hashValue) % numBlocks();
471
+ #endif
415
472
}
416
473
417
474
template <typename AlignmentPolicy, typename AlpakaAcc>
418
- ALPAKA_FN_ACC auto create (const AlpakaAcc& acc, uint32_t bytes) -> void*
475
+ ALPAKA_FN_ACC auto create (const AlpakaAcc& acc, uint32_t const bytes) -> void*
419
476
{
477
+ auto blockValue = block;
478
+ auto hashValue = hash (acc, bytes);
479
+ auto startIdx = startIndex (acc, blockValue, hashValue);
420
480
return wrappingLoop (
421
481
acc,
422
- startIndex (acc, bytes) ,
482
+ startIdx ,
423
483
numBlocks (),
424
484
static_cast <void *>(nullptr ),
425
- [this , bytes](auto const & localAcc, auto const index)
426
- { return accessBlocks[index].create (localAcc, bytes); });
485
+ [this , bytes, &acc, startIdx, &hashValue, blockValue](auto const & localAcc, auto const index) mutable
486
+ {
487
+ auto ptr = accessBlocks[index].create (localAcc, bytes, hashValue);
488
+ if (!ptr && index == startIdx)
489
+ if (blockValue == block)
490
+ block = blockValue + 1 ;
491
+ return ptr;
492
+ });
427
493
}
428
494
429
495
template <typename AlpakaAcc>
430
496
ALPAKA_FN_ACC auto destroy (const AlpakaAcc& acc, void * pointer) -> void
431
497
{
432
- auto blockIndex = indexOf (pointer, accessBlocks, T_blockSize);
498
+ // indexOf requires the access block size instead of T_blockSize in case the reinterpreted AccessBlock
499
+ // object is smaller than T_blockSize.
500
+ auto blockIndex = indexOf (pointer, accessBlocks, sizeof (AccessBlock<T_blockSize, T_pageSize>));
433
501
accessBlocks[blockIndex].destroy (acc, pointer);
434
502
}
435
503
};
@@ -454,7 +522,7 @@ struct InitKernel
454
522
namespace mallocMC ::CreationPolicies
455
523
{
456
524
457
- template <typename T_HeapConfig, typename T_HashConfig>
525
+ template <typename T_HeapConfig, typename T_HashConfig = void >
458
526
struct Scatter : public ScatterAlloc ::Heap<T_HeapConfig::accessblocksize, T_HeapConfig::pagesize>
459
527
{
460
528
static_assert (T_HeapConfig::resetfreedpages, " resetfreedpages = false is no longer implemented." );
0 commit comments