Force niofs for fdt tmp file read access when flushing stored fields (#129538)

martijnvg · web-flow · commit 41f69810dfe4 · 2025-06-23T07:46:00.000+02:00
Due to the way how stored fields get flushed when index sorting is active, it is possible that we encounter significant page cache faults when memory is scarce. In order to mitigate some of the slowness around this, we're planning to no longer mmap the fdt temp file. Initially behind a feature flag, to check for unforeseen side effects.

Typically using always mmap directory is better compared to noifs directory given there is a sufficient memory available to the OS for filesystem caching. However when that isn't the case, then indexing performance can vary a lot (often very slow). This is more true for files tmp files that stored fields create during flushing. These files exist for only a brief moment to sort stored fields in the order of the configured index sorting and are then removed. If these tmp files are mmapped there is risk to trash file system cache.

This change only avoids using mmap for the fdt tmp file. This the file that actually contains the data and can large compared to other files that get flushed. The fdm (metadata) and fdi (stored field index) remain being mmapped.
diff --git a/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java b/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java
@@ -46,6 +46,7 @@ public class FsDirectoryFactory implements IndexStorePlugin.DirectoryFactory {
 
     private static final Logger Log = LogManager.getLogger(FsDirectoryFactory.class);
     private static final FeatureFlag MADV_RANDOM_FEATURE_FLAG = new FeatureFlag("madv_random");
+    private static final FeatureFlag TMP_FDT_NO_MMAP_FEATURE_FLAG = new FeatureFlag("tmp_fdt_no_mmap");
 
     public static final Setting<LockFactory> INDEX_LOCK_FACTOR_SETTING = new Setting<>("index.store.fs.fs_lock", "native", (s) -> {
         return switch (s) {
@@ -222,7 +223,7 @@ static boolean useDelegate(String name, IOContext ioContext) {
             }
 
             final LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(getExtension(name));
-            if (extension == null || extension.shouldMmap() == false) {
+            if (extension == null || extension.shouldMmap() == false || avoidDelegateForFdtTempFiles(name, extension)) {
                 // Other files are either less performance-sensitive (e.g. stored field index, norms metadata)
                 // or are large and have a random access pattern and mmap leads to page cache trashing
                 // (e.g. stored fields and term vectors).
@@ -231,6 +232,39 @@ static boolean useDelegate(String name, IOContext ioContext) {
             return true;
         }
 
+        /**
+         * Force not using mmap if file is tmp fdt file.
+         * The tmp fdt file only gets created when flushing stored
+         * fields to disk and index sorting is active.
+         * <p>
+         * In Lucene, the <code>SortingStoredFieldsConsumer</code> first
+         * flushes stored fields to disk in tmp files in unsorted order and
+         * uncompressed format. Then the tmp file gets a full integrity check,
+         * then the stored values are read from the tmp in the order of
+         * the index sorting in the segment, the order in which this happens
+         * from the perspective of tmp fdt file is random. After that,
+         * the tmp files are removed.
+         * <p>
+         * If the machine Elasticsearch runs on has sufficient memory the i/o pattern
+         * that <code>SortingStoredFieldsConsumer</code> actually benefits from using mmap.
+         * However, in cases when memory scarce, this pattern can cause page faults often.
+         * Doing more harm than not using mmap.
+         * <p>
+         * As part of flushing stored disk when indexing sorting is active,
+         * three tmp files are created, fdm (metadata), fdx (index) and
+         * fdt (contains stored field data). The first two files are small and
+         * mmap-ing that should still be ok even is memory is scarce.
+         * The fdt file is large and tends to cause more page faults when memory is scarce.
+         *
+         * @param name      The name of the file in Lucene index
+         * @param extension The extension of the in Lucene index
+         * @return whether to avoid using delegate if the file is a tmp fdt file.
+         */
+        static boolean avoidDelegateForFdtTempFiles(String name, LuceneFilesExtensions extension) {
+            // NOTE, for now gated behind feature flag to observe impact of this change in benchmarks only:
+            return TMP_FDT_NO_MMAP_FEATURE_FLAG.isEnabled() && extension == LuceneFilesExtensions.TMP && name.contains("fdt");
+        }
+
         MMapDirectory getDelegate() {
             return delegate;
         }
diff --git a/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java b/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java
@@ -69,7 +69,10 @@ public void testPreload() throws IOException {
             assertTrue(FsDirectoryFactory.HybridDirectory.useDelegate("foo.kdi", newIOContext(random())));
             assertFalse(FsDirectoryFactory.HybridDirectory.useDelegate("foo.kdi", Store.READONCE_CHECKSUM));
             assertTrue(FsDirectoryFactory.HybridDirectory.useDelegate("foo.tmp", newIOContext(random())));
-            assertTrue(FsDirectoryFactory.HybridDirectory.useDelegate("foo.fdt__0.tmp", newIOContext(random())));
+            assertFalse(FsDirectoryFactory.HybridDirectory.useDelegate("foo.fdt__0.tmp", newIOContext(random())));
+            assertFalse(FsDirectoryFactory.HybridDirectory.useDelegate("_0.fdt__1.tmp", newIOContext(random())));
+            assertTrue(FsDirectoryFactory.HybridDirectory.useDelegate("_0.fdm__0.tmp", newIOContext(random())));
+            assertTrue(FsDirectoryFactory.HybridDirectory.useDelegate("_0.fdx__4.tmp", newIOContext(random())));
             MMapDirectory delegate = hybridDirectory.getDelegate();
             assertThat(delegate, Matchers.instanceOf(MMapDirectory.class));
             var func = fsDirectoryFactory.preLoadFuncMap.get(delegate);