ava-labs · DracoLi · Jun 1, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 22, 2025
@@ -20,6 +20,17 @@ type Cache[K comparable, V any] struct {
 	lock     sync.Mutex
 	elements *linked.Hashmap[K, V]
 	size     int
+
+	// onEvict is called with the key and value of an evicted entry, if set.
+	onEvict func(K, V)
+}
+
+// SetOnEvict sets a callback to be called with the key and value of an evicted entry.
+// The callback is called synchronously while holding the cache lock.
+func (c *Cache[K, V]) SetOnEvict(cb func(K, V)) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	c.onEvict = cb
 }
 
 func NewCache[K comparable, V any](size int) *Cache[K, V] {
@@ -34,8 +45,11 @@ func (c *Cache[K, V]) Put(key K, value V) {
 	defer c.lock.Unlock()
 
 	if c.elements.Len() == c.size {
-		oldestKey, _, _ := c.elements.Oldest()
+		oldestKey, oldestValue, _ := c.elements.Oldest()
 		c.elements.Delete(oldestKey)
+		if c.onEvict != nil {
+			c.onEvict(oldestKey, oldestValue)
+		}
 	}
 	c.elements.Put(key, value)
 }
@@ -55,14 +69,25 @@ func (c *Cache[K, V]) Get(key K) (V, bool) {
 func (c *Cache[K, _]) Evict(key K) {
 	c.lock.Lock()
 	defer c.lock.Unlock()
-
+	value, _ := c.elements.Get(key)
 	c.elements.Delete(key)
+	if c.onEvict != nil {
+		c.onEvict(key, value)
+	}
 }
 
 func (c *Cache[_, _]) Flush() {
 	c.lock.Lock()
 	defer c.lock.Unlock()
 
+	// Call onEvict for each element before clearing
+	if c.onEvict != nil {
+		iter := c.elements.NewIterator()
+		for iter.Next() {
+			c.onEvict(iter.Key(), iter.Value())
+		}
+	}
+
 	c.elements.Clear()
 }
 

@@ -6,6 +6,8 @@ package lru
 import (
 	"testing"
 
+	"github.com/stretchr/testify/require"
+
 	"github.com/ava-labs/avalanchego/cache/cachetest"
 	"github.com/ava-labs/avalanchego/ids"
 )
@@ -19,3 +21,17 @@ func TestCacheEviction(t *testing.T) {
 	c := NewCache[ids.ID, int64](2)
 	cachetest.Eviction(t, c)
 }
+
+func TestCacheFlushWithOnEvict(t *testing.T) {
+	c := NewCache[ids.ID, int64](2)
+
+	// Track which elements were evicted
+	evicted := make(map[ids.ID]int64)
+	c.SetOnEvict(func(key ids.ID, value int64) {
+		evicted[key] = value
+	})
+
+	cachetest.Eviction(t, c)
+	require.Zero(t, c.Len())
+	require.Len(t, evicted, 3)
+}
@@ -17,6 +17,7 @@ require (
 	github.com/ava-labs/ledger-avalanche/go v0.0.0-20241009183145-e6f90a8a1a60
 	github.com/ava-labs/libevm v1.13.14-0.3.0.rc.1
 	github.com/btcsuite/btcd/btcutil v1.1.3
+	github.com/cespare/xxhash/v2 v2.3.0
 	github.com/cockroachdb/pebble v0.0.0-20230928194634-aa077af62593
 	github.com/compose-spec/compose-go v1.20.2
 	github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0
@@ -89,7 +90,6 @@ require (
 	github.com/bits-and-blooms/bitset v1.10.0 // indirect
 	github.com/btcsuite/btcd/btcec/v2 v2.3.2 // indirect
 	github.com/cenkalti/backoff/v4 v4.2.1 // indirect
-	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cockroachdb/errors v1.9.1 // indirect
 	github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect
 	github.com/cockroachdb/redact v1.1.3 // indirect

@@ -0,0 +1,190 @@
+# BlockDB
+
+BlockDB is a specialized database optimized for blockchain blocks.
+
+## Key Functionalities
+
+- **O(1) Performance**: Both reads and writes complete in constant time
+- **Parallel Operations**: Multiple threads can read and write blocks concurrently without blocking
+- **Flexible Write Ordering**: Supports out-of-order block writes for bootstrapping
+- **Configurable Durability**: Optional `syncToDisk` mode guarantees immediate recoverability
+- **Automatic Recovery**: Detects and recovers unindexed blocks after unclean shutdowns
+
+## Design
+
+BlockDB uses a single index file and multiple data files. The index file maps block heights to locations in the data files, while data files store the actual block content. Data storage can be split across multiple data files based on the maximum data file size.
+
+```
+┌─────────────────┐         ┌─────────────────┐
+│   Index File    │         │  Data File 1    │
+│   (.idx)        │         │   (.dat)        │
+├─────────────────┤         ├─────────────────┤
+│ Header          │         │ Block 0         │
+│ - Version       │  ┌─────>│ - Header        │
+│ - Min Height    │  │      │ - Data          │
+│ - Max Height    │  │      ├─────────────────┤
+│ - Data Size     │  │      │ Block 1         │
+│ - ...           │  │  ┌──>│ - Header        │
+├─────────────────┤  │  │   │ - Data          │
+│ Entry[0]        │  │  │   ├─────────────────┤
+│ - Offset ───────┼──┘  │   │     ...         │
+│ - Size          │     │   └─────────────────┘
+│ - Header Size   │     │
+├─────────────────┤     │
+│ Entry[1]        │     │
+│ - Offset ───────┼─────┘
+│ - Size          │
+│ - Header Size   │
+├─────────────────┤
+│     ...         │
+└─────────────────┘
+```
+
+### File Formats
+
+#### Index File Structure
+
+The index file consists of a fixed-size header followed by fixed-size entries:
+
+```
+Index File Header (80 bytes):
+┌────────────────────────────────┬─────────┐
+│ Field                          │ Size    │
+├────────────────────────────────┼─────────┤
+│ Version                        │ 8 bytes │
+│ Max Data File Size             │ 8 bytes │
+│ Min Block Height               │ 8 bytes │
+│ Max Contiguous Height          │ 8 bytes │
+│ Max Block Height               │ 8 bytes │
+│ Next Write Offset              │ 8 bytes │
+│ Reserved                       │ 32 bytes│
+└────────────────────────────────┴─────────┘
+
+Index Entry (16 bytes):
+┌────────────────────────────────┬─────────┐
+│ Field                          │ Size    │
+├────────────────────────────────┼─────────┤
+│ Data File Offset               │ 8 bytes │
+│ Block Data Size                │ 4 bytes │
+│ Header Size                    │ 4 bytes │
+└────────────────────────────────┴─────────┘
+```
+
+#### Data File Structure
+
+Each block in the data file is stored with a header followed by the raw block data:
+
+```
+Block Header (24 bytes):
+┌────────────────────────────────┬─────────┐
+│ Field                          │ Size    │
+├────────────────────────────────┼─────────┤
+│ Height                         │ 8 bytes │
+│ Checksum                       │ 8 bytes │
+│ Size                           │ 4 bytes │
+│ Header Size                    │ 4 bytes │
+└────────────────────────────────┴─────────┘
+```
+
+### Block Overwrites
+
+BlockDB allows overwriting blocks at existing heights. When a block is overwritten, the new block is appended to the data file and the index entry is updated to point to the new location, leaving the old block data as unreferenced "dead" space. However, since blocks are immutable and rarely overwritten (e.g., during reorgs), this trade-off should have minimal impact in practice.
+
+### Fixed-Size Index Entries
+
+Each index entry is exactly 16 bytes on disk, containing the offset, size, and header size. This fixed size enables direct calculation of where each block's index entry is located, providing O(1) lookups. For blockchains with high block heights, the index remains efficient, even at height 1 billion, the index file would only be ~16GB.
+
+### Durability and Fsync Behavior
+
+BlockDB provides configurable durability through the `syncToDisk` parameter:
+
+**Data File Behavior:**
+
+- **When `syncToDisk=true`**: The data file is fsync'd after every block write, guaranteeing durability against both process failures and kernel/machine failures.
+- **When `syncToDisk=false`**: Data file writes are buffered, providing durability against process failures but not against kernel or machine failures.
+
+**Index File Behavior:**
+
+- **When `syncToDisk=true`**: The index file is fsync'd every `CheckpointInterval` blocks (when the header is written).
+- **When `syncToDisk=false`**: The index file relies on OS buffering and is not explicitly fsync'd.
+
+### Recovery Mechanism
+
+On startup, BlockDB checks for signs of an unclean shutdown by comparing the data file size on disk with the indexed data size stored in the index file header. If the data files are larger than what the index claims, it indicates that blocks were written but the index wasn't properly updated before shutdown.
+
+**Recovery Process:**
+
+1. Starts scanning from where the index left off (`NextWriteOffset`)
+2. For each unindexed block found:
+   - Validates the block header and checksum
+   - Writes the corresponding index entry
+3. Updates the max contiguous height and max block height
+4. Persists the updated index header
+
+## Usage
+
+### Creating a Database
+
+```go
+import "github.com/ava-labs/avalanchego/x/blockdb"
+
+config := blockdb.DefaultDatabaseConfig()
+db, err := blockdb.New(
+    "/path/to/index",  // Index directory
+    "/path/to/data",   // Data directory
+    config,
+    logger,
+)
+if err != nil {
+    fmt.Println("Error creating database:", err)
+    return
+}
+defer db.Close()
+```
+
+### Writing and Reading Blocks
+
+```go
+// Write a block with header size
+height := uint64(100)
+blockData := []byte("header:block data")
+headerSize := uint32(7) // First 7 bytes are the header
+err := db.WriteBlock(height, blockData, headerSize)
+if err != nil {
+    fmt.Println("Error writing block:", err)
+    return
+}
+
+// Read a block
+blockData, err := db.ReadBlock(height)
+if err != nil {
+    fmt.Println("Error reading block:", err)
+    return
+}
+if blockData == nil {
+    // Block doesn't exist at this height
+    return
+}
+
+// Read block components separately
+headerData, err := db.ReadHeader(height)
+if err != nil {
+    fmt.Println("Error reading header:", err)
+    return
+}
+bodyData, err := db.ReadBody(height)
+if err != nil {
+    fmt.Println("Error reading body:", err)
+    return
+}
+```
+
+## TODO
+
+- [ ] Compress data files to reduce storage size
+- [ ] Split data across multiple files when `MaxDataFileSize` is reached
+- [ ] Implement a block cache for recently accessed blocks
+- [ ] Use a buffered pool to avoid allocations on reads and writes
+- [ ] Add tests for core functionality
+- [ ] Add performance benchmarks
+- [ ] Consider supporting missing data files (currently we error if any data files are missing)