Allow producer config alterations on reload (#710)

mensfeld · web-flow · commit 503939bedb1a · 2025-10-13T15:14:11.000+02:00
* remarks

* do not include lock for integrations

* allow reload alterations

* bump version

* remarks

* remove reloaded
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
 # WaterDrop changelog
 
-## Unreleased
+## 2.8.13 (Unreleased)
 - [Enhancement] Make `fenced` error skip-reload behavior configurable via new `non_reloadable_errors` setting (defaults to `[:fenced]` for backward compatibility).
+- [Enhancement] Add `producer.reload` event allowing config modification before reload to escape fencing loops (#706).
+- [Enhancement] Do not early initialize the new instance on reload.
 
 ## 2.8.12 (2025-10-10)
 - [Enhancement] Introduce `reload_on_idempotent_fatal_error` to automatically reload librdkafka producer after fatal errors on idempotent (non-transactional) producers.
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    waterdrop (2.8.12)
+    waterdrop (2.8.13)
       karafka-core (>= 2.4.9, < 3.0.0)
       karafka-rdkafka (>= 0.20.0)
       zeitwerk (~> 2.3)
diff --git a/lib/waterdrop/instrumentation/notifications.rb b/lib/waterdrop/instrumentation/notifications.rb
@@ -10,6 +10,7 @@ class Notifications < ::Karafka::Core::Monitoring::Notifications
         producer.connected
         producer.closing
         producer.closed
+        producer.reload
         producer.reloaded
         producer.disconnecting
         producer.disconnected
diff --git a/lib/waterdrop/producer.rb b/lib/waterdrop/producer.rb
@@ -502,7 +502,7 @@ def produce(message)
         )
 
         # Attempt to reload the producer
-        idempotent_reload_client_on_fatal_error(@idempotent_fatal_error_attempts)
+        idempotent_reload_client_on_fatal_error(@idempotent_fatal_error_attempts, e)
 
         # Wait before retrying to avoid rapid reload loops
         sleep(@config.wait_backoff_on_idempotent_fatal_error / 1_000.0)
@@ -559,5 +559,15 @@ def produce(message)
     ensure
       @operations_in_progress.decrement
     end
+
+    # Reloads the client
+    # @note This should be used only within proper mutexes internally
+    def reload!
+      @client.flush(current_variant.max_wait_timeout)
+      purge
+      @client.close
+      @client = nil
+      @status.configured!
+    end
   end
 end
diff --git a/lib/waterdrop/producer/idempotence.rb b/lib/waterdrop/producer/idempotence.rb
@@ -50,21 +50,34 @@ def idempotent_retryable?
       # old client, and create a new client instance to continue operations.
       #
       # @param attempt [Integer] the current reload attempt number
+      # @param error [Rdkafka::RdkafkaError] the error that triggered the reload
       #
       # @note This is only called for idempotent, non-transactional producers when
       #   `reload_on_idempotent_fatal_error` is enabled
       # @note After reload, the producer will automatically retry the failed operation
-      def idempotent_reload_client_on_fatal_error(attempt)
+      def idempotent_reload_client_on_fatal_error(attempt, error)
         @operating_mutex.synchronize do
+          # Emit producer.reload event before reload
+          # Users can subscribe to this event and modify event[:caller].config.kafka to change
+          # producer config
+          @monitor.instrument(
+            'producer.reload',
+            producer_id: id,
+            error: error,
+            attempt: attempt,
+            caller: self
+          )
+
+          # Clear cached state that depends on config
+          # We always clear @idempotent as it might have been modified via the event
+          @idempotent = nil
+
           @monitor.instrument(
             'producer.reloaded',
             producer_id: id,
             attempt: attempt
           ) do
-            @client.flush(current_variant.max_wait_timeout)
-            purge
-            @client.close
-            @client = Builder.new.call(self, @config)
+            reload!
           end
         end
       end
diff --git a/lib/waterdrop/producer/transactions.rb b/lib/waterdrop/producer/transactions.rb
@@ -285,20 +285,37 @@ def transactional_reload_client_if_needed(error)
 
         # Check if we've exceeded max reload attempts
         return unless transactional_retryable?
+        # We bubble up transactional errors, so there are cases where when fencing is not
+        # considered a non-reloadable, two layers of error handling would attempt to reload the
+        # client causing double reload. This halts reload if we're in a configured state as it
+        # means, we've already reloaded and we are not even yet connected
+        return if @status.configured?
 
         # Increment attempts before reload
         @transaction_fatal_error_attempts += 1
 
         @operating_mutex.synchronize do
+          # Emit producer.reload event before reload
+          # Users can subscribe to this event and modify event[:caller].config.kafka to change
+          # producer config. This is useful for escaping fencing loops by changing transactional.id
+          @monitor.instrument(
+            'producer.reload',
+            producer_id: id,
+            error: rd_error,
+            attempt: @transaction_fatal_error_attempts,
+            caller: self
+          )
+
+          # Clear cached state that depends on config
+          # We always clear @transactional as it might have been modified via the event
+          @transactional = nil
+
           @monitor.instrument(
             'producer.reloaded',
             producer_id: id,
             attempt: @transaction_fatal_error_attempts
           ) do
-            @client.flush(current_variant.max_wait_timeout)
-            purge
-            @client.close
-            @client = Builder.new.call(self, @config)
+            reload!
           end
         end
 
diff --git a/lib/waterdrop/version.rb b/lib/waterdrop/version.rb
@@ -3,5 +3,5 @@
 # WaterDrop library
 module WaterDrop
   # Current WaterDrop version
-  VERSION = '2.8.12'
+  VERSION = '2.8.13'
 end
diff --git a/spec/integrations/fatal_error_recovery/README.md b/spec/integrations/fatal_error_recovery/README.md
diff --git a/spec/integrations/fatal_error_recovery/fencing_escape_with_reload_event_spec.rb b/spec/integrations/fatal_error_recovery/fencing_escape_with_reload_event_spec.rb
@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+
+# Integration test demonstrating how to escape producer fencing by using the producer.reload
+# event to modify transactional.id. This test shows the recommended pattern for handling fencing
+# in production environments.
+#
+# When a producer gets fenced, simply reloading with the same transactional.id creates an
+# infinite loop. But by subscribing to the producer.reload event and modifying the
+# transactional.id, the producer can escape fencing and continue operating with a new identity.
+
+require 'waterdrop'
+require 'logger'
+require 'securerandom'
+
+BOOTSTRAP_SERVERS = ENV.fetch('BOOTSTRAP_SERVERS', '127.0.0.1:9092')
+# Same ID for both producers initially
+TRANSACTIONAL_ID = "fence-escape-test-#{SecureRandom.uuid}".freeze
+
+# Track instrumentation events
+reload_events = []
+error_events = []
+
+# Create first producer with reload enabled
+producer1 = WaterDrop::Producer.new do |config|
+  config.kafka = {
+    'bootstrap.servers': BOOTSTRAP_SERVERS,
+    'transactional.id': TRANSACTIONAL_ID,
+    'transaction.timeout.ms': 30_000,
+    'message.timeout.ms': 30_000
+  }
+  config.max_wait_timeout = 5_000
+  config.logger = Logger.new($stdout, level: Logger::INFO)
+  config.reload_on_transaction_fatal_error = true
+  # IMPORTANT: Remove :fenced from non_reloadable_errors to allow reload attempts
+  config.non_reloadable_errors = []
+  config.max_attempts_on_transaction_fatal_error = 5
+  config.wait_backoff_on_transaction_fatal_error = 100
+end
+
+# Subscribe to producer.reload event and modify transactional.id to escape fencing
+producer1.monitor.subscribe('producer.reload') do |event|
+  config = event[:caller].config
+  config.kafka[:'transactional.id'] = "#{TRANSACTIONAL_ID}-recovered-#{Time.now.to_i}"
+end
+
+producer1.monitor.subscribe('producer.reloaded') { |event| reload_events << event }
+producer1.monitor.subscribe('error.occurred') { |event| error_events << event }
+
+topic_name = "it-fence-escape-#{SecureRandom.hex(6)}"
+
+# First transaction with producer1
+producer1.transaction do
+  producer1.produce_sync(topic: topic_name, payload: 'message1')
+end
+
+# Create second producer with same ID to cause fencing
+producer2 = WaterDrop::Producer.new do |config|
+  config.kafka = {
+    'bootstrap.servers': BOOTSTRAP_SERVERS,
+    'transactional.id': TRANSACTIONAL_ID,
+    'transaction.timeout.ms': 30_000,
+    'message.timeout.ms': 30_000
+  }
+  config.max_wait_timeout = 5_000
+  config.logger = Logger.new($stdout, level: Logger::INFO)
+end
+
+# This transaction will fence producer1
+producer2.transaction do
+  producer2.produce_sync(topic: topic_name, payload: 'message2')
+end
+
+# This should trigger reload with transactional.id change and succeed
+begin
+  producer1.transaction do
+    producer1.produce_sync(topic: topic_name, payload: 'message3-recovered')
+  end
+rescue Rdkafka::RdkafkaError => e
+  # This is expected. User needs to retry transaction if wants
+  # Reloading does not mean, that fencing is not re-raised in the transactional mode
+  exit(1) unless e.code == :fenced
+end
+
+10.times do
+  producer1.transaction do
+    producer1.produce_sync(topic: topic_name, payload: 'message3-recovered')
+  end
+
+  producer2.transaction do
+    producer2.produce_sync(topic: topic_name, payload: 'message2')
+  end
+end
+
+producer1.close
+producer2.close
+
+# Verify results
+# Should have exactly 1 reload (not multiple like in the loop case)
+success = reload_events.size == 1 && reload_events.first[:attempt] == 1
+
+exit(success ? 0 : 1)
diff --git a/spec/integrations/fatal_error_recovery/fencing_with_reload_loop_spec.rb b/spec/integrations/fatal_error_recovery/fencing_with_reload_loop_spec.rb
@@ -66,10 +66,14 @@ def configure_producer
     producer2.produce_sync(topic: topic_name, payload: 'message2')
   end
 
-  # This will trigger reload loop: fenced -> reload -> fenced -> reload...
   producer1.transaction do
     producer1.produce_sync(topic: topic_name, payload: 'message3')
   end
+
+  # This will trigger reload loop: fenced -> reload -> fenced -> reload...
+  producer2.transaction do
+    producer2.produce_sync(topic: topic_name, payload: 'message2')
+  end
 rescue Rdkafka::RdkafkaError => e
   exit(1) unless e.code == :fenced
 
diff --git a/spec/lib/waterdrop/producer/idempotence_spec.rb b/spec/lib/waterdrop/producer/idempotence_spec.rb