temporalio · Sushisource · Aug 15, 2025 · Aug 9, 2025 · Aug 9, 2025 · Aug 15, 2025
@@ -89,6 +89,8 @@ static TEMPORAL_NAMESPACE_HEADER_KEY: &str = "temporal-namespace";
 
 /// Key used to communicate when a GRPC message is too large
 pub static MESSAGE_TOO_LARGE_KEY: &str = "message-too-large";
+/// Key used to indicate a error was returned by the retryer because of the short-circuit predicate
+pub static ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT: &str = "short-circuit";
 
 /// The server times out polls after 60 seconds. Set our timeout to be slightly beyond that.
 const LONG_POLL_TIMEOUT: Duration = Duration::from_secs(70);

@@ -1,6 +1,6 @@
 use crate::{
-    Client, IsWorkerTaskLongPoll, MESSAGE_TOO_LARGE_KEY, NamespacedClient, NoRetryOnMatching,
-    Result, RetryConfig, raw::IsUserLongPoll,
+    Client, ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT, IsWorkerTaskLongPoll, MESSAGE_TOO_LARGE_KEY,
+    NamespacedClient, NoRetryOnMatching, Result, RetryConfig, raw::IsUserLongPoll,
 };
 use backoff::{Clock, SystemClock, backoff::Backoff, exponential::ExponentialBackoff};
 use futures_retry::{ErrorHandler, FutureRetry, RetryPolicy};
@@ -214,6 +214,10 @@ where
         if let Some(sc) = self.retry_short_circuit.as_ref()
             && (sc.predicate)(&e)
         {
+            e.metadata_mut().insert(
+                ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT,
+                tonic::metadata::MetadataValue::from(0),
+            );
             return RetryPolicy::ForwardError(e);
         }
 
@@ -441,7 +445,12 @@ mod tests {
             FixedClock(Instant::now()),
         );
         let result = err_handler.handle(1, Status::new(Code::ResourceExhausted, "leave me alone"));
-        assert_matches!(result, RetryPolicy::ForwardError(_))
+        let e = assert_matches!(result, RetryPolicy::ForwardError(e) => e);
+        assert!(
+            e.metadata()
+                .get(ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT)
+                .is_some()
+        );
     }
 
     #[tokio::test]

@@ -18,7 +18,7 @@ use std::{
     },
     time::Duration,
 };
-use temporal_client::NoRetryOnMatching;
+use temporal_client::{ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT, NoRetryOnMatching};
 use temporal_sdk_core_api::worker::{
     ActivitySlotKind, NexusSlotKind, PollerBehavior, SlotKind, WorkflowSlotKind,
 };
@@ -538,20 +538,27 @@ impl PollScalerReportHandle {
                 }
             }
             Err(e) => {
-                // We should only see (and react to) errors in autoscaling mode
-                if matches!(self.behavior, PollerBehavior::Autoscaling { .. })
-                    && self.ever_saw_scaling_decision.load(Ordering::Relaxed)
-                {
-                    debug!("Got error from server while polling: {:?}", e);
-                    if e.code() == Code::ResourceExhausted {
-                        // Scale down significantly for resource exhaustion
-                        self.change_target(usize::saturating_div, 2);
-                    } else {
-                        // Other codes that would normally have made us back off briefly can
-                        // reclaim this poller
-                        self.change_target(usize::saturating_sub, 1);
+                if matches!(self.behavior, PollerBehavior::Autoscaling { .. }) {
+                    // We should only react to errors in autoscaling mode if we saw a scaling
+                    // decision
+                    if self.ever_saw_scaling_decision.load(Ordering::Relaxed) {
+                        debug!("Got error from server while polling: {:?}", e);
+                        if e.code() == Code::ResourceExhausted {
+                            // Scale down significantly for resource exhaustion
+                            self.change_target(usize::saturating_div, 2);
+                        } else {
+                            // Other codes that would normally have made us back off briefly can
+                            // reclaim this poller
+                            self.change_target(usize::saturating_sub, 1);
+                        }
                     }
-                    return false;
+                    // Only propagate errors out if they weren't because of the short-circuiting
+                    // logic. IE: We don't want to fail callers because we said we wanted to know
+                    // about ResourceExhausted errors, but we haven't seen a scaling decision yet,
+                    // so we're not reacting to errors, only propagating them.
+                    return !e
+                        .metadata()
+                        .contains_key(ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT);
                 }
             }
         }
@@ -748,4 +755,46 @@ mod tests {
         pb.poll().await.unwrap().unwrap();
         pb.shutdown().await;
     }
+
+    #[tokio::test]
+    async fn autoscale_wont_fail_caller_on_short_circuited_error() {
+        let mut mock_client = mock_manual_worker_client();
+        mock_client
+            .expect_poll_workflow_task()
+            .times(1)
+            .returning(move |_, _| {
+                async {
+                    let mut st = tonic::Status::cancelled("whatever");
+                    st.metadata_mut()
+                        .insert(ERROR_RETURNED_DUE_TO_SHORT_CIRCUIT, 1.into());
+                    Err(st)
+                }
+                .boxed()
+            });
+        mock_client
+            .expect_poll_workflow_task()
+            .times(1)
+            .returning(move |_, _| async { Ok(Default::default()) }.boxed());
+
+        let pb = LongPollBuffer::new_workflow_task(
+            Arc::new(mock_client),
+            "sometq".to_string(),
+            None,
+            PollerBehavior::Autoscaling {
+                minimum: 1,
+                maximum: 1,
+                initial: 1,
+            },
+            fixed_size_permit_dealer(1),
+            CancellationToken::new(),
+            None::<fn(usize)>,
+            WorkflowTaskOptions {
+                wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(1)))),
+            },
+        );
+
+        // Should not see error, unwraps should get empty response
+        pb.poll().await.unwrap().unwrap();
+        pb.shutdown().await;
+    }
 }
@@ -110,7 +110,7 @@ struct GenericService<F> {
 }
 impl<F> Service<tonic::codegen::http::Request<Body>> for GenericService<F>
 where
-    F: FnMut() -> Response<Body>,
+    F: FnMut() -> BoxFuture<'static, Response<Body>>,
 {
     type Response = Response<Body>;
     type Error = Infallible;
@@ -133,7 +133,7 @@ where
             )
             .unwrap();
         let r = (self.response_maker)();
-        async move { Ok(r) }.boxed()
+        async move { Ok(r.await) }.boxed()
     }
 }
 impl<F> NamedService for GenericService<F> {
@@ -144,12 +144,12 @@ struct FakeServer {
     addr: std::net::SocketAddr,
     shutdown_tx: oneshot::Sender<()>,
     header_rx: tokio::sync::mpsc::UnboundedReceiver<String>,
-    server_handle: tokio::task::JoinHandle<()>,
+    pub server_handle: tokio::task::JoinHandle<()>,
 }
 
 async fn fake_server<F>(response_maker: F) -> FakeServer
 where
-    F: FnMut() -> Response<Body> + Clone + Send + Sync + 'static,
+    F: FnMut() -> BoxFuture<'static, Response<Body>> + Clone + Send + Sync + 'static,
 {
     let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
     let (header_tx, header_rx) = tokio::sync::mpsc::unbounded_channel();
@@ -191,7 +191,7 @@ impl FakeServer {
 
 #[tokio::test]
 async fn timeouts_respected_one_call_fake_server() {
-    let mut fs = fake_server(|| Response::new(Body::empty())).await;
+    let mut fs = fake_server(|| async { Response::new(Body::empty()) }.boxed()).await;
     let header_rx = &mut fs.header_rx;
 
     let mut opts = get_integ_server_options();
@@ -260,7 +260,11 @@ async fn non_retryable_errors() {
         Code::Unauthenticated,
         Code::Unimplemented,
     ] {
-        let mut fs = fake_server(move || Status::new(code, "bla").into_http()).await;
+        let mut fs = fake_server(move || {
+            let s = Status::new(code, "bla").into_http();
+            async { s }.boxed()
+        })
+        .await;
 
         let mut opts = get_integ_server_options();
         let uri = format!("http://localhost:{}", fs.addr.port())
@@ -292,13 +296,13 @@ async fn retryable_errors() {
     {
         let count = Arc::new(AtomicUsize::new(0));
         let mut fs = fake_server(move || {
-            dbg!("Making resp");
             let prev = count.fetch_add(1, Ordering::Relaxed);
-            if prev < 3 {
+            let r = if prev < 3 {
                 Status::new(code, "bla").into_http()
             } else {
                 make_ok_response(RespondActivityTaskCanceledResponse::default())
-            }
+            };
+            async { r }.boxed()
         })
         .await;
 
@@ -335,7 +339,7 @@ async fn namespace_header_attached_to_relevant_calls() {
             .add_service(GenericService {
                 header_to_parse: "Temporal-Namespace",
                 header_tx,
-                response_maker: || Response::new(Body::empty()),
+                response_maker: || async { Response::new(Body::empty()) }.boxed(),
             })
             .serve_with_incoming_shutdown(
                 tokio_stream::wrappers::TcpListenerStream::new(listener),

@@ -11,6 +11,7 @@ use temporal_sdk::{
     ActContext, ActExitValue, ActivityError, ActivityOptions, CancellableFuture, WfContext,
     WfExitValue, WorkflowResult,
 };
+use temporal_sdk_core_api::worker::PollerBehavior;
 use temporal_sdk_core_protos::{
     DEFAULT_ACTIVITY_TYPE, TaskToken,
     coresdk::{
@@ -1066,11 +1067,23 @@ async fn activity_can_be_cancelled_by_local_timeout() {
 
 #[tokio::test]
 #[ignore] // Runs forever, used to manually attempt to repro spurious activity completion rpc errs
+// Unfortunately there is no way to unit test this as tonic doesn't publicly expose the necessary
+// machinery to construct the right kind of error.
 async fn long_activity_timeout_repro() {
     let wf_name = "long_activity_timeout_repro";
     let mut starter = CoreWfStarter::new(wf_name);
     starter
         .worker_config
+        .workflow_task_poller_behavior(PollerBehavior::Autoscaling {
+            minimum: 1,
+            maximum: 10,
+            initial: 5,
+        })
+        .activity_task_poller_behavior(PollerBehavior::Autoscaling {
+            minimum: 1,
+            maximum: 10,
+            initial: 5,
+        })
         .local_timeout_buffer_for_activities(Duration::from_secs(0));
     let mut worker = starter.worker().await;
     worker.register_wf(wf_name.to_owned(), |ctx: WfContext| async move {