ServiceNow · AlexPiche · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/conf/rewards/tir_pure_success.yaml b/conf/rewards/tir_pure_success.yaml
@@ -0,0 +1,9 @@
+correct_answer: 1.0
+wrong_answer: -0.15
+no_answer: -0.25
+unparsable: -0.25
+execution_failure: -0.05
+successful_code_execution: 0.0
+timeout_penalty: -0.25
+buffer_tokens: 0
+iteration_penalty: -0.005
diff --git a/conf/tir.yaml b/conf/tir.yaml
@@ -6,97 +6,110 @@ save_tapes: true
 
 llm:
   parameters:
-    max_tokens: 3000
+    max_tokens: 512  # Reduced from 1024 to help stay within context limits
     temperature: 0.2
+    stop:
+      - "```output"
+
+test_llm:
+  parameters:
+    max_tokens: 512  # Reduced from 1024 to help stay within context limits
+    temperature: 0.2
+    stop:
+      - "```output"
 
 actor:
   rollout_policy: pipelinerl.domains.tir.rollouts.generate_tir_rollout
-  # TIR mode: 'fast' (single candidate + iterative reasoning) or 'sc_tir' (multiple candidates + majority voting)
-  mode: fast  # Default to fast mode
-  # SC-TIR parameters (only used when mode=sc_tir)
-  num_candidates: 4  # Width: number of solution candidates to generate  
-  max_reasoning_steps: 8  # Depth: max reasoning iterations per candidate
-  system_prompt: |-
-    You are a mathematical problem-solving assistant. Use Python code to solve problems step by step.
-
-    Instructions:
-    1. Read the problem carefully
-    2. Write Python code to solve it, showing your work
-    3. Execute the code and analyze the output
-    4. If needed, write more code to refine your solution
-    5. Once you have the final answer, present it in \boxed{} format
-
-    Code format:
-    ```python
-    # Your Python code here
-    ```
-
-    The code will be executed and you'll see the output like:
-    ```output
-    # Execution results will appear here
-    ```
-
-    You can write multiple code blocks if needed. Use libraries like sympy, numpy, math as needed.
+  max_reasoning_steps: 6  # Reduced from 8 to encourage concise reasoning
+  system_prompt: ""
+  #TODO: rm debug code
+  llm_max_rollouts: 1
+
+  # system_prompt: |-
+  #   You are a mathematical problem-solving assistant. Use Python code to solve problems step by step.
+
+  #   Instructions:
+  #   1. Read the problem carefully
+  #   2. Write Python code to solve it, showing your work
+  #   3. Execute the code and analyze the output
+  #   4. If needed, write more code to refine your solution
+  #   5. Once you have the final answer, present it in \boxed{} format
+
+  #   Code format:
+  #   ```python
+  #   # Your Python code here
+  #   ```
+
+  #   The code will be executed and you'll see the output like:
+  #   ```output
+  #   # Execution results will appear here
+  #   ```
+
+  #   You can write multiple code blocks if needed. Use libraries like sympy, numpy, math as needed.
 
-    Always end with your final answer in the format: \boxed{answer}
-
-    Examples:
-
-    Problem: What is 2 + 3?
-    ```python
-    result = 2 + 3
-    print(f"2 + 3 = {result}")
-    ```
-    ```output
-    2 + 3 = 5
-    ```
-    \boxed{5}
-
-    Problem: Solve x^2 - 5x + 6 = 0
-    ```python
-    from sympy import symbols, solve, expand
-    x = symbols('x')
-    equation = x**2 - 5*x + 6
-    solutions = solve(equation, x)
-    print(f"Solutions: {solutions}")
+  #   Always end with your final answer in the format: \boxed{answer}
+
+  #   Examples:
+
+  #   Problem: What is 2 + 3?
+  #   ```python
+  #   result = 2 + 3
+  #   print(f"2 + 3 = {result}")
+  #   ```
+  #   ```output
+  #   2 + 3 = 5
+  #   ```
+  #   \boxed{5}
+
+  #   Problem: Solve x^2 - 5x + 6 = 0
+  #   ```python
+  #   from sympy import symbols, solve, expand
+  #   x = symbols('x')
+  #   equation = x**2 - 5*x + 6
+  #   solutions = solve(equation, x)
+  #   print(f"Solutions: {solutions}")
 
-    # Verify by substitution
-    for sol in solutions:
-        check = sol**2 - 5*sol + 6
-        print(f"x = {sol}: {sol}^2 - 5*{sol} + 6 = {check}")
-    ```
-    ```output
-    Solutions: [2, 3]
-    x = 2: 2^2 - 5*2 + 6 = 0
-    x = 3: 3^2 - 5*3 + 6 = 0
-    ```
-    The solutions are x = 2 and x = 3.
-    \boxed{2, 3}
-  task_template: |-
-    {task}
+  #   # Verify by substitution
+  #   for sol in solutions:
+  #       check = sol**2 - 5*sol + 6
+  #       print(f"x = {sol}: {sol}^2 - 5*{sol} + 6 = {check}")
+  #   ```
+  #   ```output
+  #   Solutions: [2, 3]
+  #   x = 2: 2^2 - 5*2 + 6 = 0
+  #   x = 3: 3^2 - 5*3 + 6 = 0
+  #   ```
+  #   The solutions are x = 2 and x = 3.
+  #   \boxed{2, 3}
+  # task_template: |-
+  #   {task}
 
 model_path: /mnt/llmd/base_models/AI-MO-NuminaMath-7B-TIR
 
-output_dir: results/tir/${now:%Y-%m-%d}/${now:%H-%M-%S}
+output_dir: /mnt/llmd/results/exps/rafa/tir/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
 world:
   env_replicas: 1
 
-max_loops: 10  # extra iterations
+max_loops: 25  # let the agent handle its own termination
 
 environment:
   _target_: pipelinerl.domains.tir.environment.MCPPythonEnvironment
 
 agent:
   _target_: pipelinerl.domains.tir.agent.TIRMathAgent
   system_prompt: ${actor.system_prompt}
+  max_reasoning_pairs: 3  # Number of recent code-execution pairs to keep in context
+  max_code_chars: 800     # Maximum characters of code per step
+  max_output_chars: 2000  # Maximum characters of execution output per step
 
 dataset_loader: pipelinerl.domains.tir.datasets.load_datasets
 
 train_dataset_names:
-  - math_train
+  - open_reasoner_zero_57k
+  - open_reasoner_zero_extended_72k 
 
 test_dataset_names:
-  # - math_test
   - aime_2024
-  # - amc_2023
+  - amc_2023
+  - math_500
diff --git a/conf/tir_sc.yaml b/conf/tir_sc.yaml
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
@@ -15,14 +15,14 @@
 import aiohttp
 import hydra
 import uvloop
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
 from typing import Dict, List
 
 import wandb
 from pipelinerl.finetune.logging_ import flatten_dict_config, init_wandb
-from pipelinerl.rollouts import RolloutResult
+from pipelinerl.rollouts import RolloutResult, BaseMetrics
 from pipelinerl.shared_memory_array import SharedMemoryQueue
 from pipelinerl.state import TrainerState
 from pipelinerl.streams import (
@@ -257,6 +257,8 @@ def rollout_maker_entrypoint(
 
 
 def random_iter(problems: list):
+    if not problems:
+        raise ValueError(f"Cannot iterate over empty problems list. No data was loaded.")
     while True:
         yield random.sample(problems, 1)[0]
 
@@ -348,6 +350,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
             self.latency_list.append(result.latency)
             self.model_versions_list.append(result.model_version)
             domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result) 
+            assert isinstance(result.metrics, BaseMetrics), "Metrics should be an instance of BaseMetrics"
             all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
             for k, v in all_metrics.items():
                 if isinstance(v, list):
@@ -528,6 +531,24 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
                 for agg, sub_stats in calculate_stats(list_of_stats_per_metric_and_dataset).items():
                     stats[f"{dataset_name}/{metric_name}_{agg}"] = sub_stats
 
+        # Add clean dataset-specific pass rates for test evaluation
+        if not self.is_training and "success" in self.stats:
+            dataset_pass_rates = self._calculate_dataset_pass_rates()
+            stats.update(dataset_pass_rates)
+
+            # Log clean pass rates to console for easy viewing
+            if dataset_pass_rates:
+                logger.info("Dataset Pass Rates:")
+                for key, value in dataset_pass_rates.items():
+                    if key.startswith("pass_rate/"):
+                        dataset_name = key.replace("pass_rate/", "")
+                        logger.info(f"  {dataset_name}: {value:.1f}%")
+
+                # Debug: log all metrics being sent to wandb
+                logger.info("All dataset metrics being logged to wandb:")
+                for key, value in dataset_pass_rates.items():
+                    logger.info(f"  actor/{key}: {value}")
+
         stats |= (
             {
                 f"{split_name}{k}": v
@@ -549,7 +570,49 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
-        self.init_stats()  # Reset stats for the next iteration
+
+        # Only reset stats for training (not test evaluation)
+        if self.is_training:
+            self.init_stats()  # Reset stats for the next iteration
+
+    def _calculate_dataset_pass_rates(self) -> Dict[str, float]:
+        """Calculate clean dataset-specific pass rates matching the table format."""
+        pass_rates = {}
+
+        # Dataset name mapping for clean display
+        dataset_name_mapping = {
+            "gsm8k_test": "GSM8k",
+            "gsm8k_train": "GSM8k", 
+            "math_test": "MATH",
+            "math_train": "MATH",
+            "aime_2024": "AIME 2024",
+            "aime_2023": "AIME 2023", 
+            "aime_2022": "AIME 2022",
+            "amc_2023": "AMC 2023",
+            "amc_2022": "AMC 2022",
+        }
+
+        success_stats = self.stats.get("success", {})
+
+        for dataset_name, group_results in success_stats.items():
+            # Flatten all success values for this dataset
+            all_successes = []
+            for group_id, success_list in group_results.items():
+                all_successes.extend(success_list)
+
+            if all_successes:
+                # Calculate pass rate as percentage
+                pass_rate = (sum(all_successes) / len(all_successes)) * 100
+
+                # Use clean dataset name if available, otherwise use original
+                clean_name = dataset_name_mapping.get(dataset_name, dataset_name)
+                pass_rates[f"pass_rate/{clean_name}"] = pass_rate
+
+                # Track total problems attempted for all datasets
+                pass_rates[f"problems_solved/{clean_name}"] = sum(all_successes)
+                pass_rates[f"problems_total/{clean_name}"] = len(all_successes)
+
+        return pass_rates
 
 
 def run_actor_loop(cfg: DictConfig):
@@ -592,7 +655,7 @@ def run_actor_loop(cfg: DictConfig):
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
-            parameters=cfg.llm.parameters,
+            parameters=OmegaConf.to_container(cfg.llm.parameters, resolve=True),
             use_cache=False,
             collect_logprobs=True,
             observe_llm_calls=False,
@@ -604,7 +667,7 @@ def run_actor_loop(cfg: DictConfig):
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
-            parameters=cfg.test_llm.parameters,
+            parameters=OmegaConf.to_container(cfg.test_llm.parameters, resolve=True),
             use_cache=False,
             collect_logprobs=True,
             observe_llm_calls=False,
@@ -648,13 +711,22 @@ def run_actor_loop(cfg: DictConfig):
             if last_regular_eval == -1
             else last_regular_eval + cfg.eval_every_n_versions
         )
-        if (
+
+        # In eval debug mode, run test evaluation immediately and only once
+        should_run_test_eval = False
+        if cfg.debug.mode == "eval" and test_dataset and test_loop_run is None and last_regular_eval == -1:
+            should_run_test_eval = True
+            logger.info("Eval debug mode: Running test evaluation immediately")
+        elif (
             cfg.eval_every_n_versions
             and not cfg.debug.mode
             and trainer_state.propagated_weight_version >= next_regular_eval
             and test_dataset
             and test_loop_run is None
         ):
+            should_run_test_eval = True
+
+        if should_run_test_eval:
             logger.info("Create test loop")
             test_loop_run = test_loop.run(
                 dataset=test_dataset,
@@ -672,6 +744,11 @@ def run_actor_loop(cfg: DictConfig):
                 last_regular_eval = current_eval
                 train_loop.is_scheduling_paused = False
                 logger.info("Test loop finished")
+
+                # In eval debug mode, exit after test evaluation completes
+                if cfg.debug.mode == "eval":
+                    logger.info("Eval debug mode: Test evaluation completed, exiting")
+                    break
 
         # 3. Keep running the training loop
         _ = next(train_loop_run)
diff --git a/pipelinerl/domains/math/load_datasets.py b/pipelinerl/domains/math/load_datasets.py
@@ -216,7 +216,7 @@ def load_datasets(dataset_names: List[str] | str | None) -> List[Tuple[str, Dict
 
     if "math_train" in dataset_names:
         # math_dataset = load_math("train")
-        dataset = load_dataset("hendrycks/competition_math", split="train", trust_remote_code=True)
+        dataset = load_dataset("hendrycks/competition_math", "default", split="train", trust_remote_code=True)
         samples = [s for s in process_math(dataset, "math_train") if s is not None]
         logger.info(f"Loading math train dataset: {len(samples)} samples")
         datasets += add_ids(samples)
@@ -260,7 +260,7 @@ def load_datasets(dataset_names: List[str] | str | None) -> List[Tuple[str, Dict
 
     if "math_test" in dataset_names:
         # math_dataset = load_math("test")
-        dataset = load_dataset("hendrycks/competition_math", split="test", trust_remote_code=True)
+        dataset = load_dataset("hendrycks/competition_math", "default", split="test", trust_remote_code=True)
         samples = [s for s in process_math(dataset, "math_test") if s is not None]
         logger.info(f"Loading math test dataset: {len(samples)} samples")
         datasets += add_ids(samples)

diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
@@ -7,7 +7,7 @@
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 from pipelinerl.world import Job
 from tapeagents.core import Prompt
-from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.llms import TrainableLLM
 
 from pipelinerl.async_llm import llm_async_generate, make_training_text
 from .verifier_api import verify_answer_rpc