Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions conf/rewards/tir_pure_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
correct_answer: 1.0
wrong_answer: -0.15
no_answer: -0.25
unparsable: -0.25
execution_failure: -0.05
successful_code_execution: 0.0
timeout_penalty: -0.25
buffer_tokens: 0
iteration_penalty: -0.005
147 changes: 80 additions & 67 deletions conf/tir.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,97 +6,110 @@ save_tapes: true

llm:
parameters:
max_tokens: 3000
max_tokens: 512 # Reduced from 1024 to help stay within context limits
temperature: 0.2
stop:
- "```output"

test_llm:
parameters:
max_tokens: 512 # Reduced from 1024 to help stay within context limits
temperature: 0.2
stop:
- "```output"

actor:
rollout_policy: pipelinerl.domains.tir.rollouts.generate_tir_rollout
# TIR mode: 'fast' (single candidate + iterative reasoning) or 'sc_tir' (multiple candidates + majority voting)
mode: fast # Default to fast mode
# SC-TIR parameters (only used when mode=sc_tir)
num_candidates: 4 # Width: number of solution candidates to generate
max_reasoning_steps: 8 # Depth: max reasoning iterations per candidate
system_prompt: |-
You are a mathematical problem-solving assistant. Use Python code to solve problems step by step.

Instructions:
1. Read the problem carefully
2. Write Python code to solve it, showing your work
3. Execute the code and analyze the output
4. If needed, write more code to refine your solution
5. Once you have the final answer, present it in \boxed{} format

Code format:
```python
# Your Python code here
```

The code will be executed and you'll see the output like:
```output
# Execution results will appear here
```

You can write multiple code blocks if needed. Use libraries like sympy, numpy, math as needed.
max_reasoning_steps: 6 # Reduced from 8 to encourage concise reasoning
system_prompt: ""
#TODO: rm debug code
llm_max_rollouts: 1

# system_prompt: |-
# You are a mathematical problem-solving assistant. Use Python code to solve problems step by step.

# Instructions:
# 1. Read the problem carefully
# 2. Write Python code to solve it, showing your work
# 3. Execute the code and analyze the output
# 4. If needed, write more code to refine your solution
# 5. Once you have the final answer, present it in \boxed{} format

# Code format:
# ```python
# # Your Python code here
# ```

# The code will be executed and you'll see the output like:
# ```output
# # Execution results will appear here
# ```

# You can write multiple code blocks if needed. Use libraries like sympy, numpy, math as needed.

Always end with your final answer in the format: \boxed{answer}

Examples:

Problem: What is 2 + 3?
```python
result = 2 + 3
print(f"2 + 3 = {result}")
```
```output
2 + 3 = 5
```
\boxed{5}

Problem: Solve x^2 - 5x + 6 = 0
```python
from sympy import symbols, solve, expand
x = symbols('x')
equation = x**2 - 5*x + 6
solutions = solve(equation, x)
print(f"Solutions: {solutions}")
# Always end with your final answer in the format: \boxed{answer}

# Examples:

# Problem: What is 2 + 3?
# ```python
# result = 2 + 3
# print(f"2 + 3 = {result}")
# ```
# ```output
# 2 + 3 = 5
# ```
# \boxed{5}

# Problem: Solve x^2 - 5x + 6 = 0
# ```python
# from sympy import symbols, solve, expand
# x = symbols('x')
# equation = x**2 - 5*x + 6
# solutions = solve(equation, x)
# print(f"Solutions: {solutions}")

# Verify by substitution
for sol in solutions:
check = sol**2 - 5*sol + 6
print(f"x = {sol}: {sol}^2 - 5*{sol} + 6 = {check}")
```
```output
Solutions: [2, 3]
x = 2: 2^2 - 5*2 + 6 = 0
x = 3: 3^2 - 5*3 + 6 = 0
```
The solutions are x = 2 and x = 3.
\boxed{2, 3}
task_template: |-
{task}
# # Verify by substitution
# for sol in solutions:
# check = sol**2 - 5*sol + 6
# print(f"x = {sol}: {sol}^2 - 5*{sol} + 6 = {check}")
# ```
# ```output
# Solutions: [2, 3]
# x = 2: 2^2 - 5*2 + 6 = 0
# x = 3: 3^2 - 5*3 + 6 = 0
# ```
# The solutions are x = 2 and x = 3.
# \boxed{2, 3}
# task_template: |-
# {task}

model_path: /mnt/llmd/base_models/AI-MO-NuminaMath-7B-TIR

output_dir: results/tir/${now:%Y-%m-%d}/${now:%H-%M-%S}
output_dir: /mnt/llmd/results/exps/rafa/tir/${now:%Y-%m-%d}/${now:%H-%M-%S}

world:
env_replicas: 1

max_loops: 10 # extra iterations
max_loops: 25 # let the agent handle its own termination

environment:
_target_: pipelinerl.domains.tir.environment.MCPPythonEnvironment

agent:
_target_: pipelinerl.domains.tir.agent.TIRMathAgent
system_prompt: ${actor.system_prompt}
max_reasoning_pairs: 3 # Number of recent code-execution pairs to keep in context
max_code_chars: 800 # Maximum characters of code per step
max_output_chars: 2000 # Maximum characters of execution output per step

dataset_loader: pipelinerl.domains.tir.datasets.load_datasets

train_dataset_names:
- math_train
- open_reasoner_zero_57k
- open_reasoner_zero_extended_72k

test_dataset_names:
# - math_test
- aime_2024
# - amc_2023
- amc_2023
- math_500
12 changes: 0 additions & 12 deletions conf/tir_sc.yaml

This file was deleted.

89 changes: 83 additions & 6 deletions pipelinerl/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
import aiohttp
import hydra
import uvloop
from omegaconf import DictConfig
from omegaconf import DictConfig, OmegaConf
from pydantic import BaseModel, Field
from tapeagents.llms import TrainableLLM
from typing import Dict, List

import wandb
from pipelinerl.finetune.logging_ import flatten_dict_config, init_wandb
from pipelinerl.rollouts import RolloutResult
from pipelinerl.rollouts import RolloutResult, BaseMetrics
from pipelinerl.shared_memory_array import SharedMemoryQueue
from pipelinerl.state import TrainerState
from pipelinerl.streams import (
Expand Down Expand Up @@ -257,6 +257,8 @@ def rollout_maker_entrypoint(


def random_iter(problems: list):
if not problems:
raise ValueError(f"Cannot iterate over empty problems list. No data was loaded.")
while True:
yield random.sample(problems, 1)[0]

Expand Down Expand Up @@ -348,6 +350,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
self.latency_list.append(result.latency)
self.model_versions_list.append(result.model_version)
domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result)
assert isinstance(result.metrics, BaseMetrics), "Metrics should be an instance of BaseMetrics"
all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
for k, v in all_metrics.items():
if isinstance(v, list):
Expand Down Expand Up @@ -528,6 +531,24 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
for agg, sub_stats in calculate_stats(list_of_stats_per_metric_and_dataset).items():
stats[f"{dataset_name}/{metric_name}_{agg}"] = sub_stats

# Add clean dataset-specific pass rates for test evaluation
if not self.is_training and "success" in self.stats:
dataset_pass_rates = self._calculate_dataset_pass_rates()
stats.update(dataset_pass_rates)

# Log clean pass rates to console for easy viewing
if dataset_pass_rates:
logger.info("Dataset Pass Rates:")
for key, value in dataset_pass_rates.items():
if key.startswith("pass_rate/"):
dataset_name = key.replace("pass_rate/", "")
logger.info(f" {dataset_name}: {value:.1f}%")

# Debug: log all metrics being sent to wandb
logger.info("All dataset metrics being logged to wandb:")
for key, value in dataset_pass_rates.items():
logger.info(f" actor/{key}: {value}")

stats |= (
{
f"{split_name}{k}": v
Expand All @@ -549,7 +570,49 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
if self.cfg.wandb.use_wandb:
wandb.log({f"actor/{k}": v for k, v in stats.items()})
stats_writer.write(stats)
self.init_stats() # Reset stats for the next iteration

# Only reset stats for training (not test evaluation)
if self.is_training:
self.init_stats() # Reset stats for the next iteration

def _calculate_dataset_pass_rates(self) -> Dict[str, float]:
"""Calculate clean dataset-specific pass rates matching the table format."""
pass_rates = {}

# Dataset name mapping for clean display
dataset_name_mapping = {
"gsm8k_test": "GSM8k",
"gsm8k_train": "GSM8k",
"math_test": "MATH",
"math_train": "MATH",
"aime_2024": "AIME 2024",
"aime_2023": "AIME 2023",
"aime_2022": "AIME 2022",
"amc_2023": "AMC 2023",
"amc_2022": "AMC 2022",
}

success_stats = self.stats.get("success", {})

for dataset_name, group_results in success_stats.items():
# Flatten all success values for this dataset
all_successes = []
for group_id, success_list in group_results.items():
all_successes.extend(success_list)

if all_successes:
# Calculate pass rate as percentage
pass_rate = (sum(all_successes) / len(all_successes)) * 100

# Use clean dataset name if available, otherwise use original
clean_name = dataset_name_mapping.get(dataset_name, dataset_name)
pass_rates[f"pass_rate/{clean_name}"] = pass_rate

# Track total problems attempted for all datasets
pass_rates[f"problems_solved/{clean_name}"] = sum(all_successes)
pass_rates[f"problems_total/{clean_name}"] = len(all_successes)

return pass_rates


def run_actor_loop(cfg: DictConfig):
Expand Down Expand Up @@ -592,7 +655,7 @@ def run_actor_loop(cfg: DictConfig):
base_url=url,
model_name=str(actor_model_path),
tokenizer_name=str(actor_model_path),
parameters=cfg.llm.parameters,
parameters=OmegaConf.to_container(cfg.llm.parameters, resolve=True),
use_cache=False,
collect_logprobs=True,
observe_llm_calls=False,
Expand All @@ -604,7 +667,7 @@ def run_actor_loop(cfg: DictConfig):
base_url=url,
model_name=str(actor_model_path),
tokenizer_name=str(actor_model_path),
parameters=cfg.test_llm.parameters,
parameters=OmegaConf.to_container(cfg.test_llm.parameters, resolve=True),
use_cache=False,
collect_logprobs=True,
observe_llm_calls=False,
Expand Down Expand Up @@ -648,13 +711,22 @@ def run_actor_loop(cfg: DictConfig):
if last_regular_eval == -1
else last_regular_eval + cfg.eval_every_n_versions
)
if (

# In eval debug mode, run test evaluation immediately and only once
should_run_test_eval = False
if cfg.debug.mode == "eval" and test_dataset and test_loop_run is None and last_regular_eval == -1:
should_run_test_eval = True
logger.info("Eval debug mode: Running test evaluation immediately")
elif (
cfg.eval_every_n_versions
and not cfg.debug.mode
and trainer_state.propagated_weight_version >= next_regular_eval
and test_dataset
and test_loop_run is None
):
should_run_test_eval = True

if should_run_test_eval:
logger.info("Create test loop")
test_loop_run = test_loop.run(
dataset=test_dataset,
Expand All @@ -672,6 +744,11 @@ def run_actor_loop(cfg: DictConfig):
last_regular_eval = current_eval
train_loop.is_scheduling_paused = False
logger.info("Test loop finished")

# In eval debug mode, exit after test evaluation completes
if cfg.debug.mode == "eval":
logger.info("Eval debug mode: Test evaluation completed, exiting")
break

# 3. Keep running the training loop
_ = next(train_loop_run)
4 changes: 2 additions & 2 deletions pipelinerl/domains/math/load_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def load_datasets(dataset_names: List[str] | str | None) -> List[Tuple[str, Dict

if "math_train" in dataset_names:
# math_dataset = load_math("train")
dataset = load_dataset("hendrycks/competition_math", split="train", trust_remote_code=True)
dataset = load_dataset("hendrycks/competition_math", "default", split="train", trust_remote_code=True)
samples = [s for s in process_math(dataset, "math_train") if s is not None]
logger.info(f"Loading math train dataset: {len(samples)} samples")
datasets += add_ids(samples)
Expand Down Expand Up @@ -260,7 +260,7 @@ def load_datasets(dataset_names: List[str] | str | None) -> List[Tuple[str, Dict

if "math_test" in dataset_names:
# math_dataset = load_math("test")
dataset = load_dataset("hendrycks/competition_math", split="test", trust_remote_code=True)
dataset = load_dataset("hendrycks/competition_math", "default", split="test", trust_remote_code=True)
samples = [s for s in process_math(dataset, "math_test") if s is not None]
logger.info(f"Loading math test dataset: {len(samples)} samples")
datasets += add_ids(samples)
Expand Down
2 changes: 1 addition & 1 deletion pipelinerl/domains/math/rollouts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pipelinerl.rollouts import RolloutResult, BaseMetrics
from pipelinerl.world import Job
from tapeagents.core import Prompt
from tapeagents.llms.trainable import TrainableLLM
from tapeagents.llms import TrainableLLM

from pipelinerl.async_llm import llm_async_generate, make_training_text
from .verifier_api import verify_answer_rpc
Expand Down
Loading