Skip to content

love2code (3B) recipe #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 59 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
e98a0b6
love2code recipe
Feb 14, 2025
156a51d
Updated recipes to start love2code
Feb 20, 2025
5f56020
Turning off pre-empt because we want results yesterday
Feb 20, 2025
7f5f6fc
Changed workspace
Feb 20, 2025
889b53b
Oops, jupiter 2
Feb 20, 2025
a867ad9
Fix model identifier in 1b config
undfined Feb 21, 2025
1916ebd
love2code sample
Feb 21, 2025
0281713
Updated sample
Feb 22, 2025
5979d95
Added split paths to 1b5xc
Feb 22, 2025
32dc2bc
Temp solution don't merge this
undfined Feb 22, 2025
de8a541
Added python only config
Feb 25, 2025
c2f3025
Added olmo2 1b5xc config
Feb 25, 2025
09e2818
fix typo in 1b5xc olmo2 config
Feb 25, 2025
c548813
Added starcoder1 config
Feb 25, 2025
406a111
Merged tm's long config reader
Feb 25, 2025
d151faa
fixed olmo2 mix config
Feb 25, 2025
3d9c35c
Fixed weka path in starcoder1 config
Feb 25, 2025
0e39f9d
Fixed weka paths in python config
Feb 25, 2025
c14b8d3
Try without MixtureBuilder
undfined Feb 25, 2025
6ba21e7
Use main if not using MixtureBuilder
undfined Feb 25, 2025
7ddc48f
Weka globs
undfined Feb 25, 2025
d224f2d
Fixed up starcoder1 config to have 15pct codeprose
Feb 25, 2025
bf2f085
Merge branch 'love2code' of github.com:allenai/olmo-cookbook into lov…
Feb 25, 2025
92029ae
Try MixtureBuilder again
undfined Feb 25, 2025
43b1f87
Try with main and revert a sampling change
undfined Feb 25, 2025
e4ebeaa
Added dclm4pct yaml
Mar 3, 2025
7ca87c6
Merged
Mar 4, 2025
3aa108c
atually merged
Mar 4, 2025
fcd92c0
merged fr fr
Mar 4, 2025
e0beaa8
Changed model name for some reason
Mar 4, 2025
406f103
added olmo4pct
Mar 5, 2025
eb3b640
Actually pass the group_id override
undfined Mar 5, 2025
85362f5
fixed merge 1
Mar 5, 2025
be6763a
Merge branch 'main' into love2code
Mar 5, 2025
4b3375d
bumped gpus here
Mar 5, 2025
5515893
downsizing nodes on 4pct
Mar 5, 2025
0ddd0e2
Trying santacoder-ish data
Mar 7, 2025
8b4c620
trying starcoder2-3b model
Mar 8, 2025
6134547
Some minor fixes + 3b attempt
Mar 10, 2025
89c070b
oops
Mar 10, 2025
cd05c7d
wow im dumb
Mar 10, 2025
3e615fd
debugging starcoder2_3b
Mar 10, 2025
40bf29d
Oookay, OOM error, bumping gpus
Mar 10, 2025
6d0394f
Added smaller rmbsz
Mar 11, 2025
045088d
bumped rmbsz
Mar 11, 2025
32fa132
Merge branch 'main' of github.com:allenai/olmo-cookbook
Mar 11, 2025
63572b2
Trying gs setup
Mar 11, 2025
0684fc3
Trying sample run
Mar 11, 2025
b76071a
Added checks on the experiment group?
Mar 11, 2025
6272b8b
Merged
Mar 11, 2025
f1b59a6
Added cookbook 3B-5xC
Mar 11, 2025
3272fd1
Fixed token files in 3b5xc
Mar 11, 2025
f8c2a36
Added support for sc2-3b
Mar 11, 2025
a1ae5b9
Bumped nodes
Mar 11, 2025
fd81fb9
Changed checkpointer to id leak
Mar 12, 2025
f0bd921
Pull back for ephemeral saves
Mar 12, 2025
13d5f95
Merge branch 'main' of github.com:allenai/olmo-cookbook
Mar 12, 2025
95193ba
merged
Mar 12, 2025
c182ab2
fixed ref to dpconfig
Mar 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ wandb = [
"wandb",
]
all = [
"ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@revert-147-epwalsh/mixture-fix",
"ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@learn2code",
"beaker-py",
"GitPython>=3.0,<4.0",
"wandb",
Expand Down
10 changes: 2 additions & 8 deletions src/cookbook/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,14 @@
import yaml
from beaker import Beaker
from beaker.services.job import JobClient
from olmo_core.utils import generate_uuid, prepare_cli_environment
from tqdm import tqdm
from yaspin import yaspin

from cookbook.aliases import ExperimentConfig, LaunchGroup, validate_sources
from cookbook.cli.eval import convert, evaluate
from cookbook.utils.config import (
build_train_config,
config_from_path,
mk_experiment_group,
mk_launch_configs,
)
from cookbook.utils.config import build_train_config, config_from_path, mk_experiment_group, mk_launch_configs
from cookbook.utils.data import get_token_counts_and_ratios
from olmo_core.utils import generate_uuid, prepare_cli_environment

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -57,7 +52,6 @@ def cli():
)
def launch(config: Path, dry_run: bool, no_cache: bool, group_id: Optional[str] = None):
"""Launch an experiment."""

with open(config, "r") as f:
data = yaml.safe_load(f)

Expand Down
4 changes: 2 additions & 2 deletions src/cookbook/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,8 @@ def install_olmo_core(commit_hash: str | None, env: PythonEnv | None = None) ->

def make_destination_dir(input_dir: str, suffix: str, output_dir: str | None = None) -> str:
if output_dir is None:
input_base, input_fn = os.path.split(input_dir)
output_dir = os.path.join(input_base, f"{input_fn.rstrip('/')}-{suffix}")
input_base, input_fn = os.path.split(input_dir.rstrip("/"))
output_dir = os.path.join(input_base, f"{input_fn}-{suffix}")

os.makedirs(output_dir, exist_ok=True)

Expand Down
7 changes: 7 additions & 0 deletions src/cookbook/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@
"bigcodebench_hard::none",
]

ALL_1B_TASKS = [
"hellaswag",
"piqa",
] + MMLU_CATEGORIES


STARCODER_CODEX_TASKS = [
"codex_humaneval::starcoder_pass@1",
"codex_humaneval::starcoder_pass@10",
Expand All @@ -200,6 +206,7 @@
"starcoder": STARCODER_CODEX_TASKS,
"starcoder::pass@1": STARCODER_PASS_AT_1_TASKS,
"code-no-bcb": [task for task in ALL_CODEX_TASKS if "bigcodebench" not in task],
"1b-evals": ALL_1B_TASKS,
}

OE_EVAL_GIT_URL = "[email protected]:allenai/oe-eval-internal.git"
Expand Down
111 changes: 111 additions & 0 deletions src/cookbook/model/aliases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from dataclasses import dataclass
from enum import Enum

from olmo_core.config import Config
from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig, TokenizerConfig
from olmo_core.distributed.parallel import DataParallelType
from olmo_core.nn.transformer import TransformerBlockType, TransformerConfig
from olmo_core.optim import AdamWConfig
from olmo_core.train import TrainerConfig


@dataclass
class ModelTrainConfig(Config):
model: TransformerConfig
optim: AdamWConfig
dataset: NumpyDatasetConfig
data_loader: NumpyDataLoaderConfig
trainer: TrainerConfig
init_seed: int = 12536


@dataclass
class ModelConfig:
compile: bool
d_model: int
n_heads: int
n_layers: int
rope_theta: int
flash_attention: bool
max_sequence_length: int
layer_norm_eps: float = 1e-6
save_interval: int = 1000
eval_interval: int = 200
device_batch_size: int = 8
batch_divisor: int = 32
eps: float = 1e-8
betas: tuple = (0.9, 0.95)
weight_decay: float = 0.1
max_grad_norm: float = 1.0
decay_embeddings: bool = False
qk_norm: bool = True
dp_type: DataParallelType = DataParallelType.fsdp
block_type: TransformerBlockType = TransformerBlockType.reordered_norm

@classmethod
def olmo_30m(cls) -> "ModelConfig":
return ModelConfig(
compile=True,
d_model=256,
n_heads=8,
n_layers=4,
rope_theta=500_000,
flash_attention=True,
max_sequence_length=4096,
)

@classmethod
def olmo_190m(cls) -> "ModelConfig":
return ModelConfig(
compile=True,
d_model=768,
n_heads=12,
n_layers=12,
rope_theta=500_000,
flash_attention=True,
max_sequence_length=4096,
)

@classmethod
def olmo_1b(cls) -> "ModelConfig":
"""
OLMo-1b (1_336_035_328 parameters)
(1_131_841_536 nonembed params)
"""
return ModelConfig(
compile=True,
d_model=2048,
n_heads=16,
n_layers=18,
rope_theta=500_000,
flash_attention=True,
max_sequence_length=4096,
)

@classmethod
def love2code_3b(cls) -> "ModelConfig":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, this is a generic OLMo-3b config right? Or are there code specific HP's here?

"""
num params should be : 3607267840
num non_embed parmams should be: 3481438720
"""
return ModelConfig(
compile=True,
d_model=2560,
n_heads=32,
n_layers=32,
rope_theta=500_000,
flash_attention=True,
max_sequence_length=2048,
)


class SupportedModels(Enum):
olmo_190m = ModelConfig.olmo_190m()
olmo_30m = ModelConfig.olmo_30m()
olmo_1b = ModelConfig.olmo_1b()
starcoder2_3b = ModelConfig.starcoder_3b()


class SupportedTokenizers(Enum):
dolma2 = TokenizerConfig.dolma2()
gpt_neox = TokenizerConfig.gpt_neox_olmo_dolma_v1_5()
48 changes: 26 additions & 22 deletions src/cookbook/model/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
from dataclasses import dataclass
from typing import Dict, List, Optional

from olmo_core.data import (
DataMix,
NumpyDataLoaderConfig,
NumpyDatasetConfig,
NumpyDatasetType,
TokenizerConfig,
from cookbook.aliases import SourceInstance, WandbConfig
from cookbook.data.dataset import MixtureBuilder
from cookbook.model.config import (
MODEL_TO_LR_MAP,
DefaultOptimizerProperties,
ModelTrainConfig,
SupportedTokenizers,
WrappedTransformerConfig,
)
from cookbook.model.evaluators import DownstreamEvaluators
from cookbook.model.schedulers import WSD
from olmo_core.data import DataMix, NumpyDataLoaderConfig, NumpyDatasetConfig, NumpyDatasetType, TokenizerConfig
from olmo_core.data.types import NumpyDatasetDType
from olmo_core.nn.transformer import TransformerConfig
from olmo_core.optim import AdamWConfig, CosWithWarmup, OptimGroupOverride, Scheduler
Expand All @@ -28,18 +33,6 @@
)
from olmo_core.train.common import LoadStrategy

from cookbook.aliases import SourceInstance, WandbConfig
from cookbook.data.dataset import MixtureBuilder
from cookbook.model.config import (
MODEL_TO_LR_MAP,
DefaultOptimizerProperties,
ModelTrainConfig,
SupportedTokenizers,
WrappedTransformerConfig,
)
from cookbook.model.evaluators import DownstreamEvaluators
from cookbook.model.schedulers import WSD

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -199,8 +192,15 @@ def __init__(
if any(substring in cluster for substring in ["jupiter", "saturn"]) and weka:
self.root_dir = f"/weka/oe-training-default/ai2-llm"
logger.info(f"Using Weka bucket as root dir: {self.root_dir}")
self.checkpoint_dir = f"{self.root_dir}/checkpoints/{self.beaker_user.lower()}/{self.run_name}"

elif "augusta" in cluster:
try:
assert not weka
except AssertionError as e:
logger.info("Can't be on Augusta and weka!")
raise e
self.data_dir = self.root_dir = "gs://ai2-llm"

self.checkpoint_dir = f"{self.root_dir}/checkpoints/{self.beaker_user.lower()}/{self.run_name}"
self.dataset_cache = f"{self.root_dir}/{self.beaker_user.lower()}/{self.run_name}/dataset-cache"

def get_tokenizer_config(self, tokenizer) -> TokenizerConfig:
Expand Down Expand Up @@ -269,7 +269,7 @@ def build_callbacks(self, model: TransformerConfig) -> Dict[str, Callback]:
"profiler": ProfilerCallback(enabled=self.profile),
"checkpointer": CheckpointerCallback(
save_interval=self.save_interval,
ephemeral_save_interval=100,
ephemeral_save_interval=20,
save_async=True,
),
"wandb": WandBCallback(
Expand Down Expand Up @@ -327,6 +327,10 @@ def build_dataset_config(self) -> NumpyDatasetConfig:
for source in self.sources:
source_paths.extend(source.paths)

# source_paths = []
# for source in self.sources:
# source_paths.extend(source.paths)

dataset_config = NumpyDatasetConfig(
paths=source_paths,
source_mixture_config=mixture_config,
Expand Down Expand Up @@ -387,14 +391,14 @@ def build(self) -> ModelTrainConfig:
load_path=load_path,
load_strategy=load_strategy,
save_folder=self.checkpoint_dir,
max_duration=Duration.tokens(self.max_tokens),
work_dir=self.dataset_cache,
rank_microbatch_size=rank_microbatch_size,
save_overwrite=True,
metrics_collect_interval=10,
cancel_check_interval=5,
compile_loss=True,
z_loss_multiplier=1e-5,
max_duration=Duration.tokens(self.max_tokens),
)

for callback_name, callback in self.build_callbacks(self.transformer_config).items():
Expand Down
28 changes: 19 additions & 9 deletions src/cookbook/model/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,8 @@

from olmo_core.config import Config, DType
from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig, TokenizerConfig
from olmo_core.distributed.parallel import DataParallelType
from olmo_core.nn.transformer import (
TransformerBlockType,
TransformerConfig,
TransformerDataParallelConfig,
)
from olmo_core.distributed.parallel import DataParallelConfig, DataParallelType
from olmo_core.nn.transformer import TransformerBlockType, TransformerConfig
from olmo_core.optim import AdamWConfig
from olmo_core.train import TrainerConfig

Expand Down Expand Up @@ -58,7 +54,7 @@ def olmo_30m(cls, tokenizer: TokenizerConfig) -> TransformerConfig:
layer_norm_eps=DefaultTransformerProperties.layer_norm_eps,
qk_norm=DefaultTransformerProperties.qk_norm,
block_name=DefaultTransformerProperties.block_type,
dp_config=TransformerDataParallelConfig(
dp_config=DataParallelConfig(
name=DefaultTransformerProperties.dp_type,
param_dtype=DType.bfloat16,
reduce_dtype=DType.float32,
Expand All @@ -70,7 +66,7 @@ def olmo2_core_190M(cls, dp_type: Optional[DataParallelType] = None) -> Transfor
return getattr(TransformerConfig, "olmo2_190M")(
vocab_size=TokenizerConfig.dolma2().padded_vocab_size(),
compile=True,
dp_config=TransformerDataParallelConfig(
dp_config=DataParallelConfig(
name=dp_type if dp_type else DefaultTransformerProperties.dp_type,
param_dtype=DType.bfloat16,
reduce_dtype=DType.float32,
Expand All @@ -85,7 +81,19 @@ def olmo2_core_1B(cls, dp_type: Optional[DataParallelType] = None) -> Transforme
return getattr(TransformerConfig, "olmo2_1B")(
vocab_size=TokenizerConfig.dolma2().padded_vocab_size(),
compile=True,
dp_config=TransformerDataParallelConfig(
dp_config=DataParallelConfig(
name=dp_type if dp_type else DefaultTransformerProperties.dp_type,
param_dtype=DType.bfloat16,
reduce_dtype=DType.float32,
),
)

@classmethod
def starcoder2_3B(cls, dp_type: Optional[DataParallelType] = None) -> TransformerConfig:
return getattr(TransformerConfig, "starcoder2_3b")(
vocab_size=TokenizerConfig.dolma2().padded_vocab_size(),
compile=True,
dp_config=DataParallelConfig(
name=dp_type if dp_type else DefaultTransformerProperties.dp_type,
param_dtype=DType.bfloat16,
reduce_dtype=DType.float32,
Expand All @@ -100,6 +108,8 @@ def from_model_identifier(cls, model_identifier: str) -> TransformerConfig:
return cls.olmo2_core_190M()
elif model_identifier == "olmo2_1B":
return cls.olmo2_core_1B()
elif model_identifier == "starcoder2_3b":
return cls.starcoder2_3B()
else:
raise ValueError(f"Model identifier {model_identifier} is not supported.")

Expand Down
Loading