diff --git a/pyproject.toml b/pyproject.toml index 3f0f6215..4322a363 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ wandb = [ "wandb", ] all = [ - "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@revert-147-epwalsh/mixture-fix", + "ai2-olmo-core @ git+https://github.com/allenai/OLMo-core.git@learn2code", "beaker-py", "GitPython>=3.0,<4.0", "wandb", diff --git a/src/cookbook/cli/cli.py b/src/cookbook/cli/cli.py index 83c714de..d39113c3 100644 --- a/src/cookbook/cli/cli.py +++ b/src/cookbook/cli/cli.py @@ -7,19 +7,14 @@ import yaml from beaker import Beaker from beaker.services.job import JobClient -from olmo_core.utils import generate_uuid, prepare_cli_environment from tqdm import tqdm from yaspin import yaspin from cookbook.aliases import ExperimentConfig, LaunchGroup, validate_sources from cookbook.cli.eval import convert, evaluate -from cookbook.utils.config import ( - build_train_config, - config_from_path, - mk_experiment_group, - mk_launch_configs, -) +from cookbook.utils.config import build_train_config, config_from_path, mk_experiment_group, mk_launch_configs from cookbook.utils.data import get_token_counts_and_ratios +from olmo_core.utils import generate_uuid, prepare_cli_environment logger = logging.getLogger(__name__) @@ -57,7 +52,6 @@ def cli(): ) def launch(config: Path, dry_run: bool, no_cache: bool, group_id: Optional[str] = None): """Launch an experiment.""" - with open(config, "r") as f: data = yaml.safe_load(f) diff --git a/src/cookbook/cli/utils.py b/src/cookbook/cli/utils.py index 7111d955..21e24734 100644 --- a/src/cookbook/cli/utils.py +++ b/src/cookbook/cli/utils.py @@ -361,8 +361,8 @@ def install_olmo_core(commit_hash: str | None, env: PythonEnv | None = None) -> def make_destination_dir(input_dir: str, suffix: str, output_dir: str | None = None) -> str: if output_dir is None: - input_base, input_fn = os.path.split(input_dir) - output_dir = os.path.join(input_base, f"{input_fn.rstrip('/')}-{suffix}") + input_base, input_fn = os.path.split(input_dir.rstrip("/")) + output_dir = os.path.join(input_base, f"{input_fn}-{suffix}") os.makedirs(output_dir, exist_ok=True) diff --git a/src/cookbook/constants.py b/src/cookbook/constants.py index 4a0d27ad..f814cf35 100644 --- a/src/cookbook/constants.py +++ b/src/cookbook/constants.py @@ -176,6 +176,12 @@ "bigcodebench_hard::none", ] +ALL_1B_TASKS = [ + "hellaswag", + "piqa", +] + MMLU_CATEGORIES + + STARCODER_CODEX_TASKS = [ "codex_humaneval::starcoder_pass@1", "codex_humaneval::starcoder_pass@10", @@ -200,6 +206,7 @@ "starcoder": STARCODER_CODEX_TASKS, "starcoder::pass@1": STARCODER_PASS_AT_1_TASKS, "code-no-bcb": [task for task in ALL_CODEX_TASKS if "bigcodebench" not in task], + "1b-evals": ALL_1B_TASKS, } OE_EVAL_GIT_URL = "git@github.com:allenai/oe-eval-internal.git" diff --git a/src/cookbook/model/aliases.py b/src/cookbook/model/aliases.py new file mode 100644 index 00000000..5b05f510 --- /dev/null +++ b/src/cookbook/model/aliases.py @@ -0,0 +1,111 @@ +from dataclasses import dataclass +from enum import Enum + +from olmo_core.config import Config +from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig, TokenizerConfig +from olmo_core.distributed.parallel import DataParallelType +from olmo_core.nn.transformer import TransformerBlockType, TransformerConfig +from olmo_core.optim import AdamWConfig +from olmo_core.train import TrainerConfig + + +@dataclass +class ModelTrainConfig(Config): + model: TransformerConfig + optim: AdamWConfig + dataset: NumpyDatasetConfig + data_loader: NumpyDataLoaderConfig + trainer: TrainerConfig + init_seed: int = 12536 + + +@dataclass +class ModelConfig: + compile: bool + d_model: int + n_heads: int + n_layers: int + rope_theta: int + flash_attention: bool + max_sequence_length: int + layer_norm_eps: float = 1e-6 + save_interval: int = 1000 + eval_interval: int = 200 + device_batch_size: int = 8 + batch_divisor: int = 32 + eps: float = 1e-8 + betas: tuple = (0.9, 0.95) + weight_decay: float = 0.1 + max_grad_norm: float = 1.0 + decay_embeddings: bool = False + qk_norm: bool = True + dp_type: DataParallelType = DataParallelType.fsdp + block_type: TransformerBlockType = TransformerBlockType.reordered_norm + + @classmethod + def olmo_30m(cls) -> "ModelConfig": + return ModelConfig( + compile=True, + d_model=256, + n_heads=8, + n_layers=4, + rope_theta=500_000, + flash_attention=True, + max_sequence_length=4096, + ) + + @classmethod + def olmo_190m(cls) -> "ModelConfig": + return ModelConfig( + compile=True, + d_model=768, + n_heads=12, + n_layers=12, + rope_theta=500_000, + flash_attention=True, + max_sequence_length=4096, + ) + + @classmethod + def olmo_1b(cls) -> "ModelConfig": + """ + OLMo-1b (1_336_035_328 parameters) + (1_131_841_536 nonembed params) + """ + return ModelConfig( + compile=True, + d_model=2048, + n_heads=16, + n_layers=18, + rope_theta=500_000, + flash_attention=True, + max_sequence_length=4096, + ) + + @classmethod + def love2code_3b(cls) -> "ModelConfig": + """ + num params should be : 3607267840 + num non_embed parmams should be: 3481438720 + """ + return ModelConfig( + compile=True, + d_model=2560, + n_heads=32, + n_layers=32, + rope_theta=500_000, + flash_attention=True, + max_sequence_length=2048, + ) + + +class SupportedModels(Enum): + olmo_190m = ModelConfig.olmo_190m() + olmo_30m = ModelConfig.olmo_30m() + olmo_1b = ModelConfig.olmo_1b() + starcoder2_3b = ModelConfig.starcoder_3b() + + +class SupportedTokenizers(Enum): + dolma2 = TokenizerConfig.dolma2() + gpt_neox = TokenizerConfig.gpt_neox_olmo_dolma_v1_5() diff --git a/src/cookbook/model/builder.py b/src/cookbook/model/builder.py index 19154023..088f6611 100644 --- a/src/cookbook/model/builder.py +++ b/src/cookbook/model/builder.py @@ -2,13 +2,18 @@ from dataclasses import dataclass from typing import Dict, List, Optional -from olmo_core.data import ( - DataMix, - NumpyDataLoaderConfig, - NumpyDatasetConfig, - NumpyDatasetType, - TokenizerConfig, +from cookbook.aliases import SourceInstance, WandbConfig +from cookbook.data.dataset import MixtureBuilder +from cookbook.model.config import ( + MODEL_TO_LR_MAP, + DefaultOptimizerProperties, + ModelTrainConfig, + SupportedTokenizers, + WrappedTransformerConfig, ) +from cookbook.model.evaluators import DownstreamEvaluators +from cookbook.model.schedulers import WSD +from olmo_core.data import DataMix, NumpyDataLoaderConfig, NumpyDatasetConfig, NumpyDatasetType, TokenizerConfig from olmo_core.data.types import NumpyDatasetDType from olmo_core.nn.transformer import TransformerConfig from olmo_core.optim import AdamWConfig, CosWithWarmup, OptimGroupOverride, Scheduler @@ -28,18 +33,6 @@ ) from olmo_core.train.common import LoadStrategy -from cookbook.aliases import SourceInstance, WandbConfig -from cookbook.data.dataset import MixtureBuilder -from cookbook.model.config import ( - MODEL_TO_LR_MAP, - DefaultOptimizerProperties, - ModelTrainConfig, - SupportedTokenizers, - WrappedTransformerConfig, -) -from cookbook.model.evaluators import DownstreamEvaluators -from cookbook.model.schedulers import WSD - logger = logging.getLogger(__name__) @@ -199,8 +192,15 @@ def __init__( if any(substring in cluster for substring in ["jupiter", "saturn"]) and weka: self.root_dir = f"/weka/oe-training-default/ai2-llm" logger.info(f"Using Weka bucket as root dir: {self.root_dir}") - self.checkpoint_dir = f"{self.root_dir}/checkpoints/{self.beaker_user.lower()}/{self.run_name}" - + elif "augusta" in cluster: + try: + assert not weka + except AssertionError as e: + logger.info("Can't be on Augusta and weka!") + raise e + self.data_dir = self.root_dir = "gs://ai2-llm" + + self.checkpoint_dir = f"{self.root_dir}/checkpoints/{self.beaker_user.lower()}/{self.run_name}" self.dataset_cache = f"{self.root_dir}/{self.beaker_user.lower()}/{self.run_name}/dataset-cache" def get_tokenizer_config(self, tokenizer) -> TokenizerConfig: @@ -269,7 +269,7 @@ def build_callbacks(self, model: TransformerConfig) -> Dict[str, Callback]: "profiler": ProfilerCallback(enabled=self.profile), "checkpointer": CheckpointerCallback( save_interval=self.save_interval, - ephemeral_save_interval=100, + ephemeral_save_interval=20, save_async=True, ), "wandb": WandBCallback( @@ -327,6 +327,10 @@ def build_dataset_config(self) -> NumpyDatasetConfig: for source in self.sources: source_paths.extend(source.paths) + # source_paths = [] + # for source in self.sources: + # source_paths.extend(source.paths) + dataset_config = NumpyDatasetConfig( paths=source_paths, source_mixture_config=mixture_config, @@ -387,6 +391,7 @@ def build(self) -> ModelTrainConfig: load_path=load_path, load_strategy=load_strategy, save_folder=self.checkpoint_dir, + max_duration=Duration.tokens(self.max_tokens), work_dir=self.dataset_cache, rank_microbatch_size=rank_microbatch_size, save_overwrite=True, @@ -394,7 +399,6 @@ def build(self) -> ModelTrainConfig: cancel_check_interval=5, compile_loss=True, z_loss_multiplier=1e-5, - max_duration=Duration.tokens(self.max_tokens), ) for callback_name, callback in self.build_callbacks(self.transformer_config).items(): diff --git a/src/cookbook/model/config.py b/src/cookbook/model/config.py index 350407c0..a7904fde 100644 --- a/src/cookbook/model/config.py +++ b/src/cookbook/model/config.py @@ -4,12 +4,8 @@ from olmo_core.config import Config, DType from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig, TokenizerConfig -from olmo_core.distributed.parallel import DataParallelType -from olmo_core.nn.transformer import ( - TransformerBlockType, - TransformerConfig, - TransformerDataParallelConfig, -) +from olmo_core.distributed.parallel import DataParallelConfig, DataParallelType +from olmo_core.nn.transformer import TransformerBlockType, TransformerConfig from olmo_core.optim import AdamWConfig from olmo_core.train import TrainerConfig @@ -58,7 +54,7 @@ def olmo_30m(cls, tokenizer: TokenizerConfig) -> TransformerConfig: layer_norm_eps=DefaultTransformerProperties.layer_norm_eps, qk_norm=DefaultTransformerProperties.qk_norm, block_name=DefaultTransformerProperties.block_type, - dp_config=TransformerDataParallelConfig( + dp_config=DataParallelConfig( name=DefaultTransformerProperties.dp_type, param_dtype=DType.bfloat16, reduce_dtype=DType.float32, @@ -70,7 +66,7 @@ def olmo2_core_190M(cls, dp_type: Optional[DataParallelType] = None) -> Transfor return getattr(TransformerConfig, "olmo2_190M")( vocab_size=TokenizerConfig.dolma2().padded_vocab_size(), compile=True, - dp_config=TransformerDataParallelConfig( + dp_config=DataParallelConfig( name=dp_type if dp_type else DefaultTransformerProperties.dp_type, param_dtype=DType.bfloat16, reduce_dtype=DType.float32, @@ -85,7 +81,19 @@ def olmo2_core_1B(cls, dp_type: Optional[DataParallelType] = None) -> Transforme return getattr(TransformerConfig, "olmo2_1B")( vocab_size=TokenizerConfig.dolma2().padded_vocab_size(), compile=True, - dp_config=TransformerDataParallelConfig( + dp_config=DataParallelConfig( + name=dp_type if dp_type else DefaultTransformerProperties.dp_type, + param_dtype=DType.bfloat16, + reduce_dtype=DType.float32, + ), + ) + + @classmethod + def starcoder2_3B(cls, dp_type: Optional[DataParallelType] = None) -> TransformerConfig: + return getattr(TransformerConfig, "starcoder2_3b")( + vocab_size=TokenizerConfig.dolma2().padded_vocab_size(), + compile=True, + dp_config=DataParallelConfig( name=dp_type if dp_type else DefaultTransformerProperties.dp_type, param_dtype=DType.bfloat16, reduce_dtype=DType.float32, @@ -100,6 +108,8 @@ def from_model_identifier(cls, model_identifier: str) -> TransformerConfig: return cls.olmo2_core_190M() elif model_identifier == "olmo2_1B": return cls.olmo2_core_1B() + elif model_identifier == "starcoder2_3b": + return cls.starcoder2_3B() else: raise ValueError(f"Model identifier {model_identifier} is not supported.") diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-dclm_subsample_4pct-weka.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-dclm_subsample_4pct-weka.yaml new file mode 100644 index 00000000..e130a955 --- /dev/null +++ b/src/cookbook/recipes/love2code/train-1b-5xC-dclm_subsample_4pct-weka.yaml @@ -0,0 +1,475 @@ +name: "olmo-cookbook-1b-5xC-dclm_subsample_4pct" +description: "4pct subsample of dclm-baseline" +budget: "ai2/oe-training" +workspace: "ai2/learn2code" +nodes: 8 +gpus: 8 +preemptible: false +max_tokens: 113_184_153_600 # 5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "olmo2_1B" +tokenizer: "dolma2" +priority: high +cluster: ai2/jupiter-cirrascale-2 +weka: true +dataset: + sources: + - name: dclm_4pct + target_ratio: 1.0 + paths: + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-004-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-004-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-004-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-015-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-015-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-015-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-024-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-024-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-024-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-034-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-034-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-034-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-035-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-035-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-035-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-038-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-038-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-038-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-041-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-041-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-041-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-044-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-044-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-044-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-053-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-053-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-053-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-055-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-055-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-055-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-057-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-057-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-057-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-062-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-062-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-062-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-069-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-069-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-069-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-070-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-070-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-070-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-080-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-080-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-080-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-084-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-084-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-084-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-085-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-085-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-085-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-086-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-086-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-086-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-089-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-089-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-089-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-091-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-091-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-091-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-094-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-094-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-094-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-096-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-096-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-096-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-109-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-109-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-109-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-111-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-111-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-111-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-114-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-114-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-114-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-116-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-116-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-116-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00003.npy + diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-love2code-starcoder1-weka.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-love2code-starcoder1-weka.yaml index 598ab930..ce2434f8 100644 --- a/src/cookbook/recipes/love2code/train-1b-5xC-love2code-starcoder1-weka.yaml +++ b/src/cookbook/recipes/love2code/train-1b-5xC-love2code-starcoder1-weka.yaml @@ -8,7 +8,7 @@ preemptible: false max_tokens: 113_184_153_600 # 5xC multiplier sequence_length: 2048 seed: 1337 -model: "olmo2_1B" +model: "olmo_1b" tokenizer: "dolma2" priority: high cluster: ai2/jupiter-cirrascale-2 diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-python.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-python.yaml index 28374eea..3ad8b15c 100644 --- a/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-python.yaml +++ b/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka-python.yaml @@ -17,6 +17,7 @@ dataset: sources: - name: the-stack-v2-ai2v0 target_ratio: 0.85 + paths: - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-000-00000.npy - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-000-00001.npy @@ -1159,7 +1160,11 @@ dataset: - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-119-00006.npy - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-119-00007.npy - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-119-00008.npy +<<<<<<< HEAD + - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-119-00009.npy +======= - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-119-00009.npy +>>>>>>> main - name: dclm-codeprose-v0 target_ratio: 0.15 paths: @@ -1255,3 +1260,7 @@ dataset: - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-89-00000.npy - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-90-00000.npy - weka://oe-training-default/ai2-llm/preprocessed/love2code/dclm_codeprose/dolma2-tokenizer/part-91-00000.npy +<<<<<<< HEAD + +======= +>>>>>>> main diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka.yaml index 1eca7493..28b296fb 100644 --- a/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka.yaml +++ b/src/cookbook/recipes/love2code/train-1b-5xC-love2code-weka.yaml @@ -8,7 +8,7 @@ preemptible: false max_tokens: 113_184_153_600 # 5xC multiplier sequence_length: 2048 seed: 1337 -model: "olmo2_1B" +model: "olmo_1b" tokenizer: "dolma2" priority: urgent cluster: ai2/jupiter-cirrascale-2 diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-love2code.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-love2code.yaml new file mode 100644 index 00000000..e63140b1 --- /dev/null +++ b/src/cookbook/recipes/love2code/train-1b-5xC-love2code.yaml @@ -0,0 +1,26 @@ +name: "olmo-cookbook-1b-5xC-love2code" +description: "Love2Code model, first stab at a config" +budget: "ai2/oe-training" +workspace: "ai2/oe-data" +nodes: 8 +gpus: 8 +preemptible: false +max_tokens: 113_184_153_600 # 5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "olmo_1b" +tokenizer: "dolma2" +priority: high +cluster: ai2/jupiter-cirrascale-2 +weka: true +dataset: + sources: + - name: the-stack-v2-ai2v0 + target_ratio: 0.85 + paths: # TODO: ADD ACTUAL PATHS + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens_split/*.npy + - name: dclm-codeprose-v0 + target_ratio: 0.15 + paths: + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/dclm_codeprose_split/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-love2code_SAMPLE.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-love2code_SAMPLE.yaml new file mode 100644 index 00000000..168a726e --- /dev/null +++ b/src/cookbook/recipes/love2code/train-1b-5xC-love2code_SAMPLE.yaml @@ -0,0 +1,26 @@ +name: "olmo-cookbook-1b-5xC-love2code_SAMPLE" +description: "Just a quick check to see if things are working properly" +budget: "ai2/oe-training" +workspace: "ai2/oe-data" +nodes: 1 +gpus: 8 +preemptible: false +max_tokens: 1_000_000_000 # SIMPLE CHECK AMOUNT +sequence_length: 2048 +seed: 1337 +model: "olmo_1b" +tokenizer: "dolma2" +priority: urgent +cluster: ai2/jupiter-cirrascale-2 +weka: true +dataset: + sources: + - name: the-stack-v2-ai2v0 + target_ratio: 0.85 + paths: # TODO: ADD ACTUAL PATHS + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens_split/*.npy + - name: dclm-codeprose-v0 + target_ratio: 0.15 + paths: + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/dclm_codeprose_split/dolma2-tokenizer/*.npy + diff --git a/src/cookbook/recipes/love2code/train-1b-5xC-olmo2mix-weka.yaml b/src/cookbook/recipes/love2code/train-1b-5xC-olmo2mix-weka.yaml new file mode 100644 index 00000000..af8b8894 --- /dev/null +++ b/src/cookbook/recipes/love2code/train-1b-5xC-olmo2mix-weka.yaml @@ -0,0 +1,1142 @@ +name: "olmo-cookbook-1b-5xC-olmo2" +description: "Love2Code model, but olmo2 mix" +budget: "ai2/oe-training" +workspace: "ai2/oe-data" +nodes: 4 +gpus: 8 +preemptible: false +max_tokens: 113_184_153_600 # 5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "olmo_1b" +tokenizer: "dolma2" +priority: high +cluster: ai2/jupiter-cirrascale-2 +weka: true +dataset: + sources: + - name: olmo2 + target_ratio: 1.0 + paths: + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-00-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-01-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-02-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-03-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-04-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-05-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-06-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-07-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-08-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-09-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-10-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-11-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-12-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-13-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-14-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/algebraic-stack/train/allenai/dolma2-tokenizer/part-15-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-00-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-01-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-02-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-03-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-04-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-05-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-06-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-07-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-08-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-09-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-10-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-11-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-12-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-13-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-14-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-15-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-16-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-17-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-18-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/arxiv/train/allenai/dolma2-tokenizer/part-19-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-00-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-01-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-02-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-03-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-04-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-05-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-06-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-07-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-08-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-09-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-10-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-11-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/proof-pile-2/v0_decontaminated/open-web-math/train/allenai/dolma2-tokenizer/part-12-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-00-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-01-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-02-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-03-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-04-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-05-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-06-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-07-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-08-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-09-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-11-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-12-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-13-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-14-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-15-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-16-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-17-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-18-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-19-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-20-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-21-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-22-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-23-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-24-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-25-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-000-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-001-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-002-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-003-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-004-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-005-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-006-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-007-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-008-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-009-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-010-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-011-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-012-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-013-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-014-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-015-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-016-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-017-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-018-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-019-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-020-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-021-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-022-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-023-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-024-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-025-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-026-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-027-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-028-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-029-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-030-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-031-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-032-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-033-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-034-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-035-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-036-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-037-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-038-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-039-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-040-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-041-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-042-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-043-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-044-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-045-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-046-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-047-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-048-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-049-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-050-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-051-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-052-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-053-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-054-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-055-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-056-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-057-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-058-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-059-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-060-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-061-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-062-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-063-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-064-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-065-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-066-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-067-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-068-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-069-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-070-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-071-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-072-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-073-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-074-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-075-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-076-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-077-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-078-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-079-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-080-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-081-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-082-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-083-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-084-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-085-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-086-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-087-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-088-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-089-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-090-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-091-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-092-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-093-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-094-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-095-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-096-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-097-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-098-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/starcoder/v1-decon-100_to_20k-2star-top_token_030/allenai/dolma2-tokenizer/part-099-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-000-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-001-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-002-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-003-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-004-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-005-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-006-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-007-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-008-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-009-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-010-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-011-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-012-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-013-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-014-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-015-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-016-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-017-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-018-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-019-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-020-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-021-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-022-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-023-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-024-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-025-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-026-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-027-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-028-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-029-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-030-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-031-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-032-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-033-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-034-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-035-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-036-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-037-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-038-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-039-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-040-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-041-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-042-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-043-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-044-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-045-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-046-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-047-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-048-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-049-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-050-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-051-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-052-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-053-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-054-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-055-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-056-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-057-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-058-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-059-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-060-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-061-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-062-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-063-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-064-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-065-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-066-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-067-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-068-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-069-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-070-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-071-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-072-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-073-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-074-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-075-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-076-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-077-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-078-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-079-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-080-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-081-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-082-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-083-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-084-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-085-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-086-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-087-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-088-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-089-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-090-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-091-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-092-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-093-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-094-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-095-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-096-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-097-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-098-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-099-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-100-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-101-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-102-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-103-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-104-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-105-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-106-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-107-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-108-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-109-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-110-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-111-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-112-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-113-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-114-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-115-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-116-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-117-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-118-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-119-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-120-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-121-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-122-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-123-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-124-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-125-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-126-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-127-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-128-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-129-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-130-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-131-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-132-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-133-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-134-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-135-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-136-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-137-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-138-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-139-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-140-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-141-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-142-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-143-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-144-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-145-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-146-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-147-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-148-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-149-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-150-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-151-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-152-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-153-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-154-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-155-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-156-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-157-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-158-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-159-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-160-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-161-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-162-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-163-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-164-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-165-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-166-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-167-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-168-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-169-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-170-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-171-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-172-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-173-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-174-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-175-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-176-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-177-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-178-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-179-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-180-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-181-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-182-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-183-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-184-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-185-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-186-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/text_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train/allenai/dolma2-tokenizer/part-187-00004.npy + - weka://oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy \ No newline at end of file diff --git a/src/cookbook/recipes/love2code/train-3b-5xC-love2code-python-augusta.yaml b/src/cookbook/recipes/love2code/train-3b-5xC-love2code-python-augusta.yaml new file mode 100644 index 00000000..57036d29 --- /dev/null +++ b/src/cookbook/recipes/love2code/train-3b-5xC-love2code-python-augusta.yaml @@ -0,0 +1,1163 @@ +name: "olmo-cookbook-3b-5xC-love2code-python-augusta" +description: "Starcoder2-3B(ish) 5xC | Love2code python only (~3 epochs)" +budget: "ai2/oe-training" +workspace: "ai2/learn2code" +nodes: 1 +gpus: 8 +preemptible: true +max_tokens: 318_651_801_600 # 5xC multiplier +sequence_length: 4096 +seed: 1337 +model: "starcoder2_3b" +tokenizer: "dolma2" +priority: high +learning_rate: 7.0e-4 +cluster: ai2/augusta-google-1 +weka: false +dataset: + sources: + - name: the-stack-v2-ai2v0-python + target_ratio: 1.0 + paths: + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-001-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-002-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-003-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-004-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-005-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00011.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00012.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-006-00013.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-007-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-008-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-009-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-010-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-011-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-012-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-013-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-014-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-015-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-016-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-017-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-018-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-019-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-020-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-021-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-022-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-023-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-024-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-025-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-026-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-027-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-028-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-029-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-030-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-031-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-032-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-033-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-034-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-035-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-036-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-037-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-038-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-039-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-040-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-041-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-042-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-043-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-044-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-045-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-046-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-047-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-048-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-049-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-050-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-051-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-052-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-053-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-054-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-055-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-056-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-057-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-058-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-059-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-060-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-061-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-062-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-063-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-064-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-065-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-066-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-067-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-068-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-069-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-070-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-071-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-072-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-073-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-074-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-075-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-076-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-077-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-078-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-079-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-080-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-081-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-082-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-083-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-084-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-085-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-086-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-087-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-088-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-089-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-090-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-091-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-092-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-093-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-094-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-095-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-096-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-097-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-098-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-099-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-100-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-101-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-102-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-103-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-104-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-105-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-106-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-107-00011.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-108-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-109-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-110-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-111-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-112-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-113-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-114-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-115-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00009.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-116-00010.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-117-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-118-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00000.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00001.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00002.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00003.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00004.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00005.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00006.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00007.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00008.npy + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-119-00009.npy diff --git a/src/cookbook/recipes/love2code/train-3b-5xC-love2code-santa_repro0_noprose-s3.yaml b/src/cookbook/recipes/love2code/train-3b-5xC-love2code-santa_repro0_noprose-s3.yaml new file mode 100644 index 00000000..97f00c84 --- /dev/null +++ b/src/cookbook/recipes/love2code/train-3b-5xC-love2code-santa_repro0_noprose-s3.yaml @@ -0,0 +1,22 @@ +name: "olmo-cookbook-starcoder2-3b-5xC-santaRepro_trueSC" +description: "Starcoder2 3B model with a dummy data to make sure things are configured well| TRUE SC" +budget: "ai2/oe-training" +workspace: "ai2/learn2code" +nodes: 1 +gpus: 8 +preemptible: false +max_tokens: 300_000_000 # ~5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "starcoder2_3b" +tokenizer: "dolma2" +priority: high +cluster: ai2/jupiter-cirrascale-2 +weka: true +dataset: + sources: + - name: santacoder_repro + target_ratio: 1.0 + paths: + - weka://oe-training-default/ai2-llm/preprocessed/love2code/python_only/part-000-00000.npy + diff --git a/src/cookbook/recipes/love2code/train-3b-5xC-love2code.yaml b/src/cookbook/recipes/love2code/train-3b-5xC-love2code.yaml new file mode 100644 index 00000000..418db1f4 --- /dev/null +++ b/src/cookbook/recipes/love2code/train-3b-5xC-love2code.yaml @@ -0,0 +1,225 @@ +name: "olmo-cookbook-3b-5xC-love2code" +description: "Love2Code model, first stab at a config" +budget: "ai2/oe-data" +workspace: "ai2/dolma2" +nodes: 8 +gpus: 8 +preemptible: true +max_tokens: 348_143_872_000 # 5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "love2code_3b" +tokenizer: "dolma2" +priority: high +cluster: ai2/jupiter-cirrascale +weka: true +dataset: + sources: + - name: the-stack-v2-ai2v0 + target_ratio: 0.85 + paths: # TODO: ADD ACTUAL PATHS + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-001-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-004-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-006-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-007-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-008-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-008-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-008-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-009-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-010-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-011-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-012-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-014-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-015-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-016-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-017-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-018-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-020-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-020-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-022-00003.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-024-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-024-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-025-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-027-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-028-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-028-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-028-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-028-00003.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-029-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-030-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-031-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-032-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-034-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-035-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-037-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-038-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-040-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-042-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-042-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-042-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-043-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-045-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-046-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-047-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-048-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-048-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-050-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-050-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-050-00003.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-051-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-051-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-051-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-051-00003.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-052-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-053-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-055-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-055-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-058-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-058-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-060-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-060-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-060-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-067-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-068-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-070-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-071-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-073-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-074-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-077-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-077-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-078-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-081-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-082-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-083-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-086-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-088-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-090-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-092-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-092-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-093-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-093-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-095-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-096-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-097-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-099-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-099-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-099-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-100-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-101-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-101-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-103-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-105-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-106-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-108-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-109-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-110-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-110-00002.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-111-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-111-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-112-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-115-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-116-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-117-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-118-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-119-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-121-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-122-00000.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-122-00001.npy + - s3://ai2-llm/pretraining-data/sources/learn2code/minhash_metadata/love2code-baseline-filtered-20250218_004_minhashed/scratch/trunc_tokens/part-123-00000.npy + - name: dclm-codeprose-v0 + target_ratio: 0.15 + paths: + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-00-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-01-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-02-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-03-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-04-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-05-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-06-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-07-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-08-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-09-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-10-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-11-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-12-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-13-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-14-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-15-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-16-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-17-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-18-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-19-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-20-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-21-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-22-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-23-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-24-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-25-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-26-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-27-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-28-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-29-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-30-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-31-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-32-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-33-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-34-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-35-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-36-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-37-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-38-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-39-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-40-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-41-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-42-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-43-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-44-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-45-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-46-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-47-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-48-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-49-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-50-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-51-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-52-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-53-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-54-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-55-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-56-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-57-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-58-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-59-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-60-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-61-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-62-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-63-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-64-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-65-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-66-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-67-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-68-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-69-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-70-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-71-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-72-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-73-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-74-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-75-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-76-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-77-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-78-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-79-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-80-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-81-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-82-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-83-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-84-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-85-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-86-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-87-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-88-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-89-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-90-00000.npy + - s3://ai2-llm/preprocessed/dclm/raw_love2code_codeprose/dolma2-tokenizer/part-91-00000.npy + + diff --git a/src/cookbook/recipes/love2code/train-SAMPLE-augusta.yaml b/src/cookbook/recipes/love2code/train-SAMPLE-augusta.yaml new file mode 100644 index 00000000..d4e6ca2a --- /dev/null +++ b/src/cookbook/recipes/love2code/train-SAMPLE-augusta.yaml @@ -0,0 +1,21 @@ +name: "train-SAMPLE-augusta" +description: "Just trying out cookbook + augusta" +budget: "ai2/oe-training" +workspace: "ai2/learn2code" +nodes: 1 +gpus: 8 +preemptible: true +max_tokens: 113_184_153_600 # 5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "olmo2_1B" +tokenizer: "dolma2" +priority: high +cluster: ai2/augusta-google-1 +weka: false +dataset: + sources: + - name: sample + target_ratio: 1.0 + paths: + - gs://ai2-llm/preprocessed/love2code/python_only/python_tokens/part-000-00000.npy \ No newline at end of file diff --git a/src/cookbook/recipes/olmo-cookbook-1b-5xC-dclm_subsample_4pct.yaml b/src/cookbook/recipes/olmo-cookbook-1b-5xC-dclm_subsample_4pct.yaml new file mode 100644 index 00000000..d05d7066 --- /dev/null +++ b/src/cookbook/recipes/olmo-cookbook-1b-5xC-dclm_subsample_4pct.yaml @@ -0,0 +1,475 @@ +name: "olmo-cookbook-1b-5xC-dclm_subsample_4pct" +description: "4pct subsample of dclm-baseline" +budget: "ai2/oe-training" +workspace: "ai2/learn2code" +nodes: 4 +gpus: 8 +preemptible: false +max_tokens: 113_184_153_600 # 5xC multiplier +sequence_length: 2048 +seed: 1337 +model: "olmo2_1B" +tokenizer: "dolma2" +priority: high +cluster: ai2/jupiter-cirrascale-2 +weka: true +dataset: + sources: + - name: dclm_4pct + target_ratio: 1.0 + paths: + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-000-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-001-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-002-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-003-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-004-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-004-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-004-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-005-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-006-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-007-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-008-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-009-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-010-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-011-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-012-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-013-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-014-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-015-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-015-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-015-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-016-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-017-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-018-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-019-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-020-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-021-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-022-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-023-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-024-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-024-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-024-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-025-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-026-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-027-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-028-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-029-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-030-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-031-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-032-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-033-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-034-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-034-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-034-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-035-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-035-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-035-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-036-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-037-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-038-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-038-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-038-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-039-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-040-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-041-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-041-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-041-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-042-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-043-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-044-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-044-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-044-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-045-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-046-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-047-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-048-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-049-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-050-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-051-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-052-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-053-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-053-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-053-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-054-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-055-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-055-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-055-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-056-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-057-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-057-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-057-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-058-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-059-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-060-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-061-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-062-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-062-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-062-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-063-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-064-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-065-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-066-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-067-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-068-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-069-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-069-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-069-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-070-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-070-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-070-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-071-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-072-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-073-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-074-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-075-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-076-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-077-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-078-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-079-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-080-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-080-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-080-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-081-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-082-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-083-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-084-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-084-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-084-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-085-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-085-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-085-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-086-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-086-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-086-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-087-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-088-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-089-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-089-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-089-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-090-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-091-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-091-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-091-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-092-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-093-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-094-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-094-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-094-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-095-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-096-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-096-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-096-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-097-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-098-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-099-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-100-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-101-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-102-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-103-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-104-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-105-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-106-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-107-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-108-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-109-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-109-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-109-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-110-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-111-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-111-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-111-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-112-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-113-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-114-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-114-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-114-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-115-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-116-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-116-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-116-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-117-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-118-00003.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00000.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00001.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00002.npy + - weka://oe-training-default/ai2-llm/preprocessed/dclm/dclm_baseline_4pct_subsample/allenai/dolma2-tokenizer/part-119-00003.npy + diff --git a/src/cookbook/utils/config.py b/src/cookbook/utils/config.py index f92a1dd3..a55e0bfa 100644 --- a/src/cookbook/utils/config.py +++ b/src/cookbook/utils/config.py @@ -3,23 +3,13 @@ from typing import List, Tuple, cast import yaml -from olmo_core.launch.beaker import ( - BeakerEnvSecret, - BeakerLaunchConfig, - BeakerWekaBucket, -) -from olmo_core.train.callbacks import ConfigSaverCallback, WandBCallback -from olmo_core.utils import get_default_device, seed_all -from cookbook.aliases import ( - ExperimentConfig, - ExperimentGroup, - ExperimentInstance, - SourceConfig, - SourceInstance, -) +from cookbook.aliases import ExperimentConfig, ExperimentGroup, ExperimentInstance, SourceConfig, SourceInstance from cookbook.model.builder import TransformerConfigBuilder from cookbook.utils.data import normalize_source_paths +from olmo_core.launch.beaker import BeakerEnvSecret, BeakerLaunchConfig, BeakerWekaBucket +from olmo_core.train.callbacks import ConfigSaverCallback, WandBCallback +from olmo_core.utils import get_default_device, seed_all logger = logging.getLogger(__name__) @@ -162,9 +152,32 @@ def build_train_config(config_path: Path, run_name: str, group_id: str, beaker_u return trainer +def validate_experiment_group(group: ExperimentGroup) -> bool: + """Stack all the checks for validity of the ExperimentGroup here. + Return True if everything is A-okay + """ + checks_passing = True + msgs = [] + # Check: if running on augusta => not using weka for anything + if "augusta" in group.config.cluster and group.config.weka: + checks_passing &= False + msgs.append("Cannot be on Augusta and use weka!") + + return checks_passing, msgs + + def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLaunchConfig]: """Build a beaker launch config from an experiment group.""" + try: + exp_group_valid, validity_msgs = validate_experiment_group(group) + assert exp_group_valid + except AssertionError as e: + logger.info("Exp group not valid!") + for msg in validity_msgs: + logger.info("\t" + msg) + raise e + weka_buckets: List[BeakerWekaBucket] = [] if group.config.weka: weka_buckets.append(BeakerWekaBucket("oe-training-default", "/weka/oe-training-default")) @@ -193,6 +206,8 @@ def mk_launch_configs(group: ExperimentGroup, beaker_user: str) -> list[BeakerLa BeakerEnvSecret(name="AWS_CREDENTIALS", secret=f"{beaker_user}_AWS_CREDENTIALS"), BeakerEnvSecret(name="R2_ENDPOINT_URL", secret="R2_ENDPOINT_URL"), BeakerEnvSecret(name="WEKA_ENDPOINT_URL", secret="WEKA_ENDPOINT_URL"), + BeakerEnvSecret(name="GS_INTEROP_KEY", secret="GS_INTEROP_KEY"), + BeakerEnvSecret(name="GS_INTEROP_SECRET", secret="GS_INTEROP_SECRET"), ], setup_steps=[ 'git clone "$REPO_URL"', diff --git a/src/cookbook/utils/data.py b/src/cookbook/utils/data.py index 1d6d290a..1f56ca60 100644 --- a/src/cookbook/utils/data.py +++ b/src/cookbook/utils/data.py @@ -1,4 +1,6 @@ import concurrent.futures +import hashlib +import json import logging import os import pathlib @@ -7,22 +9,18 @@ from urllib.parse import urlparse import s3fs +from tqdm import tqdm + +from cookbook.aliases import SourceConfig from olmo_core.aliases import PathOrStr from olmo_core.data.types import NumpyDatasetDType from olmo_core.io import get_file_size, is_url, normalize_path from olmo_core.utils import OLMoEnvironmentError -from tqdm import tqdm logger = logging.getLogger(__name__) logging.getLogger("botocore").setLevel(logging.WARNING) -import hashlib -import json - -from cookbook.aliases import SourceConfig - - def _bytes_to_tokens(num_bytes: int, dtype: NumpyDatasetDType) -> int: """ Convert bytes to tokens based on the dtype. @@ -65,9 +63,12 @@ def get_token_counts_and_ratios( parsed = urlparse(path) if parsed.scheme == "s3": continue - if parsed.scheme == "weka": + elif parsed.scheme == "weka": client_kwargs["endpoint_url"] = os.environ.get("WEKA_ENDPOINT_URL") - + elif parsed.scheme == "gs": + client_kwargs["endpoint_url"] = "https://storage.googleapis.com" + client_kwargs["key"] = os.environ.get("GS_INTEROP_KEY") + client_kwargs["secret"] = os.environ.get("GS_INTEROP_SECRET") fs = s3fs.S3FileSystem(client_kwargs={**client_kwargs}) with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor: @@ -109,7 +110,6 @@ def get_token_counts_and_ratios( def expand_globs(s3: s3fs.S3FileSystem, sources: List[str]) -> Any: results = [] - for source in sources: if is_url(source): logger.info(f"Expanding remote glob '{source}'...") @@ -141,12 +141,10 @@ def _expand_remote(pattern: str, fs: s3fs.S3FileSystem) -> List[str]: """ parsed = urlparse(pattern) - if parsed.scheme == "s3": + if parsed.scheme in ["s3", "weka"]: return [f"s3://{obj}" for obj in fs.glob(pattern)] elif parsed.scheme == "r2": raise NotImplementedError("'r2' types are not currently supported") - elif parsed.scheme == "weka": - return [f"weka://{obj}" for obj in fs.glob(pattern)] elif parsed.scheme == "gs": raise NotImplementedError("'gs' types are not currently supported") elif parsed.scheme in ("http", "https"):