diff --git a/src/cookbook/cli/pmr.py b/src/cookbook/cli/pmr.py index a6520d75..57341ca7 100644 --- a/src/cookbook/cli/pmr.py +++ b/src/cookbook/cli/pmr.py @@ -440,7 +440,9 @@ def describe_instances( statuses = statuses or InstanceStatus.active() # Use provided client or create a new one with the specified region - client = client or boto3.client("ec2", region_name=region or cls.region) + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region or cls.region) filters = [] @@ -507,7 +509,10 @@ def describe_instance( Returns: InstanceInfo object containing the instance details """ - client = client or boto3.client("ec2", region_name=region or cls.region) + # Use provided client or create a new one with the specified region + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region or cls.region) assert client, "EC2 client is required" response = client.describe_instances(InstanceIds=[instance_id]) @@ -524,7 +529,10 @@ def pause(self, client: Union["EC2Client", None] = None, wait_for_completion: bo Returns: True if pause was successful, False otherwise """ - client = client or boto3.client("ec2", region_name=self.region) + # Use provided client or create a new one with the specified region + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=self.region) assert client, "EC2 client is required" # check if the instance is already paused @@ -561,7 +569,10 @@ def resume(self, client: Union["EC2Client", None] = None, wait_for_completion: b Returns: True if resume was successful, False otherwise """ - client = client or boto3.client("ec2", region_name=self.region) + # Use provided client or create a new one with the specified region + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=self.region) assert client, "EC2 client is required" # check if the instance is already running @@ -598,7 +609,10 @@ def terminate(self, client: Union["EC2Client", None] = None, wait_for_terminatio Returns: True if termination was successful, False otherwise """ - client = client or boto3.client("ec2", region_name=self.region) + # Use provided client or create a new one with the specified region + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=self.region) assert client, "EC2 client is required" try: @@ -637,7 +651,10 @@ def get_latest_ami_id(cls, instance_type: str, client: Union["SSMClient", None] else: image_id = "/aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64" - client = client or boto3.client("ssm") + # Use provided client or create a new one with the specified region + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ssm", region_name=os.getenv("AWS_REGION", "us-east-1")) assert client, "SSM client is required" parameter = client.get_parameter(Name=image_id, WithDecryption=False) @@ -678,9 +695,15 @@ def create_instance( InstanceInfo object representing the newly created EC2 instance """ # Initialize the EC2 client with the specified region - client = client or boto3.client("ec2", region_name=region) + if not client: + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region or cls.region) assert client, "EC2 client is required" + vpcs = client.describe_vpcs()["Vpcs"] + vpc_id = vpcs[0]["VpcId"] + print(f"Using VPC ID: {vpc_id}") + # If AMI ID is not provided, use a default Amazon Linux 2023 AMI (x86_64 or arm64 based on instance type) ami_id = ami_id or cls.get_latest_ami_id(instance_type) @@ -901,8 +924,8 @@ def import_ssh_key_to_ec2(key_name: str, region: str, private_key_path: str) -> Returns: The key pair ID if the import was successful. """ - # Initialize the EC2 client with the specified region - ec2_client = boto3.client("ec2", region_name=region) + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + ec2_client = session.client("ec2", region_name=region or os.getenv("AWS_REGION", "us-east-1")) # Use default SSH private key path if not specified if not private_key_path: @@ -1210,7 +1233,8 @@ def create_instances( logger.info("No existing instances found. Starting with index 0") # Initialize the EC2 client with the specified region - ec2_client = boto3.client("ec2", region_name=region) + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + ec2_client = session.client("ec2", region_name=region) logger.debug(f"Initialized EC2 client for region {region}") instances = [] @@ -1257,7 +1281,8 @@ def list_instances( """ logger.info(f"Listing instances with project={name} in region {region}") - client = boto3.client("ec2", region_name=region) + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region) # Retrieve matching instances instances = InstanceInfo.describe_instances( @@ -1306,7 +1331,8 @@ def terminate_instances( """ logger.info(f"Terminating instances with project={name} in region {region}") - client = boto3.client("ec2", region_name=region) + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region) # Retrieve instances matching the project and owner tags instances = InstanceInfo.describe_instances( @@ -1354,7 +1380,8 @@ def pause_instances( """ logger.info(f"Pausing instances with project={name} in region {region}") - client = boto3.client("ec2", region_name=region) + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region) # Retrieve instances matching the project and owner tags instances = InstanceInfo.describe_instances( @@ -1398,7 +1425,9 @@ def resume_instances( instance_id: Optional list of specific instance IDs to resume detach: Whether to return immediately without waiting for resume to complete """ - client = boto3.client("ec2", region_name=region) + + session = boto3.Session(profile_name=os.getenv("AWS_PROFILE", "default")) + client = session.client("ec2", region_name=region) logger.info(f"Resuming instances with project={name} in region {region}") diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/build_tasks.py b/src/cookbook/recipes/pmr/alldressed-tagging/build_tasks.py new file mode 100755 index 00000000..16ef6594 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/build_tasks.py @@ -0,0 +1,101 @@ +import os +import argparse + +template = r"""#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=XXX + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" +""" + + +parser = argparse.ArgumentParser() +parser.add_argument("--output-dir", required=True, help="Directory to write shell scripts to") +args = parser.parse_args() + +os.makedirs(args.output_dir, exist_ok=True) + +for i in range(32): + output_path = os.path.join(args.output_dir, "part_%02d.sh" % i) + with open(output_path, "w") as f: + f.write(template.replace("XXX", "%02d" % i)) diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_00.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_00.sh new file mode 100755 index 00000000..c14695b9 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_00.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=00 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_01.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_01.sh new file mode 100755 index 00000000..870be403 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_01.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=01 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_02.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_02.sh new file mode 100755 index 00000000..d544b751 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_02.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=02 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_03.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_03.sh new file mode 100755 index 00000000..d9245c3d --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_03.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=03 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_04.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_04.sh new file mode 100755 index 00000000..d733fffb --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_04.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=04 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_05.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_05.sh new file mode 100755 index 00000000..d8b58bfc --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_05.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=05 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_06.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_06.sh new file mode 100755 index 00000000..b914d869 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_06.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=06 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_07.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_07.sh new file mode 100755 index 00000000..8a24e03b --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_07.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=07 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_08.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_08.sh new file mode 100755 index 00000000..a06ac658 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_08.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=08 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_09.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_09.sh new file mode 100755 index 00000000..964e6245 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_09.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=09 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_10.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_10.sh new file mode 100755 index 00000000..281d2af7 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_10.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=10 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_11.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_11.sh new file mode 100755 index 00000000..6283eb45 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_11.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=11 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_12.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_12.sh new file mode 100755 index 00000000..e630316e --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_12.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=12 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_13.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_13.sh new file mode 100755 index 00000000..6f462c04 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_13.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=13 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_14.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_14.sh new file mode 100755 index 00000000..e50a882e --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_14.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=14 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_15.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_15.sh new file mode 100755 index 00000000..afd745ec --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_15.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=15 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_16.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_16.sh new file mode 100755 index 00000000..d3b42475 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_16.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=16 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_17.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_17.sh new file mode 100755 index 00000000..16908de2 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_17.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=17 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_18.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_18.sh new file mode 100755 index 00000000..0f6b3976 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_18.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=18 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_19.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_19.sh new file mode 100755 index 00000000..32b1a70f --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_19.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=19 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_20.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_20.sh new file mode 100755 index 00000000..29ec8f86 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_20.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=20 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_21.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_21.sh new file mode 100755 index 00000000..77d7962a --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_21.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=21 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_22.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_22.sh new file mode 100755 index 00000000..d2221552 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_22.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=22 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_23.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_23.sh new file mode 100755 index 00000000..51172d7b --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_23.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=23 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_24.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_24.sh new file mode 100755 index 00000000..bffba3fe --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_24.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=24 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_25.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_25.sh new file mode 100755 index 00000000..5a92de20 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_25.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=25 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_26.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_26.sh new file mode 100755 index 00000000..ba796f9c --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_26.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=26 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_27.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_27.sh new file mode 100755 index 00000000..01f7d857 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_27.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=27 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_28.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_28.sh new file mode 100755 index 00000000..8bed16e8 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_28.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=28 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_29.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_29.sh new file mode 100755 index 00000000..ffc66cd9 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_29.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=29 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_30.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_30.sh new file mode 100755 index 00000000..2a308d52 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_30.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=30 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X" diff --git a/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_31.sh b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_31.sh new file mode 100755 index 00000000..63452f77 --- /dev/null +++ b/src/cookbook/recipes/pmr/alldressed-tagging/tasks/part_31.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +# Check if /mnt/raid0/models is empty and download artifacts if needed +if [ ! -d "/mnt/raid0/models" ] || [ -z "$(ls -A /mnt/raid0/models)" ]; then + echo "Models directory is empty, downloading artifacts..." + mkdir -p "/mnt/raid0/models" + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/WebOrganizer/fasttext/models/Topic/may31_lr05_ng3_n3M6_ova_combined-v3.bin /mnt/raid0/models/ + s5cmd cp -sp s3://ai2-llm/pretraining-data/sources/dclm/refinedweb/dolma_reformat/pools/fasttext_models/oh_uc_wc_eli5_fasttext_model_bigram_200k.bin /mnt/raid0/models/ +else + echo "Models directory already contains files, skipping download..." +fi + +SRC_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered" +DST_S3_PREFIX="s3://ai2-llm/pretraining-data/sources/cc_all_dressed/all_dressed_v3/sa_minlen500/filtered_may31_lr05_ng3_n3M6_ova_combined-v3-partitioned" + +# Store the input argument +X=31 + +# Step 0: Prepare runtime and local storage +echo "Preparing runtime..." +rm -rf "/mnt/raid0/input" +rm -rf "/mnt/raid0/output" +rm -rf "/mnt/raid0/logs" + +mkdir -p "/mnt/raid0/input" +mkdir -p "/mnt/raid0/output" +mkdir -p "/mnt/raid0/logs" + +cd ~/datamap-rs +git checkout undfined/tag-alldressed; git pull + + +# Step 1: Copy from S3 to local storage +echo "Copying data from S3 to local storage..." +s5cmd cp -sp "$SRC_S3_PREFIX/${X}/*" "/mnt/raid0/input/" + + +# Step 2: Run the tag operation +echo "Running tag operation..." +cargo run --release -- map --input-dir /mnt/raid0/input --output-dir /mnt/raid0/input/annotated/ --config examples/tag_alldressed/tag-docs.yaml > "/mnt/raid0/logs/tag-docs-${X}.log" + + +# Step 3: Run the partition operation +echo "Running partition operation..." +cargo run --release -- partition --input-dir /mnt/raid0/input/annotated/step_final/ --output-dir /mnt/raid0/input/partitioned/ --config examples/tag_alldressed/partition-docs.yaml > "/mnt/raid0/logs/partition-docs-${X}.log" + + +# Step 4: Relocate partitioned files under category directories +echo "Relocating partitioned files..." +OUTPUT_DIR="/mnt/raid0/output/partitioned" +mkdir -p "$OUTPUT_DIR" + +# Create directories and move files based on labels +echo "Looking for files matching pattern: /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst" +for file in /mnt/raid0/input/partitioned/chunk___*__*.jsonl.zst; do + # Extract the label from the filename + label=$(basename "$file" | sed 's/.*__\([^.]*\)\..*/\1/') + + # Fix typo in label electronics_and_hardare + label=$(echo "$label" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Extract the new filename (remove chunk___*__ prefix) + new_filename=$(basename "$file" | sed 's/chunk___[^_]*__//') + + # Fix typo in new filename + new_filename=$(echo "$new_filename" | sed 's/electronics_and_hardare/electronics_and_hardware/g') + + # Create directory if it doesn't exist + mkdir -p "$OUTPUT_DIR/$label" + + # Move the file + mv "$file" "$OUTPUT_DIR/$label/$new_filename" + + echo "Moved $file to $OUTPUT_DIR/$label/$new_filename" +done + + +# Step 5: Copy partitioned files to S3 +echo "Copying output to S3..." +s5cmd cp -sp "/mnt/raid0/output/partitioned/*" "$DST_S3_PREFIX/${X}/" +s5cmd cp -sp "/mnt/raid0/logs/*.log" "$DST_S3_PREFIX/logs/" + +echo "Processing complete for chunk $X"