tinygrad
diff --git a/‎.github/workflows/benchmark.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/benchmark.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/mlperf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/mlperf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎autogen_stubs.sh
Lines changed: 1 addition & 1 deletion b/‎autogen_stubs.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/abstractions2.py
Lines changed: 3 additions & 3 deletions b/‎docs/abstractions2.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
Lines changed: 15 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
Lines changed: 69 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
Lines changed: 69 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
Lines changed: 14 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_beam.sh
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
Lines changed: 17 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/dev_run.sh
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
Lines changed: 29 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
Lines changed: 69 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
Lines changed: 69 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
Lines changed: 16 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
Lines changed: 15 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
Lines changed: 27 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
Lines changed: 27 additions & 0 deletions
diff --git a/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
Lines changed: 69 additions & 0 deletions b/‎examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
Lines changed: 69 additions & 0 deletions
@@ -617,7 +617,7 @@ jobs:
       run: |
         # generate quantized weights
         ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
-        ln -s /data/home/tiny/tinygrad/testsig-0x858d6c15.so .
+        ln -s /data/home/tiny/tinygrad/testsig-*.so .
         PYTHONPATH=. CC=clang-19 CPU=1 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
         # benchmark on DSP with NOOPT=1, the devectorizer has issues
         PYTHONPATH=. CC=clang-19 DSP=1 DONT_REALIZE_EXPAND=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
 
@@ -22,4 +22,4 @@ jobs:
         ln -s /raid/datasets/imagenet extra/datasets/imagenet
     - name: Run resnet
       run: |
-        BENCHMARK_LOG=mlpert_train_resnet LOGMLPERF=0 examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+        BENCHMARK_LOG=mlpert_train_resnet LOGMLPERF=0 examples/mlperf/training_submission_v5.1/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
@@ -225,7 +225,7 @@ generate_libc() {
 
   sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py
   sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py
-  sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py
+  sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py
 
   fixup $BASE/libc.py
 }
 
@@ -51,11 +51,11 @@
 # describe the computation
 buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
 buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
-ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1, ShapeTracker.from_shape((1,)).to_uop()))
-ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2, ShapeTracker.from_shape((1,)).to_uop()))
+ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1.view(ShapeTracker.from_shape((1,))),))
+ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2.view(ShapeTracker.from_shape((1,))),))
 alu = ld_1 + ld_2
 output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
-st_0 = UOp(Ops.STORE, dtypes.void, (output_buf, ShapeTracker.from_shape((1,)).to_uop(), alu))
+st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.view(ShapeTracker.from_shape((1,))), alu))
 s = UOp(Ops.SINK, dtypes.void, (st_0,))
 
 # convert the computation to a "linearized" format (print the format)
 
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
+
+export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+# export BEAM_LOG_SURPASS_MAX=1
+# export BASEDIR="/raid/datasets/wiki"
+
+export RESET_STEP=1
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
@@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." AMD=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_8xMI300X"
+export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
+
+# similar to https://github.com/mlcommons/training_results_v3.1/blob/d06288b2bd675a9d88e0e6181f5bb5626b71ec19/Quanta_Cloud_Technology/results/D54U-3U/bert/result_1.txt#L54
+export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
+export TRAIN_STEPS=3900
+
+export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_8xMI300x_${DATETIME}_${SEED}.log"
+
+# init  # TODO: without DEBUG=2 it hangs
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 DEBUG=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
@@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BEAM_LOG_SURPASS_MAX=1
+export BASEDIR="/raid/datasets/wiki"
+
+export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
+
+python3 examples/mlperf/model_train.py
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+export WANDB=1 PARALLEL=0
+
+RUNMLPERF=1 python3 examples/mlperf/model_train.py
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -e  # Exit on any error
+set -o pipefail  # Make pipeline fail if any command fails
+
+export PYTHONPATH="." NV=1
+export MODEL="bert"
+export SUBMISSION_PLATFORM="tinybox_green"
+export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
+
+export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
+
+export BEAM=8 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+export IGNORE_JIT_FIRST_BEAM=1
+export BASEDIR="/raid/datasets/wiki"
+
+# pip install -e ".[mlperf]"
+export LOGMLPERF=1
+
+export SEED=$RANDOM
+DATETIME=$(date "+%m%d%H%M")
+LOGFILE="bert_green_${DATETIME}_${SEED}.log"
+
+# init
+BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
+
+# run
+PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
@@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ generate_libc() {`
`225`	`225`
`226`	`226`	`sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libc.py`
`227`	`227`	`sed -i "s\FIXME_STUB\libc\g" $BASE/libc.py`
`228`		`- sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path)\g" $BASE/libc.py`
	`228`	`+ sed -i "s\FunctionFactoryStub()\None if (libc_path := ctypes.util.find_library('c')) is None else ctypes.CDLL(libc_path, use_errno=True)\g" $BASE/libc.py`
`229`	`229`
`230`	`230`	`fixup $BASE/libc.py`
`231`	`231`	`}`