|
| 1 | +# tinygrad is a tensor library, and as a tensor library it has multiple parts |
| 2 | +# 1. a "runtime". this allows buffer management, compilation, and running programs |
| 3 | +# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all |
| 4 | +# 3. a "UOp" that fuses the compute into kernels, using memory only when needed |
| 5 | +# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()" |
| 6 | + |
| 7 | + |
| 8 | +print("******** first, the runtime ***********") |
| 9 | + |
| 10 | +from tinygrad.runtime.ops_cpu import ClangJITCompiler, MallocAllocator, CPUProgram |
| 11 | + |
| 12 | +# allocate some buffers |
| 13 | +out = MallocAllocator.alloc(4) |
| 14 | +a = MallocAllocator.alloc(4) |
| 15 | +b = MallocAllocator.alloc(4) |
| 16 | + |
| 17 | +# load in some values (little endian) |
| 18 | +MallocAllocator._copyin(a, memoryview(bytearray([2,0,0,0]))) |
| 19 | +MallocAllocator._copyin(b, memoryview(bytearray([3,0,0,0]))) |
| 20 | + |
| 21 | +# compile a program to a binary |
| 22 | +lib = ClangJITCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }") |
| 23 | + |
| 24 | +# create a runtime for the program |
| 25 | +fxn = CPUProgram("add", lib) |
| 26 | + |
| 27 | +# run the program |
| 28 | +fxn(out, a, b) |
| 29 | + |
| 30 | +# check the data out |
| 31 | +print(val := MallocAllocator._as_buffer(out).cast("I").tolist()[0]) |
| 32 | +assert val == 5 |
| 33 | + |
| 34 | + |
| 35 | +print("******** second, the Device ***********") |
| 36 | + |
| 37 | +DEVICE = "CPU" # NOTE: you can change this! |
| 38 | + |
| 39 | +import struct |
| 40 | +from tinygrad.dtype import dtypes |
| 41 | +from tinygrad.device import Buffer, Device |
| 42 | +from tinygrad.uop.ops import UOp, Ops |
| 43 | +from tinygrad.shape.shapetracker import ShapeTracker |
| 44 | + |
| 45 | +# allocate some buffers + load in values |
| 46 | +out = Buffer(DEVICE, 1, dtypes.int32).allocate() |
| 47 | +a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2)))) |
| 48 | +b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3)))) |
| 49 | +# NOTE: a._buf is the same as the return from MallocAllocator.alloc |
| 50 | + |
| 51 | +# describe the computation |
| 52 | +buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1) |
| 53 | +buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2) |
| 54 | +ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1.view(ShapeTracker.from_shape((1,))),)) |
| 55 | +ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2.view(ShapeTracker.from_shape((1,))),)) |
| 56 | +alu = ld_1 + ld_2 |
| 57 | +output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0) |
| 58 | +st_0 = UOp(Ops.STORE, dtypes.void, (output_buf.view(ShapeTracker.from_shape((1,))), alu)) |
| 59 | +s = UOp(Ops.SINK, dtypes.void, (st_0,)) |
| 60 | + |
| 61 | +# convert the computation to a "linearized" format (print the format) |
| 62 | +from tinygrad.engine.realize import get_kernel, CompiledRunner |
| 63 | +kernel = get_kernel(Device[DEVICE].renderer, s).linearize() |
| 64 | + |
| 65 | +# compile a program (and print the source) |
| 66 | +fxn = CompiledRunner(kernel.to_program()) |
| 67 | +print(fxn.p.src) |
| 68 | +# NOTE: fxn.clprg is the CPUProgram |
| 69 | + |
| 70 | +# run the program |
| 71 | +fxn.exec([out, a, b]) |
| 72 | + |
| 73 | +# check the data out |
| 74 | +assert out.as_buffer().cast('I')[0] == 5 |
| 75 | + |
| 76 | + |
| 77 | +print("******** third, the UOp ***********") |
| 78 | + |
| 79 | +from tinygrad.engine.realize import run_schedule |
| 80 | +from tinygrad.engine.schedule import create_schedule_with_vars |
| 81 | +from tinygrad.engine.grouper import get_kernelize_map |
| 82 | + |
| 83 | +# allocate some values + load in values |
| 84 | +a = UOp.new_buffer(DEVICE, 1, dtypes.int32) |
| 85 | +b = UOp.new_buffer(DEVICE, 1, dtypes.int32) |
| 86 | +a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2)))) |
| 87 | +b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3)))) |
| 88 | + |
| 89 | +# describe the computation |
| 90 | +out = a + b |
| 91 | +s = UOp(Ops.SINK, dtypes.void, (out,)) |
| 92 | + |
| 93 | +# group the computation into kernels |
| 94 | +becomes_map = get_kernelize_map(s) |
| 95 | + |
| 96 | +# the compute maps to an assign |
| 97 | +assign = becomes_map[a+b] |
| 98 | + |
| 99 | +# the first source is the output buffer (data) |
| 100 | +assert assign.src[0].op is Ops.BUFFER |
| 101 | +# the second source is the kernel (compute) |
| 102 | +assert assign.src[1].op is Ops.KERNEL |
| 103 | + |
| 104 | +# schedule the kernel graph in a linear list |
| 105 | +s = UOp(Ops.SINK, dtypes.void, (assign,)) |
| 106 | +sched, _ = create_schedule_with_vars(s) |
| 107 | +assert len(sched) == 1 |
| 108 | + |
| 109 | +# DEBUGGING: print the compute ast |
| 110 | +print(sched[-1].ast) |
| 111 | +# NOTE: sched[-1].ast is the same as st_0 above |
| 112 | + |
| 113 | +# the output will be stored in a new buffer |
| 114 | +out = assign.buf_uop |
| 115 | +assert out.op is Ops.BUFFER and not out.buffer.is_allocated() |
| 116 | +print(out) |
| 117 | + |
| 118 | +# run that schedule |
| 119 | +run_schedule(sched) |
| 120 | + |
| 121 | +# check the data out |
| 122 | +assert out.is_realized and out.buffer.as_buffer().cast('I')[0] == 5 |
| 123 | + |
| 124 | + |
| 125 | +print("******** fourth, the Tensor ***********") |
| 126 | + |
| 127 | +from tinygrad import Tensor |
| 128 | + |
| 129 | +a = Tensor([2], dtype=dtypes.int32, device=DEVICE) |
| 130 | +b = Tensor([3], dtype=dtypes.int32, device=DEVICE) |
| 131 | +out = a + b |
| 132 | + |
| 133 | +# check the data out |
| 134 | +print(val:=out.item()) |
| 135 | +assert val == 5 |
0 commit comments