diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..2d56d71c0 Binary files /dev/null and b/.DS_Store differ diff --git a/.github/.DS_Store b/.github/.DS_Store new file mode 100644 index 000000000..71a545d9d Binary files /dev/null and b/.github/.DS_Store differ diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml new file mode 100644 index 000000000..9080712d2 --- /dev/null +++ b/.github/workflows/doc_build.yml @@ -0,0 +1,57 @@ +name: Docs + +on: + push: + branches: + - main + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + architecture: x64 + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install Dependencies + run: | + + set -eux + pip install --upgrade pip + pip install -r build-requirements.txt + pip install --no-build-isolation . + + pip install -r docs/requirements.txt + + - name: Build Sphinx Docs + working-directory: docs + run: | + set -eux + + make html + - name: Upload static files as artifact + id: deployment + uses: actions/upload-pages-artifact@v3 + with: + path: docs/build/html/ + + deploy: + runs-on: ubuntu-latest + needs: build + if: ${{ github.ref == 'refs/heads/main' }} + permissions: + pages: write # to deploy to Pages + id-token: write # to verify the deployment originates from an appropriate source + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index fab347461..33252d739 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,14 @@ Cargo.lock # mdbook output books/hyperactor-book/book/** +# macOS files +.DS_Store +docs/source/.DS_Store + +# Sphinx build files +build/** +docs/_build/** +docs/build/** +docs/generated/** + CLAUDE.md diff --git a/books/hyperactor-book/README.md b/books/hyperactor-book/README.md index 45c2046a7..517e94f14 100644 --- a/books/hyperactor-book/README.md +++ b/books/hyperactor-book/README.md @@ -1,7 +1,14 @@ -# hyperactor Documentation Book +# Hyperactor Documentation Book This is the development documentation for the hyperactor system, built using [`mdBook`](https://rust-lang.github.io/mdBook/). +```{toctree} +:maxdepth: 2 +:caption: Contents + +./src/introduction +``` + ## Running the Book ### On the **Server** diff --git a/books/hyperactor-book/src/introduction.md b/books/hyperactor-book/src/introduction.md index 7f712e7de..3fd4df2ee 100644 --- a/books/hyperactor-book/src/introduction.md +++ b/books/hyperactor-book/src/introduction.md @@ -5,3 +5,14 @@ This book describes the design and implementation of the hyperactor runtime. The goal is to provide a clear, structured explanation of how actors communicate safely and efficiently across distributed systems using hyperactor’s abstractions. We hope this becomes the book we wish we had when we started working with Monarch. Work in progress. + +```{toctree} +:maxdepth: 2 +:caption: Contents + +refrences +mailbox +macros +actors +summary +``` diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/docs/.DS_Store differ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..f7f9d66b6 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,35 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +# Generate example documentation from Python files +generate-examples: + @echo "Generating example documentation..." + @cd "$(SOURCEDIR)" && python GenerateExamples.py + +# Override html target to run generate-examples first and include books +html: + @echo "Building HTML documentation with books directory..." + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile html + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + @echo "Cleaning up build directory..." + @rm -rf "$(BUILDDIR)" + @rm -rf "$(SOURCEDIR)/generated" diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..747ffb7b3 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..a24770958 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,13 @@ +sphinx==7.2.6 +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2 +sphinxcontrib.katex==0.9.10 +#breathe==4.34.0 # only if generating C++ +exhale==0.2.3 # only if generating C++ docs +docutils>=0.18.1,<0.21 +sphinx-design==0.6.1 +sphinxcontrib-mermaid==1.0.0 +myst-parser #==0.18.1 # if want to contribute in markdown +sphinx-gallery==0.14.0 # only if hosting interactive tutorials +sphinx-sitemap==2.7.1 +sphinxext-opengraph +nbsphinx \ No newline at end of file diff --git a/docs/source/.DS_Store b/docs/source/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/docs/source/.DS_Store differ diff --git a/docs/source/books/books.md b/docs/source/books/books.md new file mode 100644 index 000000000..db97a6575 --- /dev/null +++ b/docs/source/books/books.md @@ -0,0 +1,16 @@ +# Books + +This section contains books and comprehensive guides for Monarch. + +```{toctree} +:maxdepth: 2 +:caption: Available Books +:hidden: + +./hyperactor-book/src/introduction +``` + + +## [HyperActor Book](./hyperactor-book/src/introduction) + +The HyperActor Book provides a comprehensive guide to using the actor system in Monarch. diff --git a/docs/source/books/hyperactor-book/.gitignore b/docs/source/books/hyperactor-book/.gitignore new file mode 100644 index 000000000..3006b271d --- /dev/null +++ b/docs/source/books/hyperactor-book/.gitignore @@ -0,0 +1 @@ +book/ diff --git a/docs/source/books/hyperactor-book/README.md b/docs/source/books/hyperactor-book/README.md new file mode 100644 index 000000000..01be21768 --- /dev/null +++ b/docs/source/books/hyperactor-book/README.md @@ -0,0 +1,75 @@ +# Hyperactor Documentation Book + +This is the development documentation for the hyperactor system, built using [`mdBook`](https://rust-lang.github.io/mdBook/). + +```{toctree} +:maxdepth: 2 +:caption: Contents + +./src/introduction +./src/refrences +mailbox +macros +actors +summary +``` + +## Running the Book + +### On the **Server** + +To run the book on a remote server (e.g., `devgpu004`): + +```bash +x2ssh devgpu004.rva5.facebook.com +tmux new -s mdbook +cd ~/fbsource/fbcode/monarch/books/hyperactor-book +mdbook serve +``` +Then detach with Ctrl+b, then d. + +### On the **Client** + +To access the remote book from your local browser: +```bash +autossh -M 0 -N -L 3000:localhost:3000 devgpu004.rva5.facebook.com +``` +Then open http://localhost:3000 in your browser. + +**Note**: If you don’t have autossh installed, you can install it with: +```bash +brew install autossh +``` + +### Notes + +- The source is located in src/, with structure defined in SUMMARY.md. +- The book will auto-reload in the browser on edits. + +## Cleaning Up + +To shut down the book server: + +### Option 1: Reattach and stop + +```bash +x2ssh devgpu004.rva5.facebook.com +tmux attach -t mdbook +``` +Inside the session: +- Press Ctrl+C to stop mdbook serve +- Then type exit to close the shell and terminate the tmux session + +### Option 2: Kill the session directly + +If you don’t want to reattach, you can kill the session from a new shell: +```bash +x2ssh devgpu004.rva5.facebook.com +tmux kill-session -t mdbook +``` + +### Optional: View active tmux sessions +```bash +tmux ls +``` +Use this to check whether the mdbook session is still running. diff --git a/docs/source/books/hyperactor-book/book.toml b/docs/source/books/hyperactor-book/book.toml new file mode 100644 index 000000000..cea7f9d89 --- /dev/null +++ b/docs/source/books/hyperactor-book/book.toml @@ -0,0 +1,9 @@ +[book] +authors = ["Shayne Fletcher"] +language = "en" +src = "src" +title = "Hyperactor Book" + +[output.html] +git-repository-url = "https://github.com/pytorch-labs/monarch" +edit-url-template = "https://github.com/pytorch-labs/monarch/edit/main/books/hyperactor-book/src/{path}" diff --git a/docs/source/books/hyperactor-book/src/SUMMARY.md b/docs/source/books/hyperactor-book/src/SUMMARY.md new file mode 100644 index 000000000..555653504 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/SUMMARY.md @@ -0,0 +1,39 @@ +# Summary + +- [Introduction](./introduction.md) +- [References](references/index.md) + - [Syntax](references/syntax.md) + - [WorldId](references/world_id.md) + - [ProcId](references/proc_id.md) + - [ActorId](references/actor_id.md) + - [PortId](references/port_id.md) + - [GangId](references/gang_id.md) + - [Reference](references/reference.md) + - [Typed References](references/typed_refs.md) +- [Mailboxes and Routers](mailboxes/index.md) + - [Ports](mailboxes/ports.md) + - [MailboxSender](mailboxes/mailbox_sender.md) + - [Reconfigurable Senders](mailboxes/reconfigurable_sender.md) + - [MailboxServer](mailboxes/mailbox_server.md) + - [MailboxClient](mailboxes/mailbox_client.md) + - [Mailbox](mailboxes/mailbox.md) + - [Delivery Semantics](mailboxes/delivery.md) + - [Multiplexers](mailboxes/multiplexer.md) + - [Routers](mailboxes/routers.md) +- [Actors](actors/index.md) + - [Actor](actors/actor.md) + - [Handler](actors/handler.md) + - [RemoteableActor](actors/remotable_actor.md) + - [Checkpointable](actors/checkpointable.md) + - [RemoteActor](actors/remote_actor.md) + - [Binds](actors/binds.md) + - [RemoteHandles](actors/remote_handles.md) + - [ActorHandle](actors/actor_handle.md) + - [Actor Lifecycle](actors/actor_lifecycle.md) +- [Macros](macros/index.md) + - [`#[derive(Handler)]`](macros/handler.md) + - [`#[derive(HandleClient)]`](macros/handle_client.md) + - [`#[derive(RefClient)]`](macros/ref_client.md) + - [`#[derive(Named)]`](macros/named.md) + - [`#[export]`](macros/export.md) + - [`#[forward]`](macros/forward.md) diff --git a/docs/source/books/hyperactor-book/src/actors/actor.md b/docs/source/books/hyperactor-book/src/actors/actor.md new file mode 100644 index 000000000..9d7ef93c2 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/actor.md @@ -0,0 +1,169 @@ +# The `Actor` Trait + +The `Actor` trait defines the core behavior of all actors in the hyperactor runtime. + +Every actor type must implement this trait to participate in the system. It defines how an actor is constructed, initialized, and supervised. + +```rust +#[async_trait] +pub trait Actor: Sized + Send + Debug + 'static { + type Params: Send + 'static; + + async fn new(params: Self::Params) -> Result; + + async fn init(&mut self, _this: &Instance) -> Result<(), anyhow::Error> { + Ok(()) + } + + async fn spawn( + cap: &impl cap::CanSpawn, + params: Self::Params, + ) -> anyhow::Result> { + cap.spawn(params).await + } + + async fn spawn_detached(params: Self::Params) -> Result, anyhow::Error> { + Proc::local().spawn("anon", params).await + } + + fn spawn_server_task(future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + tokio::spawn(future) + } + + async fn handle_supervision_event( + &mut self, + _this: &Instance, + _event: &ActorSupervisionEvent, + ) -> Result { + Ok(false) + } + + async fn handle_undeliverable_message( + &mut self, + this: &Instance, + Undeliverable(envelope): Undeliverable, + ) -> Result<(), anyhow::Error> { + assert_eq!(envelope.sender(), this.self_id()); + + anyhow::bail!(UndeliverableMessageError::delivery_failure(&envelope)); + } +} +``` + +## Construction: `Params` and `new` + +Each actor must define a `Params` type: + +```rust +type Params: Send + 'static; +``` + +This associated type defines the data required to instantiate the actor. + +The actor is constructed by the runtime using: +```rust +async fn new(params: Self::Params) -> Result; +``` + +This method returns the actor's internal state. At this point, the actor has not yet been connected to the runtime; it has no mailbox and cannot yet send or receive messages. `new` is typically used to construct the actor's fields from its input parameters. + +## Initialization: `init` + +```rust +async fn init(&mut self, this: &Instance) -> Result<(), anyhow::Error> +``` + +The `init` method is called after the actor has been constructed with `new` and registered with the runtime. It is passed a reference to the actor's `Instance`, allowing access to runtime services such as: +- The actor’s ID and status +- The mailbox and port system +- Capabilities for spawning or sending messages + +The default implementation does nothing and returns `Ok(())`. + +If `init` returns an error, the actor is considered failed and will not proceed to handle any messages. + +Use `init` to perform startup logic that depends on the actor being fully integrated into the system. + +## Spawning: `spawn` + +The `spawn` method provides a default implementation for creating a new actor from an existing one: + +```rust +async fn spawn( + cap: &impl cap::CanSpawn, + params: Self::Params, +) -> anyhow::Result> { + cap.spawn(params).await +} +``` + +In practice, `CanSpawn` is only implemented for `Instance`, which represents a running actor. As a result, `Actor::spawn(...)` always constructs a child actor: the new actor receives a child ID and is linked to its parent through the runtime. + +## Detached Spawning: `spawn_detached` + +```rust +async fn spawn_detached(params: Self::Params) -> Result, anyhow::Error> { + Proc::local().spawn("anon", params).await +} +``` +This method creates a root actor on a fresh, isolated proc. +- The proc is local-only and cannot forward messages externally. +- The actor receives a unique root `ActorId` with no parent. +- No supervision or linkage is established. +- The actor is named `"anon"`. + +## Background Tasks: `spawn_server_task` + +```rust +fn spawn_server_task(future: F) -> JoinHandle +where + F: Future + Send + 'static, + F::Output: Send + 'static, +{ + tokio::spawn(future) +} +``` + +This method provides a hook point for customizing how the runtime spawns background tasks. + +By default, it simply calls `tokio::spawn(...)` to run the given future on the Tokio executor. + +# Supervision Events: `handle_supervision_event` + +```rust +async fn handle_supervision_event( + &mut self, + _this: &Instance, + _event: &ActorSupervisionEvent, +) -> Result { + Ok(false) +} +``` +This method is invoked when the runtime delivers an `ActorSupervisionEvent` to the actor — for example, when a child crashes or exits. + +By default, it returns `Ok(false)`, which indicates that the event was not handled by the actor. This allows the runtime to fall back on default behavior (e.g., escalation). + +Actors may override this to implement custom supervision logic. + +## Undeliverables: `handle_undeliverable_message` + +```rust +async fn handle_undeliverable_message( + &mut self, + this: &Instance, + Undeliverable(envelope): Undeliverable, +) -> Result<(), anyhow::Error> { + assert_eq!(envelope.sender(), this.self_id()); + + anyhow::bail!(UndeliverableMessageError::delivery_failure(&envelope)); +} +``` +This method is called when a message sent by this actor fails to be delivered. +- It asserts that the message was indeed sent by this actor. +- Then it returns an error: `Err(UndeliverableMessageError::DeliveryFailure(...))` + +This signals that the actor considers this delivery failure to be a fatal error. You may override this method to suppress the failure or to implement custom fallback behavior. diff --git a/docs/source/books/hyperactor-book/src/actors/actor_handle.md b/docs/source/books/hyperactor-book/src/actors/actor_handle.md new file mode 100644 index 000000000..3befb189c --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/actor_handle.md @@ -0,0 +1,174 @@ +# `ActorHandle` + +An `ActorHandle` is a reference to a **local, running actor** of type `A`. It provides access to the actor's messaging ports, lifecycle status, and control methods (such as stop signals). + +Unlike remote references (e.g. `ActorRef`), which may refer to actors on other `Proc`s, an `ActorHandle` only exists within the same `Proc` and can be sent messages without requiring serialization. + +## Definition + +```rust +pub struct ActorHandle { + cell: InstanceCell, + ports: Arc>, +} +``` +An `ActorHandle` contains: +- `cell` is the actor’s internal runtime state, including identity and lifecycle metadata. +- `ports` is a shared dictionary of all typed message ports available to the actor. + +This handle is cloneable, sendable across tasks, and allows interaction with the actor via messaging, status observation, and controlled shutdown. + +## Methods + +### `new` (internal) + +Constructs a new `ActorHandle` from its backing `InstanceCell` and `Ports`. This is called by the runtime when spawning a new actor. +```rust +pub(crate) fn new(cell: InstanceCell, ports: Arc>) -> Self { + Self { cell, ports } +} +``` + +### `cell` (internal) + +Returns the underlying `InstanceCell` backing the actor. +```rust +pub(crate) fn cell(&self) -> &InstanceCell { + &self.cell +} +``` + +### `actor_id` + +Returns the `ActorId` of the actor represented by this handle. +```rust +pub fn actor_id(&self) -> &ActorId { + self.cell.actor_id() +} +``` + +### `drain_and_stop` + +Signals the actor to drain any pending messages and then stop. This enables a graceful shutdown procedure. +```rust +pub fn drain_and_stop(&self) -> Result<(), ActorError> { + self.cell.signal(Signal::DrainAndStop) +} +``` + +### `status` + +Returns a watch channel that can be used to observe the actor's lifecycle status (e.g., running, stopped, crashed). +```rust +pub fn status(&self) -> watch::Receiver { + self.cell.status().clone() +} +``` + +### `send` + +Sends a message of type `M` to the actor. The actor must implement `Handler` for this to compile. + +Messages sent via an `ActorHandle` are always delivered in-process and do not require serialization. +```rust +pub fn send(&self, message: M) -> Result<(), MailboxSenderError> +where + A: Handler, +{ + self.ports.get().send(message) +} +``` + +### `port` + +Returns a reusable port handle for the given message type. +```rust +pub fn port(&self) -> PortHandle +where + A: Handler, +{ + self.ports.get() +} +``` + +### `bind` + +Creates a remote reference (`ActorRef`) by applying a `Binds` implementation. +```rust +pub fn bind>(&self) -> ActorRef { + self.cell.bind(self.ports.as_ref()) +} +``` + +### Binding and ActorRefs + +The `bind()` method on `ActorHandle` creates an `ActorRef` for a given remote-facing reference type `R`. This is the bridge between a local actor instance and its externally visible interface. +```rust +pub fn bind>(&self) -> ActorRef +``` +This method requires that `R` implements the `Binds` trait. The `Binds` trait specifies how to associate a remote-facing reference type with the concrete ports handled by the actor: +```rust +pub trait Binds: RemoteActor { + fn bind(ports: &Ports); +} +``` +In practice, `A` and `R` are usually the same type; this is the pattern produced by the `#[export]` macro. But `R` can also be a trait object or wrapper that abstracts over multiple implementations. + +### Binding internals + +Calling `bind()` on the `ActorHandle`: +1. Invokes the `Binds::bind()` implementation for `R`, registering the actor's message handlers into the `Ports` dictionary. +2. Always binds the `Signal` type (used for draining, stopping, and supervision). +3. Records the bound message types into `InstanceState::exported_named_ports`, enabling routing and diagnostics. +4. Constructs the final `ActorRef` using `ActorRef::attest(...)`, which assumes the type-level correspondence between `R` and the bound ports. + +The result is a typed, routable reference that can be shared across `Proc`s. + +## `IntoFuture for ActorHandle` + +### Overview + +An `ActorHandle` can be awaited directly thanks to its `IntoFuture` implementation. Awaiting the `handle` waits for the actor to shut down. + +### Purpose + +This allows you to write: +```rust +let status = actor_handle.await; +``` +Instead of: +```rust +let mut status = actor_handle.status(); +status.wait_for(ActorStatus::is_terminal).await; +``` + +### Behavior + +When awaited, the handle: +- Subscribes to the actor’s status channel, +- Waits for a terminal status (`Stopped`, `Crashed`, etc.), +- Returns the final status, +- Returns `ActorStatus::Unknown` if the channel closes unexpectedly. + +### Implementation +```rust +impl IntoFuture for ActorHandle { + type Output = ActorStatus; + type IntoFuture = BoxFuture<'static, Self::Output>; + + fn into_future(self) -> Self::IntoFuture { + let future = async move { + let mut status_receiver = self.cell.status().clone(); + let result = status_receiver.wait_for(ActorStatus::is_terminal).await; + match result { + Err(_) => ActorStatus::Unknown, + Ok(status) => status.passthrough(), + } + }; + future.boxed() + } +} +``` +### Summary + +This feature is primarily ergonomic. It provides a natural way to synchronize with the termination of an actor by simply awaiting its handle. diff --git a/docs/source/books/hyperactor-book/src/actors/actor_lifecycle.md b/docs/source/books/hyperactor-book/src/actors/actor_lifecycle.md new file mode 100644 index 000000000..0746b5d36 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/actor_lifecycle.md @@ -0,0 +1,112 @@ +# Actor Lifecycle Types + +This page documents auxiliary types used in actor startup, shutdown, and supervision logic. + +## `ActorStatus` + +`ActorStatus` describes the current runtime state of an actor. It is used to monitor progress, coordinate shutdown, and detect failure conditions. +```rust +pub enum ActorStatus { + Unknown, + Created, + Initializing, + Client, + Idle, + Processing(SystemTime, Option<(String, Option)>), + Saving(SystemTime), + Loading(SystemTime), + Stopping, + Stopped, + Failed(String), +} +``` + +### States +- `Unknown`: The status is unknown (e.g. not yet initialized). +- `Created`: The actor has been constructed but not yet started. +- `Initializing`: The actor is running its init lifecycle hook and is not yet receiving messages. +- `Client`: The actor is operating in “client” mode; its ports are being managed manually. +- `Idle`: The actor is ready to process messages but is currently idle. +- `Processing`: The actor is handling a message. Contains a timestamp and optionally the handler/arm label. +- `Saving`: The actor is saving its state as part of a checkpoint. Includes the time the operation began. +- `Loading`: The actor is loading a previously saved state. +- `Stopping`: The actor is in shutdown mode and draining its mailbox. +- `Stopped`: The actor has exited and will no longer process messages. +- `Failed`: The actor terminated abnormally. Contains an error description. + +### Methods +- `is_terminal(&self) -> bool`: Returns true if the actor has either stopped or failed. +- `is_failed(&self) -> bool`: Returns true if the actor is in the Failed state. +- `passthrough(&self) -> ActorStatus`: Returns a clone of the status. Used internally during joins. +- `span_string(&self) -> &'static str`: Returns the active handler/arm name if available. Used for tracing. + +## `Signal` + +`Signal` is used to control actor lifecycle transitions externally. These messages are sent internally by the runtime (or explicitly by users) to initiate operations like shutdown. +```rust +pub enum Signal { + Stop, + DrainAndStop, + Save, + Load, +} +``` +Variants +- `Stop`: Immediately halts the actor, even if messages remain in its mailbox. +- `DrainAndStop`: Gracefully stops the actor by first draining all queued messages. +- `Save`: Triggers a state snapshot using the actor’s Checkpointable::save method. +- `Load`: Requests state restoration via Checkpointable::load. + +These signals are routed like any other message, typically sent using `ActorHandle::send` or by the runtime during supervision and recovery procedures. + +## `ActorError` + +`ActorError` represents a failure encountered while serving an actor. It includes the actor's identity and the underlying cause. +```rust +pub struct ActorError { + actor_id: ActorId, + kind: ActorErrorKind, +} +``` +This error type is returned in various actor lifecycle operations such as initialization, message handling, checkpointing, and shutdown. It is structured and extensible, allowing the runtime to distinguish between different classes of failure. + +### Associated Methods +```rust +impl ActorError { + /// Constructs a new `ActorError` with the given ID and kind. + pub(crate) fn new(actor_id: ActorId, kind: ActorErrorKind) -> Self + + /// Returns a cloneable version of this error, discarding error structure + /// and retaining only the formatted string. + fn passthrough(&self) -> Self +} +``` + +## `ActorErrorKind` + +```rust +pub enum ActorErrorKind { + Processing(anyhow::Error), + Panic(anyhow::Error), + Init(anyhow::Error), + Mailbox(MailboxError), + MailboxSender(MailboxSenderError), + Checkpoint(CheckpointError), + MessageLog(MessageLogError), + IndeterminateState, + Passthrough(anyhow::Error), +} +``` +### Variants + +- `Processing`: The actor's `handle()` method returned an error. +- `Panic`: A panic occurred during message handling or actor logic. +- `Init`: Actor initialization failed. +- `Mailbox`: A lower-level mailbox error occurred. +- `MailboxSender`: A lower-level sender error occurred. +- `Checkpoint`: Error during save/load of actor state. +- `MessageLog`: Failure in the underlying message log. +- `IndeterminateState`: The actor reached an invalid or unknown internal state. +- `Passthrough`: A generic error, preserving only the error message. + +`Passthrough` is used when a structured error needs to be simplified for cloning or propagation across boundaries. diff --git a/docs/source/books/hyperactor-book/src/actors/binds.md b/docs/source/books/hyperactor-book/src/actors/binds.md new file mode 100644 index 000000000..2409308af --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/binds.md @@ -0,0 +1,31 @@ +# Binds + +The `Binds` trait defines how an actor's ports are associated with the message types it can receive remotely. +```rust +pub trait Binds: RemoteActor { + fn bind(ports: &Ports); +} +``` +Implementing `Binds` allows the system to determine which messages can be routed to an actor instance of type `A`. + +## Code Generation + +In most cases, you do not implement this trait manually. Instead, the `#[export]` macro generates the appropriate `Binds` implementation by registering the actor's supported message types. + +For example: +```rust +#[hyperactor::export( + spawn = true, + handlers = [ShoppingList], +)] +struct ShoppingListActor; +``` +Expands to: +```rust +impl Binds for ShoppingListActor { + fn bind(ports: &Ports) { + ports.bind::(); + } +} +``` +This ensures that the actor is correctly wired to handle messages of type `ShoppingList` when used in a remote messaging context. diff --git a/docs/source/books/hyperactor-book/src/actors/checkpointable.md b/docs/source/books/hyperactor-book/src/actors/checkpointable.md new file mode 100644 index 000000000..dcc5ca55b --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/checkpointable.md @@ -0,0 +1,58 @@ +# Checkpointable + +The `Checkpointable` trait enables an actor to define how its internal state can be saved and restored. This allows actors to participate in checkpointing and recovery mechanisms when supported by the surrounding system. + +## Trait definition +```rust +#[async_trait] +pub trait Checkpointable: Send + Sync + Sized { + type State: RemoteMessage; + + async fn save(&self) -> Result; + async fn load(state: Self::State) -> Result; +} +``` + +## Associated Type + +- `type State`: A serializable type representing the object's saved state. This must implement `RemoteMessage` so it can serialized and transmitted. + +## `save` + +Persists the current state of the component. Returns the Returns a `Self::State` value. If the operation fails, returns `CheckpointError::Save`. + +## `load` + +Reconstructs a new instance from a previously saved `Self::State`. If deserialization or reconstruction fails, returns `CheckpointError::Load`. + +## `CheckpointError` + +Errors returned by save and load operations: +```rust +pub enum CheckpointError { + Save(anyhow::Error), + Load(SeqId, anyhow::Error), +} +``` + +## Blanket Implementation + +Any type `T` that implements `RemoteMessage` and `Clone` automatically satisfies `Checkpointable`: +```rust +#[async_trait] +impl Checkpointable for T +where + T: RemoteMessage + Clone, +{ + type State = T; + + async fn save(&self) -> Result { + Ok(self.clone()) + } + + async fn load(state: Self::State) -> Result { + Ok(state) + } +} +``` +This implementation uses `clone()` to produce a checkpoint and simply returns the cloned state in load. diff --git a/docs/source/books/hyperactor-book/src/actors/handler.md b/docs/source/books/hyperactor-book/src/actors/handler.md new file mode 100644 index 000000000..39cd49c52 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/handler.md @@ -0,0 +1,70 @@ +# The `Handler` Trait + +The `Handler` trait defines how an actor receives and responds to messages of a specific type. + +Each message type that an actor can handle must be declared by implementing this trait. The runtime invokes the `handle` method when such a message is delivered. + +```rust +#[async_trait] +pub trait Handler: Actor { + async fn handle(&mut self, cx: &Context, message: M) -> Result<(), anyhow::Error>; +} +``` + +## Message Dispatch: `handle` + +The `handle` method is invoked by the runtime whenever a message of type `M` arrives at a matching port on the actor. +- message is the received payload. +- this gives access to the actor's runtime context, including its identity, mailbox, and and any capabilities exposed by the `Instance` type (such as spawning or reference resolution). +- The return value indicates whether the message was handled successfully. + +An actor may implement `Handler` multiple times — once for each message type `M` it supports. + +## Built-in Handlers + +The runtime provides implementations of `Handler` for a few internal message types: + +### `Handler` + +This is a marker implementation indicating that all actors can receive `Signal`. The handler is not expected to be invoked directly — its real behavior is implemented inside the runtime. +```rust +#[async_trait] +impl Handler for A { + async fn handle( + &mut self, + _cx: &Context, + _message: Signal, + ) -> Result<(), anyhow::Error> { + unimplemented!("signal handler should not be called directly") + } +} +``` + +### `Handler>` + +```rust +#[async_trait] +impl Handler> for A +where + A: Handler, + M: Castable, +{ + async fn handle( + &mut self, + cx: &Context, + msg: IndexedErasedUnbound, + ) -> anyhow::Result<()> { + let message = msg.downcast()?.bind()?; + Handler::handle(self, this, message).await + } +} +``` +This implementation allows an actor to transparently handle erased, rebound messages of type `M`, provided it already implements `Handler`. + +This construct is used in the implementation of **accumulation**, a communication pattern where a message is multicast to multiple recipients and their replies are gathered—possibly through intermediate actors—before being sent back to the original sender. + +To enable this, messages are unbound at the sender: reply ports (`PortRef`s) are extracted into a `Bindings` object, allowing intermediate nodes to rewrite those ports to point back to themselves. This ensures that replies from downstream actors are routed through the intermediate, enabling reply collection and reduction. + +Once a message reaches its destination, it is rebound by merging the updated bindings back into the message. The `Handler>` implementation automates this by recovering the typed message `M` and dispatching it to the actor's existing `Handler` implementation. + +This allows actors to remain unaware of accumulation mechanics—they can just implement `Handler` as usual. diff --git a/docs/source/books/hyperactor-book/src/actors/index.md b/docs/source/books/hyperactor-book/src/actors/index.md new file mode 100644 index 000000000..676645622 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/index.md @@ -0,0 +1,19 @@ +# Actors + +Hyperactor programs are structured around actors: isolated state machines that process messages asynchronously. + +Each actor runs in isolation, and maintains private internal state. Actors interact with the outside world through typed message ports and follow strict lifecycle semantics managed by the runtime. + +This chapter introduces the actor system in hyperactor. We'll cover: + +- The [`Actor`](./actor.md) trait and its lifecycle hooks +- The [`Handler`](./handler.md) trait for defining message-handling behavior +- The [`RemotableActor`](./remotable_actor.md) trait for enabling remote spawning +- The [`Checkpointable`](./checkpointable.md) trait for supporting actor persistence and recovery +- The [`RemoteActor`](./remote_actor.md) marker trait for remotely referencable types +- The [`Binds`](./binds.md) trait for wiring exported ports to reference types +- The [`RemoteHandles`](./remote_handles.md) trait for associating message types with a reference +- The [`ActorHandle`](./actor_handle.md) type for referencing and communicating with running actors +- [Actor Lifecycle](./lifecycle.md), including `Signal` and `ActorStatus` + +Actors are instantiated with parameters and bound to mailboxes, enabling reliable message-passing. The runtime builds upon this foundation to support supervision, checkpointing, and remote interaction via typed references. diff --git a/docs/source/books/hyperactor-book/src/actors/remotable_actor.md b/docs/source/books/hyperactor-book/src/actors/remotable_actor.md new file mode 100644 index 000000000..02fde16ea --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/remotable_actor.md @@ -0,0 +1,79 @@ +# The `RemoteableActor` Trait + +```rust +pub trait RemotableActor: Actor +where + Self::Params: RemoteMessage, +{ + fn gspawn( + proc: &Proc, + name: &str, + serialized_params: Data, + ) -> Pin> + Send>>; + + fn get_type_id() -> TypeId { + TypeId::of::() + } +} +``` +The `RemotableActor` trait marks an actor type as spawnable across process boundaries. It enables hyperactor's remote spawning and registration system, allowing actors to be created from serialized parameters in a different `Proc`. + +## Requirements +- The actor type must also implement `Actor`. +- Its `Params` type (used in `Actor::new`) must implement `RemoteMessage`, so it can be serialized and transmitted over the network. + +## `gspawn` +```rust +fn gspawn( + proc: &Proc, + name: &str, + serialized_params: Data, +) -> Pin> + Send>>; +``` +This is the core entry point for remote actor spawning. It takes: +- a target `Proc` where the actor should be created, +- a string name to assign to the actor, +- and a `Data` payload representing serialized parameters. + +The method deserializes the parameters, creates the actor, and returns its `ActorId`. + +This is used internally by hyperactor's remote actor registry and `spawn` services. Ordinary users generally don't call this directly. + +> **Note:** This is not an `async fn` because `RemotableActor` must be object-safe. + +## `get_type_id` + +Returns a stable `TypeId` for the actor type. Used to identify actor types at runtime—e.g., in registration tables or type-based routing logic. + +## Blanket Implementation + +The RemotableActor trait is automatically implemented for any actor type `A` that: +- implements `Actor` and `RemoteActor`, +- and whose `Params` type implements `RemoteMessage`. + +This allows `A` to be remotely registered and instantiated from serialized data, typically via the runtime's registration mechanism. + +```rust +impl RemotableActor for A +where + A: Actor + RemoteActor, + A: Binds, + A::Params: RemoteMessage, +{ + fn gspawn( + proc: &Proc, + name: &str, + serialized_params: Data, + ) -> Pin> + Send>> { + let proc = proc.clone(); + let name = name.to_string(); + Box::pin(async move { + let handle = proc + .spawn::(&name, bincode::deserialize(&serialized_params)?) + .await?; + Ok(handle.bind::().actor_id) + }) + } +} +``` +Note the `Binds` bound: this trait specifies how an actor's ports are wired determining which message types the actor can receive remotely. The resulting `ActorId` corresponds to a port-bound, remotely callable version of the actor. diff --git a/docs/source/books/hyperactor-book/src/actors/remote_actor.md b/docs/source/books/hyperactor-book/src/actors/remote_actor.md new file mode 100644 index 000000000..9b7555b36 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/remote_actor.md @@ -0,0 +1,10 @@ +# RemoteActor + +```rust +pub trait RemoteActor: Named + Send + Sync {} +``` +This is a marker trait indicating that a type is eligible to serve as a reference to a remote actor (i.e., an actor that may reside on a different proc). + +It requires: +- `Named`: the type must provide a static name. +- `Send + Sync`: the type must be safely transferable and shareable across threads. diff --git a/docs/source/books/hyperactor-book/src/actors/remote_handles.md b/docs/source/books/hyperactor-book/src/actors/remote_handles.md new file mode 100644 index 000000000..a141fea9b --- /dev/null +++ b/docs/source/books/hyperactor-book/src/actors/remote_handles.md @@ -0,0 +1,14 @@ +# RemoteHandles + +The `RemoteHandles` trait is a marker used to declare that a given `RemoteActor` type can handle messages of type `M`. +```rust +pub trait RemoteHandles: RemoteActor {} +``` + +An implementation like: +```rust +impl RemoteHandles for ShoppingListActor {} +``` +means that `ShoppingListActor` is known to handle the `ShoppingList` message type. + +These implementations are typically generated by the `#[export(handlers = [...])]` macro, and are not written by hand. diff --git a/docs/source/books/hyperactor-book/src/introduction.md b/docs/source/books/hyperactor-book/src/introduction.md new file mode 100644 index 000000000..0730903f3 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/introduction.md @@ -0,0 +1,19 @@ +# Hyperactor + +This book describes the design and implementation of the hyperactor runtime. + +The goal is to provide a clear, structured explanation of how actors communicate safely and efficiently across distributed systems using hyperactor’s abstractions. + +We hope this becomes the book we wish we had when we started working with Monarch. Work in progress. + +```{toctree} +:maxdepth: 2 +:caption: Chapters + +./actors/index +./macros/index +./mailboxes/index +./references/index +./SUMMARY.md + +``` diff --git a/docs/source/books/hyperactor-book/src/macros/export.md b/docs/source/books/hyperactor-book/src/macros/export.md new file mode 100644 index 000000000..04c1abb65 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/export.md @@ -0,0 +1,70 @@ +# `#[export]` + +The `#[hyperactor::export]` macro turns a regular `Actor` implementation into a remotely spawnable actor, registering its type information, `spawn` function, and supported message handlers for discovery and use across processes or runtimes. + +## What It Adds + +When applied to an actor type like this: + +```rust +#[hyperactor::export( + spawn = true, + handlers = [ShoppingList], +)] +struct ShoppingListActor(HashSet); +``` +The macro expands to include: + - A `Named` implementation for the actor + - A `Binds` implementation that registers supported message types + - Implementations of `RemoteHandles` for each type in the `handlers = [...]` list + - A `RemoteActor` marker implementation + - If `spawn = true`, a `RemotableActor` implementation and an inventory registration of the `spawn` function. + +This enables the actor to be: + - Spawned dynamically by name + - Routed to via typed messages + - Reflected on at runtime (for diagnostics, tools, and orchestration) + +## Generated Implementations (simplified) +```rust +impl RemoteActor for ShoppingListActor {} + +impl RemoteHandles for ShoppingListActor {} +impl RemoteHandles for ShoppingListActor {} + +impl Binds for ShoppingListActor { + fn bind(ports: &Ports) { + ports.bind::(); + } +} + +impl Named for ShoppingListActor { + fn typename() -> &'static str { + "my_crate::ShoppingListActor" + } +} +``` +If `spawn = true`, the macro also emits: +```rust +impl RemotableActor for ShoppingListActor {} +``` +This enables remote spawning via the default `gspawn` provided by a blanket implementation. + +It also registers the actor into inventory: +``` +inventory::submit!(SpawnableActor { + name: ..., + gspawn: ..., + get_type_id: ..., +}); +``` +This allows the actor to be discovered and spawned by name at runtime. + +## Summary + +The `#[export]` macro makes an actor remotely visible, spawnable, and routable by declaring: + - What messages it handles + - What messages it handles + - How to bind those messages + - What its globally unique name is + - (Optionally) how to spawn it dynamically diff --git a/docs/source/books/hyperactor-book/src/macros/forward.md b/docs/source/books/hyperactor-book/src/macros/forward.md new file mode 100644 index 000000000..a342fc3a9 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/forward.md @@ -0,0 +1,28 @@ +# `#[forward]` + +The `#[hyperactor::forward]` macro connects a user-defined handler trait implementation (like `ShoppingListHandler`) to the core `Handler` trait required by the runtime. + +In short, it generates the boilerplate needed to route incoming messages of type `T` to your high-level trait implementation. + +## What it generates + +The macro expands to: +```rust +#[async_trait] +impl Handler for ShoppingListActor { + async fn handle(&mut self, ctx: &Context, message: ShoppingList) -> Result<(), Error> { + ::handle(self, ctx, message).await + } +} +``` +This avoids having to manually match on enum variants or duplicate message logic. + +## When to use it + +Use `#[forward(MessageType)]` when: + +- You’ve defined a custom trait (e.g., `ShoppingListHandler`) +- You’re handling a message enum (like `ShoppingList`) +- You want the runtime to route messages to your trait automatically. + +This is most often used alongside `#[derive(Handler)]`, which generates the corresponding handler and client traits for a user-defined message enum. diff --git a/docs/source/books/hyperactor-book/src/macros/handle_client.md b/docs/source/books/hyperactor-book/src/macros/handle_client.md new file mode 100644 index 000000000..3070b317f --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/handle_client.md @@ -0,0 +1,65 @@ +# `#[derive(HandleClient)]` + +`#[derive(Handler)]` generates both the server-side handler trait (`ShoppingListHandler`) and the client-side trait definition (`ShoppingListClient`). However, it does not implement the client trait for any specific type. + +This is where `#[derive(HandleClient)]` comes in. + +## What It Adds + +`#[derive(HandleClient)]` generates the following implementation: + +```rust +impl ShoppingListClient for ActorHandle +where + T: ShoppingListHandler + Send + Sync + 'static` +``` + +This means you can call methods like `.add(...)` or `.list(...)` directly on an `ActorHandle` without needing to manually implement the `ShoppingListClient` trait: + +In other words, `HandleClient` connects the generated `ShoppingListClient` interface (from `Handler`) to the concrete type `ActorHandle`. + +## Generated Implementation (simplified) +```rust +use async_trait::async_trait; +use hyperactor::{ + ActorHandle, + anyhow::Error, + cap::{CanSend, CanOpenPort}, + mailbox::open_once_port, + metrics, + Message, +}; + +#[async_trait] +impl ShoppingListClient for ActorHandle +where + T: ShoppingListHandler + Send + Sync + 'static, +{ + async fn add(&self, caps: &impl CanSend, item: String) -> Result<(), Error> { + self.send(caps, ShoppingList::Add(item)).await + } + + async fn remove(&self, caps: &impl CanSend, item: String) -> Result<(), Error> { + self.send(caps, ShoppingList::Remove(item)).await + } + + async fn exists( + &self, + caps: &impl CanSend + CanOpenPort, + item: String, + ) -> Result { + let (reply_to, recv) = open_once_port(caps)?; + self.send(caps, ShoppingList::Exists(item, reply_to)).await?; + Ok(recv.await?) + } + + async fn list( + &self, + caps: &impl CanSend + CanOpenPort, + ) -> Result, Error> { + let (reply_to, recv) = open_once_port(caps)?; + self.send(caps, ShoppingList::List(reply_to)).await?; + Ok(recv.await?) + } + +``` diff --git a/docs/source/books/hyperactor-book/src/macros/handler.md b/docs/source/books/hyperactor-book/src/macros/handler.md new file mode 100644 index 000000000..300130cef --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/handler.md @@ -0,0 +1,96 @@ +# `#[derive(Handler)]` + +The `#[derive(Handler)]` macro generates the infrastructure for sending and receiving typed messages in hyperactor. When applied to an enum like this: +```rust +#[derive(Handler)] +enum ShoppingList { + // Fire-and-forget messages + Add(String), + Remove(String), + + // Request-response messages + Exists(String, #[reply] OncePortRef), + List(#[reply] OncePortRef>), +} +``` +... it generates **two key things**: + +### 1. `ShoppingListHandler` trait +This trait defines a method for each variant, and a `handle` method to route incoming messages: +```rust +use async_trait::async_trait; +use hyperactor::anyhow::Error; + +#[async_trait] +pub trait ShoppingListHandler: hyperactor::Actor + Send + Sync { + async fn add(&mut self, ctx: &Context, item: String) -> Result<(), Error>; + async fn remove(&mut self, ctx: &Context, item: String) -> Result<(), Error>; + async fn exists(&mut self, ctx: &Context, item: String) -> Result; + async fn list(&mut self, ctx: &Context) -> Result, Error>; + + async fn handle(&mut self, ctx: &Context, msg: ShoppingList) -> Result<(), Error> { + match msg { + ShoppingList::Add(item) => { + self.add(ctx, item).await + } + ShoppingList::Remove(item) => { + self.remove(ctx, item).await + } + ShoppingList::Exists(item, reply_to) => { + let result = self.exists(ctx, item).await?; + reply_to.send(ctx, result)?; + Ok(()) + } + ShoppingList::List(reply_to) => { + let result = self.list(ctx).await?; + reply_to.send(ctx, result)?; + Ok(()) + } + } + } +} +``` +Note: + - `Add` and `Remove` are **oneway**: no reply port + - `Exists` and `List` are **call-style**: they take a `#[reply] OncePortRef` and expect a response to be sent back. + +### 2. `ShoppingListClient` trait + +Alongside the handler, the `#[derive(Handler)]` macro also generates a client-side trait named `ShoppingListClient`. This trait provides a convenient and type-safe interface for sending messages to an actor. + +Each method in the trait corresponds to a variant of the message enum. For example: +```rust +use async_trait::async_trait; +use hyperactor::anyhow::Error; +use hyperactor::cap::{CanSend, CanOpenPort}; + +#[async_trait] +pub trait ShoppingListClient: Send + Sync { + async fn add(&self, caps: &impl CanSend, item: String) -> Result<(), Error>; + async fn remove(&self, caps: &impl CanSend, item: String) -> Result<(), Error>; + async fn exists(&self, caps: &impl CanSend + CanOpenPort, item: String) -> Result; + async fn list(&self, caps: &impl CanSend + CanOpenPort) -> Result, Error>; +} +``` + +#### Capability Parameter +Each method takes a caps argument that provides the runtime capabilities required to send the message: +- All methods require `CanSend`. +- Methods with `#[reply]` arguments additionally require `CanOpenPort`. + +In typical usage, `caps` is a `Mailbox`. + +#### Example Usage +```rust +let mut proc = Proc::local(); +let actor = proc.spawn::("shopping", ()).await?; +let client = proc.attach("client").unwrap(); + +// Fire-and-forget +actor.add(&client, "milk".into()).await?; + +// With reply +let found = actor.exists(&client, "milk".into()).await?; +println!("got milk? {found}"); +``` +Here, actor is an `ActorHandle` that implements `ShoppingListClient`, and `client` is a `Mailbox` that provides the necessary capabilities. diff --git a/docs/source/books/hyperactor-book/src/macros/index.md b/docs/source/books/hyperactor-book/src/macros/index.md new file mode 100644 index 000000000..44a1479e9 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/index.md @@ -0,0 +1,32 @@ +# Macros + +This section documents the macros provided by hyperactor for actor and message integration. + +These macros support a complete message-passing workflow: from defining message enums and generating client APIs, to routing messages and exporting actors for dynamic or remote use. + +- [`#[derive(Handler)]`](handler.md) — generate message handling and client traits for actor enums +- [`#[derive(HandleClient)]`](handle_client.md) — implement the generated client trait for `ActorHandle` +- [`#[derive(RefClient)]`](ref_client.md) — implement the generated client trait for `ActorRef` +- [`#[derive(Named)]`](named.md) — give a type a globally unique name and port for routing and reflection +- [`#[export]`](export.md) — make an actor remotely spawnable and routable by registering its type, handlers, and and optionally spawnable from outside the current runtime +- [`#[forward]`](forward.md) — route messages to a user-defined handler trait implementation + +## Macro Summary + +- **`#[derive(Handler)]`** + Generates handler and client traits for a message enum. + +- **`#[derive(HandleClient)]`** + Implements the client trait for `ActorHandle`. + +- **`#[derive(RefClient)]`** + Implements the client trait for `ActorRef`. + +- **`#[derive(Named)]`** + Registers the type with a globally unique name and port. + +- **`#[export]`** + Makes an actor spawnable and routable via inventory. + +- **`#[forward]`** + Forwards messages to a user-defined handler trait implementation. diff --git a/docs/source/books/hyperactor-book/src/macros/named.md b/docs/source/books/hyperactor-book/src/macros/named.md new file mode 100644 index 000000000..7ca9f30fe --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/named.md @@ -0,0 +1,77 @@ +# `#[derive(Named)]` + +The `#[derive(Named)]` macro implements the `hyperactor::Named` trait for a type, making it identifiable at runtime through a globally unique string and stable hash. + +## The `Named` trait + +The `hyperactor::data::Named` trait is the foundation of type identification in hyperactor. It gives each type a globally unique identity based on its name used in routing. +```rust +pub trait Named: Sized + 'static { + fn typename() -> &'static str; + fn typehash() -> u64 { ... } + fn typeid() -> TypeId { ... } + fn port() -> u64 { ... } + fn arm(&self) -> Option<&'static str> { ... } + unsafe fn arm_unchecked(self_: *const ()) -> Option<&'static str> { ... } +} +``` + +### Trait Methods + +#### `typename() -> &'static str` + +Returns the globally unique, fully-qualified type name for the type. This should typically look like: +```rust +"foo::bar::Corge" +``` + +#### `typehash() -> u64` + +Returns a stable hash derived from `typename()`. This value is used for message port derivation. +```rust +cityhasher::hash(Self::typename()) +``` + +#### `typeid() -> TypeId` + +Returns the Rust `TypeId` for the type (, which is only unique within a single binary). + +#### `port() -> u64` + +Returns a globally unique port number for the type: +```rust +Self::typehash() | (1 << 63) +``` +Typed ports are reserved in the range 2^63 .. 2^64 - 1. + +### `arm(&self) -> Option<&'static str>` + +For enum types, this returns the name of the current variant, e.g., "Add" or "Remove". + +### `unsafe fn arm_unchecked(ptr: *const ()) -> Option<&'static str>` + +The type-erased version of `arm()`. Casts ptr back to `&Self` and calls `arm()`. + +Useful for dynamic reflection when the concrete type isn’t statically known + +### Runtime Registration + +In addition to implementing the `Named` trait, the macro registers the type’s metadata at startup using the `inventory` crate: +```rust +const _: () = { + static __INVENTORY: ::inventory::Node = ::inventory::Node { + value: &TypeInfo { ... }, + ... + }; + // Registers the type info before main() runs + #[link_section = ".init_array"] + static __CTOR: unsafe extern "C" fn() = __ctor; +}; +``` +This allows the type to be discovered at runtime, enabling: +- Message dispatch from erased or serialized inputs +- Introspection and diagnostics +- Dynamic spawning or reflection +- Tooling support + +Types registered this way appear in the global `inventory::iter` set, which is how the hyperactor runtime locates known message types. diff --git a/docs/source/books/hyperactor-book/src/macros/ref_client.md b/docs/source/books/hyperactor-book/src/macros/ref_client.md new file mode 100644 index 000000000..7ccf746b4 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/macros/ref_client.md @@ -0,0 +1,62 @@ +# `#[derive(RefClient)]` + +While `#[derive(HandleClient)]` enables calling the generated client trait on `ActorHandle`, there are cases where you don’t have a handle, only a reference to an actor (`ActorRef`). This is where `#[derive(RefClient)]` comes in. + +## What It Adds + +`#[derive(RefClient)]` generates the following implementation: +```rust +impl ShoppingListClient for ActorRef +where + T: ShoppingListHandler + Send + Sync + 'static +``` +This allows you to invoke methods like `.add(...)` or `.list(...)` directly on an `ActorRef`. + +In other words, `RefClient` connects the generated `ShoppingListClient` interface (from `Handler`) to the `ActorRef` type, which refers to a remote actor. + +## Generated Implementation (simplified) + +```rust +use async_trait::async_trait; +use hyperactor::{ + ActorRef, + anyhow::Error, + cap::{CanSend, CanOpenPort}, + mailbox::open_once_port, + metrics, + Message, +}; + +#[async_trait] +impl ShoppingListClient for ActorRef +where + T: ShoppingListHandler + Send + Sync + 'static, +{ + async fn add(&self, caps: &impl CanSend, item: String) -> Result<(), Error> { + self.send(caps, ShoppingList::Add(item)).await + } + + async fn remove(&self, caps: &impl CanSend, item: String) -> Result<(), Error> { + self.send(caps, ShoppingList::Remove(item)).await + } + + async fn exists( + &self, + caps: &impl CanSend + CanOpenPort, + item: String, + ) -> Result { + let (reply_to, recv) = open_once_port(caps)?; + self.send(caps, ShoppingList::Exists(item, reply_to)).await?; + Ok(recv.await?) + } + + async fn list( + &self, + caps: &impl CanSend + CanOpenPort, + ) -> Result, Error> { + let (reply_to, recv) = open_once_port(caps)?; + self.send(caps, ShoppingList::List(reply_to)).await?; + Ok(recv.await?) + } +} +``` diff --git a/docs/source/books/hyperactor-book/src/mailboxes/delivery.md b/docs/source/books/hyperactor-book/src/mailboxes/delivery.md new file mode 100644 index 000000000..eef1117ee --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/delivery.md @@ -0,0 +1,226 @@ +# Delivery Semantics + +This section defines the mechanics of message delivery and failure in the mailbox system. + +Key components: + +- `MessageEnvelope`: encapsulates a message, sender, and destination +- `DeliveryError`: enumerates failure modes (unroutable, broken link, etc.) +- Undeliverable handling: how messages are returned on failure +- Serialization and deserialization support + +These types form the foundation for how messages are transmitted, routed, and failed in a structured way. + +An envelope carries a message destined to a remote actor. The envelope contains a serialized message along with its destination and sender: +```rust +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Named)] +pub struct MessageEnvelope { + /// The sender of this message. + sender: ActorId, + + /// The destination of the message. + dest: PortId, + + /// The serialized message. + data: Serialized, + + /// Error contains a delivery error when message delivery failed. + error: Option, + + /// Additional context for this message. + headers: Attrs, +} +``` + +`MessageEnvelope::new` creates a message envelope: +```rust +impl MessageEnvelope { + fn new(sender: ActorId, dest: PortId, data: Serialized, headers: Attrs) -> Self { ... } +} +``` +`MessageEnvelope::new_unknown` creates a new envelope when we don't know who the sender is: +```rust +impl MessageEnvelope { + fn new_unknown(dest: PortId, data: Serialized) -> Self { + Self::new(id!(unknown[0].unknown), dest, data) + } +} +``` +If a type `T` implements `Serialize` and `Named`, an envelope can be constructed while serializing the message data: +```rust +impl MessageEnvelope { + fn serialize( + source: ActorId, dest: PortId, value: &T, headers: Attrs) -> Result { + Ok(Self { + data: Serialized::serialize(value)?, + sender: source, + dest, + error: None, + }) + } +} +``` +We can use the fact that `T` implements `DeserializeOwned` to provide a function to deserialize the message data in an envelope: +```rust +impl MessageEnvelope { + fn deserialized(&self) -> Result { + self.data.deserialized() + } +} +``` +This function stamps an envelope with a delivery error: +```rust +impl MessageEnvelope { + fn error(&mut self, error: DeliveryError) { + self.error = Some(error); + } +} +``` +The `undeliverable` function on a `MessageEnvelope` can be called when a message has been determined to be undeliverable due to the provided error. It marks the envelope with the error and attempts to return it to the sender. +```rust +impl MessageEnvelope { + pub fn undeliverable( + mut self, + error: DeliveryError, + return_handle: PortHandle>, + ) { + self.try_set_error(error); + undeliverable::return_undeliverable(return_handle, self); + } +} +``` + +### Delivery errors + +Delivery errors can occur during message posting: +```rust +#[derive(thiserror::Error, ...)] +pub enum DeliveryError { + /// The destination address is not reachable. + #[error("address not routable: {0}")] + Unroutable(String), + + /// A broken link indicates that a link in the message + /// delivery path has failed. + #[error("broken link: {0}")] + BrokenLink(String), + + /// A (local) mailbox delivery error. + #[error("mailbox error: {0}")] + Mailbox(String), +} +``` + +### Mailbox Errors + +Errors can occur during mailbox operations. Each error is associated with the mailbox's actor ID: +```rust +pub struct MailboxError { + actor_id: ActorId, + kind: MailboxErrorKind, +} + +#[non_exhaustive] +pub enum MailboxErrorKind { + /// An operation was attempted on a closed mailbox. + #[error("mailbox closed")] + Closed, + + /// The port associated with an operation was invalid. + #[error("invalid port: {0}")] + InvalidPort(PortId), + + /// There was no sender associated with the port. + #[error("no sender for port: {0}")] + NoSenderForPort(PortId), + + /// There was no local sender associated with the port. + /// Returned by operations that require a local port. + #[error("no local sender for port: {0}")] + NoLocalSenderForPort(PortId), + + /// The port was closed. + #[error("{0}: port closed")] + PortClosed(PortId), + + /// An error occured during a send operation. + #[error("send {0}: {1}")] + Send(PortId, #[source] anyhow::Error), + + /// An error occured during a receive operation. + #[error("recv {0}: {1}")] + Recv(PortId, #[source] anyhow::Error), + + /// There was a serialization failure. + #[error("serialize: {0}")] + Serialize(#[source] anyhow::Error), + + /// There was a deserialization failure. + #[error("deserialize {0}: {1}")] + Deserialize(&'static str, anyhow::Error), + + #[error(transparent)] + Channel(#[from] ChannelError), +} +``` + +`PortLocation` describes the location of a port. It provides a uniform data type for ports that may or may not be bound. +```rust +#[derive(Debug, Clone)] +pub enum PortLocation { + /// The port was bound: the location is its underlying bound ID. + Bound(PortId), + /// The port was not bound: we provide the actor ID and the message type. + Unbound(ActorId, &'static str), +} +``` + +One place `PortLocation` is used is in the type `MailboxSenderError` which is specifically for errors that occur during mailbox send operations. Each error is associated with the port ID of the operation: +```rust +#[derive(Debug)] +pub struct MailboxSenderError { + location: PortLocation, + kind: MailboxSenderErrorKind, +} + +/// The kind of mailbox sending errors. +#[derive(thiserror::Error, Debug)] +pub enum MailboxSenderErrorKind { + /// Error during serialization. + #[error("serialization error: {0}")] + Serialize(anyhow::Error), + + /// Error during deserialization. + #[error("deserialization error for type {0}: {1}")] + Deserialize(&'static str, anyhow::Error), + + /// A send to an invalid port. + #[error("invalid port")] + Invalid, + + /// A send to a closed port. + #[error("port closed")] + Closed, + + // The following pass through underlying errors: + /// An underlying mailbox error. + #[error(transparent)] + Mailbox(#[from] MailboxError), + + /// An underlying channel error. + #[error(transparent)] + Channel(#[from] ChannelError), + + /// An underlying message log error. + #[error(transparent)] + MessageLog(#[from] MessageLogError), + + /// An other, uncategorized error. + #[error("send error: {0}")] + Other(#[from] anyhow::Error), + + /// The destination was unreachable. + #[error("unreachable: {0}")] + Unreachable(anyhow::Error), +} +``` diff --git a/docs/source/books/hyperactor-book/src/mailboxes/index.md b/docs/source/books/hyperactor-book/src/mailboxes/index.md new file mode 100644 index 000000000..8897f6c60 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/index.md @@ -0,0 +1,15 @@ +# Mailboxes and Routers + +Mailboxes are the foundation of message delivery in hyperactor. They coordinate typed ports, routing logic, forwarding, and delivery infrastructure for distributed actors. + +This chapter introduces the components of the mailbox subsystem: + +- [Ports](ports.md): typed channels for local message delivery +- [MailboxSender](mailbox_sender.md): trait-based abstraction for message posting +- [Reconfigurable Senders](reconfigurable_sender.md): deferred wiring and dynamic configuration +- [MailboxServer](mailbox_server.md): bridging incoming message streams into mailboxes +- [MailboxClient](mailbox_client.md): buffering, forwarding, and failure reporting +- [Mailbox](mailbox.md): port registration, binding, and routing +- [Delivery Semantics](delivery.md): envelopes, delivery errors, and failure handling +- [Multiplexers](multiplexer.md): port-level dispatch to local mailboxes +- [Routers](routers.md): prefix-based routing to local or remote destinations diff --git a/docs/source/books/hyperactor-book/src/mailboxes/mailbox.md b/docs/source/books/hyperactor-book/src/mailboxes/mailbox.md new file mode 100644 index 000000000..9f0f07410 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/mailbox.md @@ -0,0 +1,354 @@ +# Mailbox + +A `Mailbox` represents an actor's in-process inbox. It owns and manages all of the actor's ports, provides APIs to open and bind them, and routes messages based on their destination `PortId`. + +A mailbox routes local messages directly to its ports. If a message is addressed to a different actor, the mailbox uses its configured forwarder to relay the message. If the message cannot be delivered-for instance, if the destination port is unbound-the mailbox wraps it as undeliverable and returns it via the supplied handle. + +This section covers: + +- Opening ports of various kinds +- Port binding and registration +- Internal mailbox state and delivery logic + +The `State` holds all delivery infrastructure: active ports, the actor's ID, a port allocator, and a forwarding mechanism. Multiple clones of a `Mailbox` share access to the same state: +```rust +pub struct Mailbox { + state: Arc, +} +``` +The `new` function creates a mailbox with the provided actor ID and forwarder for external destinations: +```rust +impl Mailbox { + pub fn new(actor_id: ActorId, forwarder: BoxedMailboxSender) -> Self { + Self { + state: Arc::new(State::new(actor_id, forwarder)), + } + } +} +``` +`new_detached` mailboxes are not connected to an external message forwarder and can only deliver to its own ports: +```rust +impl Mailbox { + pub fn new_detached(actor_id: ActorId) -> Self { + Self { + state: Arc::new(State::new(actor_id, BOXED_PANICKING_MAILBOX_SENDER.clone())), + } + } +``` + +A mailbox can open ports, each identified by a unique `PortId` within the owning actor. The most common form is `open_port`, which creates a fresh, unbounded channel for message delivery: +```rust +impl Mailbox { + pub fn open_port(&self) -> (PortHandle, PortReceiver) { + let port_index = self.state.allocate_port(); + let (sender, receiver) = mpsc::unbounded_channel::(); + let port_id = PortId(self.state.actor_id.clone(), port_index); + ( + PortHandle::new(self.clone(), port_index, UnboundedPortSender::Mpsc(sender)), + PortReceiver::new( + receiver, + port_id, + /*coalesce=*/ false, + self.state.clone(), + ), + ) + } +} +``` +This allocates a new port index and sets up a pair of endpoints: a `PortHandle` for sending messages into the port, and a `PortReceiver` for asynchronously consuming them. Internally, these are two ends of an `mpsc::unbounded_channel`, meaning messages are buffered in memory and processed in order without backpressure. + +In contrast to `open_port`, which uses a channel-backed buffer, `open_enqueue_port` constructs a port backed directly by a user-supplied enqueue function: +```rust +impl Mailbox { + pub(crate) fn open_enqueue_port( + &self, + enqueue: impl Fn(M) -> Result<(), anyhow::Error> + Send + Sync + 'static, + ) -> PortHandle { + PortHandle { + mailbox: self.clone(), + port_index: self.state.allocate_port(), + sender: UnboundedPortSender::Func(Arc::new(enqueue)), + bound: Arc::new(OnceLock::new()), + reducer_typehash: None, + } + } +} +``` +Unlike `open_port`, which yields both sender and receiver ends of an internal channel, `open_enqueue_port` returns only a `PortHandle`. Instead of buffering messages, this port invokes a user-supplied function directly on each message it receives. + +Another variant `open_accum_port`, builds on the same principle as `open_enqueue_port`, but pairs the port with an accumulator that maintains state across messages. We'll return to this specialized port type later. We'll also encounter `open_once_port` analogous to `open_port` but sets up a one-shot message channel - useful for rendezvous-style communication - using the associated `OncePortHandle` and `OncePort` types. + +## Port Binding + +Binding is only required when a port must be referred to externally-for example, when sending it across the network or including it in a message. Binding a port produces a `PortRef`, which globally names the port and requires that the associated message type implements `RemoteMessage` (i.e., is serializable). All messages sent via a `PortRef` are serialized. + +By contrast, `PortHandle` can be used locally to send any type implementing `Message`, including non-serializable types, and behaves like a typed in-memory queue. + +Once a port is opened with `open_port`, it must be bound before it can receive messages routed through the mailbox. Binding installs the port into the mailbox's internal routing table and produces a `PortRef`-a lightweight, serializable reference that remote actors can use to send messages to the port. + +Port binding is performed by calling `.bind()` on a `PortHandle`: +```rust +impl PortHandle { + pub fn bind(&self) -> PortRef { + PortRef::attest_reducible( + self.bound + .get_or_init(|| self.mailbox.bind(self).port_id().clone()) + .clone(), + self.reducer_typehash.clone(), + ) + } +} +``` +This delegates to `Mailbox::bind(&self, handle)`, which performs the actual installation into the mailbox's internal `State`. If the port is already bound, this is a no-op. + +The mailbox checks that the port handle belongs to it, computes the `PortId`, and then inserts the sender into the internal ports map if it hasn't been bound already: +```rust +impl Mailbox { + fn bind(&self, handle: &PortHandle) -> PortRef { + assert_eq!( + handle.mailbox.actor_id(), + self.actor_id(), + "port does not belong to mailbox" + ); + let port_id = self.actor_id().port_id(handle.port_index); + match self.state.ports.entry(handle.port_index) { + Entry::Vacant(entry) => { + entry.insert(Box::new(UnboundedSender::new( + handle.sender.clone(), + port_id.clone(), + ))); + } + Entry::Occupied(_entry) => {} + } + + PortRef::attest(port_id) + } +} +``` +The result is a `PortRef` that can be sent across the network to deliver messages to this bound port. + +## Binding to a Specific Index + +There is also a lower-level variant, bind_to, used internally by actor binding mechanisms (e.g., when installing well-known ports at known indices): +```rust +impl Mailbox { + fn bind_to(&self, handle: &PortHandle, port_index: u64) { + assert_eq!( + handle.mailbox.actor_id(), + self.actor_id(), + "port does not belong to mailbox" + ); + + let port_id = self.actor_id().port_id(port_index); + match self.state.ports.entry(port_index) { + Entry::Vacant(entry) => { + entry.insert(Box::new(UnboundedSender::new( + handle.sender.clone(), + port_id, + ))); + } + Entry::Occupied(_) => panic!("port {} already bound", port_id), + } + } +} +``` + +## Message Delivery via MailboxSender + +The mailbox also handles message delivery. It does this by implementing the `MailboxSender` trait, which defines how messages-wrapped in `MessageEnvelope`-are routed, deserialized, and delivered to bound ports or forwarded to remote destinations. +```rust +impl MailboxSender for Mailbox { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + if envelope.dest().actor_id() != &self.state.actor_id { + return self.state.forwarder.post(envelope, return_handle); + } + + match self.state.ports.entry(envelope.dest().index()) { + Entry::Vacant(_) => envelope.undeliverable( + DeliveryError::Unroutable("port not bound in mailbox".to_string()), + return_handle, + ), + Entry::Occupied(entry) => { + let (metadata, data) = envelope.open(); + let MessageMetadata {headers, sender, dest, error: metadata_error } = metadata; + match entry.get().send_serialized(headers, data) { + Ok(false) => { + entry.remove(); + } + Ok(true) => (), + Err(SerializedSenderError { + data, + error, + headers, + }) => MessageEnvelope::seal( + MessageMetadata { headers, sender, dest, error: metadata_error }, + data, + ) + .undeliverable(DeliveryError::Mailbox(format!("{}", error)), return_handle), + } + } + } + } +} +``` + +### Breakdown of Delivery Logic + +This implementation of `MailboxSender::post` defines how a mailbox handles message delivery: +1. Actor ID routing +```rust +if envelope.dest().actor_id() != &self.state.actor_id +``` +If the message is not addressed to this actor, it's forwarded using the forwarder defined in the mailbox's state. This allows for transparent routing across process or network boundaries. + +2. Port Lookup and Binding Check +```rust +match self.state.ports.entry(envelope.dest().index()) +``` +The mailbox uses the destination `PortId` to locate the bound port in its internal routing table. If the port hasn't been bound, the message is returned to the sender as undeliverable. + +3. Deserialization and Delivery Attempt +```rust +match entry.get().send_serialized(headers, data) +``` +If the port is found, the message is unsealed and passed to the corresponding `SerializedSender` (e.g., the `UnboundedSender` inserted during binding). This may succeed or fail: + - `Ok(true)`: Message was delivered. + - `Ok(false)`: Port is closed; remove it from the routing table. + - `Err(...)`: Deserialization failed or other error; wrap the message and return it to the sender as undeliverable. + +### Relationship to Bound Ports + +Only ports that have been bound via `PortHandle::bind()` appear in the ports map and are eligible to receive messages via this `post` path. The entry in this map is a type-erased boxed `SerializedSender`, which, when invoked, attempts to deserialize the raw message payload into the expected concrete type and forward it to the associated `PortReceiver` or handler. + +The mailbox's routing and delivery logic ultimately relies on the internal `State`, which stores port mappings, forwarding configuration, and allocation state. + +## State +Each `Mailbox` instance wraps an internal `State` struct that contains all shared delivery infrastructure: +```rust +struct State { + actor_id: ActorId, + ports: DashMap>, + next_port: AtomicU64, + forwarder: BoxedMailboxSender, +} +``` +This structure is reference-counted via `Arc` and is cloned across all components that need access to the mailbox's internal state. Each field plays a central role: +- **`actor_id`**: Identifies the actor that owns this mailbox. All ports in the mailbox are scoped under this actor ID and used to construct `PortId`s during binding and routing. +- **`ports`**: A concurrent map from port indices (`u64`) to type-erased `SerializedSenders`. Each entry corresponds to a bound port and provides the ability to deserialize and deliver raw messages to the correct `PortReceiver`. Only serializable ports are registered here. +- **`next_port`**: Tracks the next available user port index. Actor-assigned ports occupy indices 0..1024, and user-allocated ports begin from a constant offset (`USER_PORT_OFFSET`). +- **`forwarder`**: A boxed `MailboxSender` used for forwarding messages to other actors. If a message's destination is not owned by this mailbox, it will be passed to this sender. + +### State: Internal Structure of a Mailbox + +The `State` struct holds all the internal data needed for a functioning `Mailbox`. It's not exposed directly—rather, it's wrapped in `Arc` and shared between `Mailbox`, `PortHandle`, and `PortReceiver`: +```rust +impl State { + fn new(actor_id: ActorId, forwarder: BoxedMailboxSender) -> Self { + Self { + actor_id, + ports: DashMap::new(), + next_port: AtomicU64::new(USER_PORT_OFFSET), + forwarder, + } + } + + fn allocate_port(&self) -> u64 { + self.next_port.fetch_add(1, Ordering::SeqCst) + } +} +``` +**Notes**: +- The `actor_id` allows every `Mailbox` to know which actor it belongs to, which is essential for routing decisions (`post` checks this). +- The ports field holds the routing table: it maps each port index to a type-erased sink (`SerializedSender`) capable of deserializing and dispatching messages to the right receiver. +- `next_port` enables safe concurrent dynamic port allocation by atomically assigning unique port indices. +- The forwarder is used to send messages not destined for this actor-e.g., remote delivery. + +## Sending and Receiving Messages + +There are two distinct pathways by which a message can arrive at a `PortReceiver`. Both ultimately push a message into an `mpsc` channel (or functionally equivalent handler), but they differ in intent and routing mechanism. + +### Local Sends via PortHandle + +When you call `.send(msg)` on a `PortHandle`, the message bypasses the `Mailbox` entirely and goes directly into the associated channel: +```text +PortHandle::send(msg) +→ UnboundedPortSender::send(Attrs::new(), msg) +→ underlying channel (mpsc::UnboundedSender) +→ PortReceiver::recv().await +``` + +### Routed Sends via Mailbox + +When a message is wrapped in a `MessageEnvelope` and posted via `Mailbox::post`, routing logic takes over: +```text +Mailbox::post(envelope, return_handle) +→ lookup State::ports[port_index] +→ SerializedSender::send_serialized(headers, bytes) +→ UnboundedSender::send(headers, M) // after deserialization +→ mpsc channel +→ PortReceiver::recv().await +``` +This is the delivery path for remote messages or any message routed by a `PortRef`. A `PortHandle` must first be **bound** to participate in this. + +## Capabilities + +Capabilities are lightweight traits that control access to mailbox-related operations. They act as permissions: a type that implements a capability trait is allowed to perform the corresponding action, such as sending messages or opening ports. + +These traits are sealed, meaning they can only be implemented inside the crate. This ensures that capability boundaries are enforced and cannot be circumvented by downstream code. + +### Overview + +| Capability | Description | +|----------------|-----------------------------------------------------| +| `CanSend` | Allows sending messages to ports | +| `CanOpenPort` | Allows creating new ports for receiving messages | +| `CanSplitPort` | Allows splitting existing ports with reducers | +| `CanSpawn` | Allows spawning new child actors | + +Each public trait (e.g., `CanSend`) is implemented for any type that implements the corresponding private `sealed::CanSend` trait. This gives the crate full control over capability delegation and encapsulation. + +### Example: CanSend +```rust +pub trait CanSend: sealed::CanSend {} +impl CanSend for T {} +``` + +The sealed version defines the core method: +```rust +pub trait sealed::CanSend: Send + Sync { + fn post(&self, dest: PortId, headers: Attrs, data: Serialized); +} +``` +Only internal types (e.g., `Mailbox`) implement this sealed trait, meaning only trusted components can obtain `CanSend`: +```rust +impl cap::sealed::CanSend for Mailbox { + fn post(&self, dest: PortId, headers: Attrs, data: Serialized) { + let return_handle = self + .lookup_sender::>() + .map_or_else( + || { + let actor_id = self.actor_id(); + if CAN_SEND_WARNED_MAILBOXES + .get_or_init(DashSet::new) + .insert(actor_id.clone()) { + let bt = std::backtrace::Backtrace::capture(); + tracing::warn!( + actor_id = ?actor_id, + backtrace = ?bt, + "mailbox attempted to post a message without binding Undeliverable" + ); + } + monitored_return_handle() + }, + |sender| PortHandle::new(self.clone(), self.state.allocate_port(), sender), + ); + let envelope = MessageEnvelope::new(self.actor_id().clone(), dest, data, headers); + MailboxSender::post(self, envelope, return_handle); + } +} +``` +This implementation prefers that the mailbox has already bound a port capable of receiving undeliverable messages (of type `Undeliverable`). This port acts as a return address for failed message deliveries. If the port is not bound, message sending will warn with a backtrace indicating a logic error in system setup and fallback on a `monitored_return_handle` (ideally we'd `panic!` but backwards compatibility prevents this). This ensures that all messages have a well-defined failure path and avoids silent message loss. diff --git a/docs/source/books/hyperactor-book/src/mailboxes/mailbox_client.md b/docs/source/books/hyperactor-book/src/mailboxes/mailbox_client.md new file mode 100644 index 000000000..b2021fb3d --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/mailbox_client.md @@ -0,0 +1,201 @@ +# MailboxClient + +A `MailboxClient` is the sending counterpart to a `MailboxServer`. It owns a buffer of outgoing messages and transmits them over a `channel::Tx` interface to a remote server. + +The client handles undeliverable returns, maintains a background task for monitoring channel health, and implements `MailboxSender` for compatibility. + +Topics in this section: + +- The `MailboxClient` struct and its `new` constructor +- The use of `Buffer` for decoupled delivery +- Delivery error handling and monitoring + +## Internal Buffering + +`MailboxClient` uses a `Buffer` internally to decouple message submission from actual transmission. This buffer ensures ordered, asynchronous delivery while preserving undeliverable routing guarantees. + +This is a foundational buffer abstraction used in several types in the remainder of the program. It's a concurrency-safe buffered message processor, parameterized on the message type `T`. + +The buffer: +- accepts messages of type `T` +- spawns an internal background task to process messages asynchronously +- tracks how many messages have been processed via a `watch` channel + `AtomicUsize`: +```rust +struct Buffer { + queue: mpsc::UnboundedSender<(T, PortHandle>)>, + processed: watch::Receiver, + seq: AtomicUsize, +} +``` +For functions of type `Fn(T) -> impl Future`, a new `Buffer` can be constructed: +```rust +impl Buffer { + fn new( + process: impl Fn(T, PortHandle>) -> Fut + Send + Sync + 'static, + ) -> Self + where + Fut: Future + Send + 'static, + { + let (queue, mut next) = mpsc::unbounded_channel(); + let (last_processed, processed) = watch::channel(0); + crate::init::RUNTIME.spawn(async move { + let mut seq = 0; + while let Some((msg, return_handle)) = next.recv().await { + process(msg, return_handle).await; + seq += 1; + let _ = last_processed.send(seq); + } + }); + Self { + queue, + processed, + seq: AtomicUsize::new(0), + } + } +} +``` +The `Buffer` type is constructed by providing a user-supplied asynchronous processing function. This function accepts incoming messages of type `T` together with a return handle for undeliverable messages. Each time a message is enqueued into the buffer, it is delivered to the processing function in the order received. + +Internally, the buffer maintains an unbounded channel for queued messages and spawns a background task responsible for processing messages sequentially. As each message is handled, the buffer advances an internal sequence counter and updates a watch channel, allowing external components to monitor processing progress if needed. The processing function is fully asynchronous: the buffer awaits its completion before proceeding to the next message, ensuring that processing remains ordered and that no work is dropped or skipped. + +This design decouples message submission from processing, allowing producers to enqueue messages immediately while processing occurs concurrently in the background. + +We can write a `send` function for `Buffer`. It is not `async` since it just enqueues the incoming `T` for processing: +```rust +impl Buffer { + fn send( + &self, + item: (T, PortHandle>), + ) -> Result<(), mpsc::error::SendError<(T, PortHandle>)>> { + self.seq.fetch_add(1, Ordering::SeqCst); + self.queue.send(item)?; + Ok(()) + } +} +``` +The buffer maintains two separate counters: one tracking the number of messages submitted for processing, and another tracking the number of messages fully processed. The submission counter (`seq`) is updated atomically each time a message is enqueued. This allows external components to observe the current backlog of unprocessed messages by comparing the two counters. + +The `flush` operation however is `async`: +```rust +impl Buffer { + async fn flush(&mut self) -> Result<(), watch::error::RecvError> { + let seq = self.seq.load(Ordering::SeqCst); + while *self.processed.borrow_and_update() < seq { + self.processed.changed().await?; + } + Ok(()) + } +} +``` +This function allows callers to await the completion of all previously submitted messages. When invoked, the current submission sequence number is read to capture the total number of messages that have been enqueued at that point. The function then asynchronously waits until the processing counter reaches or exceeds this value, indicating that all submitted messages have been fully processed. + +Internally, `flush(`) uses the buffer’s watch channel to observe updates as message processing advances. Each time a message completes processing, the background task updates the watch channel, allowing` flush()` to efficiently wait without busy-waiting or polling. + +## Role and Behavior of `MailboxClient` + +The `MailboxServer` listens for incoming messages on a channel and delivers them to the system. The `MailboxClient` acts as the sender, enqueueing messages for transmission to the server. + +A `MailboxClient` is the **dual** of a `MailboxServer`. It: +- owns a `Buffer` that decouples senders from actual delivery; +- transmits messages asynchronously over a `channel::Tx`; +- reports undeliverable messages via a `PortHandle>`; +- monitors the transmission channel for health and shuts down approriately. + +`MailboxServer` is a trait defining the receiving side of a message channel; `MailboxClient` is a concrete sender that buffers and transmits messages to it: +```rust +pub struct MailboxClient { + buffer: Buffer, + _tx_monitoring: CancellationToken, +} +``` + +The `MailboxClient::new` constructor creates a buffered client capable of sending `MessageEnvelope`s over a `channel::Tx`. This channel represents the transmission path to a remote `MailboxServer`. +```rust +impl MailboxClient { + pub fn new(tx: impl channel::Tx + Send + Sync + 'static) -> Self { + let addr = tx.addr(); + let tx = Arc::new(tx); + let tx_status = tx.status().clone(); + let tx_monitoring = CancellationToken::new(); + let buffer = Buffer::new(move |envelope, return_handle| { + let tx = Arc::clone(&tx); + let (return_channel, return_receiver) = oneshot::channel(); + // Set up for delivery failure. + let return_handle_0 = return_handle.clone(); + tokio::spawn(async move { + let result = return_receiver.await; + if let Ok(message) = result { + let _ = return_handle_0.send(Undeliverable(message)); + } else { + // Sender dropped, this task can end. + } + }); + // Send the message for transmission. + let return_handle_1 = return_handle.clone(); + async move { + if let Err(SendError(_, envelope)) = tx.try_post(envelope, return_channel) { + // Failed to enqueue. + envelope.undeliverable( + DeliveryError::BrokenLink("failed to enqueue in MailboxClient".to_string()), + return_handle_1.clone(), + ); + } + } + }); + let this = Self { + buffer, + _tx_monitoring: tx_monitoring.clone(), + }; + Self::monitor_tx_health(tx_status, tx_monitoring, addr); + this + } + +``` + +Constructing a `MailboxClient` sets up a buffer that attempts to transmit messages over a `channel::Tx`, returning them to the sender via the return handle if delivery fails. + +The client internally maintains a `Buffer` that decouples the enqueueing of messages from their actual delivery. This allows producers to send messages immediately without blocking on network or delivery latency. + +To construct the client: +- The provided `tx` (a `channel::Tx`) is wrapped in an `Arc` so it can be shared safely across tasks. +- A `CancellationToken` is created to coordinate shutdown or monitoring cancellation. +- A new `Buffer` is initialized, with a closure defining how each buffered message should be processed. + + This closure is passed `(envelope, return_handle)`: + 1. A fresh one-shot channel is created for each message, to support delivery-failure return paths. + 2. A background task is spawned that awaits the outcome of the one-shot channel. + - If the `channel::Tx` reports delivery failure by sending the message back on the one-shot channel, the task uses the return handle to report it as undeliverable. + 3. The closure returns an `async move` block that attempts to send the envelope using `tx.try_post(`...). + - If the send fails (e.g., due to a broken channel), the envelope is marked as undeliverable and returned via the return handle. + +- Finally, the constructor installs a monitoring task using `monitor_tx_health`, allowing the client to detect when the transmission channel becomes unhealthy. + +The resulting `MailboxClient` consists of the constructed `Buffer` and the cancellation token used to coordinate monitoring. + +### `MailboxClient` implements `MailboxSender` + +`MailboxClient` itself implements the `MailboxSender` trait. This is made possible by delegating its `post` method to the underlying `Buffer` (by calling `send` on it). As a result, any component expecting a `MailboxSender` can use a `MailboxClient` transparently: +```rust +impl MailboxSender for MailboxClient { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + tracing::trace!(name = "post", "posting message to {}", envelope.dest); + if let Err(mpsc::error::SendError((envelope, return_handle))) = + self.buffer.send((envelope, return_handle)) + { + // Failed to enqueue. + envelope.undeliverable( + DeliveryError::BrokenLink("failed to enqueue in MailboxClient".to_string()), + return_handle, + ); + } + } +} +``` + +Although `MailboxClient` and `MailboxServer` play dual roles (one sends, the other receives) both implement the `MailboxSender` trait. + +In the client’s case, implementing `MailboxSender` allows it to participate in code paths that post messages, by enqueueing them into its internal buffer. For the server, `MailboxSender` reflects its ability to post directly into the system after receiving a message from a channel. diff --git a/docs/source/books/hyperactor-book/src/mailboxes/mailbox_sender.md b/docs/source/books/hyperactor-book/src/mailboxes/mailbox_sender.md new file mode 100644 index 000000000..fd1a78d94 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/mailbox_sender.md @@ -0,0 +1,133 @@ +# MailboxSender + +`MailboxSender` is a trait that abstracts the ability to deliver `MessageEnvelope`s. Anything that implements `MailboxSender` can act as a message sink—whether it's a local `Mailbox`, a forwarding proxy, or a buffered client. + +This section introduces: + +- The `MailboxSender` trait +- The `PortSender` extension +- Standard implementations: `BoxedMailboxSender`, `PanickingMailboxSender`, `UndeliverableMailboxSender` + +`MailboxSender`s can send messages through ports to mailboxes. +```rust +pub trait MailboxSender: Send + Sync + Debug + Any { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ); +} +``` + +`PortSender` is an extension trait. The function `serialize_and_send` will serialize a message, install it in an envelope with an unknown sender and `post` it to the provided port. `serialize_and_send_once` `post`s to a one-shot port, consuming the provided port which is not resuable. +```rust +pub trait PortSender: MailboxSender { + fn serialize_and_send( + &self, + port: &PortRef, + message: M, + return_handle: PortHandle>, + ) -> Result<(), MailboxSenderError> { + let serialized = Serialized::serialize(&message).map_err(|err| { + MailboxSenderError::new_bound( + port.port_id().clone(), + MailboxSenderErrorKind::Serialize(err.into()), + ) + })?; + self.post( + MessageEnvelope::new_unknown(port.port_id().clone(), serialized), + return_handle, + ); + Ok(()) + } + + fn serialize_and_send_once( + &self, + once_port: OncePortRef, + message: M, + return_handle: PortHandle>, + ) -> Result<(), MailboxSenderError> { + let serialized = Serialized::serialize(&message).map_err(|err| { + MailboxSenderError::new_bound( + once_port.port_id().clone(), + MailboxSenderErrorKind::Serialize(err.into()), + ) + })?; + self.post( + MessageEnvelope::new_unknown(once_port.port_id().clone(), serialized), + return_handle, + ); + Ok(()) + } +} +``` +All `MailboxSender`s are `PortSender`s too: +```rust +impl PortSender for T {} +``` +This is a perpetually closed mailbox sender. It panics if any messages are posted on it. Some uses can be found for it here and there. +```rust +#[derive(Debug, Clone)] +pub struct PanickingMailboxSender; + +impl MailboxSender for PanickingMailboxSender { + fn post( + &self, + envelope: MessageEnvelope, + _return_handle: PortHandle>, + ) { + panic!("panic! in the mailbox! attempted post: {}", envelope) + } +} +``` +This is a mailbox sender of last resort for undeliverable messages that writes to the error log: +```rust +#[derive(Debug)] +pub struct UndeliverableMailboxSender; + +impl MailboxSender for UndeliverableMailboxSender { + fn post( + &self, + envelope: MessageEnvelope, + _return_handle: PortHandle>, + ) { + tracing::error!("message not delivered: {}", envelope); + } +} +``` + +`BoxedMailboxSender` is a a type-erased, thread-safe, reference-counted mailbox sender: +```rust +struct BoxedMailboxSender(Arc); + +impl MailboxSender for BoxedMailboxSender { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + hyperactor_telemetry::declare_static_counter!(MAILBOX_POSTS, "mailbox.posts"); + MAILBOX_POSTS.add( + 1, + hyperactor_telemetry::kv_pairs!( + "actor_id" => envelope.sender.to_string(), + "dest_actor_id" => envelope.dest.0.to_string(), + ), + ); + self.0.post(envelope, return_handle) + } +} +``` +hyperactor internally makes use of a global boxed panicking mailbox sender: +```rust +static BOXED_PANICKING_MAILBOX_SENDER: LazyLock = + LazyLock::new(|| BoxedMailboxSender::new(PanickingMailboxSender)); +``` + +`Mailbox` is a concrete type representing an actor’s local inbox. Internally, it holds a map of ports and routes incoming messages to their respective receivers. + +Meanwhile, `MailboxSender` is an abstraction: a trait that represents anything capable of delivering a `MessageEnvelope` to a mailbox. + +Every `Mailbox` implements `MailboxSender`. When you invoke post on a `Mailbox`, it performs local delivery by looking up the port and forwarding the message. + +Other types - such as `MailboxServer`, `BoxedMailboxSender`, or adapters that forward to remote systems also implement `MailboxSender`. diff --git a/docs/source/books/hyperactor-book/src/mailboxes/mailbox_server.md b/docs/source/books/hyperactor-book/src/mailboxes/mailbox_server.md new file mode 100644 index 000000000..caaa564dd --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/mailbox_server.md @@ -0,0 +1,104 @@ +# MailboxServer + +A `MailboxServer` listens for incoming `MessageEnvelope`s from an external source and posts them into a mailbox using the `MailboxSender` trait. + +This section describes: + +- The `MailboxServer` trait and its `serve` method +- The `MailboxServerHandle` used for graceful shutdown +- The internal `tokio::select!` loop for serving messages + +A `MailboxServer` is any `MailboxSender` that supports being connected to a channel from which it receives `MessageEnvelope`s. It defines a single function, `serve`, which spawns a background task that listens for messages on the channel and posts them into the system via its `post` method. + +A `MailboxServerHandle` represents a running mailbox server. The handle composes a tokio `JoinHandle` and may be joined in the same manner (`MailboxServerHandle` implements `Future`): +```rust +#[derive(Debug)] +pub struct MailboxServerHandle { + join_handle: JoinHandle>, + stopped_tx: watch::Sender, +} +``` + +A mailbox server can be turned down using the `watch::Sender`: +```rust +impl MailboxServerHandle { + pub fn stop(&self) { + self.stopped_tx.send(true).expect("stop called twice"); + } +} +``` + +There is an error type associated with "mailbox serving": +```rust +#[derive(thiserror::Error, Debug)] +pub enum MailboxServerError { + /// An underlying channel error. + #[error(transparent)] + Channel(#[from] ChannelError), + + /// An underlying mailbox sender error. + #[error(transparent)] + MailboxSender(#[from] MailboxSenderError), +} +``` + +A `MailboxServer` is any `MailboxSender` that supports being connected to a channel from which it receieves `MessageEnvelope`s. It runs a background task that listens for messages on the channel and posts them into the system via its `post` method: +```rust +pub trait MailboxServer: MailboxSender + Sized + 'static { + fn serve( + self, + mut rx: impl channel::Rx + Send + 'static, + return_handle: PortHandle>, + ) -> MailboxServerHandle { + let (stopped_tx, mut stopped_rx) = watch::channel(false); + let join_handle = tokio::spawn(async move { + let mut detached = false; + + loop { + if *stopped_rx.borrow_and_update() { + break Ok(()); + } + + tokio::select! { + message = rx.recv() => { + match message { + // Relay the message to the port directly. + Ok(envelope) => self.post(envelope, return_handle.clone()), + + // Closed is a "graceful" error in this case. + // We simply stop serving. + Err(ChannelError::Closed) => break Ok(()), + Err(channel_err) => break Err(MailboxServerError::from(channel_err)), + } + } + result = stopped_rx.changed(), if !detached => { + tracing::debug!( + "the mailbox server is stopped" + ); + detached = result.is_err(); + } + } + } + }); + + MailboxServerHandle { + join_handle, + stopped_tx, + } + } +} +``` +The use of `detached` above is clever - there is no point on waiting for `stopped_rx.changed()` any more if the sender has been dropped. + +This provides a general mechanism for bridging external message sources such as remote transport with local mailbox delivery. + +The `serve` function spawns this background task and returns a handle that can be used to signal shutdown or await termination. + +This blanket impl declares that **any type `T`** which: +- implements the `MailboxSender` trait +- is `Sized`, `Sync`, `Send` and `'static` + +will **automatically implement `MailboxServer`** by inheriting the default `serve` function provided in the trait definition: +```rust +impl MailboxServer for T {} +``` diff --git a/docs/source/books/hyperactor-book/src/mailboxes/multiplexer.md b/docs/source/books/hyperactor-book/src/mailboxes/multiplexer.md new file mode 100644 index 000000000..6a0a3dfdb --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/multiplexer.md @@ -0,0 +1,73 @@ +# Multiplexers + +**Muxers** (short for multiplexers) form the first level of indirection in the mailbox subsystem. While a `Mailbox` delivers messages to typed ports within a single actor, a `MailboxMuxer` delivers messages to the correct mailbox instance given an `ActorId`. + +It acts as a dynamic registry, allowing multiple mailboxes to be addressed through a single posting interface. + +This page introduces the `MailboxMuxer` and its role in: +- Aggregating multiple mailbox instances +- Dispatching incoming messages to the appropriate `MailboxSender` +- Supporting dynamic binding and unbinding of mailboxes + +Let's begin by looking at the core structure of `MailboxMuxer`: +```rust +pub struct MailboxMuxer { + mailboxes: Arc>>, +} +``` +The `MailboxMuxer` maintains a thread-safe, concurrent map from `ActorId` to `MailboxSender` trait objects. Each entry represents a live binding to a mailbox capable of receiving messages for a specific actor. This allows the muxer to act as a single dispatch point for delivering messages to any number of registered actors, abstracting over the details of how and where each mailbox is implemented. + +To register a mailbox with the muxer, callers use the `bind` method: +```rust +impl MailboxMuxer { + pub fn bind(&self, actor_id: ActorId, sender: impl MailboxSender + 'static) -> bool { + match self.mailboxes.entry(actor_id) { + Entry::Occupied(_) => false, + Entry::Vacant(entry) => { + entry.insert(Box::new(sender)); + true + } + } + } + +} +``` +This function installs a new mapping from the given `ActorId` to a boxed `MailboxSender`. If the `ActorId` is already registered, the bind fails (returns `false`), and the existing sender is left unchanged. This ensures that actors cannot be accidentally rebound without first explicitly unbinding them—enforcing a clear handoff protocol. To rebind, the caller must invoke `unbind` first. + +It's crucial to recall that `Mailbox` itself implements the `MailboxSender` trait. This is what allows it to be registered directly into a `MailboxMuxer`. The `post` method of a `Mailbox` inspects the incoming `MessageEnvelope` to determine whether it is the intended recipient. If the `ActorId` in the envelope matches the mailbox's own ID, the mailbox delivers the message locally: it looks up the appropriate port by index and invokes `send_serialized` on the matching channel. If the `ActorId` does *not* match, the mailbox delegates the message to its internal forwarder by calling `self.state.forwarder.post(envelope)`. + +With this behavior in mind, we can now define a convenience method for registering a full `Mailbox`: + +```rust +impl MailboxMuxer { + fn bind_mailbox(&self, mailbox: Mailbox) -> bool { + self.bind(mailbox.actor_id().clone(), mailbox) + } +} +``` +To support rebinding or teardown, the muxer also provides a symmetric `unbind` function, which removes the sender associated with a given `ActorId`: +```rust + pub(crate) fn unbind(&self, actor_id: &ActorId) { + self.mailboxes.remove(actor_id); + } +``` +And of course, we can implement `MailboxSender` for `MailboxMuxer` itself—allowing it to act as a unified dispatcher for all registered mailboxes: +```rust +impl MailboxSender for MailboxMuxer { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + let dest_actor_id = envelope.dest().actor_id(); + match self.mailboxes.get(envelope.dest().actor_id()) { + None => { + let err = format!("no mailbox for actor {} registered in muxer", dest_actor_id); + envelope.undeliverable(DeliveryError::Unroutable(err), return_handle) + } + Some(sender) => sender.post(envelope, return_handle), + } + } +} +``` +This makes `MailboxMuxer` composable: it can be nested within other routers, shared across components, or substituted for a standalone mailbox in generic code. If the destination `ActorId` is found in the internal map, the message is forwarded to the corresponding sender. Otherwise, it is marked as undeliverable with an appropriate `DeliveryError`. diff --git a/docs/source/books/hyperactor-book/src/mailboxes/ports.md b/docs/source/books/hyperactor-book/src/mailboxes/ports.md new file mode 100644 index 000000000..519117c7b --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/ports.md @@ -0,0 +1,454 @@ +# Ports + +Ports are the basic units of typed message delivery within a mailbox. Each port is a pair consisting of: + +- a `PortHandle`, used to send messages of type `M`, and +- a `PortReceiver`, used to receive those messages asynchronously. + +Variants like `OncePortHandle` and `OncePortReceiver` support one-shot communication. All ports are associated with a unique `PortId` within an actor, and may be wrapped in a [`PortRef`](#portref) for safe external use. + +This section defines the structure, behavior, and usage of ports. + +## Delivery Backends + +Each typed port uses an internal delivery mechanism to enqueue messages. This is abstracted by the `UnboundedSenderPort` enum: +```rust +enum UnboundedPortSender { + Mpsc(mpsc::UnboundedSender), + Func(Arc Result<(), anyhow::Error> + Send + Sync>), +} +``` +- **`Mpsc`**: Sends messages into a tokio unbounded channel +- **`Func`**: Custom logic, often used to enqueue messages onto actor work queues. + +Messages are sent via the `.send(headers, message)` method, which forwards to either the internal channel or the configured function. + +## `PortHandle` + +A `PortHandle` is used to send `M`-typed messages to a mailbox port. It represents the sending half of a typed port: +```rust +pub struct PortHandle { + mailbox: Mailbox, + port_index: u64, + sender: UnboundedPortSender, + bound: Arc>, + reducer_typehash: Option, +} +``` +### Fields + +- **`mailbox`**: The `Mailbox` this port was created from. Stored so the handle can access the actor ID and bind itself into the mailbox’s internal port map. +- **`port_index`**: The local index of the port within the mailbox. Used as the key in the mailbox's port map. +- **`sender`**: The internal message delivery mechanism (e.g., MPSC channel). This determines how messages of type `M` are actually enqueued. +- **`bound`**: A lazily initialized `PortId` stored in a `OnceLock`. This is populated when the port is formally bound into the mailbox, enabling external references via `PortRef`. +- **`reducer_typehash`**: An optional type hash representing a reducer function for accumulating messages. Used in specialized delivery modes (e.g., incremental updates). + +### Construction and Use + +```rust +impl PortHandle { + fn new(mailbox: Mailbox, port_index: u64, sender: UnboundedPortSender) -> Self { + Self { + mailbox, + port_index, + sender, + bound: Arc::new(OnceLock::new()), + reducer_typehash: None, + } + } + + fn location(&self) -> PortLocation { + match self.bound.get() { + Some(port_id) => PortLocation::Bound(port_id.clone()), + None => PortLocation::new_unbound::(self.mailbox.actor_id().clone()), + } + } + + pub fn send(&self, message: M) -> Result<(), MailboxSenderError> { + self.sender.send(message).map_err(|err| { + MailboxSenderError::new_unbound::( + self.mailbox.actor_id().clone(), + MailboxSenderErrorKind::Other(err), + ) + }) + } +} +``` +- `new` constructs a port handle with the mailbox, port index and delivery backend +- `location` reports whether the port is currently bound +- `send` enqueues a message via the internal sender, wrapping errors as needed. + +### Binding + +To make a port externally addressable (e.g. for use remote delivery), it must be **bound**: +```rust +impl PortHandle { + pub fn bind(&self) -> PortRef; +} +``` +This registers the port in the owning `Mailbox` and returns a `PortRef`. Binding is lazy and idempotent. For a detailed explanation of port binding, see [Mailbox](./mailbox.md#port-binding). + +## PortLocation + +`PortLocation` describes the logical address of a port. It is used in error messages and has two cases to represent whether or not a port is bound. +```rust +pub enum PortLocation { + /// The port was bound: the location is its underlying bound ID. + Bound(PortId), + /// The port was not bound: we provide the actor ID and the message type. + Unbound(ActorId, &'static str), +} +``` + +## `OncePortHandle` + +A `OncePortHandle` is a one-shot sender for `M`-typed messages. Unlike `PortHandle`, which supports unbounded delivery, this variant enqueues a single message using a one-time `oneshot::Sender`: +```rust +pub struct OncePortHandle { + mailbox: Mailbox, + port_index: u64, + port_id: PortId, + sender: oneshot::Sender, +} +``` +### Fields +- **`mailbox`**: The `Mailbox` this port was created from. Stored so the handle can access the actor ID and register itself in the mailbox’s port map. +- **`port_index`**: The local index of the port within the mailbox. Used as the key in the mailbox’s port map. +- **`port_id`**: The globally unique identifier for this port. Assigned eagerly, since one-shot ports are always bound at creation. +- **`sender`**: The one-shot message delivery channel. Used to transmit a single M-typed message. + +Compared to [`PortHandle`](#porthandlem), a `OncePortHandle` is: + +- **bound eagerly** at creation (it always has a `PortId`), +- **non-reusable** (it delivers at most one message), +- and uses a **one-shot channel** instead of an unbounded queue. + +### Binding + +Unlike `PortHandle`, `OncePortHandle` is already bound at creation. However, calling `bind()` produces a `OncePortRef` that can be shared with remote actors: +```rust +impl OncePortHandle { + pub fn bind(self) -> OncePortRef; +} +``` + +## `PortRef` and `OncePortRef` + +A `PortRef` is a cloneable, sendable reference to a bound typed port. These are used to send messages to an actor from outside its mailbox, typically after calling `.bind()` on a `PortHandle`: +```rust +pub struct PortRef { + port_id: PortId, + reducer_typehash: Option, + phantom: PhantomData, +} +``` +### Fields + +- **`port_id`**: The globally unique identifier for this port. Used during message routing to locate the destination mailbox. +- **`reducer_typehash`**: Optional hash of the reducer type, used to validate compatibility when delivering messages to reducer-style ports. +- **`phantom`**: Phantom data to retain the `M` type parameter. This enforces compile-time type safety without storing a value of type `M`. + +A `OncePortRef` is a reference to a one-shot port. Unlike `PortRef`, it allows exactly one message to be sent. These are created by binding a `OncePortHandle`. +```rust +pub struct OncePortRef { + port_id: PortId, + phantom: PhantomData, +} +``` + +### Fields + +- **`port_id`**: The globally unique identifier for this port. Used during message routing to locate the destination mailbox. +- **`phantom`**: Phantom data to retain the `M` type parameter. This enforces compile-time type safety without storing a value of type `M`. + +## `PortReceiver` + +A `PortReceiver` is used to asynchronously receive `M`-typed messages from a port. It is the receiving half of a typed port pair: +```rust +pub struct PortReceiver { + receiver: mpsc::UnboundedReceiver, + port_id: PortId, + coalesce: bool, + state: Arc, +} +``` +### Fields + +- **`receiver`**: The unbounded MPSC channel receiver used to retrieve messages sent to this port. +- **`port_id`**: The unique identifier for the port associated with this receiver. Used to deregister the port when the receiver is dropped. +- **`coalesce`**: If `true`, enables coalescing behavior — only the most recent message is retained when multiple are queued, and earlier ones are discarded. +- **`state`**: Shared internal mailbox state. Used to cleanly deregister the port from the mailbox when the receiver is dropped. + +### Usage + +A `PortReceiver` is returned when calling `.open_port::()` on a `Mailbox`. The actor can `await` messages on the receiver using `.recv().await`, which yields `Option`: +```rust +let (port, mut receiver) = mailbox.open_port::(); +// ... +if let Some(msg) = receiver.recv().await { + handle(msg); +} +``` + +### Construction and Use + +A `PortReceiver` is created when calling `.open_port::()` on a `Mailbox`. `new` just constructs a`PortReceiver` by wrapping the provided channel (`receiver`): +```rust +impl PortReceiver { + fn new( + receiver: mpsc::UnboundedReceiver, + port_id: PortId, + coalesce: bool, + state: Arc + ) -> Self { + Self { + receiver, + port_id, + coalesce, + state, + } + } +} +``` + +Dropping the `PortReceiver` automatically deregisters the associated port, preventing further message delivery. +```rust +impl Drop for PortReceiver { + fn drop(&mut self) { + self.state.ports.remove(&self.port()); + } +} +``` + +### `try_recv` + +Attempts to receive a message from the port without blocking. + +This method polls the underlying channel and returns immediately: + +- `Ok(Some(msg))` if a message is available, +- `Ok(None)` if the queue is currently empty, +- `Err(MailboxError)` if the port is closed or disconnected. + +If the port was created with `coalesce = true`, `try_recv()` drains the queue and returns only the most recent message, discarding earlier ones: +```rust +impl PortReceiver { + pub fn try_recv(&mut self) -> Result, MailboxError> { + let mut next = self.receiver.try_recv(); + // To coalesce, drain the mpsc queue and only keep the last one. + if self.coalesce { + if let Some(latest) = self.drain().pop() { + next = Ok(latest); + } + } + match next { + Ok(msg) => Ok(Some(msg)), + Err(mpsc::error::TryRecvError::Empty) => Ok(None), + Err(mpsc::error::TryRecvError::Disconnected) => Err(MailboxError::new( + self.actor_id().clone(), + MailboxErrorKind::Closed, + )), + } + } +} +``` + +### `recv` + +Receives the next message from the port, waiting if necessary. + +This is the asynchronous counterpart to `try_recv`. It awaits a message and returns it once available. If the port has been closed, it returns a `MailboxError`. + +When `coalesce = true`, this method awaits one message, then drains the queue and returns only the most recent one. +```rust +impl PortReceiver { + pub async fn recv(&mut self) -> Result { + let mut next = self.receiver.recv().await; + // To coalesce, get the last message from the queue if there are + // more on the mspc queue. + if self.coalesce { + if let Some(latest) = self.drain().pop() { + next = Some(latest); + } + } + next.ok_or(MailboxError::new( + self.actor_id().clone(), + MailboxErrorKind::Closed, + )) + } +} +``` + +### `drain` + +Drains all available messages from the port without blocking. + +This method is used internally by `recv` and `try_recv` when `coalesce = true`, but can also be used directly to consume multiple messages in a batch. + +If `coalesce` is enabled, all but the most recent message are discarded during the drain. +```rust +impl PortReceiver { + pub fn drain(&mut self) -> Vec { + let mut drained: Vec = Vec::new(); + while let Ok(msg) = self.receiver.try_recv() { + // To coalesce, discard the old message if there is any. + if self.coalesce { + drained.pop(); + } + drained.push(msg); + } + drained + } +} +``` + +## `OncePortReceiver` + +A `OncePortReceiver` is the receiving half of a one-shot port. It is returned when calling `.open_once_port::()` on a `Mailbox`. Unlike `PortReceiver`, this variant: +- Receives exactly one message, +- Consumes itself on receive (i.e., recv takes self by value), +- Internally wraps a `oneshot::Receiver` instead of an unbounded channel. +```rust +pub struct OncePortReceiver { + receiver: Option>, + port_id: PortId, + state: Arc, +} +``` + +### Receiving + +`recv()` consumes the `OncePortReceiver` and awaits a single message. If the port is closed before the message is sent, it returns a `MailboxError`. +```rust +impl OncePortReceiver { + pub async fn recv(mut self) -> Result { + std::mem::take(&mut self.receiver) + .unwrap() + .await + .map_err(|err| { + MailboxError::new( + self.actor_id().clone(), + MailboxErrorKind::Recv(self.port_id.clone(), err.into()), + ) + }) + } +} +``` +The `recv` method moves out the internal `oneshot::Receiver` using `std::mem::take(`) and awaits it. Any error (e.g., if the sender was dropped) is converted into a `MailboxError`. + +### Lifecycle and Deregistration + +Like `PortReceiver`, dropping a `OncePortReceiver` deregisters the port from the mailbox’s state: +```rust +impl Drop for OncePortReceiver { + fn drop(&mut self) { + self.state.ports.remove(&self.port()); + } +} +``` +This ensures the port becomes unreachable and no further message delivery occurs once the receiver is dropped. + +## Sending Messages + +### `UnboundedSender` and `OnceSender` +Every open port is backed by a sender, responsible for delivering messages to the corresponding receiver. For unbounded ports, this sender is: +```rust +enum UnboundedPortSender { + Mpsc(mpsc::UnboundedSender), + Func(Arc Result<(), anyhow::Error> + Send + Sync>), +} +``` +These are wrapped in: +```rust +struct UnboundedSender { + sender: UnboundedPortSender, + port_id: PortId, +} +``` +The `send` method forwards messages and wraps errors in a `MailboxSenderError`: +```rust +impl UnboundedSender { + fn send(&self, message: M) -> Result<(), MailboxError> { ... } +} +``` +`OnceSender` is similar, but uses a `oneshot::Sender` under the hood: +```rust +struct OnceSender { + sender: Arc>>>, + port_id: PortId, +} +``` +Calling `.send_once(message)` on an `OnceSender` consumes the channel, and fails if the message has already been sent or the receiver is dropped. + +### Type-Erased Delivery: `SerializedSender` + +To enable uniform message routing, both `UnboundedSender` and `OnceSender` implement the `SerializedSender` trait: +```rust +trait SerializedSender: Send + Sync { + fn as_any(&self) -> &dyn Any; + fn send_serialized( + &self, + headers: Attrs, + serialized: Serialized, + ) -> Result; + +``` +This trait lets the mailbox deliver a `Serialized` message (a type-erased, encoded payload) by: +1. Deserializing the payload into a concrete `M` using `RemoteMessage` trait, +2. Sending it to the appropriate port, via its registered sender. + +All active ports in a mailbox internally tracked in a type-erased form: +``` +ports: DashMap>, +``` +This enables the mailbox to deliver messages to any known port regardless of its specific message type, provided deserialization succeeds. + +If deserialization fails, or the underlying port is closed, an appropriate `MailboxSenderError` is returned via a `SerializedSenderError`. + +See the (Mailbox) [`State`](./mailbox.md#state) section for details on how the mailbox owns and manages this ports map. + +#### Example: `SerializedSender` for `UnboundedSender` + +Below is the canonical implementation of `SerializedSender` for `UnboundedSender`: +```rust +impl SerializedSender for UnboundedSender { + fn send_serialized( + &self, + headers: Attrs, + serialized: Serialized, + ) -> Result { + match serialized.deserialized() { + Ok(message) => { + self.sender.send(headers.clone(), message).map_err(|err| { + SerializedSenderError { + data: serialized, + error: MailboxSenderError::new_bound( + self.port_id.clone(), + MailboxSenderErrorKind::Other(err), + ), + headers, + } + })?; + + Ok(true) + } + Err(err) => Err(SerializedSenderError { + data: serialized, + error: MailboxSenderError::new_bound( + self.port_id.clone(), + MailboxSenderErrorKind::Deserialize(M::typename(), err), + ), + headers, + }), + } + } +} +``` +This implementation: +- Attempts to decode the payload into a concrete `M`, +- Sends the decoded message via the associated port, +- Returns `Ok(true)` on success, or wraps any failure into a `SerializedSenderError`. + +#### `OnceSender` + +`OnceSender` implements `SerializedSender` similarly, deserializing the payload and forwarding it via a one-shot channel. It differs mainly in that the underlying port may only be used once and returns `Ok(false)` when consumed. diff --git a/docs/source/books/hyperactor-book/src/mailboxes/reconfigurable_sender.md b/docs/source/books/hyperactor-book/src/mailboxes/reconfigurable_sender.md new file mode 100644 index 000000000..f4c62fa52 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/reconfigurable_sender.md @@ -0,0 +1,63 @@ +# Reconfigurable Senders + +Some actors are constructed before the full messaging graph is available. +For example, the `ReconfigurableMailboxSender` is used during `MeshAgent::bootstrap` to allow early creation of the `Proc` and agent before outbound routing is available. +The `.configure(...)` method installs the actual router later, once mesh wiring is complete. + +## Motivation + +Actors like `mesh_agent` are created before remote routing infrastructure is established. These actors need to send messages during setup, but the concrete `MailboxSender` they will use hasn't been determined yet. + +To solve this, `ReconfigurableMailboxSender` implements [`MailboxSender`] and supports **deferred configuration**: it starts by queueing messages in memory, then later transitions to forwarding once a real sender is available. + +## Internal Structure + +The sender wraps a state machine: + +```rust +struct ReconfigurableMailboxSender { + state: Arc>, +} +``` +There are two possible states: +```rust +type Post = (MessageEnvelope, PortHandle>); + +enum ReconfigurableMailboxSenderState { + Queueing(Mutex>), + Configured(BoxedMailboxSender), +} +``` +- In the `Queueing` state, messages are buffered. +- When `.configure(...)` is called, the queue is flushed into the new sender, and the state is replaced with `Configured(...)`. + +### Configuration + +The `.configure(...)` method installs the actual sender. If called while in the `Queueing state`, it: + - Drains all buffered messages to the given sender + - Transitions to the `Configured` state + - Returns `true` if this was the first successful configuration + +Subsequent calls are ignored and return `false`. +```rust +fn configure(&self, sender: BoxedMailboxSender) -> bool { + let mut state = self.state.write().unwrap(); + if state.is_configured() { + return false; + } + + let queued = std::mem::replace( + &mut *state, + ReconfigurableMailboxSenderState::Configured(sender.clone()), + ); + + for (envelope, return_handle) in queued.into_queueing().unwrap().into_inner().unwrap() { + sender.post(envelope, return_handle); + } + + *state = ReconfigurableMailboxSenderState::Configured(sender); + true +} +``` + +This guarantees that messages posted before configuration are not dropped - they are delivered in-order once the sender becomes available. diff --git a/docs/source/books/hyperactor-book/src/mailboxes/routers.md b/docs/source/books/hyperactor-book/src/mailboxes/routers.md new file mode 100644 index 000000000..e77923bd0 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/mailboxes/routers.md @@ -0,0 +1,286 @@ +# Routers + +While a muxer dispatches messages to exact `ActorId` matches, a **router** generalizes this by routing messages to the *nearest matching prefix*. This enables hierarchical, prefix-based routing across clusters of actors—spanning local and remote processes. + +Routers extend the ideas of the muxer with: +- Longest-prefix matching on structured `Reference` identifiers +- Dynamic routing to local and remote mailboxes +- Optional serialization and remote connection via `DialMailboxRouter` +- Fallback logic via `WeakMailboxRouter` + +This page introduces: +- `MailboxRouter`: prefix-routing within a shared process +- `DialMailboxRouter`: remote routing with connection management +- `WeakMailboxRouter`: downgradeable reference for ephemeral routing + +To support routing, hyperactor defines a universal reference type for hierarchical identifiers: +```rust +pub enum Reference { + World(WorldId), + Proc(ProcId), + Actor(ActorId), + Port(PortId), +} +``` +A `Reference` encodes a path through the logical structure of the system-spanning from broad scopes like worlds and procs to fine-grained targets like actors or ports. It has a concrete string syntax (e.g., `world[0].actor[42]`) and can be parsed from user input or configuration via `FromStr`. + +## Total Ordering and Prefix Routing + +`Reference` implements a total order via a lexicographic comparison of its internal components: +```rust +impl PartialOrd for Reference { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Reference { + fn cmp(&self, other: &Self) -> Ordering { + ( + self.world_id(), + self.rank(), + self.actor_name(), + self.pid(), + self.port(), + ) + .cmp(&( + other.world_id(), + other.rank(), + other.actor_name(), + other.pid(), + other.port(), + )) + } +} +``` +This means that references are ordered by their position in the system hierarchy-starting with world, then rank (within the world), then actor name, PID, and finally port. For example: +```text +world[0] < world[0].actor["trainer"] < world[0].actor["trainer"][5] +``` + +Semantically, a `Reference` like `Proc(p)` is considered a prefix of any `Actor` or `Port` reference that shares the same world and process. + +Because this order is total and consistent with prefix semantics, it enables efficient prefix-based routing using `BTreeMap`. When routing a message, the destination `ActorId` is converted into a `Reference`, and the router performs a longest-prefix match by locating the nearest entry that is a prefix of the destination. + +### `MailboxRouter` + +With this structure in place, we can now define the core router: +```rust +pub struct MailboxRouter { + entries: Arc>>>, +} +``` + +A `MailboxRouter` maintains a thread-safe mapping from `Reference` prefixes to corresponding `MailboxSender`s. These entries form the routing table: each entry declares that messages targeting a reference in that subtree should be forwarded to the given sender. + +When a message is routed, its destination `ActorId` is converted into a `Reference`. The router performs a longest-prefix match against the table to find the nearest registered handler. + +### Binding and Downgrading + +To register a new routing entry, the router provides a `bind` method: + +```rust +impl MailboxRouter { + pub fn bind(&self, dest: Reference, sender: impl MailboxSender + 'static) { + let mut w = self.entries.write().unwrap(); + w.insert(dest, Arc::new(sender)); + } +} +``` +Each call to `bind` inserts a new `Reference` → `MailboxSender` entry into the routing table. These entries act as prefixes: once inserted, they serve as candidates during longest-prefix matching at message delivery time. + +In some cases, you may want to share or store a weak reference to the router-especially when integrating with structures that should not keep the routing table alive indefinitely. To support this, `MailboxRouter` can be downgraded to a `WeakMailboxRouter`: +```rust +impl MailboxRouter { + pub fn downgrade(&self) -> WeakMailboxRouter { + WeakMailboxRouter(Arc::downgrade(&self.entries)) + } +} +``` +This enables ephemeral or optional routing logic—useful for circular dependencies, test scaffolding, or weakly held topology graphs. + +The `WeakMailboxRouter` is a lightweight wrapper around a weak reference to the router’s internal state: +```rust +pub struct WeakMailboxRouter( + Weak>>>, +); +``` +A `WeakMailboxRouter` can be upgraded back into a strong `MailboxRouter` (if the underlying state is still alive) or used to fail gracefully when routing is unavailable. + +### Routing via `MailboxSender` + +To participate in message delivery, `MailboxRouter` implements the `MailboxSender` trait: +```rust +impl MailboxSender for MailboxRouter { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + let sender = { + let actor_id = envelope.dest().actor_id(); + match self + .entries + .read() + .unwrap() + .lower_bound(Excluded(&actor_id.clone().into())) + .prev() + { + None => None, + Some((key, sender)) if key.is_prefix_of(&actor_id.clone().into()) => { + Some(sender.clone()) + } + Some(_) => None, + } + }; + + match sender { + None => envelope.undeliverable( + DeliveryError::Unroutable( + "no destination found for actor in routing table".to_string(), + ), + return_handle, + ), + Some(sender) => sender.post(envelope, return_handle), + } + } +} +``` +This implementation performs a longest-prefix match using the total order on `Reference`: +1. It converts the destination `ActorId` into a `Reference`. +2. It performs a descending prefix search using: +```rust + entries.lower_bound(Excluded(&reference)).prev() +``` +This locates the greatest key in the routing table that is strictly less than the destination. + +3. It checks whether that key is a semantic prefix of the destination (via `is_prefix_of`). +4. If a match is found, the message is forwarded to the corresponding `MailboxSender`. +5. If no match is found, the message is marked as undeliverable, and returned using the provided `return_handle`. + +### `WeakMailboxRouter` + +A `WeakMailboxRouter` is a downgradeable, non-owning reference to a router's internal state. It allows optional or ephemeral routing participation-for example, when holding a fallback route without keeping the full routing table alive. +```rust +pub struct WeakMailboxRouter( + Weak>>>, +); +``` +To integrate into the routing system, `WeakMailboxRouter` also implements `MailboxSender`: +```rust +impl MailboxSender for WeakMailboxRouter { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + match self.upgrade() { + Some(router) => router.post(envelope, return_handle), + None => envelope.undeliverable( + DeliveryError::BrokenLink("failed to upgrade WeakMailboxRouter".to_string()), + return_handle, + ), + } + } +} +``` +If the router has already been dropped, `post` fails gracefully by returning the message to the sender with a `BrokenLink` error. This makes `WeakMailboxRouter` useful in dynamic topologies or teardown-sensitive control paths, where a full routing table may not be guaranteed to exist at the time of message delivery. + +### `DialMailboxRouter`: Remote and Serializable Routing + +While `MailboxRouter` supports prefix-based routing, it relies on explicitly registered `MailboxSender`s. In contrast, `DialMailboxRouter` enables **remote routing** through a dynamic address book and connection cache. It can forward messages to remote actors by establishing outbound connections on demand. +```rust +pub struct DialMailboxRouter { + address_book: Arc>>, + sender_cache: Arc>>, + default: BoxedMailboxSender, +} +``` +#### Address Book + +The `address_book` maps `Reference` prefixes to `ChannelAddr`s representing remote destinations. + +#### Sender Cache + +The `sender_cache` holds active `MailboxClient` connections keyed by address. When a message arrives, the router looks up the target in the address book and either reuses an existing sender or dials a new one. + +#### Default Route + +If no matching reference is found, the message is forwarded to a `default` sender—useful as a catch-all route or failover handler. + +This structure enables adaptive, connection-aware routing across distributed systems. Next, we’ll walk through constructing and populating a `DialMailboxRouter` using `new()` and `bind()`. + +### Managing Routes: `bind` and `unbind` + +To populate the router, use `bind` to associate a `Reference` with a `ChannelAddr`. This replaces any existing mapping for the same reference and evicts any cached sender tied to the old address: +```rust +router.bind(reference, remote_addr); +``` +To remove entries, use `unbind`. It removes all mappings with the given prefix-effectively deleting a subtree of the address book. Corresponding cached senders are also evicted to prevent reuse of stale connections: +```rust +router.unbind(&reference_prefix); +``` +This allows the router to adapt dynamically to process exits, topology changes, or application-level reconfiguration. The use of `is_prefix_of` during unbinding ensures that hierarchical references can be removed in bulk-e.g., removing a `Proc`-level entry will also remove all associated `Actor` routes. + +### Lookup and Dialing + +Once the router has been populated using `bind`, message delivery proceeds in two phases: **lookup** followed by **dialing**, if needed. + +#### Address Lookup + +When a message arrives, the router first attempts to locate a destination using `lookup_addr`. This method: + +- Converts the message’s `ActorId` into a `Reference` +- Performs a longest-prefix search using `lower_bound(...).prev()` on the address book +- Applies `is_prefix_of` to confirm that the matched reference is semantically valid + +This allows the router to resolve addresses at varying levels of granularity-e.g., by world, process, or actor. + +#### Dialing + +If a matching address is found and no cached connection exists, the router attempts to establish one using `channel::dial`. The resulting connection is wrapped in a `MailboxClient` and inserted into the sender cache for future use. + +If the address is already cached, the router reuses the existing sender to avoid redundant connections. + +The result of this lookup-and-dial process is an `Arc`—a runtime-capable `MailboxSender` for remote delivery. + +We'll now see how this machinery is tied together in the `MailboxSender` implementation for `DialMailboxRouter`. + +### Integration with `MailboxSender` + +`DialMailboxRouter` implements the `MailboxSender` trait, enabling it to forward messages to remote actors by resolving and caching connections dynamically: + +```rust +impl MailboxSender for DialMailboxRouter { + fn post( + &self, + envelope: MessageEnvelope, + return_handle: PortHandle>, + ) { + let Some(addr) = self.lookup_addr(envelope.dest().actor_id()) else { + self.default.post(envelope, return_handle); + return; + }; + + match self.dial(&addr, envelope.dest().actor_id()) { + Err(err) => envelope.undeliverable( + DeliveryError::Unroutable(format!("cannot dial destination: {err}")), + return_handle, + ), + Ok(sender) => sender.post(envelope, return_handle), + } + } +} +``` +Here’s what happens step by step: +1. Address lookup: + - The destination `ActorId` is converted into a `Reference`. + - The router searches for the nearest matching prefix in the address book. + - If no match is found, the message is forwarded to the configured `default` sender. +2. Connection resolution: + - If an address is found, the router attempts to `dial` or reuse a cached `MailboxClient`. + - On error (e.g., failed dial), the message is returned to the sender with a `DeliveryError::Unroutable`. +3. Message forwarding: + - If dialing succeeds, the resulting `sender` is used to post the message. + +`DialMailboxRouter` performs prefix-based routing by resolving addresses at runtime and forwarding messages over dialed or cached connections, with fallback to a default sender. diff --git a/docs/source/books/hyperactor-book/src/references/actor_id.md b/docs/source/books/hyperactor-book/src/references/actor_id.md new file mode 100644 index 000000000..de64fca2f --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/actor_id.md @@ -0,0 +1,87 @@ +# `ActorId` + +An `ActorId` uniquely identifies an actor within a proc. It combines the proc the actor lives on, a string name, and a numeric pid (process-local instance index). + +```rust +#[derive( + Debug, + Serialize, + Deserialize, + Clone, + PartialEq, + Eq, + PartialOrd, + Hash, + Ord, + Named +)] +pub struct ActorId(pub ProcId, pub String, pub usize); +``` +- The first field is the actor's `ProcId`. +- The second is the actor's name (used for grouping and logging). +- The third is the pid, which distinguishes multiple instances with the same name. + +### Construction + +Construct an actor ID directly: +```rust +use hyperactor::reference::{ActorId, ProcId, WorldId}; + +let proc = ProcId(WorldId("training".into()), 0); +let actor = ActorId(proc, "worker".into(), 1); +``` + +Or with the `id!` macro: +```rust +use hyperactor::id; + +let actor = id!(training[0].worker[1]); +// Equivalent to ActorId(ProcId(WorldId("training".into()), 0), "worker".into(), 1) +``` +To refer to the root actor (the canonical instance), use: +```rust +let root = ActorId::root(proc, "worker".into()); +// Equivalent to ActorId(proc, "worker".into(), 0) +``` + +### Methods + +```rust +impl ActorId { + pub fn proc_id(&self) -> &ProcId; + pub fn name(&self) -> &str; + pub fn pid(&self) -> usize; + pub fn world_name(&self) -> &str; + pub fn rank(&self) -> usize; + pub fn child_id(&self, pid: usize) -> Self; + pub fn port_id(&self, port: u64) -> PortId; + pub fn root(proc: ProcId, name: String) -> Self; +} +``` + +- `.proc_id()` returns the ProcId that owns this actor. +- `.name()` returns the logical name of the actor (e.g., "worker"). +- `.pid()` returns the actor's instance ID. +- `.world_name()` returns the name of the actor’s world. +- `.rank()` returns the proc rank (i.e., index) this actor runs on. +- `.child_id(pid)` creates a new `ActorId` with the same name and proc but a different pid. +- `.port_id(port)` returns a `PortId` representing a port on this actor. +- `.root(proc, name)` constructs a new root actor (`pid = 0`) in the given proc. + +### Traits + +`ActorId` implements: + +- `Display` — formats as `world[rank].name[pid]` +- `FromStr` — parses strings like `"training[0].logger[1]"` +- `Clone`, `Eq`, `Ord`, `Hash` — useful in maps, sets, and registries +- `Named` — enables type-based routing, port lookup, and reflection + +## Semantics + +- The `name` groups actors logically within a proc (e.g., `"worker"`, `"trainer"`). +- The `pid` distinguishes physical instances: + - `pid = 0` represents the **root** actor instance. + - `pid > 0` typically corresponds to **child actors** spawned by the root. +- Most routing and API surfaces operate on root actors by default. +- Port creation is always rooted in an `ActorId`, via `.port_id(...)`. diff --git a/docs/source/books/hyperactor-book/src/references/bindings.md b/docs/source/books/hyperactor-book/src/references/bindings.md new file mode 100644 index 000000000..1201c3bce --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/bindings.md @@ -0,0 +1 @@ +# Bindings diff --git a/docs/source/books/hyperactor-book/src/references/gang_id.md b/docs/source/books/hyperactor-book/src/references/gang_id.md new file mode 100644 index 000000000..e89591e1e --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/gang_id.md @@ -0,0 +1,67 @@ +# `GangId` + +A `GangId` identifies a logical group of actors with the same name across all procs in a world. It serves as a convenient shorthand for referring to all root instances of a given actor name. +```rust +#[derive( + Debug, + Serialize, + Deserialize, + Clone, + PartialEq, + Eq, + PartialOrd, + Hash, + Ord, + Named +)] +pub struct GangId(pub WorldId, pub String); +``` +- The first field is the WorldId. +- The second field is the shared actor name. + +A `GangId` is conceptually like saying: “the actor named X on every proc in world W.” + +## Construction + +```rust +use hyperactor::reference::{GangId, WorldId}; + +let gang = GangId(WorldId("training".into()), "logger".into()); +``` + +Or using the id! macro: +```rust +use hyperactor::id; + +let gang = id!(training.logger); +// Equivalent to GangId(WorldId("training".into()), "logger".into()) +``` + +## Methods + +```rust +impl GangId { + pub fn world_id(&self) -> &WorldId; + pub fn name(&self) -> &str; + pub fn actor_id(&self, rank: usize) -> ActorId; + pub fn expand(&self, world_size: usize) -> impl Iterator + '_; +} +``` +- `.world_id()` returns the world this gang is defined in. +- `.name()` returns the shared actor name (e.g., "logger"). +- `.actor_id(rank)` returns the root actor on that proc. +- `.expand(world_size)` yields all root ActorIds from rank `0..world_size`. + +## Semantics + +- Gangs are always composed of root actors (`pid = 0`) with a common name. +- Gang references are useful for broadcasting, coordination, or actor discovery. +- They are lightweight and purely name-based; no state is attached to a `GangId`. + +## Traits + +`GangId` implements: +- `Display` — formatted as world.actor +- `FromStr` — parses from strings like "training.logger" +- `Ord`, `Eq`, `Hash` — usable in maps, registries, and routing +- `Named` — enables type registration and metadata lookup diff --git a/docs/source/books/hyperactor-book/src/references/gangs.md b/docs/source/books/hyperactor-book/src/references/gangs.md new file mode 100644 index 000000000..d51be4373 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/gangs.md @@ -0,0 +1 @@ +# Gangs diff --git a/docs/source/books/hyperactor-book/src/references/index.md b/docs/source/books/hyperactor-book/src/references/index.md new file mode 100644 index 000000000..48e79f037 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/index.md @@ -0,0 +1,24 @@ +# References + +This section documents the reference system used throughout hyperactor to identify and communicate with distributed entities. + +References are lightweight, serializable identifiers for **worlds**, **procs**, **actors** **ports**, and **gangs**. They are the backbone of addressing and routing in the runtime. Whether you're sending a message, spawning an actor, or broadcasting to a group, references are how you name things. + +The reference system is: + +- **Uniform**: All references follow a shared syntax and structure. +- **Parsable**: References can be round-tripped from strings and manipulated programmatically. +- **Typed**: While the `Reference` enum is typeless and dynamic, typed references like `ActorRef` and `PortRef` allow safe interaction in APIs. +- **Orderable**: References implement a total order, enabling prefix-based routing and sorted maps. + +In this section, we’ll cover: + +- The [syntax](syntax.md) and string format of references +- The core reference types: + - [`WorldId`](world_id.md) + - [`ProcId`](proc_id.md) + - [`ActorId`](actor_id.md) + - [`PortId`](port_id.md) + - [`GangId`](gang_id.md) +- The [Reference](reference.md), which unifies all reference variants + - [Typed references](typed_refs.md) used in APIs: `ActorRef`, `PortRef`, and `OncePortRef` diff --git a/docs/source/books/hyperactor-book/src/references/port_id.md b/docs/source/books/hyperactor-book/src/references/port_id.md new file mode 100644 index 000000000..2c93edd9a --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/port_id.md @@ -0,0 +1,61 @@ +# `PortId` + +A `PortId` identifies a specific port on a particular actor. Ports are the entry points through which messages are delivered to an actor, and each `PortId` is globally unique. + +```rust +#[derive( + Debug, + Serialize, + Deserialize, + Clone, + PartialEq, + Eq, + PartialOrd, + Hash, + Ord, + Named +)] +pub struct PortId(pub ActorId, pub u64); +``` +- The first field is the owning `ActorId`. +- The second field is the port number (`u64`), typically derived from the message type’s registered port. + +## Construction + +```rust +use hyperactor::reference::{PortId, ActorId}; + +let port = PortId(actor, 42); +``` +Or via the `id!` macro: +```rust +use hyperactor::id; + +let port = id!(training[0].logger[1][42]); +// Equivalent to PortId(ActorId(...), 42) +``` +You can also construct a PortId from an `ActorId` using `.port_id(...)`: +```rust +let port = actor.port_id(42); +``` + +## Methods + +```rust +impl PortId { + pub fn actor_id(&self) -> &ActorId; + pub fn index(&self) -> u64; + pub fn into_actor_id(self) -> ActorId; +} +``` +- `.actor_id()` returns the owning actor. +- `.index()` returns the port number. +- `.into_actor_id()` discards the port index and yields the owning actor ID. + +## Traits + +`PortId` implements: +- `Display` — formatted as `world[rank].actor[pid][port]` +- `FromStr` — parses from strings like `"training[0].logger[1][42]"` +- `Ord`, `Eq`, `Hash` — usable as map keys or for dispatch +- `Named` — supports reflection and typed messaging diff --git a/docs/source/books/hyperactor-book/src/references/proc_id.md b/docs/source/books/hyperactor-book/src/references/proc_id.md new file mode 100644 index 000000000..5899044d2 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/proc_id.md @@ -0,0 +1,59 @@ +# `ProcId` + +A `ProcId` identifies a single runtime instance within a world. All actors exist within a proc, and message routing between actors is scoped by the proc’s identity. +```rust +#[derive( + Debug, + Serialize, + Deserialize, + Clone, + PartialEq, + Eq, + PartialOrd, + Hash, + Ord, + Named +)] +pub struct ProcId(pub WorldId, pub usize); +``` + +## Construction + +You can construct a `ProcId` directly: +```rust +use hyperactor::reference::{WorldId, ProcId}; + +let proc = ProcId(WorldId("training".into()), 0); +``` +Or statically using the `id!` macro: +```rust +use hyperactor::id; + +let proc = id!(training[0]); // Equivalent to ProcId(WorldId("training".into()), 0) +``` + +## Methods + +```rust +impl ProcId { + pub fn world_id(&self) -> &WorldId; + pub fn world_name(&self) -> &str; + pub fn rank(&self) -> usize; + pub fn actor_id(&self, name: impl Into, pid: usize) -> ActorId; +} +``` +- `.world_id()` gives the `WorldId` this proc belongs to. +- `.rank()` returns the proc’s index. +- `.actor_id(name, pid)` constructs an `ActorId` for an actor hosted on this proc. + +# Notes + +Ranks greater than or equal to `1 << (usize::BITS - 1)` are considered user-space procs. These are typically created with `WorldId::random_user_proc()` and are not assigned by the system. + +## Traits + +ProcId implements: +- `Display` — formatted as `world[rank]` +- `FromStr` — parses from strings like "training[0]" +- `Ord`, `Eq`, `Hash` — usable in maps and sorted structures +- `Named` — enables port lookup and type reflection diff --git a/docs/source/books/hyperactor-book/src/references/reference.md b/docs/source/books/hyperactor-book/src/references/reference.md new file mode 100644 index 000000000..1a5ea8abb --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/reference.md @@ -0,0 +1,83 @@ +# `Reference` + +The `Reference` enum is a type-erased, unified representation of all addressable entities in hyperactor. It provides a common format for parsing, logging, routing, and transport. + +```rust +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash, Named)] +pub enum Reference { + World(WorldId), + Proc(ProcId), + Actor(ActorId), + Port(PortId), + Gang(GangId), +} +``` +Each variant wraps one of the concrete identifier types: +- [`WorldId`](world_id.md) +- [`ProcId`](proc_id.md) +- [`ActorId`](actor_id.md) +- [`PortId`](port_id.md) +- [`GangId`](gang_id.md) + +## Use Cases + +- Used to represent references in a uniform way (e.g., CLI args, config, logs). +- Returned by `.parse::()` when parsing from string. +- Enables prefix-based comparisons for routing or scoping. +- Can be converted `to`/`from` the concrete types via `From`. + +## Construction + +From concrete types: +```rust +use hyperactor::reference::{Reference, ActorId}; + +let actor_id = ...; +let reference: Reference = actor_id.into(); +``` +From a string: +```rust +let reference: Reference = "training[0].logger[1][42]".parse().unwrap(); +``` +You can match on the reference to access the underlying type: +```rust +match reference { + Reference::Actor(actor_id) => { /* ... */ } + Reference::Port(port_id) => { /* ... */ } + _ => {} +} +``` + +## Methods + +```rust +impl Reference { + pub fn is_prefix_of(&self, other: &Reference) -> bool; + pub fn world_id(&self) -> &WorldId; + pub fn proc_id(&self) -> Option<&ProcId>; + pub fn actor_id(&self) -> Option<&ActorId>; + pub fn actor_name(&self) -> Option<&str>; + pub fn pid(&self) -> Option; + pub fn port(&self) -> Option; +} +``` +- `.is_prefix_of(other)` checks whether one reference is a prefix of another (e.g., `WorldId` -> `ProcId` -> `ActorId`). +- `.world_id()` returns the reference’s associated world. +- `.proc_id()` and `.actor_id()` return their corresponding IDs if applicable. +- `.actor_name()`, `.pid()`, and `.port()` expose reference components across variants. + +## Ordering + +Reference implements a total order across all variants. Ordering is defined lexicographically: +```rust +(world_id, rank, actor_name, pid, port) +``` +This allows references to be used in sorted maps or for prefix-based routing schemes. + +## Traits + +Reference implements: +- `Display` — formats to the same syntax accepted by `FromStr` +- `FromStr` — parses strings like `"world[1].actor[2][port]"` +- `Ord`, `Eq`, `Hash` — useful in sorted/routed contexts +- `Named` — used for port assignment, reflection, and runtime dispatch diff --git a/docs/source/books/hyperactor-book/src/references/syntax.md b/docs/source/books/hyperactor-book/src/references/syntax.md new file mode 100644 index 000000000..c6cf57a95 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/syntax.md @@ -0,0 +1,55 @@ +# Syntax + +References in Hyperactor follow a uniform concrete syntax that can be written as strings, parsed at runtime, or constructed statically using the `id!` macro. + +## String Form + +The canonical string syntax supports hierarchical references, from worlds down to ports: +```text +world +world[rank] +world[rank].actor // actor[0] +world[rank].actor[pid] +world[rank].actor[pid][port] +world.actor // gang reference +``` + +These forms can be used wherever a reference is accepted as a string, such as command-line arguments, config files, and logs. + +Examples: + +- `training` — world ID +- `training[0]` — proc 0 in world `training` +- `training[0].logger[1]` — actor named `logger`, pid 1 +- `training[0].logger[1][42]` — port 42 of that actor +- `training.logger` — gang reference + +The parser is robust and fails clearly on invalid syntax. + +## Runtime Parsing + +The `Reference` type implements `FromStr`, so you can parse strings into references: + +```rust +use hyperactor::reference::Reference; + +let r: Reference = "training[2].worker[0]".parse().unwrap(); +``` + +It returns a strongly typed enum: `Reference::Actor`, `Reference::Port`, etc. + +## Static Construction with `id!` + +You can also construct references statically using the `id!` macro. This macro uses the same concrete syntax: +```rust +use hyperactor::id; +use hyperactor::reference::{WorldId, ProcId, ActorId, PortId, GangId}; + +let w: WorldId = id!(training); +let p: ProcId = id!(training[0]); +let a: ActorId = id!(training[0].logger[1]); +let port: PortId = id!(training[0].logger[1][42]); +let g: GangId = id!(training.logger); +``` + +The macro expands to correct type constructors and ensures compile-time validity. The `id!()` macro does not produce a `Reference` enum-it constructs the corresponding concrete type directly (e.g., `WorldId`, `ProcId`, `ActorId`). This contrasts with parsing, which always yields a `Reference`. diff --git a/docs/source/books/hyperactor-book/src/references/typed_refs.md b/docs/source/books/hyperactor-book/src/references/typed_refs.md new file mode 100644 index 000000000..d96501bc5 --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/typed_refs.md @@ -0,0 +1,106 @@ +# Typed References + +Typed references are strongly typed wrappers over raw identifiers like `ActorId` and `PortId`. These types are used throughout hyperactor’s APIs; as parameters in messages, return values from `bind()` methods, and elements in routing decisions. They make distributed communication safe, expressive, and statically checked. + +## Overview + +There are three main typed reference types: + +- [`ActorRef`](#actorrefa): A typed reference to an actor implementing the `RemoteActor` trait. +- [`PortRef`](#portrefm): A reference to a reusable mailbox port for messages of type `M` implementing the `RemoteMessage` trait. +- [`OncePortRef`](#onceportrefm): A reference to a one-shot port for receiving a single response of type `M` implementing the `RemoteMessage` trait. + +These types are used as parameters in messages, return values from bindings, and components of the routing system. + +--- + +## `ActorRef` + +`ActorRef` is a typed reference to an actor of type `A`. It provides a way to identify and address remote actors that implement `RemoteActor`. + +```rust +let actor_ref: ActorRef = ActorRef::attest(actor_id); +``` + +> **Note**: While `ActorRef::attest` can be used to construct a reference from an `ActorId`, it should generally be avoided. Instead, prefer using the `ActorRef` returned from `ActorHandle::bind()`, which guarantees that the actor is actually running and bound to a mailbox. `attest` is unsafe in the sense that it bypasses that guarantee. + +Unlike `ActorHandle`, an `ActorRef` is just a reference — it doesn’t guarantee that the actor is currently running. It's primarily used for routing and type-safe messaging across `Proc`s. + +### Definition +```rust +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct ActorRef { + actor_id: ActorId, + phantom: PhantomData, +} +``` +This type is a thin wrapper around an `ActorId`, with a phantom type `A` to track which actor it refers to. It ensures you can only send messages supported by the actor's declared `RemoteHandles`. + +## `PortRef` + +`PortRef` refers to a mailbox port for messages of type `M`. +```rust +let (port, mut receiver) = actor.open_port::(); +let port_ref: PortRef = port.bind(); +``` + +This allows the port to be sent across the network or passed into other messages. On the receiving end, `PortRef` can be used to deliver messages of the expected type. + +### Definition + +```rust +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct PortRef { + port_id: PortId, + reducer_spec: Option, + phantom: PhantomData, +} +``` +As with `ActorRef`, this is a typed wrapper around a raw identifier (`PortId`), carrying a phantom type for safety. It ensures that only messages of type `M` can be sent through this reference. + +## `OncePortRef` + +A `OncePortRef` is like a `PortRef`, but designed for exactly one response. Once used, it cannot be reused or cloned. +```rust +let (once_port, receiver) = actor.open_once_port::(); +let once_ref = once_port.bind(); +``` +These are commonly used for request/response interactions, where a single reply is expected. + +### Definition + +```rust +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct OncePortRef { + port_id: PortId, + phantom: PhantomData, +} +``` +Just like `PortRef`, this wraps a `PortId` with a phantom message type `M` for type safety. Internally, the system enforces one-time delivery semantics, ensuring the port is closed after receiving a single message. + +## `GangRef` + +A `GangRef` is a typed reference to a gang of actors, all of which implement the same `RemoteActor` type `A`. +```rust +let gang_ref: GangRef = GangId::new(...).into(); +``` +You can extract an `ActorRef` for a specific rank in the gang: +```rust +let actor = gang_ref.rank(0); // ActorRef +``` +This allows you to route messages to specific members of a replicated actor group, or to iterate over the gang for broadcasting, synchronization, or indexing. + +### Definition +```rust +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Hash, Ord)] +pub struct GangRef { + gang_id: GangId, + phantom: PhantomData, +} +``` + +#### Methods +- `gang_ref.rank(rank: usize) -> ActorRef` +Returns a typed actor reference for the actor at the given rank in the gang. The method doesn’t validate the rank, so correctness is up to the caller. +- `gang_ref.gang_id() -> &GangId` +Returns the underlying, untyped gang identifier. diff --git a/docs/source/books/hyperactor-book/src/references/world_id.md b/docs/source/books/hyperactor-book/src/references/world_id.md new file mode 100644 index 000000000..0b238de7f --- /dev/null +++ b/docs/source/books/hyperactor-book/src/references/world_id.md @@ -0,0 +1,54 @@ +# `WorldId` + +A `WorldId` defines the top-level namespace for procs and actors. All procs, actors, ports, and gangs exist within a world. +```rust +#[derive( + Debug, + Serialize, + Deserialize, + Clone, + PartialEq, + Eq, + PartialOrd, + Hash, + Ord, + Named +)] +pub struct WorldId(pub String); +``` + +## Construction + +A `WorldId` wraps a string and can be created directly: +```rust +use hyperactor::reference::WorldId; + +let world = WorldId("training".into()); +``` +Or statically using the `id!` macro: +```rust +use hyperactor::id; + +let world = id!(training); // Equivalent to WorldId("training".into()) +``` + +## Methods + +```rust +impl WorldId { + pub fn name(&self) -> &str; + pub fn proc_id(&self, index: usize) -> ProcId; + pub fn random_user_proc(&self) -> ProcId; +} +``` +- `.name()` returns the world name string. +- `.proc_id(index)` constructs a `ProcId` rooted in this world. +- `.random_user_proc()` generates a `ProcId` with the high bit set, marking it as a user-space proc ID. + +## Traits + +`WorldId` implements: +- `Display` — string form is just the world name +- `FromStr` — parses from "training" into WorldId("training") +- `Ord`, `Eq`, `Hash` — suitable for use as map/set keys +- `Named` — used for type reflection and message dispatch diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000..1a5d156c7 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,243 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "Monarch" +copyright = "2025" +author = "" +release = "" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx_design", + "sphinx_sitemap", + "sphinxcontrib.mermaid", + "pytorch_sphinx_theme2", + "sphinxext.opengraph", + "myst_parser", + # "nbsphinx", + "sphinx_gallery.gen_gallery", + # "myst_nb", + #"sphinx_rust", + +] + +sphinx_gallery_conf = { + "examples_dirs": [ + "./examples", + ], # path to your example scripts + "gallery_dirs": "./generated/examples", # path to where to save gallery generated output + "filename_pattern": ".*\.py$", # Include all Python files + "ignore_pattern": "__init__\.py", # Exclude __init__.py files + "plot_gallery": "False", # Don't run the examples + # "thumbnail_size": (400, 400), # Thumbnail size + # "download_all_examples": True, # Allow downloading all examples + # "line_numbers": True, # Show line numbers in code blocks + # "remove_config_comments": True, # Remove configuration comments + # "show_memory": False, # Don't show memory usage + # "show_signature": True, # Show function signatures +} + + + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +import os +import sys + +# Add the repository root to the path so Sphinx can find the notebook files +sys.path.insert(0, os.path.abspath(".")) +sys.path.insert(0, os.path.abspath("../..")) +import pytorch_sphinx_theme2 + +html_theme = "pytorch_sphinx_theme2" +html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()] + +ogp_site_url = "http://pytorch.org/monarch" +ogp_image = "https://pytorch.org/assets/images/social-share.jpg" + +html_theme_options = { + "navigation_with_keys": False, + "analytics_id": "GTM-T8XT4PS", + "logo": { + "text": "", + }, + "icon_links": [ + { + "name": "X", + "url": "https://x.com/PyTorch", + "icon": "fa-brands fa-x-twitter", + }, + { + "name": "GitHub", + "url": "https://github.com/pytorch-labs/monarch", + "icon": "fa-brands fa-github", + }, + { + "name": "Discourse", + "url": "https://dev-discuss.pytorch.org/", + "icon": "fa-brands fa-discourse", + }, + { + "name": "PyPi", + "url": "https://pypi.org/project/monarch/", + "icon": "fa-brands fa-python", + }, + ], + "use_edit_page_button": True, + "navbar_center": "navbar-nav", +} + +theme_variables = pytorch_sphinx_theme2.get_theme_variables() +templates_path = [ + "_templates", + os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"), +] + +html_context = { + "theme_variables": theme_variables, + "display_github": True, + "github_url": "https://github.com", + "github_user": "pytorch-labs", + "github_repo": "monarch", + "feedback_url": "https://github.com/pytorch-labs/monarch", + "github_version": "main", + "doc_path": "docs/source", + "library_links": theme_variables.get("library_links", []), + "community_links": theme_variables.get("community_links", []), + "language_bindings_links": html_theme_options.get("language_bindings_links", []), +} + +# Enable MyST extensions for markdown files including those in books +myst_enable_extensions = [ + "colon_fence", + "deflist", + "html_image", +] + +# Add the books directory to the Sphinx path +books_path = os.path.abspath("../../books") + +# Configure Sphinx to include the books directory +exclude_patterns = [] # Reset exclude_patterns to ensure books are included + +# Add the books directory to the Sphinx path +import sphinx.util.matching +html_extra_path = [] +html_static_path = ["_static"] + +# Configure MyST-Parser to find markdown files in the books directory +myst_update_mathjax = False +myst_heading_anchors = 3 + +# Add the books directory to the Sphinx search path +import os +books_path = os.path.abspath("../../books") + + +# The suffix(es) of source filenames. +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +# Configure MyST-Parser to properly handle relative links in the books directory +myst_url_schemes = ["http", "https", "mailto"] + +# Add the books directory to the Sphinx extensions path +import os +import sys +sys.path.insert(0, os.path.abspath("../../books")) + +# Allow errors in notebook execution +nbsphinx_allow_errors = True + + +def truncate_index_file_at_raw_html(file_path): + """ + Truncate the Sphinx-Gallery index file at the first occurrence of the + raw HTML div with class 'sphx-glr-thumbnails'. + + Parameters: + - file_path (str): The path to the index file to be truncated. + """ + try: + with open(file_path, "r") as file: + lines = file.readlines() + + # Find the index of the first occurrence of the target lines + target_lines = [ + ".. raw:: html\n", + "\n", + '
\n', + ] + + # Search for the sequence in the lines + truncate_index = None + for i in range(len(lines) - len(target_lines) + 1): + if lines[i : i + len(target_lines)] == target_lines: + truncate_index = i + break + + if truncate_index is not None: + truncated_lines = lines[:truncate_index] + with open(file_path, "w") as file: + file.writelines(truncated_lines) + print(f"File {file_path} truncated at line {truncate_index}.") + else: + print( + f"Target raw HTML block not found in {file_path}. No truncation done." + ) + + except Exception as e: + print(f"An error occurred while truncating the file: {e}") + + +# Truncate the Sphinx-Gallery index file at the first occurrence of raw HTML +def truncate_gallery_index_file(app): + """ + This function runs at the beginning of the build process to truncate the index.rst file. + It first checks if the file exists, and if not, it runs sphinx-gallery to generate it. + """ + # Use the source directory path + index_file = os.path.join(app.srcdir, "examples", "index.rst") + + # Check if the file exists + if os.path.exists(index_file): + # Truncate the file + truncate_index_file_at_raw_html(index_file) + print(f"Truncated existing file: {index_file}") + else: + print( + f"File {index_file} does not exist yet. It will be generated during the build process." + ) + + +def setup(app): + # Connect to the builder-inited event, which runs at the beginning of the build process + app.connect("builder-inited", truncate_gallery_index_file) + + # Also connect to the build-finished event as a backup + app.connect( + "build-finished", + lambda app, exception: ( + truncate_index_file_at_raw_html( + os.path.join(app.srcdir, "examples", "index.rst") + ) + if exception is None + and os.path.exists(os.path.join(app.srcdir, "examples", "index.rst")) + else None + ), + ) diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst new file mode 100644 index 000000000..066e39e89 --- /dev/null +++ b/docs/source/examples/README.rst @@ -0,0 +1,22 @@ +Monarch Examples +================ + +This directory contains examples demonstrating how to use Monarch for distributed computing. + +Python Script Examples +--------------------- + +These examples are formatted for sphinx-gallery and will be automatically converted to HTML documentation: + +- ``ping_pong.py``: Demonstrates the basics of Monarch's Actor/endpoint API with a ping-pong communication example +- ``spmd_ddp.py``: Shows how to run PyTorch's Distributed Data Parallel (DDP) within Monarch actors +- ``grpo_actor.py``: Implements a distributed PPO-like reinforcement learning algorithm using the Monarch actor framework + +Running Examples +--------------- + +To run any example: + +.. code-block:: bash + + python examples/example_name.py diff --git a/docs/source/examples/__init__.py b/docs/source/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/docs/source/examples/grpo_actor.py b/docs/source/examples/grpo_actor.py new file mode 100644 index 000000000..5b8f4e260 --- /dev/null +++ b/docs/source/examples/grpo_actor.py @@ -0,0 +1,531 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Distributed PPO-like Reinforcement Learning with Monarch Actors +=======================================================>>>>>>> REPLACE + +This example demonstrates implementing a distributed PPO-like reinforcement learning +algorithm using the Monarch actor framework. The implementation features: + +- Distributed actor architecture with Generator, Scorer, and Learner components +- Asynchronous communication via queues +- RDMA-based weight synchronization +- Event-driven architecture for efficient processing + +The example shows how to: +- Set up distributed actors on separate GPU meshes +- Implement policy gradient methods in a distributed setting +- Use RDMA buffers for efficient parameter sharing +- Create an asynchronous training loop with multiple components +""" + +import asyncio +import copy +import random +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.optim as optim + +from monarch.actor import Actor, endpoint, proc_mesh +from monarch.rdma import RDMABuffer +from torch.distributions import Categorical, kl_divergence + +""" +Online reinforcement learning (RL) training loop using the Monarch actor framework. + +This example implements a distributed PPO-like algorithm with three main components: +1. Generator: Produces actions using the current policy and sends them for scoring +2. Scorer: Evaluates actions and assigns rewards +3. Learner: Updates policy based on collected experiences + +Key features demonstrated: +- Distributed actors on separate GPU meshes +- Asynchronous communication via queues +- RDMA-based weight synchronization +- Event-driven architecture +""" +G = 8 # group size +STATE_DIM = 4 +ACTION_DIM = 4 # vocab size + + +@dataclass +class TrajectorySlice: + """Single trajectory from one generator call. + + Attributes: + policy_version: Version of policy that produced this slice + state: Input state tensor [STATE_DIM] + actions: Generated actions [G] + old_logps: Log probabilities of actions under generation policy [G] + rewards: Rewards for each action (initially zeros, filled by Scorer) [G] + """ + + policy_version: int + state: torch.Tensor + actions: torch.Tensor + old_logps: torch.Tensor + rewards: torch.Tensor + + +@dataclass +class TrainingBatch: + """Batch of trajectories for training. + + Attributes: + states: Batched states [batch_size, STATE_DIM] + actions: Batched actions [batch_size * G] + old_logps: Batched log probabilities [batch_size * G] + rewards: Batched rewards [batch_size * G] + policy_versions: List of policy versions for each slice + """ + + states: torch.Tensor + actions: torch.Tensor + old_logps: torch.Tensor + rewards: torch.Tensor + policy_versions: List[int] + + +class TrajectoryQueue(Actor): + """Queue for trajectory slices between Generator and Scorer.""" + + def __init__(self): + """Initialize an empty queue.""" + self.queue: asyncio.Queue[TrajectorySlice] = asyncio.Queue() + + @endpoint + async def put(self, slice: TrajectorySlice) -> None: + """Add a trajectory slice to the queue. + + Args: + slice: The trajectory slice to add + """ + await self.queue.put(slice) + + @endpoint + async def get(self) -> TrajectorySlice: + """Remove and return a trajectory slice from the queue. + + Returns: + The next trajectory slice in the queue + """ + return await self.queue.get() + + +class ReplayBuffer(Actor): + """Storage for scored trajectory slices with weighted sampling.""" + + def __init__(self): + """Initialize an empty buffer.""" + self.storage: List[Tuple[int, TrajectorySlice]] = [] # (version, slice) + self.storage_event = asyncio.Event() + + @endpoint + async def put(self, slice: TrajectorySlice) -> None: + """Add a trajectory slice to the buffer. + + Args: + slice: The trajectory slice to add + """ + self.storage.append((slice.policy_version, slice)) + self.storage_event.set() + + async def _wait_for_storage(self): + if not self.storage: + await self.storage_event.wait() + + @endpoint + async def sample_from(self, k: int) -> List[TrajectorySlice]: + """Sample k trajectory slices using weighted sampling. + + Items from newer policy versions have higher probability of being selected. + If the buffer is empty, waits for it to be populated with a timeout. + + Args: + k: Number of slices to sample + + Returns: + List of sampled trajectory slices + + Raises: + RuntimeError: If buffer is empty after timeout + """ + try: + await asyncio.wait_for(self._wait_for_storage(), timeout=10.0) + except asyncio.TimeoutError: + raise RuntimeError("Timeout waiting for ReplayBuffer to be populated") + + # Extract policy versions and add 1 to ensure all weights are positive + policy_versions = [version + 1 for version, _ in self.storage] + + # Use policy versions as weights for sampling + total = sum(policy_versions) + probs = [v / total for v in policy_versions] + + # Sample indices based on policy version weights + indices = list(range(len(self.storage))) + chosen_indices = random.choices(indices, weights=probs, k=k) + + return [self.storage[i][1] for i in chosen_indices] + + +class Scorer(Actor): + """Evaluates actions and assigns rewards to trajectory slices.""" + + def __init__(self, trajectory_queue: Any, replay_buffer: Any): + """Initialize the scorer. + + Args: + trajectory_queue: Queue to pull trajectory slices from + replay_buffer: Buffer to store scored slices in + """ + self.trajectory_queue = trajectory_queue + self.replay_buffer = replay_buffer + self.net = nn.Sequential( + nn.Linear(STATE_DIM + 1, 8), + nn.Tanh(), + nn.Linear(8, 1), + ).to("cuda") + self.running = False + + async def _score_slice(self, slice: TrajectorySlice) -> None: + """Score a trajectory slice and store it in the replay buffer. + + Args: + slice: The trajectory slice to score + """ + s = slice.state.to("cuda").unsqueeze(0).repeat(G, 1) + a = slice.actions.to("cuda").float().unsqueeze(-1) + rewards = self.net(torch.cat([s, a], dim=-1)).squeeze(-1).cpu() + + scored = TrajectorySlice( + policy_version=slice.policy_version, + state=slice.state, + actions=slice.actions, + old_logps=slice.old_logps, + rewards=rewards, + ) + await self.replay_buffer.put.call(scored) + + @endpoint + async def run(self) -> None: + """Start the scoring event loop. + + Continuously pulls slices from the queue, scores them, + and puts them in the replay buffer until stopped. + """ + if self.running: + return + + self.running = True + try: + while self.running: + try: + slice_ = await asyncio.wait_for( + self.trajectory_queue.get.call_one(), + timeout=1.0, + ) + await self._score_slice(slice_) + except asyncio.TimeoutError: + continue + except Exception as e: + print(f"Scorer event loop error: {e}") + finally: + self.running = False + + @endpoint + async def stop(self) -> None: + """Stop the scoring event loop.""" + self.running = False + + +class Learner(Actor): + """Updates policy based on collected experiences using PPO algorithm.""" + + def __init__(self, replay_buffer: Any): + """Initialize the learner. + + Args: + replay_buffer: Buffer to sample experiences from + """ + # Policy network and reference network for KL divergence + self.model = nn.Sequential( + nn.Linear(STATE_DIM, 16), nn.Tanh(), nn.Linear(16, ACTION_DIM) + ).to("cuda") + self.ref_model = copy.deepcopy(self.model) + for p in self.ref_model.parameters(): + p.requires_grad = False + self.ref_model.eval() + + # Optimization parameters + self.optim = optim.Adam(self.model.parameters(), lr=1e-3, eps=1e-5) + self.eps = 0.2 # PPO clipping parameter + self.kl_coeff = 0.1 # KL divergence coefficient + self.policy_version = 0 + self.replay_buffer = replay_buffer + self.batch_size = 2 + self.generators: Optional[Any] = None + + @endpoint + async def init_generators(self, generators: Any) -> None: + """Set the generators service for weight updates. + + Args: + generators: Service to notify of policy updates + """ + self.generators = generators + + @endpoint + async def weights_handle(self) -> Dict[str, RDMABuffer]: + """Create RDMA buffers for model weights. + + Returns: + Dictionary mapping parameter names to RDMA buffers + """ + return { + k: RDMABuffer(v.view(torch.uint8).flatten()) + for k, v in self.model.state_dict().items() + } + + def _compute_advantages(self, rewards: torch.Tensor) -> torch.Tensor: + """Compute advantages from rewards. + + In PPO, advantages represent how much better an action is compared to the average. + Here we compute advantages by subtracting a baseline (mean reward) from the rewards + and then normalizing to stabilize training. + + Args: + rewards: Raw rewards tensor [batch_size * G] + + Returns: + Advantages tensor [batch_size * G] + """ + # First, reshape rewards to [batch_size, G] to compute per-state baseline + batch_size = rewards.shape[0] // G + rewards_reshaped = rewards.view(batch_size, G) + + # Compute baseline (mean reward) for each state + baselines = rewards_reshaped.mean(dim=1, keepdim=True) # [batch_size, 1] + + # Subtract baseline from rewards to get advantages + advantages = rewards_reshaped - baselines # [batch_size, G] + + # Reshape back to original shape + advantages = advantages.reshape(-1) # [batch_size * G] + + # Normalize advantages for training stability + if advantages.numel() > 1: # Check if we have more than one element + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + + return advantages + + def _apply_policy_update( + self, + states: torch.Tensor, + actions: torch.Tensor, + old_logps: torch.Tensor, + advantages: torch.Tensor, + ) -> torch.Tensor: + """Apply PPO update to policy network. + + Args: + states: Batch of states + actions: Batch of actions + old_logps: Log probabilities from old policy + advantages: Normalized advantages + + Returns: + Loss value + """ + # Compute new policy distribution and log probabilities + dist_new = Categorical(logits=self.model(states)) + new_logps = dist_new.log_prob(actions) + + # PPO clipped objective + ratio = (new_logps - old_logps).exp() + unclipped = ratio * advantages + clipped = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantages + ppo_loss = -torch.min(unclipped, clipped).mean() + + # KL penalty to prevent large policy updates + with torch.no_grad(): + ref_logits = self.ref_model(states) + kl = kl_divergence(Categorical(logits=ref_logits), dist_new).mean() + + # Update policy + loss = ppo_loss + self.kl_coeff * kl + self.optim.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) + self.optim.step() + self.policy_version += 1 + + return loss.detach() + + @endpoint + async def step(self) -> torch.Tensor: + """Perform one training step. + + Returns: + Loss value from the update + """ + # Notify generators of current policy version + if self.generators: + await self.generators.update.call(self.policy_version) + + # Sample and process trajectory slices + slices = await self.replay_buffer.sample_from.call_one(self.batch_size) + raw_states = torch.stack([s.state for s in slices]) + actions = torch.cat([s.actions for s in slices]) + old_logps = torch.cat([s.old_logps for s in slices]) + rewards = torch.cat([s.rewards for s in slices]) + + # Prepare tensors for update + states = raw_states.repeat_interleave(G, 0).to("cuda") + actions, old_logps, rewards = [ + x.to("cuda") for x in (actions, old_logps, rewards) + ] + + # Compute advantages and update policy + advs = self._compute_advantages(rewards) + return self._apply_policy_update(states, actions, old_logps, advs) + + +class GeneratorState: + """States for the Generator's state machine.""" + + READY_TO_GENERATE = "READY_TO_GENERATE" + READY_TO_UPDATE = "READY_TO_UPDATE" + + +class Generator(Actor): + """Generates actions using the current policy. + + Maintains a copy of the policy network that is synchronized with the Learner + via RDMA buffers. Generates actions for given states and sends them to the + trajectory queue for scoring. + """ + + def __init__(self, weight_buffers, trajectory_queue): + """Initialize the generator. + + Args: + weight_buffers: RDMA buffers for policy weights + trajectory_queue: Queue to put generated trajectories in + """ + self.model = nn.Sequential( + nn.Linear(STATE_DIM, 16), nn.Tanh(), nn.Linear(16, ACTION_DIM) + ).to("cuda") + self.weight_buffers = weight_buffers + self.trajectory_queue = trajectory_queue + self.state = GeneratorState.READY_TO_GENERATE + self.cond = asyncio.Condition() + self.policy_version = 0 + + @endpoint + async def generate(self, state: torch.Tensor) -> None: + """Generate actions for a given state. + + Args: + state: Input state tensor [STATE_DIM] + """ + async with self.cond: + # Wait until ready to generate + await self.cond.wait_for( + lambda: self.state == GeneratorState.READY_TO_GENERATE + ) + + # Generate actions using current policy + x = state.to("cuda").unsqueeze(0).repeat(G, 1) + dist = Categorical(logits=self.model(x)) + acts = dist.sample() + logps = dist.log_prob(acts) + + # Create trajectory slice + slice_ = TrajectorySlice( + self.policy_version, + state, + acts, + logps, + torch.zeros(G), + ) + + # Send to trajectory queue for scoring + await self.trajectory_queue.put.call(slice_) + + async with self.cond: + # Signal ready for update + self.state = GeneratorState.READY_TO_UPDATE + self.cond.notify_all() + + @endpoint + async def update(self, version: int) -> None: + """Update policy weights from RDMA buffers. + + Args: + version: New policy version number + """ + async with self.cond: + # Copy weights from RDMA buffers + sd = self.model.state_dict() + for n, b in self.weight_buffers.items(): + await b.read_into(sd[n].view(torch.uint8).flatten()) + self.model.load_state_dict(sd) + + # Update version and state + self.policy_version = version + self.state = GeneratorState.READY_TO_GENERATE + self.cond.notify_all() + + +async def main(): + """Run the distributed reinforcement learning training loop.""" + # Create process meshes for different components + learner_mesh = await proc_mesh(gpus=1) + gen_mesh = await proc_mesh(gpus=2) + + # Spawn actors on the learner mesh + traj_q = await learner_mesh.spawn("traj", TrajectoryQueue) + replay_buf = await learner_mesh.spawn("rb", ReplayBuffer) + learner = await learner_mesh.spawn("learner", Learner, replay_buf) + scorer = await learner_mesh.spawn("scorer", Scorer, traj_q, replay_buf) + + # Get weight buffers and spawn generators on the generator mesh + wb = await learner.weights_handle.call_one() + generators = await gen_mesh.spawn( + "generator", + Generator, + wb, + traj_q, + ) + await learner.init_generators.call(generators) + + # Start the scorer event loop in the background + scorer_run_future = scorer.run.call_one() + + # Training loop + for step in range(5): + state = torch.randn(STATE_DIM) + # Generate actions and update policy in parallel + _, loss = await asyncio.gather( + generators.generate.call(state), + learner.step.call_one(), + ) + print(f"[Step {step:02d}] loss={loss:.3f}") + + # Clean up + await scorer.stop.call_one() + await scorer_run_future + print("✅ done") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/source/examples/ping_pong.py b/docs/source/examples/ping_pong.py new file mode 100644 index 000000000..9fcb179d3 --- /dev/null +++ b/docs/source/examples/ping_pong.py @@ -0,0 +1,133 @@ +""" +Monarch Actor API: Ping Pong Example +==================================== + +This example demonstrates the basics of Monarch's Actor/endpoint API, which provides +a generic interface for distributed computing. We'll cover: + +- Creating and spawning actors in process meshes +- Calling endpoints on actors +- Actor-to-actor communication with a ping-pong example +""" + +# %% +# Hello World +# ----------- +# Actors are spawned in Process meshes via the `monarch.actor` API. For those familiar with +# distributed systems, it can be helpful to think of each Actor as a server with endpoints +# that can be called. + +import asyncio + +from monarch.actor import proc_mesh, ProcMesh +from monarch.actor import Actor, endpoint, current_rank + +NUM_ACTORS = 4 + +class ToyActor(Actor): + def __init__(self): + self.rank = current_rank().rank + + @endpoint + async def hello_world(self, msg): + print(f"Identity: {self.rank}, {msg=}") + +# Note: Meshes can be also be created on different nodes, but we're ignoring that in this example +async def create_toy_actors(): + local_proc_mesh = await proc_mesh(gpus=NUM_ACTORS) + # This spawns 4 instances of 'ToyActor' + toy_actor = await local_proc_mesh.spawn("toy_actor", ToyActor) + return toy_actor, local_proc_mesh + +# %% +# Once actors are spawned, we can call all of them simultaneously with `Actor.endpoint.call` +async def call_all_actors(toy_actor): + await toy_actor.hello_world.call("hey there, from script!!") + +# %% +# We can also specify a single actor using the 'slice' API +async def call_specific_actors(toy_actor): + futures = [] + for idx in range(NUM_ACTORS): + actor_instance = toy_actor.slice(gpus=idx) + futures.append(actor_instance.hello_world.call_one(f"Here's an arbitrary unique value: {idx}")) + + # conveniently, we can still schedule & gather them in parallel using asyncio + await asyncio.gather(*futures) + +# %% +# Ping Pong +# --------- +# Not only is it possible to call endpoints from a 'main' function, but actors have +# the useful property of being able to communicate with one another. + +class ExampleActor(Actor): + def __init__(self, actor_name): + self.actor_name = actor_name + + @endpoint + async def init(self, other_actor): + self.other_actor = other_actor + self.other_actor_pair = other_actor.slice(**current_rank()) + self.identity = current_rank().rank + + @endpoint + async def send(self, msg): + await self.other_actor_pair.recv.call(f"Sender ({self.actor_name}:{self.identity}) {msg=}") + + @endpoint + async def recv(self, msg): + print(f"Pong!, Receiver ({self.actor_name}:{self.identity}) received msg {msg}") + +async def create_ping_pong_actors(): + # Spawn two different Actors in different meshes, with two instances each + local_mesh_0 = await proc_mesh(gpus=2) + actor_0 = await local_mesh_0.spawn( + "actor_0", + ExampleActor, + "actor_0" # this arg is passed to ExampleActor.__init__ + ) + + local_mesh_1 = await proc_mesh(gpus=2) + actor_1 = await local_mesh_1.spawn( + "actor_1", + ExampleActor, + "actor_1" # this arg is passed to ExampleActor.__init__ + ) + + return actor_0, actor_1, local_mesh_0, local_mesh_1 + +# %% +# Initialize each actor with references to each other +async def init_ping_pong(actor_0, actor_1): + await asyncio.gather( + actor_0.init.call(actor_1), + actor_1.init.call(actor_0), + ) + +# %% +# Send messages between actors +async def send_ping_pong(actor_0, actor_1): + # Actor 0 sends to Actor 1 + await actor_0.send.call("Ping") + + # Actor 1 sends to Actor 0 + await actor_1.send.call("Ping") + +# %% +# Main function to run the example +async def main(): + # Hello World example + toy_actor, toy_mesh = await create_toy_actors() + await call_all_actors(toy_actor) + await call_specific_actors(toy_actor) + + # Ping Pong example + actor_0, actor_1, mesh_0, mesh_1 = await create_ping_pong_actors() + await init_ping_pong(actor_0, actor_1) + await send_ping_pong(actor_0, actor_1) + + print("Example completed successfully!") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/source/examples/spmd_ddp.py b/docs/source/examples/spmd_ddp.py new file mode 100644 index 000000000..20ee522e0 --- /dev/null +++ b/docs/source/examples/spmd_ddp.py @@ -0,0 +1,158 @@ +""" +DDP Examples Using Classic SPMD / torch.distributed +================================================== + +This example demonstrates how to run PyTorch's Distributed Data Parallel (DDP) +within Monarch actors. We'll adapt the basic DDP example from PyTorch's +documentation and wrap it in Monarch's actor framework. + +This example shows: +- How to initialize torch.distributed within Monarch actors +- How to create and use DDP models in a distributed setting +- How to properly clean up distributed resources +""" + +# %% +# First, we'll import the necessary libraries and define our model and actor classes + +import os +import sys + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.optim as optim + +from monarch.actor import proc_mesh +from monarch.actor import Actor, current_rank, endpoint + +from torch.nn.parallel import DistributedDataParallel as DDP + + +WORLD_SIZE = 4 + + +class ToyModel(nn.Module): + """A simple toy model for demonstration purposes.""" + + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.net2 = nn.Linear(10, 5) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + +class DDPActor(Actor): + """This Actor wraps the basic functionality from Torch's DDP example. + + Conveniently, all of the methods we need are already laid out for us, + so we can just wrap them in the usual Actor endpoint semantic with some + light modifications. + + Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case + """ + + def __init__(self): + self.rank = current_rank().rank + + def _rprint(self, msg): + """Helper method to print with rank information.""" + print(f"{self.rank=} {msg}") + + @endpoint + async def setup(self): + """Initialize the PyTorch distributed process group.""" + self._rprint("Initializing torch distributed") + + # initialize the process group + dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE) + self._rprint("Finished initializing torch distributed") + + @endpoint + async def cleanup(self): + """Clean up the PyTorch distributed process group.""" + self._rprint("Cleaning up torch distributed") + dist.destroy_process_group() + + @endpoint + async def demo_basic(self): + """Run a basic DDP training example.""" + self._rprint("Running basic DDP example") + + # create model and move it to GPU with id rank + model = ToyModel().to(self.rank) + ddp_model = DDP(model, device_ids=[self.rank]) + + loss_fn = nn.MSELoss() + optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) + + optimizer.zero_grad() + outputs = ddp_model(torch.randn(20, 10)) + labels = torch.randn(20, 5).to(self.rank) + loss_fn(outputs, labels).backward() + optimizer.step() + + print(f"{self.rank=} Finished running basic DDP example") + + +# %% +# Now we'll define functions to create and run our DDP example + +async def create_ddp_actors(): + """Create the process mesh and spawn DDP actors.""" + # Spawn a process mesh + local_proc_mesh = await proc_mesh( + gpus=WORLD_SIZE, + env={ + "MASTER_ADDR": "localhost", + "MASTER_PORT": "12355", + }, + ) + # Spawn our actor mesh on top of the process mesh + ddp_actor = await local_proc_mesh.spawn("ddp_actor", DDPActor) + return ddp_actor, local_proc_mesh + + +async def setup_distributed(ddp_actor): + """Initialize the distributed environment.""" + # Setup torch Distributed + await ddp_actor.setup.call() + + +async def run_ddp_example(ddp_actor): + """Run the DDP training example.""" + # Run the demo + await ddp_actor.demo_basic.call() + + +async def cleanup_distributed(ddp_actor): + """Clean up distributed resources.""" + # Clean up + await ddp_actor.cleanup.call() + + +# %% +# Main function to run the complete example +async def main(): + """Main function to run the DDP example.""" + # Create actors + ddp_actor, proc_mesh = await create_ddp_actors() + + # Setup distributed environment + await setup_distributed(ddp_actor) + + # Run DDP example + await run_ddp_example(ddp_actor) + + # Clean up + await cleanup_distributed(ddp_actor) + + print("DDP example completed successfully!") + + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) diff --git a/docs/source/get_started.md b/docs/source/get_started.md new file mode 100644 index 000000000..c001056a3 --- /dev/null +++ b/docs/source/get_started.md @@ -0,0 +1,192 @@ +# Getting Started with Monarch + +Welcome to Monarch! This guide will help you get up and running with Monarch, a distributed execution engine for PyTorch that delivers high-quality user experience at cluster scale. + +## What is Monarch? + +Monarch is designed to extend PyTorch's capabilities to efficiently run on distributed systems. It maintains the familiar PyTorch API while handling the complexities of distributed execution, making it easier to scale your deep learning workloads across multiple GPUs and nodes. + +## Prerequisites + +Before installing Monarch, ensure you have: + +- A Linux system (Monarch is currently only supported on Linux) +- Python 3.10 or later +- CUDA-compatible GPU(s) +- Basic familiarity with PyTorch + +## Installation + +### Quick Installation + +The simplest way to install Monarch is via pip: + +```bash +pip install torchmonarch-nightly +``` + +### Manual Installation + +For more control or development purposes, you can install Monarch manually: + +```bash +# Create and activate the conda environment +conda create -n monarchenv python=3.10 -y +conda activate monarchenv + +# Install nightly rust toolchain +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +rustup toolchain install nightly +rustup default nightly + +# Install non-python dependencies +conda install libunwind -y + +# Install the correct cuda and cuda-toolkit versions for your machine +sudo dnf install cuda-toolkit-12-0 cuda-12-0 + +# Install clang-dev and nccl-dev +sudo dnf install clang-devel libnccl-devel +# Or, in some environments, the following may be necessary instead +conda install -c conda-forge clangdev nccl +conda update -n monarchenv --all -c conda-forge -y + +# Install build dependencies +pip install -r build-requirements.txt +# Install test dependencies +pip install -r python/tests/requirements.txt + +# Build and install Monarch +pip install --no-build-isolation . +# or setup for development +pip install --no-build-isolation -e . +``` + +## Verifying Your Installation + +After installation, you can verify that Monarch is working correctly by running the unit tests: + +```bash +pytest python/tests/ -v -m "not oss_skip" +``` + +## Basic Usage + +Here's a simple example to get you started with Monarch: + +```python +import torch +import monarch as mon + +# Initialize Monarch +mon.init() + +# Create a simple model +model = torch.nn.Linear(10, 5) + +# Distribute the model using Monarch +distributed_model = mon.distribute(model) + +# Create some input data +input_data = torch.randn(8, 10) + +# Run a forward pass +output = distributed_model(input_data) + +# Clean up +mon.shutdown() +``` + +## Example: Ping Pong + +One of the simplest examples of using Monarch is the "ping pong" example, which demonstrates basic communication between processes: + +```python +import monarch as mon +import torch + +# Initialize Monarch +mon.init() + +# Get the current process rank and world size +rank = mon.get_rank() +world_size = mon.get_world_size() + +# Create a tensor to send +send_tensor = torch.tensor([rank], dtype=torch.float32) + +# Determine the destination rank +dst_rank = (rank + 1) % world_size + +# Send the tensor to the destination rank +mon.send(send_tensor, dst_rank) + +# Receive a tensor from the source rank +src_rank = (rank - 1) % world_size +recv_tensor = torch.zeros(1, dtype=torch.float32) +mon.recv(recv_tensor, src_rank) + +print(f"Rank {rank} received {recv_tensor.item()} from rank {src_rank}") + +# Clean up +mon.shutdown() +``` + +## Distributed Data Parallel Training + +Monarch makes it easy to implement distributed data parallel training: + +```python +import monarch as mon +import torch +import torch.nn as nn +import torch.optim as optim + +# Initialize Monarch +mon.init() + +# Create a simple model +model = nn.Linear(10, 5) +model = mon.distribute(model) + +# Create optimizer +optimizer = optim.SGD(model.parameters(), lr=0.01) + +# Create loss function +criterion = nn.MSELoss() + +# Training loop +for epoch in range(10): + # Assume data_loader is your distributed data loader + for data, target in data_loader: + # Forward pass + output = model(data) + loss = criterion(output, target) + + # Backward pass and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + +# Clean up +mon.shutdown() +``` + +## Next Steps + +Now that you've got the basics, you can: + +1. Check out the [examples](./examples/index) directory for more detailed demonstrations +2. Explore the [API documentation](api) for a complete reference +3. Learn about advanced features in the [How-to guides](howtos) + +## Troubleshooting + +If you encounter issues: + +- Make sure your CUDA environment is properly set up +- Check that you're using a compatible version of PyTorch +- Verify that all dependencies are installed correctly +- Consult the [GitHub repository](https://github.com/pytorch-labs/monarch) for known issues + +Remember that Monarch is currently in an experimental stage, so you may encounter bugs or incomplete features. Contributions and bug reports are welcome! diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 000000000..c98e4291b --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,60 @@ +# Monarch 🦋 + +**Monarch** is a distributed execution engine for PyTorch. Our overall goal is +to deliver the high-quality user experience that people get from single-GPU +PyTorch, but at cluster scale. + +> ⚠️ **Early Development Warning** Monarch is currently in an experimental +> stage. You should expect bugs, incomplete features, and APIs that may change +> in future versions. The project welcomes bugfixes, but to make sure things are +> well coordinated you should discuss any significant change before starting the +> work. It's recommended that you signal your intention to contribute in the +> issue tracker, either by filing a new issue or by claiming an existing one. + +## What is Monarch? + +Monarch extends PyTorch's capabilities to efficiently run on distributed systems. It maintains the familiar PyTorch API while handling the complexities of distributed execution, making it easier to scale your deep learning workloads across multiple GPUs and nodes. + +Key features: +- **Familiar PyTorch API** - Use the same PyTorch code you're already familiar with +- **Efficient Distribution** - Scale your models across multiple GPUs and nodes +- **Simplified Communication** - Built-in primitives for distributed communication +- **Performance Optimized** - Designed for high performance at scale + +**Note:** Monarch is currently only supported on Linux systems. + +## Getting Started + +Here are some suggested steps to get started with Monarch: + +1. **Learn the Basics**: Check out the [Getting Started](get_started) guide to learn the basics of Monarch +2. **Explore Examples**: Review the [Examples](./examples/index) to see Monarch in action +3. **Dive Deeper**: Explore the [API Documentation](api) for more detailed information + +## Documentation Contents + +```{toctree} +:maxdepth: 2 +:caption: Contents +:hidden: + +get_started +./generated/examples/index +books/books +api +howtos +glossary +``` + +## License + +Monarch is BSD-3 licensed, as found in the [LICENSE](https://github.com/pytorch-labs/monarch/blob/main/LICENSE) file. + +## Community + +We welcome contributions from the community! If you're interested in contributing, please: + +1. Check the [GitHub repository](https://github.com/pytorch-labs/monarch) +2. Review existing issues or create a new one +3. Discuss your proposed changes before starting work +4. Submit a pull request with your changes diff --git a/examples/README.rst b/examples/README.rst new file mode 100644 index 000000000..d4eac8c0a --- /dev/null +++ b/examples/README.rst @@ -0,0 +1,31 @@ +Monarch Examples +================ + +This directory contains examples demonstrating how to use Monarch for distributed computing. + +Python Script Examples +--------------------- + +These examples are formatted for sphinx-gallery and will be automatically converted to HTML documentation: + +- ``ping_pong.py``: Demonstrates the basics of Monarch's Actor/endpoint API with a ping-pong communication example +- ``spmd_ddp.py``: Shows how to run PyTorch's Distributed Data Parallel (DDP) within Monarch actors +- ``grpo_actor.py``: Implements a distributed PPO-like reinforcement learning algorithm using the Monarch actor framework + +Running Examples +--------------- + +To run any example: + +.. code-block:: bash + + python examples/example_name.py + +.. toctree:: + :maxdepth: 2 + :caption: Contents + :hidden: + + ping_pong + spmd_ddp + grpo_actor diff --git a/examples/grpo_actor.py b/examples/grpo_actor.py index 9595f02e0..5b8f4e260 100644 --- a/examples/grpo_actor.py +++ b/examples/grpo_actor.py @@ -4,6 +4,25 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +""" +Distributed PPO-like Reinforcement Learning with Monarch Actors +=======================================================>>>>>>> REPLACE + +This example demonstrates implementing a distributed PPO-like reinforcement learning +algorithm using the Monarch actor framework. The implementation features: + +- Distributed actor architecture with Generator, Scorer, and Learner components +- Asynchronous communication via queues +- RDMA-based weight synchronization +- Event-driven architecture for efficient processing + +The example shows how to: +- Set up distributed actors on separate GPU meshes +- Implement policy gradient methods in a distributed setting +- Use RDMA buffers for efficient parameter sharing +- Create an asynchronous training loop with multiple components +""" + import asyncio import copy import random diff --git a/examples/notebooks/README.md b/examples/notebooks/README.md deleted file mode 100644 index 78872f2b7..000000000 --- a/examples/notebooks/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Example Notebooks - -This folder contains some basic examples of using the Monarch API in jupyter notebooks. - -## Setup -1. Follow the instructions outlined in ../../monarch/README.md to setup Monarch -2. Pip install jupyter: - `pip install jupyter notebook` -3. Run your jupyter notebook: `jupyter notebook` -4. (optiona) In remote settings (as in a devserver), you can also port forward your jupyter notebook to your local machine. e.g. - ``` - # devserver - jupyter notebook --no-browser --port=8098 - - #local - ssh -N -L 8098:localhost:8098 -```` -5. Open localhost:8098 in your browser to see the jupyter notebook - - -## Manifest -* ping_pong.ipynb - Simple hello world with Actor API + Inter Actor Communication diff --git a/examples/notebooks/ping_pong.ipynb b/examples/notebooks/ping_pong.ipynb deleted file mode 100644 index 6675587e9..000000000 --- a/examples/notebooks/ping_pong.ipynb +++ /dev/null @@ -1,314 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "eed8a43f-7b0d-4d44-81f0-f96bb6648d6f", - "metadata": {}, - "source": [ - "# Monarch\n", - "\n", - "One of Monarch's more powerful features is its Actor/endpoint API, which provides a generic interface for distributed computing. In this notebook, we introduce some of the basics" - ] - }, - { - "cell_type": "markdown", - "id": "40edef57-5d29-46e1-9bf0-7d0fec90a543", - "metadata": {}, - "source": [ - "## Hello World\n", - "Actors are spawned in Process meshes via the `monarch.actor` API. For those familiar with distributed systems, it can be helpful to think of each Actor as a server with endpoints that can be called." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "057bdd93-f218-4192-b71c-16045ad77ffe", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "I0529 17:04:38.195836 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _128tEcETi5FK[0] rank 0: created\n", - "I0529 17:04:38.196293 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _128tEcETi5FK[1] rank 1: created\n", - "I0529 17:04:38.196709 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _128tEcETi5FK[2] rank 2: created\n", - "I0529 17:04:38.197093 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _128tEcETi5FK[3] rank 3: created\n", - "I0529 17:04:39.233129 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _128tEcETi5FK[1] rank 1: running at addr:unix!@choXYEwnmQspRu9iZUgKvF6Q mesh_agent:_128tEcETi5FK[1].mesh[0]\n", - "I0529 17:04:39.234619 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _128tEcETi5FK[3] rank 3: running at addr:unix!@aztljBHYamc5fQB8oKyUptrm mesh_agent:_128tEcETi5FK[3].mesh[0]\n", - "I0529 17:04:39.238444 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _128tEcETi5FK[2] rank 2: running at addr:unix!@8qAmI2Ex2gj3sUNjfm4RkaL3 mesh_agent:_128tEcETi5FK[2].mesh[0]\n", - "I0529 17:04:39.238497 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _128tEcETi5FK[0] rank 0: running at addr:unix!@yXDQEYcGLQnPmb8Un5FX8rpY mesh_agent:_128tEcETi5FK[0].mesh[0]\n" - ] - } - ], - "source": [ - "import asyncio\n", - "\n", - "from monarch.actor import proc_mesh, ProcMesh\n", - "from monarch.actor import Actor, endpoint, current_rank\n", - "\n", - "NUM_ACTORS=4\n", - "\n", - "class ToyActor(Actor):\n", - " def __init__(self):\n", - " self.rank = current_rank().rank\n", - " \n", - " @endpoint\n", - " async def hello_world(self, msg):\n", - " print(f\"Identity: {self.rank}, {msg=}\") \n", - "\n", - "# Note: Meshes can be also be created on different nodes, but we're ignoring that in this example\n", - "local_proc_mesh = await proc_mesh(gpus=NUM_ACTORS)\n", - "# This spawns 4 instances of 'ToyActor'\n", - "toy_actor = await local_proc_mesh.spawn(\"toy_actor\", ToyActor) " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "53bfe107-b4d9-4e33-80b2-dbea14cc4df2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Identity: 3, msg='hey there, from jupyter!!'\n", - "Identity: 2, msg='hey there, from jupyter!!'\n", - "Identity: 0, msg='hey there, from jupyter!!'\n", - "Identity: 1, msg='hey there, from jupyter!!'\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Once actors are spawned, we can call all of them simultaneously with `Actor.endpoint.call` as below\n", - "await toy_actor.hello_world.call(\"hey there, from jupyter!!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a1327cdd-45c6-423b-a05c-e0565555109a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Identity: 0, msg=\"Here's an arbitrary unique value: 0\"\n", - "Identity: 0, msg=\"Here's an arbitrary unique value: 1\"\n", - "Identity: 0, msg=\"Here's an arbitrary unique value: 2\"\n", - "Identity: 0, msg=\"Here's an arbitrary unique value: 3\"\n" - ] - }, - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We can also specify a single actor using the 'slice' API\n", - "futures = []\n", - "for idx in range(NUM_ACTORS):\n", - " actor_instance = toy_actor.slice(gpus=idx)\n", - " futures.append(actor_instance.hello_world.call_one(f\"Here's an arbitrary unique value: {idx}\"))\n", - "\n", - "# conveniently, we can still schedule & gather them in parallel using asyncio\n", - "await asyncio.gather(*futures)" - ] - }, - { - "cell_type": "markdown", - "id": "b1c727fe-1330-4a12-adeb-82a126397e8e", - "metadata": {}, - "source": [ - "## Ping Pong\n", - "Not only is it possible to call endpoints froma 'main' fuction, but actors have the useful property of being able to communicate with one another. " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "2c9d37c7-e6d9-452d-bde0-589b8757c9e8", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "I0529 17:05:37.904476 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _14wENoic4WQM[0] rank 0: created\n", - "I0529 17:05:37.904983 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _14wENoic4WQM[1] rank 1: created\n", - "I0529 17:05:39.065343 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _14wENoic4WQM[0] rank 0: running at addr:unix!@tjxHFPUc2tv5w8RYZDOjVTFN mesh_agent:_14wENoic4WQM[0].mesh[0]\n", - "I0529 17:05:39.070276 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _14wENoic4WQM[1] rank 1: running at addr:unix!@yHbPlBV7H43hbCq73DsCoPTQ mesh_agent:_14wENoic4WQM[1].mesh[0]\n", - "I0529 17:05:39.289522 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _1xtVcyVB4hVH[0] rank 0: created\n", - "I0529 17:05:39.290061 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:114] proc _1xtVcyVB4hVH[1] rank 1: created\n", - "I0529 17:05:40.331259 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _1xtVcyVB4hVH[1] rank 1: running at addr:unix!@I3py42AbzUiSqiEEijRbDzjC mesh_agent:_1xtVcyVB4hVH[1].mesh[0]\n", - "I0529 17:05:40.339197 2830017 hyperactor_mesh/src/proc_mesh/mod.rs:133] proc _1xtVcyVB4hVH[0] rank 0: running at addr:unix!@xXjtJso3xmY5X2gXQ64Ey0fZ mesh_agent:_1xtVcyVB4hVH[0].mesh[0]\n" - ] - } - ], - "source": [ - "import asyncio\n", - "\n", - "from monarch.actor import proc_mesh, ProcMesh\n", - "from monarch.actor import Actor, endpoint, current_rank\n", - "\n", - "class ExampleActor(Actor):\n", - " def __init__(self, actor_name):\n", - " self.actor_name=actor_name\n", - " \n", - " @endpoint\n", - " async def init(self, other_actor):\n", - " self.other_actor = other_actor\n", - " self.other_actor_pair = other_actor.slice(**current_rank())\n", - " self.identity = current_rank().rank\n", - " \n", - " @endpoint\n", - " async def send(self, msg):\n", - " await self.other_actor_pair.recv.call(f\"Sender ({self.actor_name}:{self.identity}) {msg=}\")\n", - " \n", - " @endpoint\n", - " async def recv(self, msg):\n", - " print(f\"Pong!, Receiver ({self.actor_name}:{self.identity}) received msg {msg}\")\n", - "\n", - "# Spawn two different Actors in different meshes, with two instances each\n", - "local_mesh_0 = await proc_mesh(gpus=2)\n", - "actor_0 = await local_mesh_0.spawn(\n", - " \"actor_0\",\n", - " ExampleActor,\n", - " \"actor_0\" # this arg is passed to ExampleActor.__init__\n", - ") \n", - "\n", - "local_mesh_1 = await proc_mesh(gpus=2)\n", - "actor_1 = await local_mesh_1.spawn(\n", - " \"actor_1\",\n", - " ExampleActor,\n", - " \"actor_1\" # this arg is passed to ExampleActor.__init__\n", - ") " - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "9b1fcb84-83b2-4465-9ff4-141e5b0c4a16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Initialize each actor with references to each other\n", - "await asyncio.gather(\n", - " actor_0.init.call(actor_1),\n", - " actor_1.init.call(actor_0),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "1f44a9fb-9ba4-44cd-af29-e32e3a4c7c65", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pong!, Receiver (actor_1:0) received msg Sender (actor_0:0) msg='Ping'\n", - "Pong!, Receiver (actor_1:1) received msg Sender (actor_0:1) msg='Ping'\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await actor_0.send.call(\"Ping\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "282b36b6-9348-48a0-92f5-f29d3401009d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pong!, Receiver (actor_0:0) received msg Sender (actor_1:0) msg='Ping'\n", - "Pong!, Receiver (actor_0:1) received msg Sender (actor_1:1) msg='Ping'\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "await actor_1.send.call(\"Ping\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/notebooks/spmd_ddp.ipynb b/examples/notebooks/spmd_ddp.ipynb deleted file mode 100644 index 6036bb42f..000000000 --- a/examples/notebooks/spmd_ddp.ipynb +++ /dev/null @@ -1,171 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c8184767-4307-482c-850f-c19cbe49f16a", - "metadata": {}, - "source": [ - "# DDP Examples Using Classic SPMD / torch.distributed\n", - "Let's see if we can run torch's basic [ddp example](https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html), but wrapped in Monarch actors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc7ef2eb-e3ad-4caf-b5f7-f6352970bc2c", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "import torch\n", - "import torch.distributed as dist\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "from monarch.actor import proc_mesh\n", - "from monarch.actor import Actor, current_rank, endpoint\n", - "\n", - "from torch.nn.parallel import DistributedDataParallel as DDP\n", - "\n", - "\n", - "WORLD_SIZE = 4\n", - "\n", - "\n", - "class ToyModel(nn.Module):\n", - " def __init__(self):\n", - " super(ToyModel, self).__init__()\n", - " self.net1 = nn.Linear(10, 10)\n", - " self.relu = nn.ReLU()\n", - " self.net2 = nn.Linear(10, 5)\n", - "\n", - " def forward(self, x):\n", - " return self.net2(self.relu(self.net1(x)))\n", - "\n", - "\n", - "class DDPActor(Actor):\n", - " \"\"\"This Actor wraps the basic functionality from Torch's DDP example. Conveniently, all of the\n", - " methods we need are already laid out for us, so we can just wrap them in the usual Actor endpoint semantic with some light modifications\n", - "\n", - " # copy pasta from https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case\n", - " \"\"\"\n", - "\n", - " def __init__(self):\n", - " self.rank = current_rank().rank\n", - "\n", - " def _rprint(self, msg):\n", - " print(f\"{self.rank=} {msg}\")\n", - "\n", - " @endpoint\n", - " async def setup(self):\n", - " self._rprint(\"Initializing torch distributed\")\n", - "\n", - " # initialize the process group\n", - " dist.init_process_group(\"gloo\", rank=self.rank, world_size=WORLD_SIZE)\n", - " self._rprint(\"Finished initializing torch distributed\")\n", - "\n", - " @endpoint\n", - " async def cleanup(self):\n", - " self._rprint(\"Cleaning up torch distributed\")\n", - " dist.destroy_process_group()\n", - "\n", - " @endpoint\n", - " async def demo_basic(self):\n", - " self._rprint(\"Running basic DDP example\")\n", - " # setup(rank, world_size)\n", - "\n", - " # create model and move it to GPU with id rank\n", - " model = ToyModel().to(self.rank)\n", - " ddp_model = DDP(model, device_ids=[self.rank])\n", - "\n", - " loss_fn = nn.MSELoss()\n", - " optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)\n", - "\n", - " optimizer.zero_grad()\n", - " outputs = ddp_model(torch.randn(20, 10))\n", - " labels = torch.randn(20, 5).to(self.rank)\n", - " loss_fn(outputs, labels).backward()\n", - " optimizer.step()\n", - "\n", - " # cleanup()\n", - " print(f\"{self.rank=} Finished running basic DDP example\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5cc47e45-cde8-4376-9d47-1d629871a9e1", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Spawn a process mesh\n", - "local_proc_mesh = await proc_mesh(\n", - " gpus=WORLD_SIZE,\n", - " env={\n", - " \"MASTER_ADDR\": \"localhost\",\n", - " \"MASTER_PORT\": \"12355\",\n", - " },\n", - ")\n", - "# Spawn our actor mesh on top of the process mesh\n", - "ddp_actor = await local_proc_mesh.spawn(\"ddp_actor\", DDPActor)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87a77cb6-57fe-40eb-9f09-bfb51b64b5c1", - "metadata": {}, - "outputs": [], - "source": [ - "# Setup torch Distributed\n", - "await ddp_actor.setup.call()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "408a1ef6-6fab-41cc-816e-95c115577a51", - "metadata": {}, - "outputs": [], - "source": [ - "# Run the demo\n", - "await ddp_actor.demo_basic.call()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3beeb159-8b8f-4ec7-ae5a-59f041726fb0", - "metadata": {}, - "outputs": [], - "source": [ - "# Run the demo\n", - "await ddp_actor.cleanup.call()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/ping_pong.py b/examples/ping_pong.py new file mode 100644 index 000000000..9fcb179d3 --- /dev/null +++ b/examples/ping_pong.py @@ -0,0 +1,133 @@ +""" +Monarch Actor API: Ping Pong Example +==================================== + +This example demonstrates the basics of Monarch's Actor/endpoint API, which provides +a generic interface for distributed computing. We'll cover: + +- Creating and spawning actors in process meshes +- Calling endpoints on actors +- Actor-to-actor communication with a ping-pong example +""" + +# %% +# Hello World +# ----------- +# Actors are spawned in Process meshes via the `monarch.actor` API. For those familiar with +# distributed systems, it can be helpful to think of each Actor as a server with endpoints +# that can be called. + +import asyncio + +from monarch.actor import proc_mesh, ProcMesh +from monarch.actor import Actor, endpoint, current_rank + +NUM_ACTORS = 4 + +class ToyActor(Actor): + def __init__(self): + self.rank = current_rank().rank + + @endpoint + async def hello_world(self, msg): + print(f"Identity: {self.rank}, {msg=}") + +# Note: Meshes can be also be created on different nodes, but we're ignoring that in this example +async def create_toy_actors(): + local_proc_mesh = await proc_mesh(gpus=NUM_ACTORS) + # This spawns 4 instances of 'ToyActor' + toy_actor = await local_proc_mesh.spawn("toy_actor", ToyActor) + return toy_actor, local_proc_mesh + +# %% +# Once actors are spawned, we can call all of them simultaneously with `Actor.endpoint.call` +async def call_all_actors(toy_actor): + await toy_actor.hello_world.call("hey there, from script!!") + +# %% +# We can also specify a single actor using the 'slice' API +async def call_specific_actors(toy_actor): + futures = [] + for idx in range(NUM_ACTORS): + actor_instance = toy_actor.slice(gpus=idx) + futures.append(actor_instance.hello_world.call_one(f"Here's an arbitrary unique value: {idx}")) + + # conveniently, we can still schedule & gather them in parallel using asyncio + await asyncio.gather(*futures) + +# %% +# Ping Pong +# --------- +# Not only is it possible to call endpoints from a 'main' function, but actors have +# the useful property of being able to communicate with one another. + +class ExampleActor(Actor): + def __init__(self, actor_name): + self.actor_name = actor_name + + @endpoint + async def init(self, other_actor): + self.other_actor = other_actor + self.other_actor_pair = other_actor.slice(**current_rank()) + self.identity = current_rank().rank + + @endpoint + async def send(self, msg): + await self.other_actor_pair.recv.call(f"Sender ({self.actor_name}:{self.identity}) {msg=}") + + @endpoint + async def recv(self, msg): + print(f"Pong!, Receiver ({self.actor_name}:{self.identity}) received msg {msg}") + +async def create_ping_pong_actors(): + # Spawn two different Actors in different meshes, with two instances each + local_mesh_0 = await proc_mesh(gpus=2) + actor_0 = await local_mesh_0.spawn( + "actor_0", + ExampleActor, + "actor_0" # this arg is passed to ExampleActor.__init__ + ) + + local_mesh_1 = await proc_mesh(gpus=2) + actor_1 = await local_mesh_1.spawn( + "actor_1", + ExampleActor, + "actor_1" # this arg is passed to ExampleActor.__init__ + ) + + return actor_0, actor_1, local_mesh_0, local_mesh_1 + +# %% +# Initialize each actor with references to each other +async def init_ping_pong(actor_0, actor_1): + await asyncio.gather( + actor_0.init.call(actor_1), + actor_1.init.call(actor_0), + ) + +# %% +# Send messages between actors +async def send_ping_pong(actor_0, actor_1): + # Actor 0 sends to Actor 1 + await actor_0.send.call("Ping") + + # Actor 1 sends to Actor 0 + await actor_1.send.call("Ping") + +# %% +# Main function to run the example +async def main(): + # Hello World example + toy_actor, toy_mesh = await create_toy_actors() + await call_all_actors(toy_actor) + await call_specific_actors(toy_actor) + + # Ping Pong example + actor_0, actor_1, mesh_0, mesh_1 = await create_ping_pong_actors() + await init_ping_pong(actor_0, actor_1) + await send_ping_pong(actor_0, actor_1) + + print("Example completed successfully!") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/spmd_ddp.py b/examples/spmd_ddp.py new file mode 100644 index 000000000..20ee522e0 --- /dev/null +++ b/examples/spmd_ddp.py @@ -0,0 +1,158 @@ +""" +DDP Examples Using Classic SPMD / torch.distributed +================================================== + +This example demonstrates how to run PyTorch's Distributed Data Parallel (DDP) +within Monarch actors. We'll adapt the basic DDP example from PyTorch's +documentation and wrap it in Monarch's actor framework. + +This example shows: +- How to initialize torch.distributed within Monarch actors +- How to create and use DDP models in a distributed setting +- How to properly clean up distributed resources +""" + +# %% +# First, we'll import the necessary libraries and define our model and actor classes + +import os +import sys + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.optim as optim + +from monarch.actor import proc_mesh +from monarch.actor import Actor, current_rank, endpoint + +from torch.nn.parallel import DistributedDataParallel as DDP + + +WORLD_SIZE = 4 + + +class ToyModel(nn.Module): + """A simple toy model for demonstration purposes.""" + + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.net2 = nn.Linear(10, 5) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + +class DDPActor(Actor): + """This Actor wraps the basic functionality from Torch's DDP example. + + Conveniently, all of the methods we need are already laid out for us, + so we can just wrap them in the usual Actor endpoint semantic with some + light modifications. + + Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case + """ + + def __init__(self): + self.rank = current_rank().rank + + def _rprint(self, msg): + """Helper method to print with rank information.""" + print(f"{self.rank=} {msg}") + + @endpoint + async def setup(self): + """Initialize the PyTorch distributed process group.""" + self._rprint("Initializing torch distributed") + + # initialize the process group + dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE) + self._rprint("Finished initializing torch distributed") + + @endpoint + async def cleanup(self): + """Clean up the PyTorch distributed process group.""" + self._rprint("Cleaning up torch distributed") + dist.destroy_process_group() + + @endpoint + async def demo_basic(self): + """Run a basic DDP training example.""" + self._rprint("Running basic DDP example") + + # create model and move it to GPU with id rank + model = ToyModel().to(self.rank) + ddp_model = DDP(model, device_ids=[self.rank]) + + loss_fn = nn.MSELoss() + optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) + + optimizer.zero_grad() + outputs = ddp_model(torch.randn(20, 10)) + labels = torch.randn(20, 5).to(self.rank) + loss_fn(outputs, labels).backward() + optimizer.step() + + print(f"{self.rank=} Finished running basic DDP example") + + +# %% +# Now we'll define functions to create and run our DDP example + +async def create_ddp_actors(): + """Create the process mesh and spawn DDP actors.""" + # Spawn a process mesh + local_proc_mesh = await proc_mesh( + gpus=WORLD_SIZE, + env={ + "MASTER_ADDR": "localhost", + "MASTER_PORT": "12355", + }, + ) + # Spawn our actor mesh on top of the process mesh + ddp_actor = await local_proc_mesh.spawn("ddp_actor", DDPActor) + return ddp_actor, local_proc_mesh + + +async def setup_distributed(ddp_actor): + """Initialize the distributed environment.""" + # Setup torch Distributed + await ddp_actor.setup.call() + + +async def run_ddp_example(ddp_actor): + """Run the DDP training example.""" + # Run the demo + await ddp_actor.demo_basic.call() + + +async def cleanup_distributed(ddp_actor): + """Clean up distributed resources.""" + # Clean up + await ddp_actor.cleanup.call() + + +# %% +# Main function to run the complete example +async def main(): + """Main function to run the DDP example.""" + # Create actors + ddp_actor, proc_mesh = await create_ddp_actors() + + # Setup distributed environment + await setup_distributed(ddp_actor) + + # Run DDP example + await run_ddp_example(ddp_actor) + + # Clean up + await cleanup_distributed(ddp_actor) + + print("DDP example completed successfully!") + + +if __name__ == "__main__": + import asyncio + asyncio.run(main())