Skip to content

Commit cf7434a

Browse files
actor: move more port receiver supervision to rust (#578)
Summary: Pull Request resolved: #578 this diff moves supervision logic from python into rust, aligning with the goal of eliminating complex supervision wiring in python. the essential change is that: ``` class ActorEndpoint(...): def _port(self, once: bool = False) -> "PortTuple[R]": monitor = ( None if self._actor_mesh._actor_mesh is None else self._actor_mesh._actor_mesh.monitor() ) return PortTuple.create(self._mailbox, monitor, once) ``` becomes: ``` class ActorEndpoint(...): def _port(self, once: bool = False) -> PortTuple[R]: p, r = PortTuple.create(self._mailbox, once) return PortTuple(p, PortReceiver(self._mailbox, self._supervise(r._receiver))) ``` `_supervise(...)` dispatches to new Rust helpers: ```python mesh.supervise_port(...) and mesh.supervise_once_port(...) ``` which wrap the receivers with supervision logic (including selection between message arrival and supervision events), completely eliminating the need for python-side constructs like `ActorMeshMonitor`. most of the python complexity introduced in D77434080 is removed. the only meaningful addition is `_supervise(...)`, a small overrideable hook that defaults to a no-op and cleanly delegates to rust when supervision is desired. - the creation and wiring of the monitor stream is now fully in rust. - the responsibility of wrapping receivers with supervision is now fully in rust. - python no longer constructs or passes supervision monitors; rust now owns the full wiring, and python receives already-wrapped receivers with supervision behavior embedded this is a strict improvement: lower complexity, cleaner override points and supervision is entirely managed in rust. Differential Revision: D78528860
1 parent 4d93ed0 commit cf7434a

File tree

6 files changed

+112
-173
lines changed

6 files changed

+112
-173
lines changed

monarch_hyperactor/src/actor_mesh.rs

Lines changed: 59 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,32 @@ impl PythonActorMesh {
176176
.map(PyActorId::from))
177177
}
178178

179-
// Start monitoring the actor mesh by subscribing to its supervision events. For each supervision
180-
// event, it is consumed by PythonActorMesh first, then gets sent to the monitor for user to consume.
181-
fn monitor<'py>(&self, py: Python<'py>) -> PyResult<PyObject> {
182-
let receiver = self.user_monitor_sender.subscribe();
183-
let monitor_instance = PyActorMeshMonitor {
184-
receiver: SharedCell::from(Mutex::new(receiver)),
179+
fn supervise_port<'py>(
180+
&self,
181+
py: Python<'py>,
182+
receiver: &PythonPortReceiver,
183+
) -> PyResult<PyObject> {
184+
let rx = MonitoredPythonPortReceiver {
185+
inner: receiver.inner(),
186+
monitor: ActorMeshMonitor {
187+
receiver: SharedCell::from(Mutex::new(self.user_monitor_sender.subscribe())),
188+
},
185189
};
186-
Ok(monitor_instance.into_py(py))
190+
rx.into_py_any(py)
191+
}
192+
193+
fn supervise_once_port<'py>(
194+
&self,
195+
py: Python<'py>,
196+
receiver: &PythonOncePortReceiver,
197+
) -> PyResult<PyObject> {
198+
let rx = MonitoredPythonOncePortReceiver {
199+
inner: receiver.inner(),
200+
monitor: ActorMeshMonitor {
201+
receiver: SharedCell::from(Mutex::new(self.user_monitor_sender.subscribe())),
202+
},
203+
};
204+
rx.into_py_any(py)
187205
}
188206

189207
#[pyo3(signature = (**kwargs))]
@@ -335,83 +353,46 @@ impl Drop for PythonActorMesh {
335353
}
336354
}
337355

338-
#[pyclass(
339-
name = "ActorMeshMonitor",
340-
module = "monarch._rust_bindings.monarch_hyperactor.actor_mesh"
341-
)]
342-
pub struct PyActorMeshMonitor {
356+
#[derive(Debug, Clone)]
357+
struct ActorMeshMonitor {
343358
receiver: SharedCell<Mutex<tokio::sync::broadcast::Receiver<Option<ActorSupervisionEvent>>>>,
344359
}
345360

346-
#[pymethods]
347-
impl PyActorMeshMonitor {
348-
fn __aiter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
349-
slf
350-
}
351-
352-
pub fn __anext__(&self, py: Python<'_>) -> PyResult<PyObject> {
361+
impl ActorMeshMonitor {
362+
pub async fn next(&self) -> PyActorSupervisionEvent {
353363
let receiver = self.receiver.clone();
354-
Ok(pyo3_async_runtimes::tokio::future_into_py(py, get_next(receiver))?.into())
355-
}
356-
}
357-
358-
impl PyActorMeshMonitor {
359-
pub async fn next(&self) -> PyResult<PyObject> {
360-
get_next(self.receiver.clone()).await
361-
}
362-
}
363-
364-
impl Clone for PyActorMeshMonitor {
365-
fn clone(&self) -> Self {
366-
Self {
367-
receiver: self.receiver.clone(),
364+
let receiver = receiver
365+
.borrow()
366+
.expect("`Actor mesh receiver` is shutdown");
367+
let mut receiver = receiver.lock().await;
368+
let event = receiver.recv().await.unwrap();
369+
match event {
370+
None => PyActorSupervisionEvent {
371+
// Dummy actor as place holder to indicate the whole mesh is stopped
372+
// TODO(albertli): remove this when pushing all supervision logic to rust.
373+
actor_id: id!(default[0].actor[0]).into(),
374+
actor_status: "actor mesh is stopped due to proc mesh shutdown".to_string(),
375+
},
376+
Some(event) => PyActorSupervisionEvent::from(event.clone()),
368377
}
369378
}
370379
}
371380

372-
async fn get_next(
373-
receiver: SharedCell<Mutex<tokio::sync::broadcast::Receiver<Option<ActorSupervisionEvent>>>>,
374-
) -> PyResult<PyObject> {
375-
let receiver = receiver.clone();
376-
377-
let receiver = receiver
378-
.borrow()
379-
.expect("`Actor mesh receiver` is shutdown");
380-
let mut receiver = receiver.lock().await;
381-
let event = receiver.recv().await.unwrap();
382-
383-
let supervision_event = match event {
384-
None => PyActorSupervisionEvent {
385-
// Dummy actor as place holder to indicate the whole mesh is stopped
386-
// TODO(albertli): remove this when pushing all supervision logic to rust.
387-
actor_id: id!(default[0].actor[0]).into(),
388-
actor_status: "actor mesh is stopped due to proc mesh shutdown".to_string(),
389-
},
390-
Some(event) => PyActorSupervisionEvent::from(event.clone()),
391-
};
392-
393-
Python::with_gil(|py| supervision_event.into_py_any(py))
394-
}
395-
396-
// TODO(albertli): this is temporary remove this when pushing all supervision logic to rust.
381+
// Values of this (private) type can only be created by calling
382+
// `PythonActorMesh::supervise_port()`.
397383
#[pyclass(
398384
name = "MonitoredPortReceiver",
399385
module = "monarch._rust_bindings.monarch_hyperactor.actor_mesh"
400386
)]
401-
pub(super) struct MonitoredPythonPortReceiver {
387+
struct MonitoredPythonPortReceiver {
402388
inner: Arc<tokio::sync::Mutex<PortReceiver<PythonMessage>>>,
403-
monitor: PyActorMeshMonitor,
389+
monitor: ActorMeshMonitor,
404390
}
405391

406392
#[pymethods]
407393
impl MonitoredPythonPortReceiver {
408-
#[new]
409-
fn new(receiver: &PythonPortReceiver, monitor: &PyActorMeshMonitor) -> Self {
410-
let inner = receiver.inner();
411-
MonitoredPythonPortReceiver {
412-
inner,
413-
monitor: monitor.clone(),
414-
}
394+
fn __repr__(&self) -> &'static str {
395+
"<MonitoredPortReceiver>"
415396
}
416397

417398
fn recv<'py>(&mut self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
@@ -424,7 +405,7 @@ impl MonitoredPythonPortReceiver {
424405
result.map_err(|err| PyErr::new::<PyEOFError, _>(format!("port closed: {}", err)))
425406
}
426407
event = monitor.next() => {
427-
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event.unwrap())))
408+
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event)))
428409
}
429410
}
430411
})
@@ -440,31 +421,28 @@ impl MonitoredPythonPortReceiver {
440421
result.map_err(|err| PyErr::new::<PyEOFError, _>(format!("port closed: {}", err)))
441422
}
442423
event = monitor.next() => {
443-
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event.unwrap())))
424+
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event)))
444425
}
445426
}
446427
})?
447428
}
448429
}
449430

431+
// Values of this (private) type can only be created by calling
432+
// `PythonActorMesh::supervise_once_port()`.
450433
#[pyclass(
451434
name = "MonitoredOncePortReceiver",
452435
module = "monarch._rust_bindings.monarch_hyperactor.actor_mesh"
453436
)]
454-
pub(super) struct MonitoredPythonOncePortReceiver {
437+
struct MonitoredPythonOncePortReceiver {
455438
inner: Arc<std::sync::Mutex<Option<OncePortReceiver<PythonMessage>>>>,
456-
monitor: PyActorMeshMonitor,
439+
monitor: ActorMeshMonitor,
457440
}
458441

459442
#[pymethods]
460443
impl MonitoredPythonOncePortReceiver {
461-
#[new]
462-
fn new(receiver: &PythonOncePortReceiver, monitor: &PyActorMeshMonitor) -> Self {
463-
let inner = receiver.inner();
464-
MonitoredPythonOncePortReceiver {
465-
inner,
466-
monitor: monitor.clone(),
467-
}
444+
fn __repr__(&self) -> &'static str {
445+
"<MonitoredOncePortReceiver>"
468446
}
469447

470448
fn recv<'py>(&mut self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
@@ -478,7 +456,7 @@ impl MonitoredPythonOncePortReceiver {
478456
result.map_err(|err| PyErr::new::<PyEOFError, _>(format!("port closed: {}", err)))
479457
}
480458
event = monitor.next() => {
481-
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event.unwrap())))
459+
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event)))
482460
}
483461
}
484462
})
@@ -495,7 +473,7 @@ impl MonitoredPythonOncePortReceiver {
495473
result.map_err(|err| PyErr::new::<PyEOFError, _>(format!("port closed: {}", err)))
496474
}
497475
event = monitor.next() => {
498-
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event.unwrap())))
476+
Err(PyErr::new::<SupervisionError, _>(format!("supervision error: {:?}", event)))
499477
}
500478
}
501479
})?
@@ -506,6 +484,7 @@ impl MonitoredPythonOncePortReceiver {
506484
name = "ActorSupervisionEvent",
507485
module = "monarch._rust_bindings.monarch_hyperactor.actor_mesh"
508486
)]
487+
#[derive(Debug)]
509488
pub struct PyActorSupervisionEvent {
510489
/// Actor ID of the actor where supervision event originates from.
511490
#[pyo3(get)]
@@ -538,7 +517,6 @@ impl From<ActorSupervisionEvent> for PyActorSupervisionEvent {
538517
pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> {
539518
hyperactor_mod.add_class::<PythonActorMesh>()?;
540519
hyperactor_mod.add_class::<PythonActorMeshRef>()?;
541-
hyperactor_mod.add_class::<PyActorMeshMonitor>()?;
542520
hyperactor_mod.add_class::<MonitoredPythonPortReceiver>()?;
543521
hyperactor_mod.add_class::<MonitoredPythonOncePortReceiver>()?;
544522
hyperactor_mod.add_class::<PyActorSupervisionEvent>()?;

python/monarch/_rust_bindings/monarch_hyperactor/actor_mesh.pyi

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,15 @@ class PythonActorMesh:
8888
"""
8989
...
9090

91-
# TODO(albertli): remove this when pushing all supervision logic to Rust
92-
def monitor(self) -> ActorMeshMonitor:
91+
def supervise_port(self, r: PortReceiver) -> MonitoredPortReceiver:
9392
"""
94-
Returns a supervision monitor for this mesh.
93+
Return a monitored port receiver.
94+
"""
95+
...
96+
97+
def supervise_once_port(self, r: OncePortReceiver) -> MonitoredOncePortReceiver:
98+
"""
99+
Return a monitored once port receiver.
95100
"""
96101
...
97102

@@ -113,31 +118,11 @@ class PythonActorMesh:
113118
"""
114119
...
115120

116-
@final
117-
class ActorMeshMonitor:
118-
def __aiter__(self) -> AsyncIterator["ActorSupervisionEvent"]:
119-
"""
120-
Returns an async iterator for this monitor.
121-
"""
122-
...
123-
124-
async def __anext__(self) -> "ActorSupervisionEvent":
125-
"""
126-
Returns the next proc event in the proc mesh.
127-
"""
128-
...
129-
130121
@final
131122
class MonitoredPortReceiver:
123+
"""A monitored receiver to which PythonMessages are sent. Values
124+
of this type cannot be constructed directly in Python.
132125
"""
133-
A monitored receiver to which PythonMessages are sent.
134-
"""
135-
136-
def __init__(self, receiver: PortReceiver, monitor: ActorMeshMonitor) -> None:
137-
"""
138-
Create a new monitored receiver from a PortReceiver.
139-
"""
140-
...
141126

142127
async def recv(self) -> PythonMessage:
143128
"""Receive a PythonMessage from the port's sender."""
@@ -148,15 +133,9 @@ class MonitoredPortReceiver:
148133

149134
@final
150135
class MonitoredOncePortReceiver:
136+
"""A monitored once receiver to which PythonMessages are sent.
137+
Values of this type cannot be constructed directly in Python.
151138
"""
152-
A variant of monitored PortReceiver that can only receive a single message.
153-
"""
154-
155-
def __init__(self, receiver: OncePortReceiver, monitor: ActorMeshMonitor) -> None:
156-
"""
157-
Create a new monitored receiver from a PortReceiver.
158-
"""
159-
...
160139

161140
async def recv(self) -> PythonMessage:
162141
"""Receive a single PythonMessage from the port's sender."""

0 commit comments

Comments
 (0)