Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4b0e884
First round of changes for Polaris. Needs debugging and testing.
rickybalin Oct 28, 2022
012ecf0
Fixed bugs. Clustered and co-located DB tests run on Polaris.
rickybalin Oct 28, 2022
b0ecaca
Added cpu binding options with mpiexec
rickybalin Nov 2, 2022
b91c550
Merge remote-tracking branch 'upstream/develop' into develop
rickybalin Dec 21, 2022
5d2f82d
Correction to flag setting number of tasks for PalsMpiexecSettings
rickybalin Dec 21, 2022
dd67114
Removed mpiexecStep.py, no longer needed after merge with upstream Sm…
rickybalin Dec 21, 2022
a11464b
Merged with SmartSim upstream branch
rickybalin May 3, 2023
52ff300
Added option to specify affinity script to PALS mpiexec settings. Nee…
rickybalin May 3, 2023
f024ec4
Modified affinity script setting to include optional arguments
rickybalin Aug 30, 2023
5f90163
Merge branch 'develop' into develop
rickybalin Oct 12, 2023
f0fcf5c
Updated affinity script changes to have type defs and hints
rickybalin Oct 16, 2023
315009d
Added test for Pals affinity script option
rickybalin Oct 18, 2023
9f81b95
Merged with SmartSim develop official
Feb 21, 2024
8df7ead
Modified buildenv.py to take my fork of RedisAI which updates to C++ …
Feb 21, 2024
11bfb3d
Merge branch 'CrayLabs:develop' into develop
rickybalin Apr 26, 2024
6e56a70
Synced with SmartSim develop branch
rickybalin Jun 10, 2024
896a805
Merge pull request #1 from rickybalin/develop_full_sync
rickybalin Jun 10, 2024
6217a26
Added feature to pals settings to add any mpiexec argument
Oct 21, 2024
69d2ef2
Add a minitor flag to experiment start so can select which jobs to mo…
Feb 28, 2025
d981269
Clean up
Jul 2, 2025
f477030
Update docstrings and add test for set_launcher_args() in PALS settings
Jul 2, 2025
6327338
Fix type
Jul 2, 2025
7426dbc
Fix typo
Jul 2, 2025
058d0aa
Fix line length error
Jul 2, 2025
93d79d9
Formatting changes from make style
Jul 2, 2025
683d733
Make style
al-rigazzi Jul 4, 2025
930ca5a
Merge remote-tracking branch 'origin/develop' into pr/rickybalin/788
al-rigazzi Jul 4, 2025
b01cc78
Merge branch 'CrayLabs:develop' into feature/monitor_model
rickybalin Oct 9, 2025
55a02c0
Update changelog.md
rickybalin Oct 9, 2025
04a78c6
Add the new monitor parameter to the docstring of experiment.start()
rickybalin Oct 9, 2025
292a529
Fix format
rickybalin Oct 9, 2025
9826d08
Add monitor argument to start_wo_job_manager()
rickybalin Oct 9, 2025
c2c645b
Add monitor argument to launch_step_nop
rickybalin Oct 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ Description

Detailed Notes

- Enable control over monitoring of Models launched with `experiment.start()` by
adding an optional boolean argument determining whether to monitor the particular
model or not. The argument is set to True by default, so no changes are needed for
the default behavior of monitoring all Models launched.
([SmartSim-PR788](https://github.com/CrayLabs/SmartSim/pull/788))
- Copyright headers have been updated from "2021-2024" to "2021-2025" across 271 files
including Python source files, configuration files, documentation, tests, Docker files,
shell scripts, and other supporting files to reflect the new year.
Expand Down
2 changes: 1 addition & 1 deletion smartsim/_core/_cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test_install(

@contextlib.contextmanager
def _env_vars_set_to(
evars: t.Mapping[str, t.Optional[str]]
evars: t.Mapping[str, t.Optional[str]],
) -> t.Generator[None, None, None]:
envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items())
for var, _, tmpval in envvars:
Expand Down
14 changes: 9 additions & 5 deletions smartsim/_core/control/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def start(
manifest: Manifest,
block: bool = True,
kill_on_interrupt: bool = True,
monitor: bool = True,
) -> None:
"""Start the passed SmartSim entities

Expand All @@ -134,7 +135,7 @@ def start(
SignalInterceptionStack.get(signal.SIGINT).push_unique(
self._jobs.signal_interrupt
)
launched = self._launch(exp_name, exp_path, manifest)
launched = self._launch(exp_name, exp_path, manifest, monitor)

# start the job manager thread if not already started
if not self._jobs.actively_monitoring:
Expand Down Expand Up @@ -172,7 +173,7 @@ def poll(
:param kill_on_interrupt: flag for killing jobs when SIGINT is received
"""
self._jobs.kill_on_interrupt = kill_on_interrupt
to_monitor = self._jobs.jobs
to_monitor = self._jobs.monitor_jobs
while len(to_monitor) > 0:
time.sleep(interval)

Expand Down Expand Up @@ -388,7 +389,7 @@ def symlink_output_files(
)

def _launch(
self, exp_name: str, exp_path: str, manifest: Manifest
self, exp_name: str, exp_path: str, manifest: Manifest, monitor: bool = True
) -> LaunchedManifest[t.Tuple[str, Step]]:
"""Main launching function of the controller

Expand All @@ -398,6 +399,7 @@ def _launch(
:param exp_name: The name of the launching experiment
:param exp_path: path to location of ``Experiment`` directory if generated
:param manifest: Manifest of deployables to launch
:param monitor: boolean to signal whether to monitor deployables
"""

manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]](
Expand Down Expand Up @@ -479,7 +481,7 @@ def _launch(

# launch and symlink steps
for step, entity in steps:
self._launch_step(step, entity)
self._launch_step(step, entity, monitor)
self.symlink_output_files(step, entity)

# symlink substeps to maintain directory structure
Expand Down Expand Up @@ -570,11 +572,13 @@ def _launch_step(
self,
job_step: Step,
entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]],
monitor: bool = True,
) -> None:
"""Use the launcher to launch a job step

:param job_step: a job step instance
:param entity: entity instance
:param monitor: boolean determining whether to monitor job
:raises SmartSimError: if launch fails
"""
# attempt to retrieve entity name in JobManager.completed
Expand Down Expand Up @@ -622,7 +626,7 @@ def _launch_step(
self._jobs.restart_job(job_step.name, job_id, entity.name, is_task)
else:
logger.debug(f"Launching {entity.name}")
self._jobs.add_job(job_step.name, job_id, entity, is_task)
self._jobs.add_job(job_step.name, job_id, entity, is_task, monitor)

def _create_batch_job_step(
self,
Expand Down
7 changes: 7 additions & 0 deletions smartsim/_core/control/jobmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None:

# active jobs
self.jobs: t.Dict[str, Job] = {}
self.monitor_jobs: t.Dict[str, Job] = {}
self.db_jobs: t.Dict[str, Job] = {}

# completed jobs
Expand Down Expand Up @@ -133,6 +134,8 @@ def move_to_completed(self, job: Job) -> None:
del self.db_jobs[job.ename]
elif job.ename in self.jobs:
del self.jobs[job.ename]
if job.ename in self.monitor_jobs:
del self.monitor_jobs[job.ename]

def __getitem__(self, entity_name: str) -> Job:
"""Return the job associated with the name of the entity
Expand Down Expand Up @@ -166,12 +169,14 @@ def add_job(
job_id: t.Optional[str],
entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity],
is_task: bool = True,
monitor: bool = True,
) -> None:
"""Add a job to the job manager which holds specific jobs by type.

:param job_name: name of the job step
:param job_id: job step id created by launcher
:param entity: entity that was launched on job step
:param monitor: boolean to monitor job
:param is_task: process monitored by TaskManager (True) or the WLM (True)
"""
launcher = str(self._launcher)
Expand All @@ -183,6 +188,8 @@ def add_job(
self.db_jobs[entity.name] = job
else:
self.jobs[entity.name] = job
if monitor:
self.monitor_jobs[entity.name] = job

def is_finished(self, entity: SmartSimEntity) -> bool:
"""Detect if a job has completed
Expand Down
2 changes: 1 addition & 1 deletion smartsim/_core/control/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def finalize(self) -> LaunchedManifest[_T]:


def _format_exp_telemetry_path(
exp_path: t.Union[str, "os.PathLike[str]"]
exp_path: t.Union[str, "os.PathLike[str]"],
) -> pathlib.Path:
return pathlib.Path(exp_path, CONFIG.telemetry_subdir)

Expand Down
2 changes: 1 addition & 1 deletion smartsim/_core/entrypoints/telemetrymonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@


def register_signal_handlers(
handle_signal: t.Callable[[int, t.Optional[FrameType]], None]
handle_signal: t.Callable[[int, t.Optional[FrameType]], None],
) -> None:
"""Register a signal handling function for all termination events

Expand Down
7 changes: 7 additions & 0 deletions smartsim/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ def start(
block: bool = True,
summary: bool = False,
kill_on_interrupt: bool = True,
monitor: bool = True,
) -> None:
"""Start passed instances using Experiment launcher

Expand Down Expand Up @@ -229,11 +230,16 @@ def start(
that all jobs launched by this experiment will be killed, and the
zombie processes will need to be manually killed.

If `monitor=True`, all the jobs being started will be monitored
by the Controller. If `monitor=True`, the jobs will not be
monitored, meaning that their status will not be reported.

:param block: block execution until all non-database
jobs are finished
:param summary: print a launch summary prior to launch
:param kill_on_interrupt: flag for killing jobs when ^C (SIGINT)
signal is received.
:param monitor: monitor the jobs being started
"""
start_manifest = Manifest(*args)
self._create_entity_dir(start_manifest)
Expand All @@ -246,6 +252,7 @@ def start(
manifest=start_manifest,
block=block,
kill_on_interrupt=kill_on_interrupt,
monitor=monitor,
)
except SmartSimError as e:
logger.error(e)
Expand Down
10 changes: 10 additions & 0 deletions smartsim/settings/palsSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,16 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
)
self.run_args["transfer"] = None

def set_launcher_args(
self, arguments: t.Dict[str, t.Union[int, str, float, None]]
) -> None:
"""Set any other task launcher argument

:param arguments: dictionary with string name and value
"""
for name, value in arguments.items():
self.run_args[name] = value

def set_walltime(self, walltime: str) -> None:
"""Set the maximum number of seconds that a job will run

Expand Down
10 changes: 8 additions & 2 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,18 @@ def _monkeypatch_exp_controller(exp):
entity_steps = []

def start_wo_job_manager(
self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True
self,
exp_name,
exp_path,
manifest,
block=True,
kill_on_interrupt=True,
monitor=True,
):
self._launch(exp_name, exp_path, manifest)
return LaunchedManifestBuilder("name", "path", "launcher").finalize()

def launch_step_nop(self, step, entity):
def launch_step_nop(self, step, entity, monitor):
entity_steps.append((step, entity))

monkeypatch.setattr(
Expand Down
6 changes: 6 additions & 0 deletions tests/test_pals_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ def turn_off_telemetry_indirect(monkeypatch):
# func(None)


def test_set_launcher_args():
settings = PalsMpiexecSettings(default_exe, **default_kwargs)
settings.set_launcher_args({"mem-bind": "none", "line-buffer": ""})
assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"]


def test_affinity_script():
settings = PalsMpiexecSettings(default_exe, **default_kwargs)
settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)
Expand Down
Loading