CrayLabs · rickybalin · Oct 28, 2022 · Oct 28, 2022 · Nov 2, 2022 · Dec 21, 2022
diff --git a/doc/changelog.md b/doc/changelog.md
@@ -23,6 +23,11 @@ Description
 
 Detailed Notes
 
+- Enable control over monitoring of Models launched with `experiment.start()` by 
+  adding an optional boolean argument determining whether to monitor the particular 
+  model or not. The argument is set to True by default, so no changes are needed for 
+  the default behavior of monitoring all Models launched.
+  ([SmartSim-PR788](https://github.com/CrayLabs/SmartSim/pull/788))
 - Copyright headers have been updated from "2021-2024" to "2021-2025" across 271 files
   including Python source files, configuration files, documentation, tests, Docker files,
   shell scripts, and other supporting files to reflect the new year.

diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py
@@ -171,7 +171,7 @@ def test_install(
 
 @contextlib.contextmanager
 def _env_vars_set_to(
-    evars: t.Mapping[str, t.Optional[str]]
+    evars: t.Mapping[str, t.Optional[str]],
 ) -> t.Generator[None, None, None]:
     envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items())
     for var, _, tmpval in envvars:

diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py
@@ -115,6 +115,7 @@ def start(
         manifest: Manifest,
         block: bool = True,
         kill_on_interrupt: bool = True,
+        monitor: bool = True,
     ) -> None:
         """Start the passed SmartSim entities
 
@@ -134,7 +135,7 @@ def start(
         SignalInterceptionStack.get(signal.SIGINT).push_unique(
             self._jobs.signal_interrupt
         )
-        launched = self._launch(exp_name, exp_path, manifest)
+        launched = self._launch(exp_name, exp_path, manifest, monitor)
 
         # start the job manager thread if not already started
         if not self._jobs.actively_monitoring:
@@ -172,7 +173,7 @@ def poll(
         :param kill_on_interrupt: flag for killing jobs when SIGINT is received
         """
         self._jobs.kill_on_interrupt = kill_on_interrupt
-        to_monitor = self._jobs.jobs
+        to_monitor = self._jobs.monitor_jobs
         while len(to_monitor) > 0:
             time.sleep(interval)
 
@@ -388,7 +389,7 @@ def symlink_output_files(
             )
 
     def _launch(
-        self, exp_name: str, exp_path: str, manifest: Manifest
+        self, exp_name: str, exp_path: str, manifest: Manifest, monitor: bool = True
     ) -> LaunchedManifest[t.Tuple[str, Step]]:
         """Main launching function of the controller
 
@@ -398,6 +399,7 @@ def _launch(
         :param exp_name: The name of the launching experiment
         :param exp_path: path to location of ``Experiment`` directory if generated
         :param manifest: Manifest of deployables to launch
+        :param monitor: boolean to signal whether to monitor deployables
         """
 
         manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]](
@@ -479,7 +481,7 @@ def _launch(
 
         # launch and symlink steps
         for step, entity in steps:
-            self._launch_step(step, entity)
+            self._launch_step(step, entity, monitor)
             self.symlink_output_files(step, entity)
 
         # symlink substeps to maintain directory structure
@@ -570,11 +572,13 @@ def _launch_step(
         self,
         job_step: Step,
         entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]],
+        monitor: bool = True,
     ) -> None:
         """Use the launcher to launch a job step
 
         :param job_step: a job step instance
         :param entity: entity instance
+        :param monitor: boolean determining whether to monitor job
         :raises SmartSimError: if launch fails
         """
         # attempt to retrieve entity name in JobManager.completed
@@ -622,7 +626,7 @@ def _launch_step(
             self._jobs.restart_job(job_step.name, job_id, entity.name, is_task)
         else:
             logger.debug(f"Launching {entity.name}")
-            self._jobs.add_job(job_step.name, job_id, entity, is_task)
+            self._jobs.add_job(job_step.name, job_id, entity, is_task, monitor)
 
     def _create_batch_job_step(
         self,

diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py
@@ -66,6 +66,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None:
 
         # active jobs
         self.jobs: t.Dict[str, Job] = {}
+        self.monitor_jobs: t.Dict[str, Job] = {}
         self.db_jobs: t.Dict[str, Job] = {}
 
         # completed jobs
@@ -133,6 +134,8 @@ def move_to_completed(self, job: Job) -> None:
                 del self.db_jobs[job.ename]
             elif job.ename in self.jobs:
                 del self.jobs[job.ename]
+                if job.ename in self.monitor_jobs:
+                    del self.monitor_jobs[job.ename]
 
     def __getitem__(self, entity_name: str) -> Job:
         """Return the job associated with the name of the entity
@@ -166,12 +169,14 @@ def add_job(
         job_id: t.Optional[str],
         entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity],
         is_task: bool = True,
+        monitor: bool = True,
     ) -> None:
         """Add a job to the job manager which holds specific jobs by type.
 
         :param job_name: name of the job step
         :param job_id: job step id created by launcher
         :param entity: entity that was launched on job step
+        :param monitor: boolean to monitor job
         :param is_task: process monitored by TaskManager (True) or the WLM (True)
         """
         launcher = str(self._launcher)
@@ -183,6 +188,8 @@ def add_job(
             self.db_jobs[entity.name] = job
         else:
             self.jobs[entity.name] = job
+            if monitor:
+                self.monitor_jobs[entity.name] = job
 
     def is_finished(self, entity: SmartSimEntity) -> bool:
         """Detect if a job has completed

diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py
@@ -310,7 +310,7 @@ def finalize(self) -> LaunchedManifest[_T]:
 
 
 def _format_exp_telemetry_path(
-    exp_path: t.Union[str, "os.PathLike[str]"]
+    exp_path: t.Union[str, "os.PathLike[str]"],
 ) -> pathlib.Path:
     return pathlib.Path(exp_path, CONFIG.telemetry_subdir)
 

diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py
@@ -49,7 +49,7 @@
 
 
 def register_signal_handlers(
-    handle_signal: t.Callable[[int, t.Optional[FrameType]], None]
+    handle_signal: t.Callable[[int, t.Optional[FrameType]], None],
 ) -> None:
     """Register a signal handling function for all termination events
 

diff --git a/smartsim/experiment.py b/smartsim/experiment.py
@@ -189,6 +189,7 @@ def start(
         block: bool = True,
         summary: bool = False,
         kill_on_interrupt: bool = True,
+        monitor: bool = True,
     ) -> None:
         """Start passed instances using Experiment launcher
 
@@ -229,11 +230,16 @@ def start(
         that all jobs launched by this experiment will be killed, and the
         zombie processes will need to be manually killed.
 
+        If `monitor=True`, all the jobs being started will be monitored
+        by the Controller. If `monitor=True`, the jobs will not be
+        monitored, meaning that their status will not be reported.
+
         :param block: block execution until all non-database
                        jobs are finished
         :param summary: print a launch summary prior to launch
         :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT)
                                   signal is received.
+        :param monitor: monitor the jobs being started
         """
         start_manifest = Manifest(*args)
         self._create_entity_dir(start_manifest)
@@ -246,6 +252,7 @@ def start(
                 manifest=start_manifest,
                 block=block,
                 kill_on_interrupt=kill_on_interrupt,
+                monitor=monitor,
             )
         except SmartSimError as e:
             logger.error(e)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
@@ -158,6 +158,16 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
             )
         self.run_args["transfer"] = None
 
+    def set_launcher_args(
+        self, arguments: t.Dict[str, t.Union[int, str, float, None]]
+    ) -> None:
+        """Set any other task launcher argument
+
+        :param arguments: dictionary with string name and value
+        """
+        for name, value in arguments.items():
+            self.run_args[name] = value
+
     def set_walltime(self, walltime: str) -> None:
         """Set the maximum number of seconds that a job will run
 

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -94,12 +94,18 @@ def _monkeypatch_exp_controller(exp):
         entity_steps = []
 
         def start_wo_job_manager(
-            self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True
+            self,
+            exp_name,
+            exp_path,
+            manifest,
+            block=True,
+            kill_on_interrupt=True,
+            monitor=True,
         ):
             self._launch(exp_name, exp_path, manifest)
             return LaunchedManifestBuilder("name", "path", "launcher").finalize()
 
-        def launch_step_nop(self, step, entity):
+        def launch_step_nop(self, step, entity, monitor):
             entity_steps.append((step, entity))
 
         monkeypatch.setattr(

diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py
@@ -67,6 +67,12 @@ def turn_off_telemetry_indirect(monkeypatch):
 #        func(None)
 
 
+def test_set_launcher_args():
+    settings = PalsMpiexecSettings(default_exe, **default_kwargs)
+    settings.set_launcher_args({"mem-bind": "none", "line-buffer": ""})
+    assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"]
+
+
 def test_affinity_script():
     settings = PalsMpiexecSettings(default_exe, **default_kwargs)
     settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)