[PROD-1968] Add monitor_once functionality to library (#116)

singhals · web-flow · commit a74539421da8 · 2024-05-24T22:12:07.000-04:00
* Add monitor_once functionality to library

* Track in progress cluster object
diff --git a/sync/__init__.py b/sync/__init__.py
@@ -1,5 +1,5 @@
 """Library for leveraging the power of Sync"""
 
-__version__ = "1.7.0"
+__version__ = "1.8.0"
 
 TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py
@@ -64,6 +64,7 @@
     "get_cluster_report",
     "get_all_cluster_events",
     "monitor_cluster",
+    "monitor_once",
     "create_cluster",
     "get_cluster",
     "handle_successful_job_run",
@@ -85,7 +86,6 @@
     "apply_project_recommendation",
 ]
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -268,7 +268,6 @@ def _create_cluster_report(
 
 
 def _load_aws_cluster_info(cluster: dict) -> Tuple[Response[dict], Response[dict]]:
-
     cluster_info = None
     cluster_id = None
     cluster_log_dest = _cluster_log_destination(cluster)
@@ -312,7 +311,6 @@ def _load_aws_cluster_info(cluster: dict) -> Tuple[Response[dict], Response[dict
 
 
 def _get_aws_cluster_info(cluster: dict) -> Tuple[Response[dict], Response[dict], Response[dict]]:
-
     aws_region_name = DB_CONFIG.aws_region_name
 
     cluster_info, cluster_id = _load_aws_cluster_info(cluster)
@@ -394,7 +392,6 @@ def _monitor_cluster(
     kill_on_termination: bool = False,
     write_function=None,
 ) -> None:
-
     (log_url, filesystem, bucket, base_prefix) = cluster_log_destination
     # If the event log destination is just a *bucket* without any sub-path, then we don't want to include
     #  a leading `/` in our Prefix (which will make it so that we never actually find the event log), so
@@ -458,6 +455,42 @@ def _monitor_cluster(
         sleep(polling_period)
 
 
+def monitor_once(cluster_id: str, in_progress_cluster={}):
+    all_inst_by_id = in_progress_cluster.get("all_inst_by_id") or {}
+    active_timelines_by_id = in_progress_cluster.get("active_timelines_by_id") or {}
+    retired_timelines = in_progress_cluster.get("retired_timelines") or []
+    recorded_volumes_by_id = in_progress_cluster.get("recorded_volumes_by_id") or {}
+
+    aws_region_name = DB_CONFIG.aws_region_name
+    ec2 = boto.client("ec2", region_name=aws_region_name)
+
+    current_insts = _get_ec2_instances(cluster_id, ec2)
+    recorded_volumes_by_id.update(
+        {v["VolumeId"]: v for v in _get_ebs_volumes_for_instances(current_insts, ec2)}
+    )
+
+    # Record new (or overwrite) existing instances.
+    # Separately record the ids of those that are in the "running" state.
+    running_inst_ids = set({})
+    for inst in current_insts:
+        all_inst_by_id[inst["InstanceId"]] = inst
+        if inst["State"]["Name"] == "running":
+            running_inst_ids.add(inst["InstanceId"])
+
+    active_timelines_by_id, new_retired_timelines = _update_monitored_timelines(
+        running_inst_ids, active_timelines_by_id
+    )
+
+    retired_timelines.extend(new_retired_timelines)
+
+    return {
+        "all_inst_by_id": all_inst_by_id,
+        "active_timelines_by_id": active_timelines_by_id,
+        "retired_timelines": retired_timelines,
+        "recorded_volumes_by_id": recorded_volumes_by_id,
+    }
+
+
 def _define_write_file(file_key, filesystem, bucket, write_function):
     if filesystem == "lambda":
 
@@ -499,7 +532,6 @@ def write_file(body: bytes):
 
 
 def _get_ec2_instances(cluster_id: str, ec2_client: "botocore.client.ec2") -> List[dict]:
-
     filters = [
         {"Name": "tag:Vendor", "Values": ["Databricks"]},
         {"Name": "tag:ClusterId", "Values": [cluster_id]},
diff --git a/sync/azuredatabricks.py b/sync/azuredatabricks.py
@@ -63,6 +63,7 @@
     "get_access_report",
     "run_and_record_job",
     "monitor_cluster",
+    "monitor_once",
     "create_cluster",
     "get_cluster",
     "create_submission_for_run",
@@ -89,7 +90,6 @@
     "apply_project_recommendation",
 ]
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -287,7 +287,6 @@ def _get_cluster_instances(cluster: dict) -> Response[dict]:
     # If this cluster does not have the "Sync agent" configured, attempt a best-effort snapshot of the instances that
     #  are associated with this cluster
     if not cluster_instances:
-
         resource_group_name = _get_databricks_resource_group_name()
 
         compute = _get_azure_client(ComputeManagementClient)
@@ -422,6 +421,34 @@ def _monitor_cluster(
         sleep(polling_period)
 
 
+def monitor_once(cluster_id: str, in_progress_cluster={}):
+    all_vms_by_id = in_progress_cluster.get("all_vms_by_id") or {}
+    active_timelines_by_id = in_progress_cluster.get("active_timelines_by_id") or {}
+    retired_timelines = in_progress_cluster.get("retired_timelines") or []
+
+    resource_group_name = _get_databricks_resource_group_name()
+    if not resource_group_name:
+        logger.warning("Failed to find Databricks managed resource group")
+
+    compute = _get_azure_client(ComputeManagementClient)
+
+    running_vms_by_id = _get_running_vms_by_id(compute, resource_group_name, cluster_id)
+
+    for vm in running_vms_by_id.values():
+        all_vms_by_id[vm["name"]] = vm
+
+    active_timelines_by_id, new_retired_timelines = _update_monitored_timelines(
+        set(running_vms_by_id.keys()), active_timelines_by_id
+    )
+    retired_timelines.extend(new_retired_timelines)
+
+    return {
+        "all_vms_by_id": all_vms_by_id,
+        "active_timelines_by_id": active_timelines_by_id,
+        "retired_timelines": retired_timelines,
+    }
+
+
 def _define_write_file(file_key, filesystem, write_function):
     if filesystem == "lambda":
 
@@ -469,7 +496,6 @@ def _get_databricks_resource_group_name() -> str:
 _azure_credential = None
 _azure_subscription_id = None
 
-
 AzureClient = TypeVar("AzureClient")
 
 
@@ -519,7 +545,6 @@ def _get_azure_subscription_id():
 def _get_running_vms_by_id(
     compute: AzureClient, resource_group_name: Optional[str], cluster_id: str
 ) -> Dict[str, dict]:
-
     if resource_group_name:
         vms = compute.virtual_machines.list(resource_group_name=resource_group_name)
     else: