Merge pull request #121 from synccomputingcode/PROD-2015/collect-cluster-report-refac

romainissynced · web-flow · commit dfe489914f92 · 2024-06-24T10:11:29.000-06:00
[PROD-2015] Add save_cluster_report and refactor _monitor_cluster
diff --git a/sync/__init__.py b/sync/__init__.py
@@ -1,5 +1,5 @@
 """Library for leveraging the power of Sync"""
 
-__version__ = "1.8.4"
+__version__ = "1.9.0"
 
 TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py
@@ -2,7 +2,7 @@
 import logging
 from pathlib import Path
 from time import sleep
-from typing import Generator, List, Tuple
+from typing import Dict, Generator, List, Optional, Tuple
 from urllib.parse import urlparse
 
 import boto3 as boto
@@ -84,6 +84,7 @@
     "wait_for_run_and_cluster",
     "terminate_cluster",
     "apply_project_recommendation",
+    "save_cluster_report",
 ]
 
 logger = logging.getLogger(__name__)
@@ -348,6 +349,62 @@ def _get_aws_cluster_info_from_s3(bucket: str, file_key: str, cluster_id):
         logger.warning(f"Failed to retrieve cluster info from S3 with key, '{file_key}': {err}")
 
 
+def save_cluster_report(
+    cluster_id: str,
+    instance_timelines: List[dict],
+    cluster_log_destination: Optional[Tuple[str, ...]] = None,
+    cluster_report_destination_override: Optional[Dict[str, str]] = None,
+    write_function=None,
+) -> bool:
+    cluster = get_default_client().get_cluster(cluster_id)
+    spark_context_id = cluster.get("spark_context_id")
+
+    if not spark_context_id:
+        return False
+
+    cluster_log_destination = cluster_log_destination or _cluster_log_destination(cluster)
+    (log_url, filesystem, bucket, base_prefix) = cluster_log_destination
+
+    if cluster_report_destination_override:
+        filesystem = cluster_report_destination_override.get("filesystem", filesystem)
+        base_prefix = cluster_report_destination_override.get("base_prefix", base_prefix)
+
+    if not log_url and not cluster_report_destination_override:
+        logger.warning(
+            "Unable to save cluster report due to missing cluster log destination - exiting"
+        )
+        return False
+
+    # If the event log destination is just a *bucket* without any sub-path, then we don't want to include
+    #  a leading `/` in our Prefix (which will make it so that we never actually find the event log), so
+    #  we make sure to re-strip our final Prefix
+    file_key = f"{base_prefix}/sync_data/{spark_context_id}/aws_cluster_info.json".strip("/")
+
+    aws_region_name = DB_CONFIG.aws_region_name
+    ec2 = boto.client("ec2", region_name=aws_region_name)
+
+    current_insts = _get_ec2_instances(cluster_id, ec2)
+    ebs_volumes = _get_ebs_volumes_for_instances(current_insts, ec2)
+
+    write_file = _define_write_file(file_key, filesystem, bucket, write_function)
+
+    write_file(
+        bytes(
+            json.dumps(
+                {
+                    "instances": current_insts,
+                    "instance_timelines": instance_timelines,
+                    "volumes": ebs_volumes,
+                },
+                cls=DefaultDateTimeEncoder,
+            ),
+            "utf-8",
+        )
+    )
+
+    return True
+
+
 def monitor_cluster(
     cluster_id: str,
     polling_period: int = 20,
@@ -375,7 +432,6 @@ def monitor_cluster(
         _monitor_cluster(
             (log_url, filesystem, bucket, base_prefix),
             cluster_id,
-            spark_context_id,
             polling_period,
             kill_on_termination,
             write_function,
@@ -387,40 +443,26 @@ def monitor_cluster(
 def _monitor_cluster(
     cluster_log_destination,
     cluster_id: str,
-    spark_context_id: int,
     polling_period: int,
     kill_on_termination: bool = False,
     write_function=None,
 ) -> None:
-    (log_url, filesystem, bucket, base_prefix) = cluster_log_destination
-    # If the event log destination is just a *bucket* without any sub-path, then we don't want to include
-    #  a leading `/` in our Prefix (which will make it so that we never actually find the event log), so
-    #  we make sure to re-strip our final Prefix
-    file_key = f"{base_prefix}/sync_data/{spark_context_id}/aws_cluster_info.json".strip("/")
-
     aws_region_name = DB_CONFIG.aws_region_name
     ec2 = boto.client("ec2", region_name=aws_region_name)
 
-    write_file = _define_write_file(file_key, filesystem, bucket, write_function)
-
-    all_inst_by_id = {}
     active_timelines_by_id = {}
     retired_timelines = []
-    recorded_volumes_by_id = {}
 
     while_condition = True
+
     while while_condition:
         try:
             current_insts = _get_ec2_instances(cluster_id, ec2)
-            recorded_volumes_by_id.update(
-                {v["VolumeId"]: v for v in _get_ebs_volumes_for_instances(current_insts, ec2)}
-            )
 
             # Record new (or overwrite) existing instances.
             # Separately record the ids of those that are in the "running" state.
             running_inst_ids = set({})
             for inst in current_insts:
-                all_inst_by_id[inst["InstanceId"]] = inst
                 if inst["State"]["Name"] == "running":
                     running_inst_ids.add(inst["InstanceId"])
 
@@ -431,24 +473,19 @@ def _monitor_cluster(
             retired_timelines.extend(new_retired_timelines)
             all_timelines = retired_timelines + list(active_timelines_by_id.values())
 
-            write_file(
-                bytes(
-                    json.dumps(
-                        {
-                            "instances": list(all_inst_by_id.values()),
-                            "instance_timelines": all_timelines,
-                            "volumes": list(recorded_volumes_by_id.values()),
-                        },
-                        cls=DefaultDateTimeEncoder,
-                    ),
-                    "utf-8",
-                )
+            save_cluster_report(
+                cluster_id,
+                all_timelines,
+                cluster_log_destination=cluster_log_destination,
+                write_function=write_function,
             )
 
             if kill_on_termination:
                 cluster_state = get_default_client().get_cluster(cluster_id).get("state")
+
                 if cluster_state == "TERMINATED":
                     while_condition = False
+
         except Exception as e:
             logger.error(f"Exception encountered while polling cluster: {e}")
 
diff --git a/tests/test_awsdatabricks.py b/tests/test_awsdatabricks.py
@@ -41,7 +41,6 @@ def test_monitor_cluster_with_override(
         mock_monitor_cluster.assert_called_with(
             expected_log_destination_override,
             "0101-214342-tpi6qdp2",
-            1443449481634833945,
             1,
             True,
             None,
@@ -53,7 +52,6 @@ def test_monitor_cluster_with_override(
         mock_monitor_cluster.assert_called_with(
             expected_log_destination_override,
             "0101-214342-tpi6qdp2",
-            1443449481634833945,
             1,
             True,
             None,
@@ -77,7 +75,6 @@ def test_monitor_cluster_without_override(
         mock_monitor_cluster.assert_called_with(
             mock_cluster_log_destination.return_value,
             "0101-214342-tpi6qdp2",
-            1443449481634833945,
             1,
             False,
             None,