[PROD-1044] Refactor and CLI update for easier Airflow integration (#32)

rmoneys · web-flow · commit b8475c061b91 · 2023-06-23T10:25:32.000-07:00
diff --git a/sync/__init__.py b/sync/__init__.py
@@ -1,4 +1,4 @@
 """Library for leveraging the power of Sync"""
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 
 TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py
@@ -482,33 +482,19 @@ def get_prediction_job(
     :return: job object with prediction applied to it
     :rtype: Response[dict]
     """
-    prediction_response = get_prediction(prediction_id)
-    prediction = prediction_response.result
-    if prediction:
-        job = get_default_client().get_job(job_id)
-        if "error_code" in job:
-            return Response(error=DatabricksAPIError(**job))
-
-        job_settings = job["settings"]
-        tasks = job_settings.get("tasks", [])
-        if tasks:
-            cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", []))
-            cluster = cluster_response.result
-            if cluster:
-                # num_workers/autoscale are mutually exclusive settings, and we are relying on our Prediction
-                #  Recommendations to set these appropriately. Since we may recommend a Static cluster (i.e. a cluster
-                #  with `num_workers`) for a cluster that was originally autoscaled, we want to make sure to remove this
-                #  prior configuration
-                if "num_workers" in cluster:
-                    del cluster["num_workers"]
-
-                if "autoscale" in cluster:
-                    del cluster["autoscale"]
-
-                prediction_cluster = _deep_update(
-                    cluster, prediction["solutions"][preference]["configuration"]
-                )
+    job = get_default_client().get_job(job_id)
+    if "error_code" in job:
+        return Response(error=DatabricksAPIError(**job))
 
+    job_settings = job["settings"]
+    tasks = job_settings.get("tasks", [])
+    if tasks:
+        cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", []))
+        cluster = cluster_response.result
+        if cluster:
+            prediction_cluster_response = get_prediction_cluster(cluster, prediction_id, preference)
+            prediction_cluster = prediction_cluster_response.result
+            if prediction_cluster:
                 cluster_key = tasks[0].get("job_cluster_key")
                 if cluster_key:
                     job_settings["job_clusters"] = [
@@ -517,10 +503,51 @@ def get_prediction_job(
                         if j.get("job_cluster_key") != cluster_key
                     ] + [{"job_cluster_key": cluster_key, "new_cluster": prediction_cluster}]
                 else:
+                    # For `new_cluster` definitions, Databricks will automatically assign the newly created cluster a name,
+                    # and will reject any run submissions where the `cluster_name` is pre-populated
+                    if "cluster_name" in prediction_cluster:
+                        del prediction_cluster["cluster_name"]
                     tasks[0]["new_cluster"] = prediction_cluster
                 return Response(result=job)
-            return cluster_response
-        return Response(error=DatabricksError(message="No task found in job"))
+            return prediction_cluster_response
+        return cluster_response
+    return Response(error=DatabricksError(message="No task found in job"))
+
+
+def get_prediction_cluster(
+    cluster: dict, prediction_id: str, preference: str = CONFIG.default_prediction_preference.value
+) -> Response[dict]:
+    """Apply the prediction to the provided cluster.
+
+    The cluster is updated with configuration from the prediction and returned in the result.
+
+    :param cluster: Databricks cluster object
+    :type cluster: dict
+    :param prediction_id: prediction ID
+    :type prediction_id: str
+    :param preference: preferred prediction solution, defaults to local configuration
+    :type preference: str, optional
+    :return: job object with prediction applied to it
+    :rtype: Response[dict]
+    """
+    prediction_response = get_prediction(prediction_id)
+    prediction = prediction_response.result
+    if prediction:
+        # num_workers/autoscale are mutually exclusive settings, and we are relying on our Prediction
+        #  Recommendations to set these appropriately. Since we may recommend a Static cluster (i.e. a cluster
+        #  with `num_workers`) for a cluster that was originally autoscaled, we want to make sure to remove this
+        #  prior configuration
+        if "num_workers" in cluster:
+            del cluster["num_workers"]
+
+        if "autoscale" in cluster:
+            del cluster["autoscale"]
+
+        prediction_cluster = _deep_update(
+            cluster, prediction["solutions"][preference]["configuration"]
+        )
+
+        return Response(result=prediction_cluster)
     return prediction_response
 
 
@@ -550,10 +577,9 @@ def get_project_job(job_id: str, project_id: str, region_name: str = None) -> Re
         cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", []))
         cluster = cluster_response.result
         if cluster:
-            project_settings_response = get_project_cluster_settings(project_id, region_name)
-            project_cluster_settings = project_settings_response.result
-            if project_cluster_settings:
-                project_cluster = _deep_update(cluster, project_cluster_settings)
+            project_cluster_response = get_project_cluster(cluster, project_id, region_name)
+            project_cluster = project_cluster_response.result
+            if project_cluster:
                 cluster_key = tasks[0].get("job_cluster_key")
                 if cluster_key:
                     job_settings["job_clusters"] = [
@@ -565,11 +591,34 @@ def get_project_job(job_id: str, project_id: str, region_name: str = None) -> Re
                     tasks[0]["new_cluster"] = project_cluster
 
                 return Response(result=job)
-            return project_settings_response
+            return project_cluster_response
         return cluster_response
     return Response(error=DatabricksError(message="No task found in job"))
 
 
+def get_project_cluster(cluster: dict, project_id: str, region_name: str = None) -> Response[dict]:
+    """Apply project configuration to a cluster.
+
+    The cluster is updated with tags and a log configuration to facilitate project continuity.
+
+    :param cluster: Databricks cluster object
+    :type cluster: dict
+    :param project_id: Sync project ID
+    :type project_id: str
+    :param region_name: region name, defaults to AWS configuration
+    :type region_name: str, optional
+    :return: project job object
+    :rtype: Response[dict]
+    """
+    project_settings_response = get_project_cluster_settings(project_id, region_name)
+    project_cluster_settings = project_settings_response.result
+    if project_cluster_settings:
+        project_cluster = _deep_update(cluster, project_cluster_settings)
+
+        return Response(result=project_cluster)
+    return project_settings_response
+
+
 def get_project_cluster_settings(project_id: str, region_name: str = None) -> Response[dict]:
     """Gets cluster configuration for a project.
 
diff --git a/sync/cli/__init__.py b/sync/cli/__init__.py
@@ -33,40 +33,57 @@ def main(debug: bool):
 
 
 @main.command
-def configure():
+@click.option("--api-key-id")
+@click.option("--api-key-secret")
+@click.option("--prediction-preference")
+@click.option("--databricks-host")
+@click.option("--databricks-token")
+@click.option("--databricks-region")
+def configure(
+    api_key_id: str = None,
+    api_key_secret: str = None,
+    prediction_preference: str = None,
+    databricks_host: str = None,
+    databricks_token: str = None,
+    databricks_region: str = None,
+):
     """Configure Sync Library"""
-    api_key_id = click.prompt("Sync API key ID", default=API_KEY.id if API_KEY else None)
-    api_key_secret = click.prompt(
+    api_key_id = api_key_id or click.prompt(
+        "Sync API key ID", default=API_KEY.id if API_KEY else None
+    )
+    api_key_secret = api_key_secret or click.prompt(
         "Sync API key secret",
         default=API_KEY.secret if API_KEY else None,
         hide_input=True,
         show_default=False,
     )
 
-    prediction_preference = click.prompt(
+    prediction_preference = prediction_preference or click.prompt(
         "Default prediction preference",
         type=click.Choice([p.value for p in Preference]),
         default=(CONFIG.default_prediction_preference or Preference.ECONOMY).value,
     )
 
-    dbx_host = OPTIONAL_DEFAULT
-    dbx_token = OPTIONAL_DEFAULT
-    dbx_region = OPTIONAL_DEFAULT
-    if click.confirm("Would you like to configure a Databricks workspace?"):
-        dbx_host = click.prompt(
-            "Databricks host (prefix with https://)",
-            default=DB_CONFIG.host if DB_CONFIG else OPTIONAL_DEFAULT,
-        )
-        dbx_token = click.prompt(
-            "Databricks token",
-            default=DB_CONFIG.token if DB_CONFIG else OPTIONAL_DEFAULT,
-            hide_input=True,
-            show_default=False,
-        )
-        dbx_region = click.prompt(
-            "Databricks AWS region name",
-            default=DB_CONFIG.aws_region_name if DB_CONFIG else OPTIONAL_DEFAULT,
-        )
+    dbx_host = databricks_host or OPTIONAL_DEFAULT
+    dbx_token = databricks_token or OPTIONAL_DEFAULT
+    dbx_region = databricks_region or OPTIONAL_DEFAULT
+    # Skip only if all are provided since all are required to initialize the configuration below
+    if any(param == OPTIONAL_DEFAULT for param in (dbx_host, dbx_token, dbx_region)):
+        if click.confirm("Would you like to configure a Databricks workspace?"):
+            dbx_host = click.prompt(
+                "Databricks host (prefix with https://)",
+                default=DB_CONFIG.host if DB_CONFIG else OPTIONAL_DEFAULT,
+            )
+            dbx_token = click.prompt(
+                "Databricks token",
+                default=DB_CONFIG.token if DB_CONFIG else OPTIONAL_DEFAULT,
+                hide_input=True,
+                show_default=False,
+            )
+            dbx_region = click.prompt(
+                "Databricks AWS region name",
+                default=DB_CONFIG.aws_region_name if DB_CONFIG else OPTIONAL_DEFAULT,
+            )
 
     init(
         APIKey(api_key_id=api_key_id, api_key_secret=api_key_secret),