synccomputingcode
diff --git a/‎sync/__init__.py
Lines changed: 1 addition & 1 deletion b/‎sync/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sync/_databricks.py
Lines changed: 210 additions & 22 deletions b/‎sync/_databricks.py
Lines changed: 210 additions & 22 deletions
diff --git a/‎sync/api/projects.py
Lines changed: 1 addition & 1 deletion b/‎sync/api/projects.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sync/awsdatabricks.py
Lines changed: 4 additions & 0 deletions b/‎sync/awsdatabricks.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎sync/azuredatabricks.py
Lines changed: 4 additions & 0 deletions b/‎sync/azuredatabricks.py
Lines changed: 4 additions & 0 deletions
@@ -1,4 +1,4 @@
 """Library for leveraging the power of Sync"""
-__version__ = "0.4.5"
+__version__ = "0.4.6"
 
 TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
@@ -14,7 +14,11 @@
 
 import boto3 as boto
 
-from sync.api.predictions import create_prediction_with_eventlog_bytes, get_prediction
+from sync.api.predictions import (
+    create_prediction_with_eventlog_bytes,
+    get_prediction,
+    get_predictions,
+)
 from sync.api.projects import (
     create_project_submission_with_eventlog_bytes,
     get_project,
@@ -153,22 +157,22 @@ def create_prediction_for_run(
 
     project_cluster_tasks = _get_project_cluster_tasks(run, exclude_tasks)
 
-    cluster_id = None
+    cluster_tasks = None
     if project_id:
-        if project_id in project_cluster_tasks:
-            cluster_id, tasks = project_cluster_tasks.get(project_id)
+        cluster_tasks = project_cluster_tasks.get(project_id)
 
-    if not cluster_id:
-        if None in project_cluster_tasks and len(project_cluster_tasks) == 1:
-            # If there's only 1 cluster and it's not tagged with project ID assume that's the one for the project
-            cluster_id, tasks = project_cluster_tasks.get(None)
-
-    if not cluster_id:
-        return Response(
-            error=DatabricksError(
-                message=f"No cluster found in run {run_id} for project {project_id}"
+    if not cluster_tasks:
+        if len(project_cluster_tasks) == 1:
+            # If there's only 1 cluster assume that's the one for the project
+            cluster_tasks = next(iter(project_cluster_tasks.values()))
+        else:
+            return Response(
+                error=DatabricksError(
+                    message=f"No cluster found in run {run_id} for project {project_id}"
+                )
             )
-        )
+
+    cluster_id, tasks = cluster_tasks
 
     return _create_prediction(
         cluster_id, tasks, plan_type, compute_type, project_id, allow_incomplete_cluster_report
@@ -294,11 +298,19 @@ def create_submission_for_run(
 
     project_cluster_tasks = _get_project_cluster_tasks(run, exclude_tasks)
 
-    if project_id in project_cluster_tasks:
-        cluster_id, tasks = project_cluster_tasks.get(project_id)
-    elif None in project_cluster_tasks and len(project_cluster_tasks) == 1:
-        # If there's only 1 cluster and it's not tagged with project ID assume that's the one for the project
-        cluster_id, tasks = project_cluster_tasks.get(None)
+    cluster_tasks = project_cluster_tasks.get(project_id)
+    if not cluster_tasks:
+        if len(project_cluster_tasks) == 1:
+            # If there's only 1 cluster assume that's the one for the project
+            cluster_tasks = next(iter(project_cluster_tasks.values()))
+        else:
+            return Response(
+                error=DatabricksError(
+                    message=f"Unable to locate cluster in run {run_id} for project {project_id}"
+                )
+            )
+
+    cluster_id, tasks = cluster_tasks
 
     run_information_response = _get_run_information(
         cluster_id,
@@ -386,6 +398,7 @@ def get_cluster_report(
         return Response(error=DatabricksAPIError(**run))
 
     project_cluster_tasks = _get_project_cluster_tasks(run, exclude_tasks)
+
     cluster_tasks = project_cluster_tasks.get(project_id)
     if not cluster_tasks:
         return Response(
@@ -455,9 +468,11 @@ def record_run(
     if project_id:
         if project_id in project_cluster_tasks:
             filtered_project_cluster_tasks = {project_id: project_cluster_tasks.get(project_id)}
-        elif None in project_cluster_tasks and len(project_cluster_tasks) == 1:
-            # If there's only 1 cluster and it's not tagged with project ID assume that's the one for the project
-            filtered_project_cluster_tasks = {project_id: project_cluster_tasks.get(None)}
+        elif len(project_cluster_tasks) == 1:
+            # If there's only 1 cluster assume that's the one for the project
+            filtered_project_cluster_tasks = {
+                project_id: next(iter(project_cluster_tasks.values()))
+            }
     else:
         filtered_project_cluster_tasks = {
             cluster_project_id: cluster_tasks
@@ -499,6 +514,83 @@ def record_run(
         )
 
 
+def apply_prediction(
+    job_id: str, project_id: str, prediction_id: str = None, preference: str = None
+):
+    """Updates jobs with prediction configuration
+
+    :param job_id: ID of job to apply prediction to
+    :type job_id: str
+    :param project_id: Sync project ID
+    :type project_id: str
+    :param prediction_id: Sync prediction ID, defaults to latest in project
+    :type prediction_id: str, optional
+    :param preference: Prediction preference, defaults to "recommended" then "economy"
+    :type preference: str, optional
+    :return: ID of applied prediction
+    :rtype: Response[str]
+    """
+    if prediction_id:
+        prediction_response = get_prediction(prediction_id, preference)
+    else:
+        predictions_response = get_predictions(project_id=project_id)
+        if predictions_response.error:
+            return predictions_response
+        prediction_id = predictions_response.result[0]["prediction_id"]
+        prediction_response = get_prediction(prediction_id, preference)
+
+    if prediction_response.error:
+        return prediction_response
+
+    prediction = prediction_response.result
+
+    databricks_client = get_default_client()
+
+    job = databricks_client.get_job(job_id)
+    job_clusters = _get_project_job_clusters(job)
+
+    project_cluster = job_clusters.get(project_id)
+    if not project_cluster:
+        if len(job_clusters) == 1:
+            project_cluster = next(iter(job_clusters.values()))
+        else:
+            return Response(
+                error=DatabricksError(
+                    message=f"Unable to locate cluster in job {job_id} for project {project_id}"
+                )
+            )
+
+    project_cluster_path, _ = project_cluster
+
+    if preference:
+        prediction_cluster = prediction["solutions"][preference]["configuration"]
+    else:
+        prediction_cluster = prediction["solutions"].get(
+            "recommended", prediction["solutions"]["economy"]
+        )["configuration"]
+
+    if "cluster_name" in prediction_cluster:
+        del prediction_cluster["cluster_name"]
+
+    if project_cluster_path[0] == "job_clusters":
+        new_settings = {
+            "job_clusters": [
+                {"job_cluster_key": project_cluster_path[1], "new_cluster": prediction_cluster}
+            ]
+        }
+    else:
+        new_settings = {
+            "tasks": [{"task_key": project_cluster_path[1], "new_cluster": prediction_cluster}]
+        }
+
+    response = databricks_client.update_job(job_id, new_settings)
+
+    if "error_code" in response:
+        return Response(error=DatabricksAPIError(**response))
+
+    return Response(result=prediction_id)
+
+
 def get_prediction_job(
     job_id: str, prediction_id: str, preference: str = CONFIG.default_prediction_preference.value
 ) -> Response[dict]:
@@ -586,6 +678,62 @@ def get_prediction_cluster(
     return prediction_response
 
 
+def apply_project_recommendation(job_id: str, project_id: str, recommendation_id: str):
+    """Updates jobs with project recommendation
+
+    :param job_id: ID of job to apply prediction to
+    :type job_id: str
+    :param project_id: Sync project ID
+    :type project_id: str
+    :param recommendation_id: Sync project recommendation ID
+    :type recommendation_id: str
+    :return: ID of applied recommendation
+    :rtype: Response[str]
+    """
+    databricks_client = get_default_client()
+
+    job = databricks_client.get_job(job_id)
+    job_clusters = _get_project_job_clusters(job)
+
+    project_cluster = job_clusters.get(project_id)
+    if not project_cluster:
+        if len(job_clusters) == 1:
+            project_cluster = next(iter(job_clusters.values()))
+        else:
+            return Response(
+                error=DatabricksError(
+                    message=f"Unable to locate cluster in job {job_id} for project {project_id}"
+                )
+            )
+
+    project_cluster_path, project_cluster_def = project_cluster
+
+    new_cluster_def_response = get_recommendation_cluster(
+        project_cluster_def, project_id, recommendation_id
+    )
+    if new_cluster_def_response.error:
+        return new_cluster_def_response
+    new_cluster_def = new_cluster_def_response.result
+
+    if project_cluster_path[0] == "job_clusters":
+        new_settings = {
+            "job_clusters": [
+                {"job_cluster_key": project_cluster_path[1], "new_cluster": new_cluster_def}
+            ]
+        }
+    else:
+        new_settings = {
+            "tasks": [{"task_key": project_cluster_path[1], "new_cluster": new_cluster_def}]
+        }
+
+    response = databricks_client.update_job(job_id, new_settings)
+
+    if "error_code" in response:
+        return Response(error=DatabricksAPIError(**response))
+
+    return Response(result=recommendation_id)
+
+
 def get_recommendation_job(job_id: str, project_id: str, recommendation_id: str) -> Response[dict]:
     """Apply the recommendation to the specified job.
 
@@ -1222,6 +1370,46 @@ def _get_job_cluster(tasks: List[dict], job_clusters: list) -> Response[dict]:
     return Response(error=DatabricksError(message="Not all tasks use the same cluster"))
 
 
+def _get_project_job_clusters(
+    job: dict,
+    exclude_tasks: Union[Collection[str], None] = None,
+) -> Dict[str, Tuple[Tuple[str], dict]]:
+    """Returns a mapping of project IDs to cluster paths and clusters.
+
+    Cluster paths are tuples that can be used to locate clusters in a job object, e.g.
+
+    ("tasks", <task_key>) or ("job_clusters", <job_cluster_key>)
+
+    Items for project IDs with more than 1 associated cluster are omitted"""
+    job_clusters = {
+        c["job_cluster_key"]: c["new_cluster"] for c in job["settings"].get("job_clusters", [])
+    }
+    all_project_clusters = defaultdict(list)
+
+    for task in job["settings"]["tasks"]:
+        if not exclude_tasks or task["task_key"] not in exclude_tasks:
+            task_cluster = task.get("new_cluster")
+            if task_cluster:
+                task_cluster_path = ("tasks", task["task_key"])
+
+            if not task_cluster:
+                task_cluster = job_clusters.get(task.get("job_cluster_key"))
+                task_cluster_path = ("job_clusters", task.get("job_cluster_key"))
+
+            if task_cluster:
+                cluster_project_id = task_cluster.get("custom_tags", {}).get("sync:project-id")
+                all_project_clusters[cluster_project_id].append((task_cluster_path, task_cluster))
+
+    filtered_project_clusters = {}
+    for project_id, clusters in all_project_clusters.items():
+        if len(clusters) > 1:
+            logger.warning(f"More than 1 cluster found for project ID {project_id}")
+        else:
+            filtered_project_clusters[project_id] = clusters[0]
+
+    return filtered_project_clusters
+
+
 def _get_project_cluster_tasks(
     run: dict,
     exclude_tasks: Union[Collection[str], None] = None,
 
@@ -19,7 +19,7 @@
     SubmissionError,
 )
 
-logger = logging.getLogger()
+logger = logging.getLogger(__name__)
 
 
 def get_prediction(project_id: str, preference: Preference = None) -> Response[dict]:
 
@@ -14,6 +14,8 @@
     _get_all_cluster_events,
     _get_cluster_instances_from_dbfs,
     _wait_for_cluster_termination,
+    apply_prediction,
+    apply_project_recommendation,
     create_and_record_run,
     create_and_wait_for_run,
     create_cluster,
@@ -83,6 +85,8 @@
     "wait_for_run_and_cluster",
     "terminate_cluster",
     "event_log_poll_duration_seconds",
+    "apply_prediction",
+    "apply_project_recommendation",
 ]
 
 
 
@@ -18,6 +18,8 @@
     _get_all_cluster_events,
     _get_cluster_instances_from_dbfs,
     _wait_for_cluster_termination,
+    apply_prediction,
+    apply_project_recommendation,
     create_and_record_run,
     create_and_wait_for_run,
     create_cluster,
@@ -87,6 +89,8 @@
     "wait_for_run_and_cluster",
     "terminate_cluster",
     "event_log_poll_duration_seconds",
+    "apply_prediction",
+    "apply_project_recommendation",
 ]
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`SubmissionError,`
`20`	`20`	`)`
`21`	`21`
`22`		`-logger = logging.getLogger()`
	`22`	`+logger = logging.getLogger(__name__)`
`23`	`23`
`24`	`24`
`25`	`25`	`def get_prediction(project_id: str, preference: Preference = None) -> Response[dict]:`