Merge pull request #18 from synccomputingcode/graham/dbx-error-message

gseyffert-sync · web-flow · commit 4cad8b359e34 · 2023-05-09T14:18:49.000-07:00
PROD-958 Update handling of incomplete cluster_report data for Databricks jobs
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
   "httpx~=0.23.0",
   "orjson~=3.8.0",
   "click~=8.1.0",
+  "tenacity==8.2.2"
 ]
 dynamic = ["version", "description"]
 
diff --git a/sync/__init__.py b/sync/__init__.py
@@ -1,4 +1,4 @@
 """Library for leveraging the power of Sync"""
-__version__ = "0.0.7"
+__version__ = "0.0.8"
 
 TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py
@@ -112,7 +112,11 @@ def get_cluster(cluster_id: str) -> Response[dict]:
 
 # TODO - Databricks configuration documentation
 def create_prediction_for_run(
-    run_id: str, plan_type: str, compute_type: str, project_id: str = None
+    run_id: str,
+    plan_type: str,
+    compute_type: str,
+    project_id: str = None,
+    allow_incomplete_cluster_report: bool = False,
 ) -> Response[str]:
     """Create a prediction for the specified Databricks run.
 
@@ -124,6 +128,8 @@ def create_prediction_for_run(
     :type compute_type: str
     :param project_id: Sync project ID, defaults to None
     :type project_id: str, optional
+    :param allow_incomplete_cluster_report: Whether creating a prediction with incomplete cluster report data should be allowable
+    :type allow_incomplete_cluster_report: bool, optional, defaults to False
     :return: prediction ID
     :rtype: Response[str]
     """
@@ -138,7 +144,9 @@ def create_prediction_for_run(
     if cluster_id := cluster_id_response.result:
         # Making these calls prior to fetching the event log allows Databricks a little extra time to finish
         #  uploading all the event log data before we start checking for it
-        cluster_report_response = _get_cluster_report(cluster_id, plan_type, compute_type)
+        cluster_report_response = _get_cluster_report(
+            cluster_id, plan_type, compute_type, allow_incomplete_cluster_report
+        )
         if cluster_report := cluster_report_response.result:
 
             cluster = cluster_report.cluster
@@ -161,7 +169,7 @@ def create_prediction_for_run(
 
 
 def get_cluster_report(
-    run_id: str, plan_type: str, compute_type: str
+    run_id: str, plan_type: str, compute_type: str, allow_incomplete: bool = False
 ) -> Response[DatabricksClusterReport]:
     """Fetches the cluster information required to create a Sync prediction
 
@@ -171,6 +179,8 @@ def get_cluster_report(
     :type plan_type: str
     :param compute_type: Cluster compute type, e.g. "Jobs Compute"
     :type compute_type: str
+    :param allow_incomplete: Whether creating a cluster report with incomplete data should be allowable
+    :type allow_incomplete: bool, optional, defaults to False
     :return: cluster report
     :rtype: Response[DatabricksClusterReport]
     """
@@ -183,13 +193,13 @@ def get_cluster_report(
 
     cluster_id_response = _get_run_cluster_id(run["tasks"])
     if cluster_id := cluster_id_response.result:
-        return _get_cluster_report(cluster_id, plan_type, compute_type)
+        return _get_cluster_report(cluster_id, plan_type, compute_type, allow_incomplete)
 
     return cluster_id_response
 
 
 def _get_cluster_report(
-    cluster_id: str, plan_type: str, compute_type: str
+    cluster_id: str, plan_type: str, compute_type: str, allow_incomplete: bool
 ) -> Response[DatabricksClusterReport]:
     cluster = get_default_client().get_cluster(cluster_id)
     if "error_code" in cluster:
@@ -209,11 +219,15 @@ def _get_cluster_report(
         ]
     )
     if not instances["Reservations"]:
-        return Response(
-            error=DatabricksError(
-                message=f"Unable to find any active or recently terminated instances for cluster `{cluster_id}` in `{aws_region_name}`"
-            )
+        no_instances_message = (
+            f"Unable to find any active or recently terminated instances for cluster `{cluster_id}` in `{aws_region_name}`. "
+            + "Please refer to the following documentation for options on how to address this - "
+            + "https://synccomputingcode.github.io/syncsparkpy/reference/awsdatabricks.html"
         )
+        if allow_incomplete:
+            logger.warning(no_instances_message)
+        else:
+            return Response(error=DatabricksError(message=no_instances_message))
 
     return Response(
         result=DatabricksClusterReport(
@@ -281,7 +295,9 @@ def get_prediction_job(
                 if "autoscale" in cluster:
                     del cluster["autoscale"]
 
-                prediction_cluster = _deep_update(cluster, prediction["solutions"][preference]["configuration"])
+                prediction_cluster = _deep_update(
+                    cluster, prediction["solutions"][preference]["configuration"]
+                )
 
                 if cluster_key := tasks[0].get("job_cluster_key"):
                     job_settings["job_clusters"] = [
diff --git a/sync/cli/__init__.py b/sync/cli/__init__.py
@@ -23,7 +23,7 @@ def main(debug: bool):
     if debug:
         logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
     else:
-        logging.basicConfig(level=logging.CRITICAL, format=LOG_FORMAT)
+        logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT)
 
 
 main.add_command(predictions.predictions)
diff --git a/sync/cli/awsdatabricks.py b/sync/cli/awsdatabricks.py
@@ -60,12 +60,22 @@ def run_job(
     default=DatabricksComputeType.JOBS_COMPUTE,
 )
 @click.option("--project", callback=validate_project)
+@click.option(
+    "--allow-incomplete",
+    is_flag=True,
+    default=False,
+    help="Force creation of a prediction even with incomplete cluster data.",
+)
 def create_prediction(
-    run_id: str, plan: DatabricksPlanType, compute: DatabricksComputeType, project: dict = None
+    run_id: str,
+    plan: DatabricksPlanType,
+    compute: DatabricksComputeType,
+    project: dict = None,
+    allow_incomplete: bool = False,
 ):
     """Create a prediction for a job run"""
     prediction_response = awsdatabricks.create_prediction_for_run(
-        run_id, plan, compute, project["id"]
+        run_id, plan, compute, project["id"], allow_incomplete
     )
     if prediction := prediction_response.result:
         click.echo(f"Prediction ID: {prediction}")
@@ -81,9 +91,20 @@ def create_prediction(
     type=click.Choice(DatabricksComputeType),
     default=DatabricksComputeType.JOBS_COMPUTE,
 )
-def get_cluster_report(run_id: str, plan: DatabricksPlanType, compute: DatabricksComputeType):
+@click.option(
+    "--allow-incomplete",
+    is_flag=True,
+    default=False,
+    help="Force creation of a cluster report even if some data is missing.",
+)
+def get_cluster_report(
+    run_id: str,
+    plan: DatabricksPlanType,
+    compute: DatabricksComputeType,
+    allow_incomplete: bool = False,
+):
     """Get a cluster report"""
-    config_response = awsdatabricks.get_cluster_report(run_id, plan, compute)
+    config_response = awsdatabricks.get_cluster_report(run_id, plan, compute, allow_incomplete)
     if config := config_response.result:
         click.echo(
             orjson.dumps(
diff --git a/sync/clients/__init__.py b/sync/clients/__init__.py
@@ -1,4 +1,6 @@
+import httpx
 import orjson
+from tenacity import Retrying, TryAgain, stop_after_attempt, wait_exponential_jitter
 
 from sync import __version__
 
@@ -15,3 +17,56 @@ def encode_json(obj: dict) -> tuple[dict, str]:
         "Content-Length": str(len(json)),
         "Content-Type": "application/json",
     }, json
+
+
+class RetryableHTTPClient:
+    """
+    Smaller wrapper around httpx.Client/AsyncClient to contain retrying logic that httpx does not offer natively
+    """
+
+    _DEFAULT_RETRYABLE_STATUS_CODES: set[httpx.codes] = {
+        httpx.codes.REQUEST_TIMEOUT,
+        httpx.codes.TOO_EARLY,
+        httpx.codes.TOO_MANY_REQUESTS,
+        httpx.codes.INTERNAL_SERVER_ERROR,
+        httpx.codes.BAD_GATEWAY,
+        httpx.codes.SERVICE_UNAVAILABLE,
+        httpx.codes.GATEWAY_TIMEOUT,
+    }
+
+    def __init__(self, client: httpx.Client | httpx.AsyncClient):
+        self._client: httpx.Client | httpx.AsyncClient = client
+
+    def _send_request(self, request: httpx.Request) -> httpx.Response:
+        try:
+            for attempt in Retrying(
+                stop=stop_after_attempt(3),
+                wait=wait_exponential_jitter(initial=2, max=10, jitter=2),
+                reraise=True,
+            ):
+                with attempt:
+                    response = self._client.send(request)
+                    if response.status_code in self._DEFAULT_RETRYABLE_STATUS_CODES:
+                        raise TryAgain()
+        except TryAgain:
+            # If we max out on retries, then return the bad response back to the caller to handle as appropriate
+            pass
+
+        return response
+
+    async def _send_request_async(self, request: httpx.Request) -> httpx.Response:
+        try:
+            for attempt in Retrying(
+                stop=stop_after_attempt(3),
+                wait=wait_exponential_jitter(initial=2, max=10, jitter=2),
+                reraise=True,
+            ):
+                with attempt:
+                    response = await self._client.send(request)
+                    if response.status_code in self._DEFAULT_RETRYABLE_STATUS_CODES:
+                        raise TryAgain()
+        except TryAgain:
+            # If we max out on retries, then return the bad response back to the caller to handle as appropriate
+            pass
+
+        return response
diff --git a/sync/clients/databricks.py b/sync/clients/databricks.py
@@ -4,7 +4,7 @@
 import httpx
 
 from ..config import DB_CONFIG
-from . import USER_AGENT, encode_json
+from . import USER_AGENT, RetryableHTTPClient, encode_json
 
 logger = logging.getLogger(__name__)
 
@@ -19,10 +19,14 @@ def auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, httpx.Re
         yield request
 
 
-class DatabricksClient:
+class DatabricksClient(RetryableHTTPClient):
     def __init__(self, base_url: str, access_token: str):
-        self._client = httpx.Client(
-            base_url=base_url, headers={"User-Agent": USER_AGENT}, auth=DatabricksAuth(access_token)
+        super().__init__(
+            client=httpx.Client(
+                base_url=base_url,
+                headers={"User-Agent": USER_AGENT},
+                auth=DatabricksAuth(access_token),
+            )
         )
 
     def create_cluster(self, config: dict) -> dict:
@@ -92,9 +96,9 @@ def get_run(self, run_id: str) -> dict:
         )
 
     def _send(self, request: httpx.Request) -> dict:
-        response = self._client.send(request)
+        response = self._send_request(request)
 
-        if response.status_code >= 200 and response.status_code < 300:
+        if 200 <= response.status_code < 300:
             return response.json()
 
         if response.headers.get("Content-Type", "").startswith("application/json"):
diff --git a/sync/clients/sync.py b/sync/clients/sync.py
@@ -4,7 +4,7 @@
 import httpx
 
 from ..config import API_KEY, CONFIG, APIKey
-from . import USER_AGENT, encode_json
+from . import USER_AGENT, RetryableHTTPClient, encode_json
 
 logger = logging.getLogger(__name__)
 
@@ -48,13 +48,15 @@ def update_access_token(self, response: httpx.Response):
             logger.error(f"{response.status_code}: Failed to authenticate")
 
 
-class SyncClient:
+class SyncClient(RetryableHTTPClient):
     def __init__(self, api_url, api_key):
-        self._client = httpx.Client(
-            base_url=api_url,
-            headers={"User-Agent": USER_AGENT},
-            auth=SyncAuth(api_url, api_key),
-            timeout=60.0,
+        super().__init__(
+            client=httpx.Client(
+                base_url=api_url,
+                headers={"User-Agent": USER_AGENT},
+                auth=SyncAuth(api_url, api_key),
+                timeout=60.0,
+            )
         )
 
     def get_products(self) -> dict:
@@ -109,9 +111,9 @@ def delete_project(self, project_id: str) -> dict:
         return self._send(self._client.build_request("DELETE", f"/v1/projects/{project_id}"))
 
     def _send(self, request: httpx.Request) -> dict:
-        response = self._client.send(request)
+        response = self._send_request(request)
 
-        if response.status_code >= 200 and response.status_code < 300:
+        if 200 <= response.status_code < 300:
             return response.json()
 
         if response.headers.get("Content-Type", "").startswith("application/json"):
@@ -126,13 +128,15 @@ def _send(self, request: httpx.Request) -> dict:
         return {"error": {"code": "Sync API Error", "message": "Transaction failure"}}
 
 
-class ASyncClient:
+class ASyncClient(RetryableHTTPClient):
     def __init__(self, api_url, api_key):
-        self._client = httpx.AsyncClient(
-            base_url=api_url,
-            headers={"User-Agent": USER_AGENT},
-            auth=SyncAuth(api_url, api_key),
-            timeout=60.0,
+        super().__init__(
+            client=httpx.AsyncClient(
+                base_url=api_url,
+                headers={"User-Agent": USER_AGENT},
+                auth=SyncAuth(api_url, api_key),
+                timeout=60.0,
+            )
         )
 
     async def create_prediction(self, prediction: dict) -> dict:
@@ -184,9 +188,9 @@ async def delete_project(self, project_id: str) -> dict:
         return await self._send(self._client.build_request("DELETE", f"/v1/projects/{project_id}"))
 
     async def _send(self, request: httpx.Request) -> dict:
-        response = await self._client.send(request)
+        response = await self._send_request_async(request)
 
-        if response.status_code >= 200 and response.status_code < 300:
+        if 200 <= response.status_code < 300:
             return response.json()
 
         if response.headers.get("Content-Type", "").startswith("application/json"):

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ dependencies = [`
`31`	`31`	`"httpx~=0.23.0",`
`32`	`32`	`"orjson~=3.8.0",`
`33`	`33`	`"click~=8.1.0",`
	`34`	`+ "tenacity==8.2.2"`
`34`	`35`	`]`
`35`	`36`	`dynamic = ["version", "description"]`
`36`	`37`