From 09af88ef2fc4f5a552a203866265541f245dc574 Mon Sep 17 00:00:00 2001 From: meher-m Date: Fri, 6 Jun 2025 22:23:18 +0000 Subject: [PATCH 01/10] multinode readme changes and remove logs --- .../api/v2/chat_completion.py | 2 +- .../model_engine_server/api/v2/completion.py | 2 +- .../use_cases/llm_model_endpoint_use_cases.py | 119 ++++++++++++++++-- .../inference/forwarding/http_forwarder.py | 6 +- 4 files changed, 112 insertions(+), 17 deletions(-) diff --git a/model-engine/model_engine_server/api/v2/chat_completion.py b/model-engine/model_engine_server/api/v2/chat_completion.py index 614f159d..6140c451 100644 --- a/model-engine/model_engine_server/api/v2/chat_completion.py +++ b/model-engine/model_engine_server/api/v2/chat_completion.py @@ -263,7 +263,7 @@ async def chat_completion( ) else: logger.info( - f"POST /v2/chat/completion ({('stream' if request.stream else 'sync')}) with {request} to endpoint {model_endpoint_name} for {auth}" + f"POST /v2/chat/completion ({('stream' if request.stream else 'sync')}) with to endpoint {model_endpoint_name} for {auth}" ) if request.stream: diff --git a/model-engine/model_engine_server/api/v2/completion.py b/model-engine/model_engine_server/api/v2/completion.py index ed529fe3..aaf59918 100644 --- a/model-engine/model_engine_server/api/v2/completion.py +++ b/model-engine/model_engine_server/api/v2/completion.py @@ -262,7 +262,7 @@ async def completion( ) else: logger.info( - f"POST /v2/completion ({('stream' if request.stream else 'sync')}) with {request} to endpoint {model_endpoint_name} for {auth}" + f"POST /v2/completion ({('stream' if request.stream else 'sync')}) with to endpoint {model_endpoint_name} for {auth}" ) if request.stream: diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 1fb8dbed..c370c250 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -376,6 +376,87 @@ def check_docker_image_exists_for_image_tag( repository=repository_name, tag=framework_image_tag, ) + + async def create_sglang_multinode_bundle( + self, + user: User, + model_name: str, + framework_image_tag: str, + endpoint_unique_name: str, + num_shards: int, + nodes_per_worker: int, + quantize: Optional[Quantization], + checkpoint_path: Optional[str], + chat_template_override: Optional[str], + additional_args: Optional[SGLangEndpointAdditionalArgs] = None, + ): + leader_command = [ + "python3", + "/root/sglang-startup-script.py", + "--model", + "deepseek-ai/DeepSeek-R1-0528", + "--nnodes", + "2", + "--node-rank", + "0" + "--worker-port", + "5005", + "--leader-port", + "5002" + ] + + worker_command = [ + "python3", + "/root/sglang-startup-script.py", + "--model", + "deepseek-ai/DeepSeek-R1-0528", + "--nnodes", + "2", + "--node-rank", + "1", + "--worker-port", + "5005", + "--leader-port", + "5002" + ] + + # NOTE: the most important env var SGLANG_HOST_IP is already established in the sglang startup script + + common_sglang_envs = { # these are for debugging + "NCCL_SOCKET_IFNAME": "eth0", + "GLOO_SOCKET_IFNAME": "eth0", + } + + # This is same as VLLM multinode bundle + create_model_bundle_v2_request = CreateModelBundleV2Request( + name=endpoint_unique_name, + schema_location="TBA", + flavor=StreamingEnhancedRunnableImageFlavor( + flavor=ModelBundleFlavorType.STREAMING_ENHANCED_RUNNABLE_IMAGE, + repository=hmi_config.sglang_repository, + tag=framework_image_tag, + command=leader_command, + streaming_command=leader_command, + protocol="http", + readiness_initial_delay_seconds=10, + healthcheck_route="/health", + predict_route="/predict", + streaming_predict_route="/stream", + extra_routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], + env=common_sglang_envs, + worker_command=worker_command, + worker_env=common_sglang_envs, + ), + metadata={}, + ) + + return ( + await self.create_model_bundle_use_case.execute( + user, + create_model_bundle_v2_request, + do_auth_check=False, + ) + ).model_bundle_id async def execute( self, @@ -400,7 +481,7 @@ async def execute( self.check_docker_image_exists_for_image_tag( framework_image_tag, INFERENCE_FRAMEWORK_REPOSITORY[framework] ) - if multinode and framework != LLMInferenceFramework.VLLM: + if multinode and framework not in [LLMInferenceFramework.VLLM, LLMInferenceFramework.SGLANG]: raise ObjectHasInvalidValueException( f"Multinode is not supported for framework {framework}." ) @@ -481,16 +562,30 @@ async def execute( if additional_args else None ) - bundle_id = await self.create_sglang_bundle( - user, - model_name, - framework_image_tag, - endpoint_name, - num_shards, - checkpoint_path, - chat_template_override, - additional_args=additional_sglang_args, - ) + if multinode: + bundle_id = await self.create_sglang_multinode_bundle( + user, + model_name, + framework_image_tag, + endpoint_name, + num_shards, + nodes_per_worker, + quantize, + checkpoint_path, + chat_template_override, + additional_args=additional_sglang_args, + ) + else: + bundle_id = await self.create_sglang_bundle( + user, + model_name, + framework_image_tag, + endpoint_name, + num_shards, + checkpoint_path, + chat_template_override, + additional_args=additional_sglang_args, + ) case _: assert_never(framework) raise ObjectHasInvalidValueException( @@ -1323,7 +1418,7 @@ async def execute( if ( request.nodes_per_worker > 1 - and not request.inference_framework == LLMInferenceFramework.VLLM + and not request.inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.SGLANG] ): raise ObjectHasInvalidValueException( "Multinode endpoints are only supported for VLLM models." diff --git a/model-engine/model_engine_server/inference/forwarding/http_forwarder.py b/model-engine/model_engine_server/inference/forwarding/http_forwarder.py index 89fcb3fb..883ef4b8 100644 --- a/model-engine/model_engine_server/inference/forwarding/http_forwarder.py +++ b/model-engine/model_engine_server/inference/forwarding/http_forwarder.py @@ -90,7 +90,7 @@ async def predict( ) return response except Exception: - logger.error(f"Failed to decode payload from: {request}") + logger.error(f"Failed to decode payload from:") raise @@ -103,10 +103,10 @@ async def stream( try: payload = request.model_dump() except Exception: - logger.error(f"Failed to decode payload from: {request}") + logger.error(f"Failed to decode payload from:") raise else: - logger.debug(f"Received request: {payload}") + logger.debug(f"Received request: ") responses = forwarder.forward(payload) # We fetch the first response to check if upstream request was successful From 6cf54471c5517813ef9ecfad542ecca6ccbf775f Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Sat, 7 Jun 2025 20:19:29 +0000 Subject: [PATCH 02/10] some fixes --- .../use_cases/llm_model_endpoint_use_cases.py | 71 ++++++++++--------- .../inference/forwarding/http_forwarder.py | 17 ++++- 2 files changed, 51 insertions(+), 37 deletions(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index c370c250..b25aa6b3 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -376,7 +376,7 @@ def check_docker_image_exists_for_image_tag( repository=repository_name, tag=framework_image_tag, ) - + async def create_sglang_multinode_bundle( self, user: User, @@ -391,40 +391,40 @@ async def create_sglang_multinode_bundle( additional_args: Optional[SGLangEndpointAdditionalArgs] = None, ): leader_command = [ - "python3", - "/root/sglang-startup-script.py", - "--model", - "deepseek-ai/DeepSeek-R1-0528", - "--nnodes", - "2", - "--node-rank", - "0" - "--worker-port", - "5005", - "--leader-port", - "5002" + "python3", + "/root/sglang-startup-script.py", + "--model", + "deepseek-ai/DeepSeek-R1-0528", + "--nnodes", + "2", + "--node-rank", + "0", + "--worker-port", + "5005", + "--leader-port", + "5002", ] - + worker_command = [ - "python3", - "/root/sglang-startup-script.py", - "--model", - "deepseek-ai/DeepSeek-R1-0528", - "--nnodes", - "2", - "--node-rank", - "1", - "--worker-port", - "5005", - "--leader-port", - "5002" + "python3", + "/root/sglang-startup-script.py", + "--model", + "deepseek-ai/DeepSeek-R1-0528", + "--nnodes", + "2", + "--node-rank", + "1", + "--worker-port", + "5005", + "--leader-port", + "5002", ] # NOTE: the most important env var SGLANG_HOST_IP is already established in the sglang startup script - - common_sglang_envs = { # these are for debugging + + common_sglang_envs = { # these are for debugging "NCCL_SOCKET_IFNAME": "eth0", - "GLOO_SOCKET_IFNAME": "eth0", + "GLOO_SOCKET_IFNAME": "eth0", } # This is same as VLLM multinode bundle @@ -481,7 +481,10 @@ async def execute( self.check_docker_image_exists_for_image_tag( framework_image_tag, INFERENCE_FRAMEWORK_REPOSITORY[framework] ) - if multinode and framework not in [LLMInferenceFramework.VLLM, LLMInferenceFramework.SGLANG]: + if multinode and framework not in [ + LLMInferenceFramework.VLLM, + LLMInferenceFramework.SGLANG, + ]: raise ObjectHasInvalidValueException( f"Multinode is not supported for framework {framework}." ) @@ -1416,10 +1419,10 @@ async def execute( request.inference_framework ) - if ( - request.nodes_per_worker > 1 - and not request.inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.SGLANG] - ): + if request.nodes_per_worker > 1 and not request.inference_framework in [ + LLMInferenceFramework.VLLM, + LLMInferenceFramework.SGLANG, + ]: raise ObjectHasInvalidValueException( "Multinode endpoints are only supported for VLLM models." ) diff --git a/model-engine/model_engine_server/inference/forwarding/http_forwarder.py b/model-engine/model_engine_server/inference/forwarding/http_forwarder.py index 883ef4b8..88332cca 100644 --- a/model-engine/model_engine_server/inference/forwarding/http_forwarder.py +++ b/model-engine/model_engine_server/inference/forwarding/http_forwarder.py @@ -22,6 +22,8 @@ logger = make_logger(logger_name()) +LOG_SENSITIVE_DATA = False + def get_config(): overrides = os.getenv("CONFIG_OVERRIDES") @@ -90,7 +92,10 @@ async def predict( ) return response except Exception: - logger.error(f"Failed to decode payload from:") + if LOG_SENSITIVE_DATA: + logger.error(f"Failed to decode payload") + else: + logger.error(f"Failed to decode payload from: {request}") raise @@ -103,10 +108,16 @@ async def stream( try: payload = request.model_dump() except Exception: - logger.error(f"Failed to decode payload from:") + if LOG_SENSITIVE_DATA: + logger.error(f"Failed to decode payload") + else: + logger.error(f"Failed to decode payload from: {request}") raise else: - logger.debug(f"Received request: ") + if LOG_SENSITIVE_DATA: + logger.debug(f"Received request") + else: + logger.debug(f"Received request: {request}") responses = forwarder.forward(payload) # We fetch the first response to check if upstream request was successful From e129f472f3c3d1b47ca14554f5dacb08e4b405fa Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Sat, 7 Jun 2025 20:21:55 +0000 Subject: [PATCH 03/10] fix --- model-engine/model_engine_server/api/v2/chat_completion.py | 2 +- model-engine/model_engine_server/api/v2/completion.py | 2 +- .../domain/use_cases/llm_model_endpoint_use_cases.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/model-engine/model_engine_server/api/v2/chat_completion.py b/model-engine/model_engine_server/api/v2/chat_completion.py index 6140c451..f6b4facc 100644 --- a/model-engine/model_engine_server/api/v2/chat_completion.py +++ b/model-engine/model_engine_server/api/v2/chat_completion.py @@ -263,7 +263,7 @@ async def chat_completion( ) else: logger.info( - f"POST /v2/chat/completion ({('stream' if request.stream else 'sync')}) with to endpoint {model_endpoint_name} for {auth}" + f"POST /v2/chat/completion ({('stream' if request.stream else 'sync')}) with request {request} to endpoint {model_endpoint_name} for {auth}" ) if request.stream: diff --git a/model-engine/model_engine_server/api/v2/completion.py b/model-engine/model_engine_server/api/v2/completion.py index aaf59918..eb101e02 100644 --- a/model-engine/model_engine_server/api/v2/completion.py +++ b/model-engine/model_engine_server/api/v2/completion.py @@ -262,7 +262,7 @@ async def completion( ) else: logger.info( - f"POST /v2/completion ({('stream' if request.stream else 'sync')}) with to endpoint {model_endpoint_name} for {auth}" + f"POST /v2/completion ({('stream' if request.stream else 'sync')}) with request {request} to endpoint {model_endpoint_name} for {auth}" ) if request.stream: diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index c370c250..b7a9758e 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -170,8 +170,8 @@ } -NUM_DOWNSTREAM_REQUEST_RETRIES = 80 # has to be high enough so that the retries take the 5 minutes -DOWNSTREAM_REQUEST_TIMEOUT_SECONDS = 5 * 60 # 5 minutes +NUM_DOWNSTREAM_REQUEST_RETRIES = 80 * 12 # has to be high enough so that the retries take the 5 minutes +DOWNSTREAM_REQUEST_TIMEOUT_SECONDS = 60 * 60 # 5 minutes DEFAULT_BATCH_COMPLETIONS_NODES_PER_WORKER = 1 @@ -398,7 +398,7 @@ async def create_sglang_multinode_bundle( "--nnodes", "2", "--node-rank", - "0" + "0", "--worker-port", "5005", "--leader-port", From 49fa308d5d72cc2282e3f642c511a4638eccafad Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Sat, 7 Jun 2025 20:55:48 +0000 Subject: [PATCH 04/10] increase timeout for http forwarder --- .../model_engine_server/inference/forwarding/forwarding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/inference/forwarding/forwarding.py b/model-engine/model_engine_server/inference/forwarding/forwarding.py index 096d48bf..33533958 100644 --- a/model-engine/model_engine_server/inference/forwarding/forwarding.py +++ b/model-engine/model_engine_server/inference/forwarding/forwarding.py @@ -430,7 +430,9 @@ async def forward(self, json_payload: Any) -> AsyncGenerator[Any, None]: # prag try: response: aiohttp.ClientResponse - async with aiohttp.ClientSession(json_serialize=_serialize_json) as aioclient: + async with aiohttp.ClientSession( + json_serialize=_serialize_json, timeout=aiohttp.ClientTimeout(total=60 * 60) + ) as aioclient: response = await aioclient.post( self.predict_endpoint, json=json_payload, From 9ca3ba6770262f309e2c42c48a0132e89a6a7185 Mon Sep 17 00:00:00 2001 From: meher-m Date: Sat, 7 Jun 2025 22:45:27 +0000 Subject: [PATCH 05/10] trying to fix timeout --- .../inference/forwarding/forwarding.py | 4 +++- .../inference/forwarding/http_forwarder.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/model-engine/model_engine_server/inference/forwarding/forwarding.py b/model-engine/model_engine_server/inference/forwarding/forwarding.py index 33533958..5518e713 100644 --- a/model-engine/model_engine_server/inference/forwarding/forwarding.py +++ b/model-engine/model_engine_server/inference/forwarding/forwarding.py @@ -174,7 +174,9 @@ async def forward(self, json_payload: Any) -> Any: logger.info(f"Accepted request, forwarding {json_payload_repr=}") try: - async with aiohttp.ClientSession(json_serialize=_serialize_json) as aioclient: + async with aiohttp.ClientSession( + json_serialize=_serialize_json, timeout=aiohttp.ClientTimeout(total=60 * 60) + ) as aioclient: response_raw = await aioclient.post( self.predict_endpoint, json=json_payload, diff --git a/model-engine/model_engine_server/inference/forwarding/http_forwarder.py b/model-engine/model_engine_server/inference/forwarding/http_forwarder.py index 88332cca..39f38a5c 100644 --- a/model-engine/model_engine_server/inference/forwarding/http_forwarder.py +++ b/model-engine/model_engine_server/inference/forwarding/http_forwarder.py @@ -93,9 +93,9 @@ async def predict( return response except Exception: if LOG_SENSITIVE_DATA: - logger.error(f"Failed to decode payload") - else: logger.error(f"Failed to decode payload from: {request}") + else: + logger.error(f"Failed to decode payload") raise @@ -109,15 +109,15 @@ async def stream( payload = request.model_dump() except Exception: if LOG_SENSITIVE_DATA: - logger.error(f"Failed to decode payload") - else: logger.error(f"Failed to decode payload from: {request}") + else: + logger.error(f"Failed to decode payload") raise else: if LOG_SENSITIVE_DATA: - logger.debug(f"Received request") - else: logger.debug(f"Received request: {request}") + else: + logger.debug(f"Received request") responses = forwarder.forward(payload) # We fetch the first response to check if upstream request was successful From 993266f4107c6add7cfca53537f941521132d15c Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Mon, 9 Jun 2025 15:54:45 +0000 Subject: [PATCH 06/10] listen to ipv4 --- .../inference/sglang/sglang-startup-script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py index 157e9c30..a688be5c 100755 --- a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py +++ b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py @@ -12,7 +12,7 @@ def wait_for_dns(dns_name: str, max_retries: int = 20, sleep_seconds: int = 3): sleeping sleep_seconds between attempts. Raises RuntimeError if resolution fails repeatedly. """ - for attempt in range(1, max_retries + 1): + for attempt in range(1, max_retries + 2): try: # Use AF_UNSPEC to allow both IPv4 and IPv6 socket.getaddrinfo(dns_name, None, socket.AF_UNSPEC) @@ -107,7 +107,7 @@ def main( "--tp", str(tp), "--host", - "::", + "0.0.0.0", "--port", str(worker_port), "--dist-init-addr", From 697e2a2fc20aae6f352d751e52dcdda97cb3cf6a Mon Sep 17 00:00:00 2001 From: meher-m Date: Mon, 9 Jun 2025 16:12:58 +0000 Subject: [PATCH 07/10] trying to optimize tp --- .../model_engine_server/inference/sglang/Dockerfile.sglang | 2 +- .../inference/sglang/sglang-startup-script.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang b/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang index 61e4ae44..015da503 100644 --- a/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang +++ b/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang @@ -1,4 +1,4 @@ -FROM 692474966980.dkr.ecr.us-west-2.amazonaws.com/sglang:v0.4.1.post7-cu124 +FROM lmsysorg/sglang:latest # These aren't all needed but good to have for debugging purposes RUN apt-get -yq update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ diff --git a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py index a688be5c..24f808a9 100755 --- a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py +++ b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py @@ -119,6 +119,7 @@ def main( "--trust-remote-code", "--log-level", "debug", + "--enable-dp-attention", ] print("Running SGLang server command...") subprocess.check_call(sglang_cmd) From e57a56a79818d951608c3c6f6c99345fa8b47067 Mon Sep 17 00:00:00 2001 From: meher-m Date: Mon, 9 Jun 2025 16:26:00 +0000 Subject: [PATCH 08/10] updates --- .../model_engine_server/inference/sglang/Dockerfile.sglang | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang b/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang index 015da503..fc76d51e 100644 --- a/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang +++ b/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang @@ -1,4 +1,4 @@ -FROM lmsysorg/sglang:latest +FROM lmsysorg/sglang:v0.4.6.post5-cu124 # These aren't all needed but good to have for debugging purposes RUN apt-get -yq update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ @@ -35,7 +35,7 @@ RUN apt-get -yq update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ tk-dev \ libffi-dev \ liblzma-dev \ - python-openssl \ + python3-openssl \ moreutils \ libcurl4-openssl-dev \ libssl-dev \ From a10583b668fccaa46968f72db1bfd4491c9adae1 Mon Sep 17 00:00:00 2001 From: meher-m Date: Mon, 9 Jun 2025 17:07:19 +0000 Subject: [PATCH 09/10] set dp --- .../inference/sglang/sglang-startup-script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py index 24f808a9..138a4c83 100755 --- a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py +++ b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py @@ -120,6 +120,8 @@ def main( "--log-level", "debug", "--enable-dp-attention", + "--dp", + str(tp), ] print("Running SGLang server command...") subprocess.check_call(sglang_cmd) From a8c2bd109b9e4ea2fca9855fa05309f17c978787 Mon Sep 17 00:00:00 2001 From: meher-m Date: Wed, 11 Jun 2025 04:07:16 +0000 Subject: [PATCH 10/10] this is what's used for multinode-12-rc0 --- .../model_engine_server/inference/sglang/Dockerfile.sglang | 3 ++- .../inference/sglang/sglang-startup-script.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang b/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang index fc76d51e..cb799c30 100644 --- a/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang +++ b/model-engine/model_engine_server/inference/sglang/Dockerfile.sglang @@ -1,4 +1,5 @@ -FROM lmsysorg/sglang:v0.4.6.post5-cu124 +# FROM lmsysorg/sglang:v0.4.6.post5-cu124 -- this one didn't work +FROM lmsysorg/sglang:v0.4.5.post3-cu121 # These aren't all needed but good to have for debugging purposes RUN apt-get -yq update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ diff --git a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py index 138a4c83..a688be5c 100755 --- a/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py +++ b/model-engine/model_engine_server/inference/sglang/sglang-startup-script.py @@ -119,9 +119,6 @@ def main( "--trust-remote-code", "--log-level", "debug", - "--enable-dp-attention", - "--dp", - str(tp), ] print("Running SGLang server command...") subprocess.check_call(sglang_cmd)