diff --git a/src/art/dev/model.py b/src/art/dev/model.py index 2ba170f3..7b2ed0b0 100644 --- a/src/art/dev/model.py +++ b/src/art/dev/model.py @@ -42,12 +42,12 @@ def get_model_config( disable_log_requests=True, # Multi-step processing is not supported for the Xformers attention backend # which is the fallback for devices with compute capability < 8.0 - num_scheduler_steps=( - 16 - if config.get("torchtune_args") is None - and torch.cuda.get_device_capability()[0] >= 8 - else 1 - ), + # num_scheduler_steps=( + # 16 + # if config.get("torchtune_args") is None + # and torch.cuda.get_device_capability()[0] >= 8 + # else 1 + # ), enable_sleep_mode=enable_sleep_mode, generation_config="vllm", ) diff --git a/src/art/unsloth/state.py b/src/art/unsloth/state.py index d5c6db29..5fab1975 100644 --- a/src/art/unsloth/state.py +++ b/src/art/unsloth/state.py @@ -38,6 +38,7 @@ class ModelState: """ def __init__(self, config: InternalModelConfig) -> None: + from unsloth_zoo.vllm_rlhf_utils import ColocateWorkerExtension from vllm.engine import async_llm_engine # Patch MultiStepModelRunner for Unsloth compatibility @@ -49,7 +50,7 @@ def __init__(self, config: InternalModelConfig) -> None: # Set effectively unlimited timeout to support engine pausing & resumption async_llm_engine.ENGINE_ITERATION_TIMEOUT_S = 2**31 - 1 # Sticking with V0 engine for now - os.environ["VLLM_USE_V1"] = "0" + os.environ["VLLM_USE_V1"] = "1" # We can't use expandable segments with sleep mode enable_sleep_mode = config.get("engine_args", {}).get( "enable_sleep_mode", False @@ -69,7 +70,13 @@ def _from_engine_args( engine_args: AsyncEngineArgs, *args: Any, **kwargs: Any ) -> AsyncLLMEngine: return from_engine_args( - replace(engine_args, **config.get("engine_args", {})), *args, **kwargs + replace( + engine_args, + **config.get("engine_args", {}), + worker_extension_cls=f"{ColocateWorkerExtension.__module__}.{ColocateWorkerExtension.__qualname__}", + ), + *args, + **kwargs, ) AsyncLLMEngine.from_engine_args = _from_engine_args