Skip to content

vllm.v1.engine.launch

LaunchEngineClient: A lightweight EngineClient for GPU-less online serving.

This implements the EngineClient protocol without AsyncLLM or EngineCore, enabling preprocessing (tokenization, rendering) and postprocessing (detokenization) without GPU inference.

LaunchEngineClient

Bases: EngineClient

GPU-less EngineClient that only supports preprocessing/postprocessing.

This is a Null Object at the EngineClient level, bypassing AsyncLLM entirely. It initializes renderer, io_processor, and input_processor for tokenization and rendering, but raises NotImplementedError for any inference-related operations.

Source code in vllm/v1/engine/launch.py
class LaunchEngineClient(EngineClient):
    """GPU-less EngineClient that only supports preprocessing/postprocessing.

    This is a Null Object at the EngineClient level, bypassing AsyncLLM
    entirely. It initializes renderer, io_processor, and input_processor
    for tokenization and rendering, but raises NotImplementedError for
    any inference-related operations.
    """

    def __init__(
        self,
        vllm_config: VllmConfig,
    ) -> None:
        self.vllm_config = vllm_config
        self.model_config = vllm_config.model_config

        self.renderer = renderer = renderer_from_config(self.vllm_config)
        self.io_processor = get_io_processor(
            self.vllm_config,
            self.renderer,
            self.model_config.io_processor_plugin,
        )

        # Convert TokPrompt --> EngineCoreRequest.
        self.input_processor = InputProcessor(self.vllm_config, renderer)

    @classmethod
    def from_vllm_config(
        cls,
        vllm_config: VllmConfig,
    ) -> "LaunchEngineClient":
        """Create a LaunchEngineClient from a VllmConfig without GPU."""
        return cls(
            vllm_config=vllm_config,
        )

    # -- Task support --

    async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return ("render",)

    # -- Inference (not supported) --

    async def generate(
        self,
        prompt: EngineCoreRequest
        | PromptType
        | ProcessorInputs
        | AsyncGenerator[StreamingInput, None],
        sampling_params: SamplingParams,
        request_id: str,
        *,
        prompt_text: str | None = None,
        lora_request: LoRARequest | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        data_parallel_rank: int | None = None,
        reasoning_ended: bool | None = None,
    ) -> AsyncGenerator[RequestOutput, None]:
        raise NotImplementedError(
            "LaunchEngineClient does not support inference. "
            "Use vllm serve for generation requests."
        )
        # yield is needed to make this an async generator
        yield  # type: ignore[misc] # pragma: no cover

    # -- Request management (no-op) --

    async def abort(
        self, request_id: str | Iterable[str], internal: bool = False
    ) -> None:
        pass

    # -- Generation control (no-op) --

    async def pause_generation(
        self,
        *,
        mode: PauseMode = "abort",
        wait_for_inflight_requests: bool | None = None,
        clear_cache: bool = True,
    ) -> None:
        pass

    async def resume_generation(self) -> None:
        pass

    async def is_paused(self) -> bool:
        return False

    def shutdown(self, timeout: float | None = None) -> None:
        pass

    async def encode(
        self,
        prompt: PromptType | ProcessorInputs,
        pooling_params: PoolingParams,
        request_id: str,
        lora_request: LoRARequest | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        tokenization_kwargs: dict[str, Any] | None = None,
        reasoning_ended: bool | None = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
        raise NotImplementedError(
            "LaunchEngineClient does not support inference. "
            "Use vllm serve for encoding requests."
        )
        yield  # type: ignore[misc] # pragma: no cover

    # -- Observability (no-op / defaults) --

    async def is_tracing_enabled(self) -> bool:
        return False

    async def do_log_stats(self) -> None:
        pass

    async def check_health(self) -> None:
        pass

    async def start_profile(self) -> None:
        pass

    async def stop_profile(self) -> None:
        pass

    # -- Cache management (no-op) --

    async def reset_mm_cache(self) -> None:
        pass

    async def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return True

    async def reset_encoder_cache(self) -> None:
        pass

    # -- Power management (no-op) --

    async def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
        pass

    async def wake_up(self, tags: list[str] | None = None) -> None:
        pass

    async def is_sleeping(self) -> bool:
        return False

    # -- LoRA (not supported) --

    async def add_lora(self, lora_request: LoRARequest) -> bool:
        return False

    # -- Status properties --

    @property
    def is_running(self) -> bool:
        return True

    @property
    def is_stopped(self) -> bool:
        return False

    @property
    def errored(self) -> bool:
        return False

    @property
    def dead_error(self) -> BaseException:
        return RuntimeError("LaunchEngineClient does not support inference")

from_vllm_config classmethod

from_vllm_config(
    vllm_config: VllmConfig,
) -> LaunchEngineClient

Create a LaunchEngineClient from a VllmConfig without GPU.

Source code in vllm/v1/engine/launch.py
@classmethod
def from_vllm_config(
    cls,
    vllm_config: VllmConfig,
) -> "LaunchEngineClient":
    """Create a LaunchEngineClient from a VllmConfig without GPU."""
    return cls(
        vllm_config=vllm_config,
    )