vllm.entrypoints.openai.translations.serving ¶

EngineClient ¶

Bases: ABC

Protocol class for Clients to Engine

Source code in vllm/engine/protocol.py

class EngineClient(ABC):
    """Protocol class for Clients to Engine"""

    vllm_config: VllmConfig
    model_config: ModelConfig
    renderer: BaseRenderer
    io_processor: IOProcessor | None
    input_processor: InputProcessor

    @property
    @abstractmethod
    def is_running(self) -> bool: ...

    @property
    @abstractmethod
    def is_stopped(self) -> bool: ...

    @property
    @abstractmethod
    def errored(self) -> bool: ...

    @property
    @abstractmethod
    def dead_error(self) -> BaseException: ...

    @abstractmethod
    def generate(
        self,
        prompt: EngineCoreRequest
        | PromptType
        | ProcessorInputs
        | AsyncGenerator[StreamingInput, None],
        sampling_params: SamplingParams,
        request_id: str,
        *,
        prompt_text: str | None = None,
        lora_request: LoRARequest | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        data_parallel_rank: int | None = None,
        reasoning_ended: bool | None = None,
    ) -> AsyncGenerator[RequestOutput, None]:
        """Generate outputs for a request."""
        ...

    @abstractmethod
    def encode(
        self,
        prompt: PromptType | ProcessorInputs,
        pooling_params: PoolingParams,
        request_id: str,
        lora_request: LoRARequest | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        tokenization_kwargs: dict[str, Any] | None = None,
        reasoning_ended: bool | None = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
        """Generate outputs for a request from a pooling model."""
        ...

    @abstractmethod
    async def abort(self, request_id: str | Iterable[str]) -> None:
        """Abort a request.

        Args:
            request_id: The unique id of the request,
                        or an iterable of such ids.
        """
        ...

    @abstractmethod
    async def is_tracing_enabled(self) -> bool: ...

    @abstractmethod
    async def do_log_stats(self) -> None: ...

    @abstractmethod
    async def check_health(self) -> None:
        """Raise if unhealthy"""
        ...

    @abstractmethod
    async def start_profile(self) -> None:
        """Start profiling the engine"""
        ...

    @abstractmethod
    async def stop_profile(self) -> None:
        """Stop profiling the engine"""
        ...

    @abstractmethod
    async def reset_mm_cache(self) -> None:
        """Reset the multi-modal cache"""
        ...

    @abstractmethod
    async def reset_encoder_cache(self) -> None:
        """Reset the encoder cache"""
        ...

    @abstractmethod
    async def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        """Reset the prefix cache and optionally any configured connector cache"""
        ...

    @abstractmethod
    async def sleep(self, level: int = 1) -> None:
        """Sleep the engine"""
        ...

    @abstractmethod
    async def wake_up(self, tags: list[str] | None = None) -> None:
        """Wake up the engine"""
        ...

    @abstractmethod
    async def is_sleeping(self) -> bool:
        """Check whether the engine is sleeping"""
        ...

    @abstractmethod
    async def add_lora(self, lora_request: LoRARequest) -> bool:
        """Load a new LoRA adapter into the engine for future requests."""
        ...

    @abstractmethod
    async def pause_generation(
        self,
        *,
        mode: "PauseMode" = "abort",
        wait_for_inflight_requests: bool = False,
        clear_cache: bool = True,
    ) -> None:
        """Pause new generation/encoding requests.

        Args:
            mode: How to handle in-flight requests:
                - ``"abort"``: Abort all in-flight requests immediately
                  and return partial results with "abort" reason (default).
                - ``"wait"``: Wait for in-flight requests to complete.
                - ``"keep"``: Freeze requests in queue; they resume on
                  :meth:`resume_generation`.
            wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
            clear_cache: DEPRECATED. Whether to clear KV and prefix caches
                after draining.
        """
        ...

    @abstractmethod
    async def resume_generation(self) -> None:
        """Resume accepting generation/encoding requests."""
        ...

    @abstractmethod
    async def is_paused(self) -> bool:
        """Return whether the engine is currently paused."""
        ...

    async def scale_elastic_ep(
        self, new_data_parallel_size: int, drain_timeout: int = 300
    ) -> None:
        """Scale the engine"""
        raise NotImplementedError

    async def collective_rpc(
        self,
        method: str,
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict | None = None,
    ):
        """Perform a collective RPC call to the given path."""
        raise NotImplementedError

    async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        """Get supported tasks"""
        raise NotImplementedError

    async def init_weight_transfer_engine(
        self, init_request: WeightTransferInitRequest
    ) -> None:
        """Initialize weight transfer for RL training."""
        raise NotImplementedError

    async def update_weights(self, request: WeightTransferUpdateRequest) -> None:
        """Batched weight update for RL training."""
        raise NotImplementedError

abort `abstractmethod` `async` ¶

abort(request_id: str | Iterable[str]) -> None

Abort a request.

Parameters:

Name	Type	Description	Default
`request_id`	`str \| Iterable[str]`	The unique id of the request, or an iterable of such ids.	required

Source code in vllm/engine/protocol.py

@abstractmethod
async def abort(self, request_id: str | Iterable[str]) -> None:
    """Abort a request.

    Args:
        request_id: The unique id of the request,
                    or an iterable of such ids.
    """
    ...

add_lora `abstractmethod` `async` ¶

add_lora(lora_request: LoRARequest) -> bool

Load a new LoRA adapter into the engine for future requests.

Source code in vllm/engine/protocol.py

@abstractmethod
async def add_lora(self, lora_request: LoRARequest) -> bool:
    """Load a new LoRA adapter into the engine for future requests."""
    ...

check_health `abstractmethod` `async` ¶

check_health() -> None

Raise if unhealthy

Source code in vllm/engine/protocol.py

@abstractmethod
async def check_health(self) -> None:
    """Raise if unhealthy"""
    ...

collective_rpc `async` ¶

collective_rpc(
    method: str,
    timeout: float | None = None,
    args: tuple = (),
    kwargs: dict | None = None,
)

Perform a collective RPC call to the given path.

Source code in vllm/engine/protocol.py

async def collective_rpc(
    self,
    method: str,
    timeout: float | None = None,
    args: tuple = (),
    kwargs: dict | None = None,
):
    """Perform a collective RPC call to the given path."""
    raise NotImplementedError

encode `abstractmethod` ¶

encode(
    prompt: PromptType | ProcessorInputs,
    pooling_params: PoolingParams,
    request_id: str,
    lora_request: LoRARequest | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    tokenization_kwargs: dict[str, Any] | None = None,
    reasoning_ended: bool | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]

Generate outputs for a request from a pooling model.

Source code in vllm/engine/protocol.py

@abstractmethod
def encode(
    self,
    prompt: PromptType | ProcessorInputs,
    pooling_params: PoolingParams,
    request_id: str,
    lora_request: LoRARequest | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    tokenization_kwargs: dict[str, Any] | None = None,
    reasoning_ended: bool | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
    """Generate outputs for a request from a pooling model."""
    ...

generate `abstractmethod` ¶

generate(
    prompt: EngineCoreRequest
    | PromptType
    | ProcessorInputs
    | AsyncGenerator[StreamingInput, None],
    sampling_params: SamplingParams,
    request_id: str,
    *,
    prompt_text: str | None = None,
    lora_request: LoRARequest | None = None,
    tokenization_kwargs: dict[str, Any] | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    data_parallel_rank: int | None = None,
    reasoning_ended: bool | None = None,
) -> AsyncGenerator[RequestOutput, None]

Generate outputs for a request.

Source code in vllm/engine/protocol.py

@abstractmethod
def generate(
    self,
    prompt: EngineCoreRequest
    | PromptType
    | ProcessorInputs
    | AsyncGenerator[StreamingInput, None],
    sampling_params: SamplingParams,
    request_id: str,
    *,
    prompt_text: str | None = None,
    lora_request: LoRARequest | None = None,
    tokenization_kwargs: dict[str, Any] | None = None,
    trace_headers: Mapping[str, str] | None = None,
    priority: int = 0,
    data_parallel_rank: int | None = None,
    reasoning_ended: bool | None = None,
) -> AsyncGenerator[RequestOutput, None]:
    """Generate outputs for a request."""
    ...

get_supported_tasks `async` ¶

get_supported_tasks() -> tuple[SupportedTask, ...]

Get supported tasks

Source code in vllm/engine/protocol.py

async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
    """Get supported tasks"""
    raise NotImplementedError

init_weight_transfer_engine `async` ¶

init_weight_transfer_engine(
    init_request: WeightTransferInitRequest,
) -> None

Initialize weight transfer for RL training.

Source code in vllm/engine/protocol.py

async def init_weight_transfer_engine(
    self, init_request: WeightTransferInitRequest
) -> None:
    """Initialize weight transfer for RL training."""
    raise NotImplementedError

is_paused `abstractmethod` `async` ¶

is_paused() -> bool

Return whether the engine is currently paused.

Source code in vllm/engine/protocol.py

@abstractmethod
async def is_paused(self) -> bool:
    """Return whether the engine is currently paused."""
    ...

is_sleeping `abstractmethod` `async` ¶

is_sleeping() -> bool

Check whether the engine is sleeping

Source code in vllm/engine/protocol.py

@abstractmethod
async def is_sleeping(self) -> bool:
    """Check whether the engine is sleeping"""
    ...

pause_generation `abstractmethod` `async` ¶

pause_generation(
    *,
    mode: PauseMode = "abort",
    wait_for_inflight_requests: bool = False,
    clear_cache: bool = True,
) -> None

Pause new generation/encoding requests.

Parameters:

Name	Type	Description	Default
`mode`	`PauseMode`	How to handle in-flight requests: - `"abort"`: Abort all in-flight requests immediately and return partial results with "abort" reason (default). - `"wait"`: Wait for in-flight requests to complete. - `"keep"`: Freeze requests in queue; they resume on :meth:`resume_generation`.	`'abort'`
`wait_for_inflight_requests`	`bool`	DEPRECATED. Use `mode="wait"` instead.	`False`
`clear_cache`	`bool`	DEPRECATED. Whether to clear KV and prefix caches after draining.	`True`

Source code in vllm/engine/protocol.py

@abstractmethod
async def pause_generation(
    self,
    *,
    mode: "PauseMode" = "abort",
    wait_for_inflight_requests: bool = False,
    clear_cache: bool = True,
) -> None:
    """Pause new generation/encoding requests.

    Args:
        mode: How to handle in-flight requests:
            - ``"abort"``: Abort all in-flight requests immediately
              and return partial results with "abort" reason (default).
            - ``"wait"``: Wait for in-flight requests to complete.
            - ``"keep"``: Freeze requests in queue; they resume on
              :meth:`resume_generation`.
        wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
        clear_cache: DEPRECATED. Whether to clear KV and prefix caches
            after draining.
    """
    ...

reset_encoder_cache `abstractmethod` `async` ¶

reset_encoder_cache() -> None

Reset the encoder cache

Source code in vllm/engine/protocol.py

@abstractmethod
async def reset_encoder_cache(self) -> None:
    """Reset the encoder cache"""
    ...

reset_mm_cache `abstractmethod` `async` ¶

reset_mm_cache() -> None

Reset the multi-modal cache

Source code in vllm/engine/protocol.py

@abstractmethod
async def reset_mm_cache(self) -> None:
    """Reset the multi-modal cache"""
    ...

reset_prefix_cache `abstractmethod` `async` ¶

reset_prefix_cache(
    reset_running_requests: bool = False,
    reset_connector: bool = False,
) -> bool

Reset the prefix cache and optionally any configured connector cache

Source code in vllm/engine/protocol.py

@abstractmethod
async def reset_prefix_cache(
    self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
    """Reset the prefix cache and optionally any configured connector cache"""
    ...

resume_generation `abstractmethod` `async` ¶

resume_generation() -> None

Resume accepting generation/encoding requests.

Source code in vllm/engine/protocol.py

@abstractmethod
async def resume_generation(self) -> None:
    """Resume accepting generation/encoding requests."""
    ...

scale_elastic_ep `async` ¶

scale_elastic_ep(
    new_data_parallel_size: int, drain_timeout: int = 300
) -> None

Scale the engine

Source code in vllm/engine/protocol.py

async def scale_elastic_ep(
    self, new_data_parallel_size: int, drain_timeout: int = 300
) -> None:
    """Scale the engine"""
    raise NotImplementedError

sleep `abstractmethod` `async` ¶

sleep(level: int = 1) -> None

Sleep the engine

Source code in vllm/engine/protocol.py

@abstractmethod
async def sleep(self, level: int = 1) -> None:
    """Sleep the engine"""
    ...

start_profile `abstractmethod` `async` ¶

start_profile() -> None

Start profiling the engine

Source code in vllm/engine/protocol.py

@abstractmethod
async def start_profile(self) -> None:
    """Start profiling the engine"""
    ...

stop_profile `abstractmethod` `async` ¶

stop_profile() -> None

Stop profiling the engine

Source code in vllm/engine/protocol.py

@abstractmethod
async def stop_profile(self) -> None:
    """Stop profiling the engine"""
    ...

update_weights `async` ¶

update_weights(
    request: WeightTransferUpdateRequest,
) -> None

Batched weight update for RL training.

Source code in vllm/engine/protocol.py

async def update_weights(self, request: WeightTransferUpdateRequest) -> None:
    """Batched weight update for RL training."""
    raise NotImplementedError

wake_up `abstractmethod` `async` ¶

wake_up(tags: list[str] | None = None) -> None

Wake up the engine

Source code in vllm/engine/protocol.py

@abstractmethod
async def wake_up(self, tags: list[str] | None = None) -> None:
    """Wake up the engine"""
    ...

OpenAIServingModels ¶

Shared instance to hold data about the loaded base model(s) and adapters.

Handles the routes: - /v1/models - /v1/load_lora_adapter - /v1/unload_lora_adapter

Source code in vllm/entrypoints/openai/models/serving.py

class OpenAIServingModels:
    """Shared instance to hold data about the loaded base model(s) and adapters.

    Handles the routes:
    - /v1/models
    - /v1/load_lora_adapter
    - /v1/unload_lora_adapter
    """

    def __init__(
        self,
        engine_client: EngineClient,
        base_model_paths: list[BaseModelPath],
        *,
        lora_modules: list[LoRAModulePath] | None = None,
    ):
        super().__init__()

        self.engine_client = engine_client
        self.base_model_paths = base_model_paths

        self.static_lora_modules = lora_modules
        self.lora_requests: dict[str, LoRARequest] = {}
        self.lora_id_counter = AtomicCounter(0)

        self.lora_resolvers: list[LoRAResolver] = []
        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers():
            self.lora_resolvers.append(
                LoRAResolverRegistry.get_resolver(lora_resolver_name)
            )
        self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)

        self.model_config = self.engine_client.model_config
        self.renderer = self.engine_client.renderer
        self.io_processor = self.engine_client.io_processor
        self.input_processor = self.engine_client.input_processor

    async def init_static_loras(self):
        """Loads all static LoRA modules.
        Raises if any fail to load"""
        if self.static_lora_modules is None:
            return
        for lora in self.static_lora_modules:
            load_request = LoadLoRAAdapterRequest(
                lora_path=lora.path, lora_name=lora.name
            )
            load_result = await self.load_lora_adapter(
                request=load_request, base_model_name=lora.base_model_name
            )
            if isinstance(load_result, ErrorResponse):
                raise ValueError(load_result.error.message)

    def is_base_model(self, model_name) -> bool:
        return any(model.name == model_name for model in self.base_model_paths)

    def model_name(self, lora_request: LoRARequest | None = None) -> str:
        """Returns the appropriate model name depending on the availability
        and support of the LoRA or base model.
        Parameters:
        - lora: LoRARequest that contain a base_model_name.
        Returns:
        - str: The name of the base model or the first available model path.
        """
        if lora_request is not None:
            return lora_request.lora_name
        return self.base_model_paths[0].name

    async def show_available_models(self) -> ModelList:
        """Show available models. This includes the base model and all adapters."""
        max_model_len = self.model_config.max_model_len

        model_cards = [
            ModelCard(
                id=base_model.name,
                max_model_len=max_model_len,
                root=base_model.model_path,
                permission=[ModelPermission()],
            )
            for base_model in self.base_model_paths
        ]
        lora_cards = [
            ModelCard(
                id=lora.lora_name,
                root=lora.path,
                parent=lora.base_model_name
                if lora.base_model_name
                else self.base_model_paths[0].name,
                permission=[ModelPermission()],
            )
            for lora in self.lora_requests.values()
        ]
        model_cards.extend(lora_cards)
        return ModelList(data=model_cards)

    async def load_lora_adapter(
        self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
    ) -> ErrorResponse | str:
        lora_name = request.lora_name

        # Ensure atomicity based on the lora name
        async with self.lora_resolver_lock[lora_name]:
            error_check_ret = await self._check_load_lora_adapter_request(request)
            if error_check_ret is not None:
                return error_check_ret

            lora_path = request.lora_path
            lora_int_id = (
                self.lora_requests[lora_name].lora_int_id
                if lora_name in self.lora_requests
                else self.lora_id_counter.inc(1)
            )
            lora_request = LoRARequest(
                lora_name=lora_name,
                lora_int_id=lora_int_id,
                lora_path=lora_path,
                load_inplace=request.load_inplace,
            )
            if base_model_name is not None and self.is_base_model(base_model_name):
                lora_request.base_model_name = base_model_name

            # Validate that the adapter can be loaded into the engine
            # This will also preload it for incoming requests
            try:
                await self.engine_client.add_lora(lora_request)
            except Exception as e:
                error_type = "BadRequestError"
                status_code = HTTPStatus.BAD_REQUEST
                if "No adapter found" in str(e):
                    error_type = "NotFoundError"
                    status_code = HTTPStatus.NOT_FOUND

                return create_error_response(
                    message=str(e), err_type=error_type, status_code=status_code
                )

            self.lora_requests[lora_name] = lora_request
            logger.info(
                "Loaded new LoRA adapter: name '%s', path '%s'", lora_name, lora_path
            )
            return f"Success: LoRA adapter '{lora_name}' added successfully."

    async def unload_lora_adapter(
        self, request: UnloadLoRAAdapterRequest
    ) -> ErrorResponse | str:
        lora_name = request.lora_name

        # Ensure atomicity based on the lora name
        async with self.lora_resolver_lock[lora_name]:
            error_check_ret = await self._check_unload_lora_adapter_request(request)
            if error_check_ret is not None:
                return error_check_ret

            # Safe to delete now since we hold the lock
            del self.lora_requests[lora_name]
            logger.info("Removed LoRA adapter: name '%s'", lora_name)
            return f"Success: LoRA adapter '{lora_name}' removed successfully."

    async def _check_load_lora_adapter_request(
        self, request: LoadLoRAAdapterRequest
    ) -> ErrorResponse | None:
        # Check if both 'lora_name' and 'lora_path' are provided
        if not request.lora_name or not request.lora_path:
            return create_error_response(
                message="Both 'lora_name' and 'lora_path' must be provided.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )

        # If not loading inplace
        # Check if the lora adapter with the given name already exists
        if not request.load_inplace and request.lora_name in self.lora_requests:
            return create_error_response(
                message=f"The lora adapter '{request.lora_name}' has already been "
                "loaded. If you want to load the adapter in place, set 'load_inplace'"
                " to True.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )

        return None

    async def _check_unload_lora_adapter_request(
        self, request: UnloadLoRAAdapterRequest
    ) -> ErrorResponse | None:
        # Check if 'lora_name' is not provided return an error
        if not request.lora_name:
            return create_error_response(
                message="'lora_name' needs to be provided to unload a LoRA adapter.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )

        # Check if the lora adapter with the given name exists
        if request.lora_name not in self.lora_requests:
            return create_error_response(
                message=f"The lora adapter '{request.lora_name}' cannot be found.",
                err_type="NotFoundError",
                status_code=HTTPStatus.NOT_FOUND,
            )

        return None

    async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
        """Attempt to resolve a LoRA adapter using available resolvers.

        Args:
            lora_name: Name/identifier of the LoRA adapter

        Returns:
            LoRARequest if found and loaded successfully.
            ErrorResponse (404) if no resolver finds the adapter.
            ErrorResponse (400) if adapter(s) are found but none load.
        """
        async with self.lora_resolver_lock[lora_name]:
            # First check if this LoRA is already loaded
            if lora_name in self.lora_requests:
                return self.lora_requests[lora_name]

            base_model_name = self.model_config.model
            unique_id = self.lora_id_counter.inc(1)
            found_adapter = False

            # Try to resolve using available resolvers
            for resolver in self.lora_resolvers:
                lora_request = await resolver.resolve_lora(base_model_name, lora_name)

                if lora_request is not None:
                    found_adapter = True
                    lora_request.lora_int_id = unique_id

                    try:
                        await self.engine_client.add_lora(lora_request)
                        self.lora_requests[lora_name] = lora_request
                        logger.info(
                            "Resolved and loaded LoRA adapter '%s' using %s",
                            lora_name,
                            resolver.__class__.__name__,
                        )
                        return lora_request
                    except BaseException as e:
                        logger.warning(
                            "Failed to load LoRA '%s' resolved by %s: %s. "
                            "Trying next resolver.",
                            lora_name,
                            resolver.__class__.__name__,
                            e,
                        )
                        continue

            if found_adapter:
                # An adapter was found, but all attempts to load it failed.
                return create_error_response(
                    message=(
                        f"LoRA adapter '{lora_name}' was found but could not be loaded."
                    ),
                    err_type="BadRequestError",
                    status_code=HTTPStatus.BAD_REQUEST,
                )
            else:
                # No adapter was found
                return create_error_response(
                    message=f"LoRA adapter {lora_name} does not exist",
                    err_type="NotFoundError",
                    status_code=HTTPStatus.NOT_FOUND,
                )

init_static_loras `async` ¶

init_static_loras()

Loads all static LoRA modules. Raises if any fail to load

Source code in vllm/entrypoints/openai/models/serving.py

async def init_static_loras(self):
    """Loads all static LoRA modules.
    Raises if any fail to load"""
    if self.static_lora_modules is None:
        return
    for lora in self.static_lora_modules:
        load_request = LoadLoRAAdapterRequest(
            lora_path=lora.path, lora_name=lora.name
        )
        load_result = await self.load_lora_adapter(
            request=load_request, base_model_name=lora.base_model_name
        )
        if isinstance(load_result, ErrorResponse):
            raise ValueError(load_result.error.message)

model_name ¶

model_name(lora_request: LoRARequest | None = None) -> str

Returns the appropriate model name depending on the availability and support of the LoRA or base model. Parameters: - lora: LoRARequest that contain a base_model_name. Returns: - str: The name of the base model or the first available model path.

Source code in vllm/entrypoints/openai/models/serving.py

def model_name(self, lora_request: LoRARequest | None = None) -> str:
    """Returns the appropriate model name depending on the availability
    and support of the LoRA or base model.
    Parameters:
    - lora: LoRARequest that contain a base_model_name.
    Returns:
    - str: The name of the base model or the first available model path.
    """
    if lora_request is not None:
        return lora_request.lora_name
    return self.base_model_paths[0].name

resolve_lora `async` ¶

resolve_lora(lora_name: str) -> LoRARequest | ErrorResponse

Attempt to resolve a LoRA adapter using available resolvers.

Parameters:

Name	Type	Description	Default
`lora_name`	`str`	Name/identifier of the LoRA adapter	required

Returns:

Type	Description
`LoRARequest \| ErrorResponse`	LoRARequest if found and loaded successfully.
`LoRARequest \| ErrorResponse`	ErrorResponse (404) if no resolver finds the adapter.
`LoRARequest \| ErrorResponse`	ErrorResponse (400) if adapter(s) are found but none load.

Source code in vllm/entrypoints/openai/models/serving.py

async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
    """Attempt to resolve a LoRA adapter using available resolvers.

    Args:
        lora_name: Name/identifier of the LoRA adapter

    Returns:
        LoRARequest if found and loaded successfully.
        ErrorResponse (404) if no resolver finds the adapter.
        ErrorResponse (400) if adapter(s) are found but none load.
    """
    async with self.lora_resolver_lock[lora_name]:
        # First check if this LoRA is already loaded
        if lora_name in self.lora_requests:
            return self.lora_requests[lora_name]

        base_model_name = self.model_config.model
        unique_id = self.lora_id_counter.inc(1)
        found_adapter = False

        # Try to resolve using available resolvers
        for resolver in self.lora_resolvers:
            lora_request = await resolver.resolve_lora(base_model_name, lora_name)

            if lora_request is not None:
                found_adapter = True
                lora_request.lora_int_id = unique_id

                try:
                    await self.engine_client.add_lora(lora_request)
                    self.lora_requests[lora_name] = lora_request
                    logger.info(
                        "Resolved and loaded LoRA adapter '%s' using %s",
                        lora_name,
                        resolver.__class__.__name__,
                    )
                    return lora_request
                except BaseException as e:
                    logger.warning(
                        "Failed to load LoRA '%s' resolved by %s: %s. "
                        "Trying next resolver.",
                        lora_name,
                        resolver.__class__.__name__,
                        e,
                    )
                    continue

        if found_adapter:
            # An adapter was found, but all attempts to load it failed.
            return create_error_response(
                message=(
                    f"LoRA adapter '{lora_name}' was found but could not be loaded."
                ),
                err_type="BadRequestError",
                status_code=HTTPStatus.BAD_REQUEST,
            )
        else:
            # No adapter was found
            return create_error_response(
                message=f"LoRA adapter {lora_name} does not exist",
                err_type="NotFoundError",
                status_code=HTTPStatus.NOT_FOUND,
            )

show_available_models `async` ¶

show_available_models() -> ModelList

Show available models. This includes the base model and all adapters.

Source code in vllm/entrypoints/openai/models/serving.py

async def show_available_models(self) -> ModelList:
    """Show available models. This includes the base model and all adapters."""
    max_model_len = self.model_config.max_model_len

    model_cards = [
        ModelCard(
            id=base_model.name,
            max_model_len=max_model_len,
            root=base_model.model_path,
            permission=[ModelPermission()],
        )
        for base_model in self.base_model_paths
    ]
    lora_cards = [
        ModelCard(
            id=lora.lora_name,
            root=lora.path,
            parent=lora.base_model_name
            if lora.base_model_name
            else self.base_model_paths[0].name,
            permission=[ModelPermission()],
        )
        for lora in self.lora_requests.values()
    ]
    model_cards.extend(lora_cards)
    return ModelList(data=model_cards)

OpenAIServingTranscription ¶

Bases: OpenAISpeechToText

Handles transcription requests.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py

class OpenAIServingTranscription(OpenAISpeechToText):
    """Handles transcription requests."""

    def __init__(
        self,
        engine_client: EngineClient,
        models: OpenAIServingModels,
        *,
        request_logger: RequestLogger | None,
        return_tokens_as_token_ids: bool = False,
        log_error_stack: bool = False,
        enable_force_include_usage: bool = False,
    ):
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            task_type="transcribe",
            log_error_stack=log_error_stack,
            enable_force_include_usage=enable_force_include_usage,
        )

    async def create_transcription(
        self,
        audio_data: bytes,
        request: TranscriptionRequest,
        raw_request: Request | None = None,
    ) -> (
        TranscriptionResponse
        | TranscriptionResponseVerbose
        | AsyncGenerator[str, None]
        | ErrorResponse
    ):
        """Transcription API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/audio/createTranscription
        for the API specification. This API mimics the OpenAI transcription API.
        """
        return await self._create_speech_to_text(
            audio_data=audio_data,
            request=request,
            raw_request=raw_request,
            response_class=(
                TranscriptionResponseVerbose
                if request.response_format == "verbose_json"
                else TranscriptionResponse
            ),
            stream_generator_method=self.transcription_stream_generator,
        )

    async def transcription_stream_generator(
        self,
        request: TranscriptionRequest,
        result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
    ) -> AsyncGenerator[str, None]:
        generator = self._speech_to_text_stream_generator(
            request=request,
            list_result_generator=result_generator,
            request_id=request_id,
            request_metadata=request_metadata,
            audio_duration_s=audio_duration_s,
            chunk_object_type="transcription.chunk",
            response_stream_choice_class=TranscriptionResponseStreamChoice,
            stream_response_class=TranscriptionStreamResponse,
        )
        async for chunk in generator:
            yield chunk

create_transcription `async` ¶

create_transcription(
    audio_data: bytes,
    request: TranscriptionRequest,
    raw_request: Request | None = None,
) -> (
    TranscriptionResponse
    | TranscriptionResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
)

Transcription API similar to OpenAI's API.

See https://platform.openai.com/docs/api-reference/audio/createTranscription for the API specification. This API mimics the OpenAI transcription API.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py

async def create_transcription(
    self,
    audio_data: bytes,
    request: TranscriptionRequest,
    raw_request: Request | None = None,
) -> (
    TranscriptionResponse
    | TranscriptionResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
):
    """Transcription API similar to OpenAI's API.

    See https://platform.openai.com/docs/api-reference/audio/createTranscription
    for the API specification. This API mimics the OpenAI transcription API.
    """
    return await self._create_speech_to_text(
        audio_data=audio_data,
        request=request,
        raw_request=raw_request,
        response_class=(
            TranscriptionResponseVerbose
            if request.response_format == "verbose_json"
            else TranscriptionResponse
        ),
        stream_generator_method=self.transcription_stream_generator,
    )

OpenAIServingTranslation ¶

Bases: OpenAISpeechToText

Handles translation requests.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py

class OpenAIServingTranslation(OpenAISpeechToText):
    """Handles translation requests."""

    def __init__(
        self,
        engine_client: EngineClient,
        models: OpenAIServingModels,
        *,
        request_logger: RequestLogger | None,
        return_tokens_as_token_ids: bool = False,
        log_error_stack: bool = False,
        enable_force_include_usage: bool = False,
    ):
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            task_type="translate",
            log_error_stack=log_error_stack,
            enable_force_include_usage=enable_force_include_usage,
        )

    async def create_translation(
        self,
        audio_data: bytes,
        request: TranslationRequest,
        raw_request: Request | None = None,
    ) -> (
        TranslationResponse
        | TranslationResponseVerbose
        | AsyncGenerator[str, None]
        | ErrorResponse
    ):
        """Translation API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/audio/createTranslation
        for the API specification. This API mimics the OpenAI translation API.
        """
        return await self._create_speech_to_text(
            audio_data=audio_data,
            request=request,
            raw_request=raw_request,
            response_class=(
                TranslationResponseVerbose
                if request.response_format == "verbose_json"
                else TranslationResponse
            ),
            stream_generator_method=self.translation_stream_generator,
        )

    async def translation_stream_generator(
        self,
        request: TranslationRequest,
        result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
    ) -> AsyncGenerator[str, None]:
        generator = self._speech_to_text_stream_generator(
            request=request,
            list_result_generator=result_generator,
            request_id=request_id,
            request_metadata=request_metadata,
            audio_duration_s=audio_duration_s,
            chunk_object_type="translation.chunk",
            response_stream_choice_class=TranslationResponseStreamChoice,
            stream_response_class=TranslationStreamResponse,
        )
        async for chunk in generator:
            yield chunk

create_translation `async` ¶

create_translation(
    audio_data: bytes,
    request: TranslationRequest,
    raw_request: Request | None = None,
) -> (
    TranslationResponse
    | TranslationResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
)

Translation API similar to OpenAI's API.

See https://platform.openai.com/docs/api-reference/audio/createTranslation for the API specification. This API mimics the OpenAI translation API.

Source code in vllm/entrypoints/openai/speech_to_text/serving.py

async def create_translation(
    self,
    audio_data: bytes,
    request: TranslationRequest,
    raw_request: Request | None = None,
) -> (
    TranslationResponse
    | TranslationResponseVerbose
    | AsyncGenerator[str, None]
    | ErrorResponse
):
    """Translation API similar to OpenAI's API.

    See https://platform.openai.com/docs/api-reference/audio/createTranslation
    for the API specification. This API mimics the OpenAI translation API.
    """
    return await self._create_speech_to_text(
        audio_data=audio_data,
        request=request,
        raw_request=raw_request,
        response_class=(
            TranslationResponseVerbose
            if request.response_format == "verbose_json"
            else TranslationResponse
        ),
        stream_generator_method=self.translation_stream_generator,
    )

OpenAISpeechToText ¶

Bases: OpenAIServing

Base class for speech-to-text operations like transcription and translation.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

class OpenAISpeechToText(OpenAIServing):
    """Base class for speech-to-text operations like transcription and
    translation."""

    def __init__(
        self,
        engine_client: EngineClient,
        models: OpenAIServingModels,
        *,
        request_logger: RequestLogger | None,
        return_tokens_as_token_ids: bool = False,
        task_type: Literal["transcribe", "translate"] = "transcribe",
        log_error_stack: bool = False,
        enable_force_include_usage: bool = False,
    ):
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            log_error_stack=log_error_stack,
        )

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
        self.task_type: Final = task_type

        self.asr_config = self.model_cls.get_speech_to_text_config(
            self.model_config, task_type
        )

        self.enable_force_include_usage = enable_force_include_usage

        self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
        if self.model_cls.supports_segment_timestamp:
            self.tokenizer = cast(
                PreTrainedTokenizerBase,
                get_tokenizer(
                    tokenizer_name=self.model_config.tokenizer,
                    tokenizer_mode=self.model_config.tokenizer_mode,
                ),
            )

        if self.default_sampling_params:
            logger.info(
                "Overwriting default completion sampling param with: %s",
                self.default_sampling_params,
            )

        # Warm up audio preprocessing to avoid first-request latency
        self._warmup_audio_preprocessing()
        # Warm up input processor with dummy audio
        self._warmup_input_processor()

    def _warmup_audio_preprocessing(self) -> None:
        """Warm up audio processing libraries to avoid first-request latency.

        The first call to librosa functions (load, get_duration, mel-spectrogram)
        triggers JIT compilation and library initialization which can take ~7s.
        This method warms up these operations during server initialization.
        """
        # Skip warmup if librosa is not installed (optional dependency)
        if isinstance(librosa, PlaceholderModule):
            return

        # Skip warmup if model doesn't support transcription
        if not supports_transcription(self.model_cls):
            return

        if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
            return

        try:
            warmup_start = time.perf_counter()
            logger.info("Warming up audio preprocessing libraries...")

            # Create a minimal dummy audio (1 second of silence at target sample rate)
            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

            # Warm up librosa.load by using librosa functions on the dummy data
            # This initializes FFTW, numba JIT, and other audio processing libraries
            _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)

            # Warm up mel-spectrogram computation with model-specific parameters
            from vllm.transformers_utils.processor import cached_processor_from_config

            processor = cached_processor_from_config(self.model_config)
            feature_extractor = None
            if hasattr(processor, "feature_extractor"):
                feature_extractor = processor.feature_extractor
            elif hasattr(processor, "audio_processor"):
                # For models like GraniteSpeech that use audio_processor
                audio_proc = processor.audio_processor
                if hasattr(audio_proc, "feature_extractor"):
                    feature_extractor = audio_proc.feature_extractor
                # If audio_processor doesn't have feature_extractor,
                # skip mel-spectrogram warmup for these models

            if feature_extractor is not None:
                _ = librosa.feature.melspectrogram(
                    y=dummy_audio,
                    sr=self.asr_config.sample_rate,
                    n_mels=getattr(feature_extractor, "n_mels", 128),
                    n_fft=getattr(feature_extractor, "n_fft", 400),
                    hop_length=getattr(feature_extractor, "hop_length", 160),
                )

            warmup_elapsed = time.perf_counter() - warmup_start
            logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
        except Exception:
            # Don't fail initialization if warmup fails - log exception and continue
            logger.exception(
                "Audio preprocessing warmup failed (non-fatal): %s. "
                "First request may experience higher latency.",
            )

    def _warmup_input_processor(self) -> None:
        """Warm up input processor with dummy audio to avoid first-request latency.

        The first call to input_processor.process_inputs() with multimodal audio
        triggers multimodal processing initialization which can take ~2.5s.
        This method processes a dummy audio request to warm up the pipeline.
        """
        # Skip warmup if model doesn't support transcription
        if not supports_transcription(self.model_cls):
            return

        # Only warm up if model supports transcription methods
        if not hasattr(self.model_cls, "get_generation_prompt"):
            return

        try:
            warmup_start = time.perf_counter()
            logger.info("Warming up multimodal input processor...")

            # Create minimal dummy audio (1 second of silence)
            dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

            # Use the same method that _preprocess_speech_to_text uses
            # to create the prompt
            dummy_prompt = self.model_cls.get_generation_prompt(
                audio=dummy_audio,
                stt_config=self.asr_config,
                model_config=self.model_config,
                language="en",
                task_type=self.task_type,
                request_prompt="",
                to_language=None,
            )
            parsed_prompt = parse_model_prompt(self.model_config, dummy_prompt)

            # Process the dummy input through the input processor
            # This will trigger all the multimodal processing initialization
            _ = self.renderer.render_cmpl([parsed_prompt])

            warmup_elapsed = time.perf_counter() - warmup_start
            logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
        except Exception:
            # Don't fail initialization if warmup fails - log warning and continue
            logger.exception(
                "Input processor warmup failed (non-fatal): %s. "
                "First request may experience higher latency."
            )

    @cached_property
    def model_cls(self) -> type[SupportsTranscription]:
        from vllm.model_executor.model_loader import get_model_cls

        model_cls = get_model_cls(self.model_config)
        return cast(type[SupportsTranscription], model_cls)

    async def _detect_language(
        self,
        audio_chunk: np.ndarray,
        request_id: str,
    ) -> str:
        """Auto-detect the spoken language from an audio chunk.

        Delegates prompt construction and output parsing to the model class
        via ``get_language_detection_prompt`` and
        ``parse_language_detection_output``.
        """
        from vllm.sampling_params import SamplingParams

        prompt = self.model_cls.get_language_detection_prompt(
            audio_chunk,
            self.asr_config,
        )
        allowed_token_ids = self.model_cls.get_language_token_ids(
            self.tokenizer,
        )
        sampling_params = SamplingParams(
            max_tokens=1,
            temperature=0.0,
            allowed_token_ids=allowed_token_ids,
        )

        result_generator = self.engine_client.generate(
            prompt,
            sampling_params,
            request_id,
        )

        final_output: RequestOutput
        async for final_output in result_generator:
            if final_output.finished:
                break

        token_ids = list(final_output.outputs[0].token_ids)
        lang = self.model_cls.parse_language_detection_output(
            token_ids,
            self.tokenizer,
        )

        logger.info("Auto-detected language: '%s'", lang)
        return lang

    async def _preprocess_speech_to_text(
        self,
        request: SpeechToTextRequest,
        audio_data: bytes,
        request_id: str,
    ) -> tuple[list[ProcessorInputs], float]:
        # Validate request
        language = self.model_cls.validate_language(request.language)
        # Skip to_language validation to avoid extra logging for Whisper.
        to_language = (
            self.model_cls.validate_language(request.to_language)
            if request.to_language
            else None
        )

        if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
            raise VLLMValidationError(
                "Maximum file size exceeded",
                parameter="audio_filesize_mb",
                value=len(audio_data) / 1024**2,
            )

        with io.BytesIO(audio_data) as bytes_:
            # NOTE resample to model SR here for efficiency. This is also a
            # pre-requisite for chunking, as it assumes Whisper SR.
            y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)

        duration = librosa.get_duration(y=y, sr=sr)
        do_split_audio = (
            self.asr_config.allow_audio_chunking
            and duration > self.asr_config.max_audio_clip_s
        )
        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))

        if language is None and getattr(
            self.model_cls, "supports_explicit_language_detection", False
        ):
            language = await self._detect_language(
                chunks[0], f"{request_id}-lang_detect"
            )
            request.language = language

        parsed_prompts: list[DictPrompt] = []
        for chunk in chunks:
            # The model has control over the construction, as long as it
            # returns a valid PromptType.
            prompt = self.model_cls.get_generation_prompt(
                audio=chunk,
                stt_config=self.asr_config,
                model_config=self.model_config,
                language=language,
                task_type=self.task_type,
                request_prompt=request.prompt,
                to_language=to_language,
            )

            parsed_prompt: DictPrompt
            if request.response_format == "verbose_json":
                parsed_prompt = parse_enc_dec_prompt(prompt)
                parsed_prompt = self._preprocess_verbose_prompt(parsed_prompt)
            else:
                parsed_prompt = parse_model_prompt(self.model_config, prompt)

            parsed_prompts.append(parsed_prompt)

        engine_prompts = await self.renderer.render_cmpl_async(parsed_prompts)

        return engine_prompts, duration

    def _preprocess_verbose_prompt(self, prompt: EncoderDecoderDictPrompt):
        dec_prompt = prompt["decoder_prompt"]

        if not (isinstance(dec_prompt, dict) and "prompt" in dec_prompt):
            raise VLLMValidationError(
                "Expected decoder_prompt to contain text",
                parameter="decoder_prompt",
                value=type(dec_prompt).__name__,
            )

        dec_prompt["prompt"] = dec_prompt["prompt"].replace(
            "<|notimestamps|>", "<|0.00|>"
        )

        return prompt

    def _get_verbose_segments(
        self,
        tokens: tuple,
        log_probs: FlatLogprobs | list[dict[int, Logprob]],
        request: SpeechToTextRequest,
        segment_class: type[SpeechToTextSegment],
        start_time: float = 0,
    ) -> list[SpeechToTextSegment]:
        """
        Convert tokens to verbose segments.

        This method expects the model to produce
        timestamps as tokens (similar to Whisper).
        If the tokens do not include timestamp information,
        the segments may not be generated correctly.

        Note: No_speech_prob field is not supported
        in this implementation and will be None. See docs for details.
        """
        BASE_OFFSET = 0.02
        init_token = self.tokenizer.encode("<|0.00|>", add_special_tokens=False)[0]
        if tokens[-1] == self.tokenizer.eos_token_id:
            tokens = tokens[:-1]

        tokens_with_start = (init_token,) + tokens
        segments: list[SpeechToTextSegment] = []
        last_timestamp_start = 0

        if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
            tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
        avg_logprob = 0.0
        for idx in range(1, len(tokens_with_start)):
            # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
            # If the ordering is violated, this slicing may produce incorrect results.
            token = tokens_with_start[idx]
            if token >= init_token and tokens_with_start[idx - 1] >= init_token:
                sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
                start_timestamp = sliced_timestamp_tokens[0] - init_token
                end_timestamp = sliced_timestamp_tokens[-1] - init_token
                text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
                text_bytes = text.encode("utf-8")

                casting_segment = cast(
                    SpeechToTextSegment,
                    segment_class(
                        id=len(segments),
                        seek=start_time,
                        start=start_time + BASE_OFFSET * start_timestamp,
                        end=start_time + BASE_OFFSET * end_timestamp,
                        temperature=request.temperature,
                        text=text,
                        # The compression ratio measures
                        # how compressible the generated text is.
                        # A higher ratio indicates more repetitive content,
                        # which is a strong sign of hallucination in outputs.
                        compression_ratio=len(text_bytes)
                        / len(zlib.compress(text_bytes)),
                        tokens=sliced_timestamp_tokens[1:-1],
                        avg_logprob=avg_logprob / (idx - last_timestamp_start),
                    ),
                )
                segments.append(casting_segment)
                last_timestamp_start = idx
                avg_logprob = 0
            else:
                avg_logprob += log_probs[idx - 1][token].logprob
        return segments

    async def _create_speech_to_text(
        self,
        audio_data: bytes,
        request: SpeechToTextRequest,
        raw_request: Request,
        response_class: type[ResponseType],
        stream_generator_method: Callable[..., AsyncGenerator[str, None]],
    ) -> T | V | AsyncGenerator[str, None] | ErrorResponse:
        """Base method for speech-to-text operations like transcription and
        translation."""
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

        if request.response_format not in ["text", "json", "verbose_json"]:
            return self.create_error_response(
                "Currently only support response_format: "
                "`text`, `json` or `verbose_json`"
            )

        if (
            request.response_format == "verbose_json"
            and not self.model_cls.supports_segment_timestamp
        ):
            return self.create_error_response(
                f"Currently do not support verbose_json for {request.model}"
            )

        if request.response_format == "verbose_json" and request.stream:
            return self.create_error_response(
                "verbose_json format doesn't support streaming case"
            )
        request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)

            engine_prompts, duration_s = await self._preprocess_speech_to_text(
                request=request,
                audio_data=audio_data,
                request_id=request_id,
            )

        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(e)

        # Schedule the request and get the result generator.
        max_model_len = self.model_config.max_model_len
        list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
        try:
            # Unlike most decoder-only models, whisper generation length is not
            # constrained by the size of the input audio, which is mapped to a
            # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
            # generated by respecting the extra completion tokens arg.
            max_tokens = get_max_tokens(
                max_model_len,
                request.max_completion_tokens,
                0,
                self.default_sampling_params,
            )

            sampling_params = request.to_sampling_params(
                max_tokens,
                self.default_sampling_params,
            )
            if request.response_format == "verbose_json":
                sampling_params.logprobs = 1

            list_result_generator = []
            for i, engine_prompt in enumerate(engine_prompts):
                request_id_item = f"{request_id}_{i}"

                self._log_inputs(
                    request_id_item,
                    engine_prompt,
                    params=sampling_params,
                    lora_request=lora_request,
                )

                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )

                generator = self.engine_client.generate(
                    engine_prompt,
                    sampling_params,
                    request_id_item,
                    lora_request=lora_request,
                    trace_headers=trace_headers,
                )

                list_result_generator.append(generator)
        except ValueError as e:
            return self.create_error_response(e)

        if request.stream:
            return stream_generator_method(
                request, list_result_generator, request_id, request_metadata, duration_s
            )
        # Non-streaming response.
        total_segments = []
        text_parts = []
        try:
            assert list_result_generator is not None
            segments_types: dict[str, type[SpeechToTextSegment]] = {
                "transcribe": TranscriptionSegment,
                "translate": TranslationSegment,
            }
            segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
            text = ""
            chunk_size_in_s = self.asr_config.max_audio_clip_s
            if chunk_size_in_s is None:
                assert len(list_result_generator) == 1, (
                    "`max_audio_clip_s` is set to None, audio cannot be chunked"
                )
            for idx, result_generator in enumerate(list_result_generator):
                start_time = (
                    float(idx * chunk_size_in_s) if chunk_size_in_s is not None else 0.0
                )
                async for op in result_generator:
                    if request.response_format == "verbose_json":
                        assert op.outputs[0].logprobs
                        segments: list[SpeechToTextSegment] = (
                            self._get_verbose_segments(
                                tokens=tuple(op.outputs[0].token_ids),
                                segment_class=segment_class,
                                request=request,
                                start_time=start_time,
                                log_probs=op.outputs[0].logprobs,
                            )
                        )

                        total_segments.extend(segments)
                        text_parts.extend([seg.text for seg in segments])
                    else:
                        raw_text = op.outputs[0].text
                        text_parts.append(self.model_cls.post_process_output(raw_text))
            text = "".join(text_parts)
            if self.task_type == "transcribe":
                final_response: ResponseType
                # add usage in TranscriptionResponse.
                usage = {
                    "type": "duration",
                    # rounded up as per openAI specs
                    "seconds": int(math.ceil(duration_s)),
                }
                if request.response_format != "verbose_json":
                    final_response = cast(
                        T, TranscriptionResponse(text=text, usage=usage)
                    )
                else:
                    final_response = cast(
                        V,
                        TranscriptionResponseVerbose(
                            text=text,
                            language=request.language,
                            duration=str(duration_s),
                            segments=total_segments,
                        ),
                    )
            else:
                # no usage in response for translation task
                if request.response_format != "verbose_json":
                    final_response = cast(T, TranslationResponse(text=text))
                else:
                    final_response = cast(
                        V,
                        TranslationResponseVerbose(
                            text=text,
                            language=request.language,
                            duration=str(duration_s),
                            segments=total_segments,
                        ),
                    )
            return final_response
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except ValueError as e:
            return self.create_error_response(e)

    async def _speech_to_text_stream_generator(
        self,
        request: SpeechToTextRequest,
        list_result_generator: list[AsyncGenerator[RequestOutput, None]],
        request_id: str,
        request_metadata: RequestResponseMetadata,
        audio_duration_s: float,
        chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
        response_stream_choice_class: type[TranscriptionResponseStreamChoice]
        | type[TranslationResponseStreamChoice],
        stream_response_class: type[TranscriptionStreamResponse]
        | type[TranslationStreamResponse],
    ) -> AsyncGenerator[str, None]:
        created_time = int(time.time())
        model_name = request.model

        completion_tokens = 0
        num_prompt_tokens = 0

        include_usage = self.enable_force_include_usage or request.stream_include_usage
        include_continuous_usage = (
            request.stream_continuous_usage_stats
            if include_usage and request.stream_continuous_usage_stats
            else False
        )

        try:
            for result_generator in list_result_generator:
                async for res in result_generator:
                    # On first result.
                    if res.prompt_token_ids is not None:
                        num_prompt_tokens = len(res.prompt_token_ids)
                        if audio_tokens := self.model_cls.get_num_audio_tokens(
                            audio_duration_s, self.asr_config, self.model_config
                        ):
                            num_prompt_tokens += audio_tokens

                    # We need to do it here, because if there are exceptions in
                    # the result_generator, it needs to be sent as the FIRST
                    # response (by the try...catch).

                    # Just one output (n=1) supported.
                    assert len(res.outputs) == 1
                    output = res.outputs[0]

                    # TODO: For models that output structured formats (e.g.,
                    # Qwen3-ASR with "language X<asr_text>" prefix), streaming
                    # would need buffering to strip the prefix properly since
                    # deltas may split the tag across chunks.
                    delta_message = DeltaMessage(content=output.text)
                    completion_tokens += len(output.token_ids)

                    if output.finish_reason is None:
                        # Still generating, send delta update.
                        choice_data = response_stream_choice_class(delta=delta_message)
                    else:
                        # Model is finished generating.
                        choice_data = response_stream_choice_class(
                            delta=delta_message,
                            finish_reason=output.finish_reason,
                            stop_reason=output.stop_reason,
                        )

                    chunk = stream_response_class(
                        id=request_id,
                        object=chunk_object_type,
                        created=created_time,
                        choices=[choice_data],
                        model=model_name,
                    )

                    # handle usage stats if requested & if continuous
                    if include_continuous_usage:
                        chunk.usage = UsageInfo(
                            prompt_tokens=num_prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=num_prompt_tokens + completion_tokens,
                        )

                    data = chunk.model_dump_json(exclude_unset=True)
                    yield f"data: {data}\n\n"

            # Once the final token is handled, if stream_options.include_usage
            # is sent, send the usage.
            if include_usage:
                final_usage = UsageInfo(
                    prompt_tokens=num_prompt_tokens,
                    completion_tokens=completion_tokens,
                    total_tokens=num_prompt_tokens + completion_tokens,
                )

                final_usage_chunk = stream_response_class(
                    id=request_id,
                    object=chunk_object_type,
                    created=created_time,
                    choices=[],
                    model=model_name,
                    usage=final_usage,
                )
                final_usage_data = final_usage_chunk.model_dump_json(
                    exclude_unset=True, exclude_none=True
                )
                yield f"data: {final_usage_data}\n\n"

            # report to FastAPI middleware aggregate usage across all choices
            request_metadata.final_usage_info = UsageInfo(
                prompt_tokens=num_prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=num_prompt_tokens + completion_tokens,
            )

        except Exception as e:
            logger.exception("Error in %s stream generator.", self.task_type)
            data = self.create_streaming_error_response(e)
            yield f"data: {data}\n\n"
        # Send the final done message after all response.n are finished
        yield "data: [DONE]\n\n"

    def _split_audio(
        self, audio_data: np.ndarray, sample_rate: int
    ) -> list[np.ndarray]:
        assert self.asr_config.max_audio_clip_s is not None, (
            f"{self.asr_config.max_audio_clip_s=} cannot be None to"
            " split audio into chunks."
        )
        chunk_size = sample_rate * self.asr_config.max_audio_clip_s
        overlap_size = sample_rate * self.asr_config.overlap_chunk_second
        chunks = []
        i = 0
        while i < audio_data.shape[-1]:
            if i + chunk_size >= audio_data.shape[-1]:
                # handle last chunk
                chunks.append(audio_data[..., i:])
                break

            # Find the best split point in the overlap region
            search_start = i + chunk_size - overlap_size
            search_end = min(i + chunk_size, audio_data.shape[-1])
            split_point = self._find_split_point(audio_data, search_start, search_end)

            # Extract chunk up to the split point
            chunks.append(audio_data[..., i:split_point])
            i = split_point
        return chunks

    def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
        """Find the best point to split audio by
        looking for silence or low amplitude.
        Args:
            wav: Audio tensor [1, T]
            start_idx: Start index of search region
            end_idx: End index of search region
        Returns:
            Index of best splitting point
        """
        segment = wav[start_idx:end_idx]

        # Calculate RMS energy in small windows
        min_energy = math.inf
        quietest_idx = 0
        min_energy_window = self.asr_config.min_energy_split_window_size
        assert min_energy_window is not None
        for i in range(0, len(segment) - min_energy_window, min_energy_window):
            window = segment[i : i + min_energy_window]
            energy = (window**2).mean() ** 0.5
            if energy < min_energy:
                quietest_idx = i + start_idx
                min_energy = energy
        return quietest_idx

_create_speech_to_text `async` ¶

_create_speech_to_text(
    audio_data: bytes,
    request: SpeechToTextRequest,
    raw_request: Request,
    response_class: type[ResponseType],
    stream_generator_method: Callable[
        ..., AsyncGenerator[str, None]
    ],
) -> T | V | AsyncGenerator[str, None] | ErrorResponse

Base method for speech-to-text operations like transcription and translation.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

async def _create_speech_to_text(
    self,
    audio_data: bytes,
    request: SpeechToTextRequest,
    raw_request: Request,
    response_class: type[ResponseType],
    stream_generator_method: Callable[..., AsyncGenerator[str, None]],
) -> T | V | AsyncGenerator[str, None] | ErrorResponse:
    """Base method for speech-to-text operations like transcription and
    translation."""
    error_check_ret = await self._check_model(request)
    if error_check_ret is not None:
        return error_check_ret

    # If the engine is dead, raise the engine's DEAD_ERROR.
    # This is required for the streaming case, where we return a
    # success status before we actually start generating text :).
    if self.engine_client.errored:
        raise self.engine_client.dead_error

    if request.response_format not in ["text", "json", "verbose_json"]:
        return self.create_error_response(
            "Currently only support response_format: "
            "`text`, `json` or `verbose_json`"
        )

    if (
        request.response_format == "verbose_json"
        and not self.model_cls.supports_segment_timestamp
    ):
        return self.create_error_response(
            f"Currently do not support verbose_json for {request.model}"
        )

    if request.response_format == "verbose_json" and request.stream:
        return self.create_error_response(
            "verbose_json format doesn't support streaming case"
        )
    request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"

    request_metadata = RequestResponseMetadata(request_id=request_id)
    if raw_request:
        raw_request.state.request_metadata = request_metadata

    try:
        lora_request = self._maybe_get_adapters(request)

        engine_prompts, duration_s = await self._preprocess_speech_to_text(
            request=request,
            audio_data=audio_data,
            request_id=request_id,
        )

    except ValueError as e:
        logger.exception("Error in preprocessing prompt inputs")
        return self.create_error_response(e)

    # Schedule the request and get the result generator.
    max_model_len = self.model_config.max_model_len
    list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
    try:
        # Unlike most decoder-only models, whisper generation length is not
        # constrained by the size of the input audio, which is mapped to a
        # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
        # generated by respecting the extra completion tokens arg.
        max_tokens = get_max_tokens(
            max_model_len,
            request.max_completion_tokens,
            0,
            self.default_sampling_params,
        )

        sampling_params = request.to_sampling_params(
            max_tokens,
            self.default_sampling_params,
        )
        if request.response_format == "verbose_json":
            sampling_params.logprobs = 1

        list_result_generator = []
        for i, engine_prompt in enumerate(engine_prompts):
            request_id_item = f"{request_id}_{i}"

            self._log_inputs(
                request_id_item,
                engine_prompt,
                params=sampling_params,
                lora_request=lora_request,
            )

            trace_headers = (
                None
                if raw_request is None
                else await self._get_trace_headers(raw_request.headers)
            )

            generator = self.engine_client.generate(
                engine_prompt,
                sampling_params,
                request_id_item,
                lora_request=lora_request,
                trace_headers=trace_headers,
            )

            list_result_generator.append(generator)
    except ValueError as e:
        return self.create_error_response(e)

    if request.stream:
        return stream_generator_method(
            request, list_result_generator, request_id, request_metadata, duration_s
        )
    # Non-streaming response.
    total_segments = []
    text_parts = []
    try:
        assert list_result_generator is not None
        segments_types: dict[str, type[SpeechToTextSegment]] = {
            "transcribe": TranscriptionSegment,
            "translate": TranslationSegment,
        }
        segment_class: type[SpeechToTextSegment] = segments_types[self.task_type]
        text = ""
        chunk_size_in_s = self.asr_config.max_audio_clip_s
        if chunk_size_in_s is None:
            assert len(list_result_generator) == 1, (
                "`max_audio_clip_s` is set to None, audio cannot be chunked"
            )
        for idx, result_generator in enumerate(list_result_generator):
            start_time = (
                float(idx * chunk_size_in_s) if chunk_size_in_s is not None else 0.0
            )
            async for op in result_generator:
                if request.response_format == "verbose_json":
                    assert op.outputs[0].logprobs
                    segments: list[SpeechToTextSegment] = (
                        self._get_verbose_segments(
                            tokens=tuple(op.outputs[0].token_ids),
                            segment_class=segment_class,
                            request=request,
                            start_time=start_time,
                            log_probs=op.outputs[0].logprobs,
                        )
                    )

                    total_segments.extend(segments)
                    text_parts.extend([seg.text for seg in segments])
                else:
                    raw_text = op.outputs[0].text
                    text_parts.append(self.model_cls.post_process_output(raw_text))
        text = "".join(text_parts)
        if self.task_type == "transcribe":
            final_response: ResponseType
            # add usage in TranscriptionResponse.
            usage = {
                "type": "duration",
                # rounded up as per openAI specs
                "seconds": int(math.ceil(duration_s)),
            }
            if request.response_format != "verbose_json":
                final_response = cast(
                    T, TranscriptionResponse(text=text, usage=usage)
                )
            else:
                final_response = cast(
                    V,
                    TranscriptionResponseVerbose(
                        text=text,
                        language=request.language,
                        duration=str(duration_s),
                        segments=total_segments,
                    ),
                )
        else:
            # no usage in response for translation task
            if request.response_format != "verbose_json":
                final_response = cast(T, TranslationResponse(text=text))
            else:
                final_response = cast(
                    V,
                    TranslationResponseVerbose(
                        text=text,
                        language=request.language,
                        duration=str(duration_s),
                        segments=total_segments,
                    ),
                )
        return final_response
    except asyncio.CancelledError:
        return self.create_error_response("Client disconnected")
    except ValueError as e:
        return self.create_error_response(e)

_detect_language `async` ¶

_detect_language(
    audio_chunk: ndarray, request_id: str
) -> str

Auto-detect the spoken language from an audio chunk.

Delegates prompt construction and output parsing to the model class via get_language_detection_prompt and parse_language_detection_output.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

async def _detect_language(
    self,
    audio_chunk: np.ndarray,
    request_id: str,
) -> str:
    """Auto-detect the spoken language from an audio chunk.

    Delegates prompt construction and output parsing to the model class
    via ``get_language_detection_prompt`` and
    ``parse_language_detection_output``.
    """
    from vllm.sampling_params import SamplingParams

    prompt = self.model_cls.get_language_detection_prompt(
        audio_chunk,
        self.asr_config,
    )
    allowed_token_ids = self.model_cls.get_language_token_ids(
        self.tokenizer,
    )
    sampling_params = SamplingParams(
        max_tokens=1,
        temperature=0.0,
        allowed_token_ids=allowed_token_ids,
    )

    result_generator = self.engine_client.generate(
        prompt,
        sampling_params,
        request_id,
    )

    final_output: RequestOutput
    async for final_output in result_generator:
        if final_output.finished:
            break

    token_ids = list(final_output.outputs[0].token_ids)
    lang = self.model_cls.parse_language_detection_output(
        token_ids,
        self.tokenizer,
    )

    logger.info("Auto-detected language: '%s'", lang)
    return lang

_find_split_point ¶

_find_split_point(
    wav: ndarray, start_idx: int, end_idx: int
) -> int

Find the best point to split audio by looking for silence or low amplitude. Args: wav: Audio tensor [1, T] start_idx: Start index of search region end_idx: End index of search region Returns: Index of best splitting point

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
    """Find the best point to split audio by
    looking for silence or low amplitude.
    Args:
        wav: Audio tensor [1, T]
        start_idx: Start index of search region
        end_idx: End index of search region
    Returns:
        Index of best splitting point
    """
    segment = wav[start_idx:end_idx]

    # Calculate RMS energy in small windows
    min_energy = math.inf
    quietest_idx = 0
    min_energy_window = self.asr_config.min_energy_split_window_size
    assert min_energy_window is not None
    for i in range(0, len(segment) - min_energy_window, min_energy_window):
        window = segment[i : i + min_energy_window]
        energy = (window**2).mean() ** 0.5
        if energy < min_energy:
            quietest_idx = i + start_idx
            min_energy = energy
    return quietest_idx

_get_verbose_segments ¶

_get_verbose_segments(
    tokens: tuple,
    log_probs: FlatLogprobs | list[dict[int, Logprob]],
    request: SpeechToTextRequest,
    segment_class: type[SpeechToTextSegment],
    start_time: float = 0,
) -> list[SpeechToTextSegment]

Convert tokens to verbose segments.

This method expects the model to produce timestamps as tokens (similar to Whisper). If the tokens do not include timestamp information, the segments may not be generated correctly.

Note: No_speech_prob field is not supported in this implementation and will be None. See docs for details.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

def _get_verbose_segments(
    self,
    tokens: tuple,
    log_probs: FlatLogprobs | list[dict[int, Logprob]],
    request: SpeechToTextRequest,
    segment_class: type[SpeechToTextSegment],
    start_time: float = 0,
) -> list[SpeechToTextSegment]:
    """
    Convert tokens to verbose segments.

    This method expects the model to produce
    timestamps as tokens (similar to Whisper).
    If the tokens do not include timestamp information,
    the segments may not be generated correctly.

    Note: No_speech_prob field is not supported
    in this implementation and will be None. See docs for details.
    """
    BASE_OFFSET = 0.02
    init_token = self.tokenizer.encode("<|0.00|>", add_special_tokens=False)[0]
    if tokens[-1] == self.tokenizer.eos_token_id:
        tokens = tokens[:-1]

    tokens_with_start = (init_token,) + tokens
    segments: list[SpeechToTextSegment] = []
    last_timestamp_start = 0

    if tokens_with_start[-2] < init_token and tokens_with_start[-1] >= init_token:
        tokens_with_start = tokens_with_start + (tokens_with_start[-1],)
    avg_logprob = 0.0
    for idx in range(1, len(tokens_with_start)):
        # Timestamp tokens (e.g., <|0.00|>) are assumed to be sorted.
        # If the ordering is violated, this slicing may produce incorrect results.
        token = tokens_with_start[idx]
        if token >= init_token and tokens_with_start[idx - 1] >= init_token:
            sliced_timestamp_tokens = tokens_with_start[last_timestamp_start:idx]
            start_timestamp = sliced_timestamp_tokens[0] - init_token
            end_timestamp = sliced_timestamp_tokens[-1] - init_token
            text = self.tokenizer.decode(sliced_timestamp_tokens[1:-1])
            text_bytes = text.encode("utf-8")

            casting_segment = cast(
                SpeechToTextSegment,
                segment_class(
                    id=len(segments),
                    seek=start_time,
                    start=start_time + BASE_OFFSET * start_timestamp,
                    end=start_time + BASE_OFFSET * end_timestamp,
                    temperature=request.temperature,
                    text=text,
                    # The compression ratio measures
                    # how compressible the generated text is.
                    # A higher ratio indicates more repetitive content,
                    # which is a strong sign of hallucination in outputs.
                    compression_ratio=len(text_bytes)
                    / len(zlib.compress(text_bytes)),
                    tokens=sliced_timestamp_tokens[1:-1],
                    avg_logprob=avg_logprob / (idx - last_timestamp_start),
                ),
            )
            segments.append(casting_segment)
            last_timestamp_start = idx
            avg_logprob = 0
        else:
            avg_logprob += log_probs[idx - 1][token].logprob
    return segments

_warmup_audio_preprocessing ¶

_warmup_audio_preprocessing() -> None

Warm up audio processing libraries to avoid first-request latency.

The first call to librosa functions (load, get_duration, mel-spectrogram) triggers JIT compilation and library initialization which can take ~7s. This method warms up these operations during server initialization.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

def _warmup_audio_preprocessing(self) -> None:
    """Warm up audio processing libraries to avoid first-request latency.

    The first call to librosa functions (load, get_duration, mel-spectrogram)
    triggers JIT compilation and library initialization which can take ~7s.
    This method warms up these operations during server initialization.
    """
    # Skip warmup if librosa is not installed (optional dependency)
    if isinstance(librosa, PlaceholderModule):
        return

    # Skip warmup if model doesn't support transcription
    if not supports_transcription(self.model_cls):
        return

    if getattr(self.model_cls, "skip_warmup_audio_preprocessing", False):
        return

    try:
        warmup_start = time.perf_counter()
        logger.info("Warming up audio preprocessing libraries...")

        # Create a minimal dummy audio (1 second of silence at target sample rate)
        dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

        # Warm up librosa.load by using librosa functions on the dummy data
        # This initializes FFTW, numba JIT, and other audio processing libraries
        _ = librosa.get_duration(y=dummy_audio, sr=self.asr_config.sample_rate)

        # Warm up mel-spectrogram computation with model-specific parameters
        from vllm.transformers_utils.processor import cached_processor_from_config

        processor = cached_processor_from_config(self.model_config)
        feature_extractor = None
        if hasattr(processor, "feature_extractor"):
            feature_extractor = processor.feature_extractor
        elif hasattr(processor, "audio_processor"):
            # For models like GraniteSpeech that use audio_processor
            audio_proc = processor.audio_processor
            if hasattr(audio_proc, "feature_extractor"):
                feature_extractor = audio_proc.feature_extractor
            # If audio_processor doesn't have feature_extractor,
            # skip mel-spectrogram warmup for these models

        if feature_extractor is not None:
            _ = librosa.feature.melspectrogram(
                y=dummy_audio,
                sr=self.asr_config.sample_rate,
                n_mels=getattr(feature_extractor, "n_mels", 128),
                n_fft=getattr(feature_extractor, "n_fft", 400),
                hop_length=getattr(feature_extractor, "hop_length", 160),
            )

        warmup_elapsed = time.perf_counter() - warmup_start
        logger.info("Audio preprocessing warmup completed in %.2fs", warmup_elapsed)
    except Exception:
        # Don't fail initialization if warmup fails - log exception and continue
        logger.exception(
            "Audio preprocessing warmup failed (non-fatal): %s. "
            "First request may experience higher latency.",
        )

_warmup_input_processor ¶

_warmup_input_processor() -> None

Warm up input processor with dummy audio to avoid first-request latency.

The first call to input_processor.process_inputs() with multimodal audio triggers multimodal processing initialization which can take ~2.5s. This method processes a dummy audio request to warm up the pipeline.

Source code in vllm/entrypoints/openai/speech_to_text/speech_to_text.py

def _warmup_input_processor(self) -> None:
    """Warm up input processor with dummy audio to avoid first-request latency.

    The first call to input_processor.process_inputs() with multimodal audio
    triggers multimodal processing initialization which can take ~2.5s.
    This method processes a dummy audio request to warm up the pipeline.
    """
    # Skip warmup if model doesn't support transcription
    if not supports_transcription(self.model_cls):
        return

    # Only warm up if model supports transcription methods
    if not hasattr(self.model_cls, "get_generation_prompt"):
        return

    try:
        warmup_start = time.perf_counter()
        logger.info("Warming up multimodal input processor...")

        # Create minimal dummy audio (1 second of silence)
        dummy_audio = np.zeros(int(self.asr_config.sample_rate), dtype=np.float32)

        # Use the same method that _preprocess_speech_to_text uses
        # to create the prompt
        dummy_prompt = self.model_cls.get_generation_prompt(
            audio=dummy_audio,
            stt_config=self.asr_config,
            model_config=self.model_config,
            language="en",
            task_type=self.task_type,
            request_prompt="",
            to_language=None,
        )
        parsed_prompt = parse_model_prompt(self.model_config, dummy_prompt)

        # Process the dummy input through the input processor
        # This will trigger all the multimodal processing initialization
        _ = self.renderer.render_cmpl([parsed_prompt])

        warmup_elapsed = time.perf_counter() - warmup_start
        logger.info("Input processor warmup completed in %.2fs", warmup_elapsed)
    except Exception:
        # Don't fail initialization if warmup fails - log warning and continue
        logger.exception(
            "Input processor warmup failed (non-fatal): %s. "
            "First request may experience higher latency."
        )

RequestOutput ¶

The output data of a completion request to the LLM.

Parameters:

Name	Type	Description	Default
`request_id`	`str`	The unique ID of the request.	required
`prompt`	`str \| None`	The prompt string of the request. For encoder/decoder models, this is the decoder input prompt.	required
`prompt_token_ids`	`list[int] \| None`	The token IDs of the prompt. For encoder/decoder models, this is the decoder input prompt token ids.	required
`prompt_logprobs`	`PromptLogprobs \| None`	The log probabilities to return per prompt token.	required
`outputs`	`list[CompletionOutput]`	The output sequences of the request.	required
`finished`	`bool`	Whether the whole request is finished.	required
`metrics`	`RequestStateStats \| None`	Metrics associated with the request.	`None`
`lora_request`	`LoRARequest \| None`	The LoRA request that was used to generate the output.	`None`
`encoder_prompt`	`str \| None`	The encoder prompt string of the request. None if decoder-only.	`None`
`encoder_prompt_token_ids`	`list[int] \| None`	The token IDs of the encoder prompt. None if decoder-only.	`None`
`num_cached_tokens`	`int \| None`	The number of tokens with prefix cache hit.	`None`
`kv_transfer_params`	`dict[str, Any] \| None`	The params for remote K/V transfer.	`None`

Source code in vllm/outputs.py

class RequestOutput:
    """The output data of a completion request to the LLM.

    Args:
        request_id: The unique ID of the request.
        prompt: The prompt string of the request.
                For encoder/decoder models, this is the
                decoder input prompt.
        prompt_token_ids: The token IDs of the prompt.
                          For encoder/decoder models, this is the
                          decoder input prompt token ids.
        prompt_logprobs: The log probabilities to return per prompt token.
        outputs: The output sequences of the request.
        finished: Whether the whole request is finished.
        metrics: Metrics associated with the request.
        lora_request: The LoRA request that was used to generate the output.
        encoder_prompt: The encoder prompt string of the request.
                        None if decoder-only.
        encoder_prompt_token_ids: The token IDs of the encoder prompt.
                                  None if decoder-only.
        num_cached_tokens: The number of tokens with prefix cache hit.
        kv_transfer_params: The params for remote K/V transfer.
    """

    def __init__(
        self,
        request_id: str,
        prompt: str | None,
        prompt_token_ids: list[int] | None,
        prompt_logprobs: PromptLogprobs | None,
        outputs: list[CompletionOutput],
        finished: bool,
        metrics: RequestStateStats | None = None,
        lora_request: LoRARequest | None = None,
        encoder_prompt: str | None = None,
        encoder_prompt_token_ids: list[int] | None = None,
        num_cached_tokens: int | None = None,
        *,
        kv_transfer_params: dict[str, Any] | None = None,
        # Forward compatibility, code that uses args added in new release can
        # still run with older versions of vLLM without breaking.
        **kwargs: Any,
    ) -> None:
        if kwargs:
            logger.warning_once(
                "RequestOutput: Ignoring extra arguments: %s", str(kwargs)
            )
        self.request_id = request_id
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.prompt_logprobs = prompt_logprobs
        self.outputs = outputs
        self.finished = finished
        self.metrics = metrics
        self.lora_request = lora_request
        self.encoder_prompt = encoder_prompt
        self.encoder_prompt_token_ids = encoder_prompt_token_ids
        self.num_cached_tokens = num_cached_tokens
        self.kv_transfer_params = kv_transfer_params

    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
        """Merge subsequent RequestOutput into this one"""

        self.finished |= next_output.finished
        self.kv_transfer_params = next_output.kv_transfer_params

        for next_completion in next_output.outputs:
            for i, completion in enumerate(self.outputs):
                if completion.index == next_completion.index:
                    if aggregate:
                        # Merge outputs with same index
                        completion.text += next_completion.text
                        if not isinstance(completion.token_ids, MutableSequence):
                            completion.token_ids = list(completion.token_ids)
                        completion.token_ids.extend(next_completion.token_ids)
                        if next_completion.logprobs:
                            assert completion.logprobs is not None
                            completion.logprobs.extend(next_completion.logprobs)  # type: ignore[arg-type]
                        completion.cumulative_logprob = (
                            next_completion.cumulative_logprob
                        )
                        completion.finish_reason = next_completion.finish_reason
                        completion.stop_reason = next_completion.stop_reason
                    else:
                        # Replace the output with the new one
                        self.outputs[i] = next_completion
                    break
            else:
                self.outputs.append(next_completion)

    def __repr__(self) -> str:
        return (
            f"RequestOutput(request_id={self.request_id}, "
            f"prompt={self.prompt!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"encoder_prompt={self.encoder_prompt!r}, "
            f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"outputs={self.outputs}, "
            f"finished={self.finished}, "
            f"metrics={self.metrics}, "
            f"lora_request={self.lora_request}, "
            f"num_cached_tokens={self.num_cached_tokens})"
        )

add ¶

add(next_output: RequestOutput, aggregate: bool) -> None

Merge subsequent RequestOutput into this one

Source code in vllm/outputs.py

def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
    """Merge subsequent RequestOutput into this one"""

    self.finished |= next_output.finished
    self.kv_transfer_params = next_output.kv_transfer_params

    for next_completion in next_output.outputs:
        for i, completion in enumerate(self.outputs):
            if completion.index == next_completion.index:
                if aggregate:
                    # Merge outputs with same index
                    completion.text += next_completion.text
                    if not isinstance(completion.token_ids, MutableSequence):
                        completion.token_ids = list(completion.token_ids)
                    completion.token_ids.extend(next_completion.token_ids)
                    if next_completion.logprobs:
                        assert completion.logprobs is not None
                        completion.logprobs.extend(next_completion.logprobs)  # type: ignore[arg-type]
                    completion.cumulative_logprob = (
                        next_completion.cumulative_logprob
                    )
                    completion.finish_reason = next_completion.finish_reason
                    completion.stop_reason = next_completion.stop_reason
                else:
                    # Replace the output with the new one
                    self.outputs[i] = next_completion
                break
        else:
            self.outputs.append(next_completion)

TranscriptionRequest ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py

class TranscriptionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranscription

    file: UploadFile
    """
    The audio file object (not file name) to transcribe, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: str | None = None
    """ID of the model to use.
    """

    language: str | None = None
    """The language of the input audio.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy and latency.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    ## TODO (varun) : Support if set to 0, certain thresholds are met !!

    timestamp_granularities: list[Literal["word", "segment"]] = Field(
        alias="timestamp_granularities[]", default=[]
    )
    """The timestamp granularities to populate for this transcription.

    `response_format` must be set `verbose_json` to use timestamp granularities.
    Either or both of these options are supported: `word`, or `segment`. Note:
    There is no additional latency for segment timestamps, but generating word
    timestamps incurs additional latency.
    """

    stream: bool | None = False
    """When set, it will enable output to be streamed in a similar fashion
    as the Chat Completion endpoint.
    """
    # --8<-- [start:transcription-extra-params]
    # Flattened stream option to simplify form data.
    stream_include_usage: bool | None = False
    stream_continuous_usage_stats: bool | None = False

    vllm_xargs: dict[str, str | int | float] | None = Field(
        default=None,
        description=(
            "Additional request parameters with string or "
            "numeric values, used by custom extensions."
        ),
    )
    # --8<-- [end:transcription-extra-params]

    to_language: str | None = None
    """The language of the output audio we transcribe to.

    Please note that this is not currently used by supported models at this
    time, but it is a placeholder for future use, matching translation api.
    """

    # --8<-- [start:transcription-sampling-params]
    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """

    top_p: float | None = None
    """Enables nucleus (top-p) sampling, where tokens are selected from the
    smallest possible set whose cumulative probability exceeds `p`.
    """

    top_k: int | None = None
    """Limits sampling to the `k` most probable tokens at each step."""

    min_p: float | None = None
    """Filters out tokens with a probability lower than `min_p`, ensuring a
    minimum likelihood threshold during sampling.
    """

    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    frequency_penalty: float | None = 0.0
    """The frequency penalty to use for sampling."""

    repetition_penalty: float | None = None
    """The repetition penalty to use for sampling."""

    presence_penalty: float | None = 0.0
    """The presence penalty to use for sampling."""

    max_completion_tokens: int | None = None
    """The maximum number of tokens to generate."""
    # --8<-- [end:transcription-sampling-params]

    # Default sampling parameters for transcription requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_sampling_params(
        self, default_max_tokens: int, default_sampling_params: dict | None = None
    ) -> SamplingParams:
        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
            )
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
            )
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
            )

        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )

        return SamplingParams.from_optional(
            temperature=temperature,
            max_tokens=max_tokens,
            seed=self.seed,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            presence_penalty=self.presence_penalty,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
            extra_args=self.vllm_xargs,
            skip_clone=True,  # Created fresh per request, safe to skip clone
        )

    @model_validator(mode="before")
    @classmethod
    def validate_transcription_request(cls, data):
        if isinstance(data.get("file"), str):
            raise HTTPException(
                status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
                detail="Expected 'file' to be a file-like object, not 'str'.",
            )

        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            # Find which specific stream option was set
            invalid_param = next(
                (so for so in stream_opts if data.get(so, False)),
                "stream_include_usage",
            )
            raise VLLMValidationError(
                "Stream options can only be defined when `stream=True`.",
                parameter=invalid_param,
            )

        return data

file `instance-attribute` ¶

file: UploadFile

The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

frequency_penalty `class-attribute` `instance-attribute` ¶

frequency_penalty: float | None = 0.0

The frequency penalty to use for sampling.

language `class-attribute` `instance-attribute` ¶

language: str | None = None

The language of the input audio.

Supplying the input language in ISO-639-1 format will improve accuracy and latency.

max_completion_tokens `class-attribute` `instance-attribute` ¶

max_completion_tokens: int | None = None

The maximum number of tokens to generate.

min_p `class-attribute` `instance-attribute` ¶

min_p: float | None = None

Filters out tokens with a probability lower than min_p, ensuring a minimum likelihood threshold during sampling.

model `class-attribute` `instance-attribute` ¶

model: str | None = None

ID of the model to use.

presence_penalty `class-attribute` `instance-attribute` ¶

presence_penalty: float | None = 0.0

The presence penalty to use for sampling.

prompt `class-attribute` `instance-attribute` ¶

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

repetition_penalty `class-attribute` `instance-attribute` ¶

repetition_penalty: float | None = None

The repetition penalty to use for sampling.

response_format `class-attribute` `instance-attribute` ¶

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed `class-attribute` `instance-attribute` ¶

seed: int | None = Field(None, ge=min, le=max)

The seed to use for sampling.

stream `class-attribute` `instance-attribute` ¶

stream: bool | None = False

When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

temperature `class-attribute` `instance-attribute` ¶

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

timestamp_granularities `class-attribute` `instance-attribute` ¶

timestamp_granularities: list[
    Literal["word", "segment"]
] = Field(alias="timestamp_granularities[]", default=[])

The timestamp granularities to populate for this transcription.

response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.

to_language `class-attribute` `instance-attribute` ¶

to_language: str | None = None

The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api.

top_k `class-attribute` `instance-attribute` ¶

top_k: int | None = None

Limits sampling to the k most probable tokens at each step.

top_p `class-attribute` `instance-attribute` ¶

top_p: float | None = None

Enables nucleus (top-p) sampling, where tokens are selected from the smallest possible set whose cumulative probability exceeds p.

TranscriptionResponse ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py

class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
    usage: TranscriptionUsageAudio

text `instance-attribute` ¶

text: str

The transcribed text.

TranscriptionResponseVerbose ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py

class TranscriptionResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The transcribed text."""

    segments: list[TranscriptionSegment] | None = None
    """Segments of the transcribed text and their corresponding details."""

    words: list[TranscriptionWord] | None = None
    """Extracted words and their corresponding timestamps."""

duration `instance-attribute` ¶

duration: str

The duration of the input audio.

language `instance-attribute` ¶

language: str

The language of the input audio.

segments `class-attribute` `instance-attribute` ¶

segments: list[TranscriptionSegment] | None = None

Segments of the transcribed text and their corresponding details.

text `instance-attribute` ¶

text: str

The transcribed text.

words `class-attribute` `instance-attribute` ¶

words: list[TranscriptionWord] | None = None

Extracted words and their corresponding timestamps.

TranslationRequest ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py

class TranslationRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranslation

    file: UploadFile
    """
    The audio file object (not file name) to translate, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: str | None = None
    """ID of the model to use.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    # TODO support additional sampling parameters
    # --8<-- [start:translation-sampling-params]
    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """
    # --8<-- [end:translation-sampling-params]

    # --8<-- [start:translation-extra-params]
    language: str | None = None
    """The language of the input audio we translate from.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy.
    """

    to_language: str | None = None
    """The language of the input audio we translate to.

    Please note that this is not supported by all models, refer to the specific
    model documentation for more details.
    For instance, Whisper only supports `to_language=en`.
    """

    stream: bool | None = False
    """Custom field not present in the original OpenAI definition. When set,
    it will enable output to be streamed in a similar fashion as the Chat
    Completion endpoint.
    """
    # Flattened stream option to simplify form data.
    stream_include_usage: bool | None = False
    stream_continuous_usage_stats: bool | None = False

    max_completion_tokens: int | None = None
    """The maximum number of tokens to generate."""
    # --8<-- [end:translation-extra-params]

    # Default sampling parameters for translation requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "temperature": 0,
    }

    def to_sampling_params(
        self, default_max_tokens: int, default_sampling_params: dict | None = None
    ) -> SamplingParams:
        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}
        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )

        return SamplingParams.from_optional(
            temperature=temperature,
            max_tokens=max_tokens,
            seed=self.seed,
            output_kind=RequestOutputKind.DELTA
            if self.stream
            else RequestOutputKind.FINAL_ONLY,
            skip_clone=True,  # Created fresh per request, safe to skip clone
        )

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            # Find which specific stream option was set
            invalid_param = next(
                (so for so in stream_opts if data.get(so, False)),
                "stream_include_usage",
            )
            raise VLLMValidationError(
                "Stream options can only be defined when `stream=True`.",
                parameter=invalid_param,
            )

        return data

file `instance-attribute` ¶

file: UploadFile

The audio file object (not file name) to translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

language `class-attribute` `instance-attribute` ¶

language: str | None = None

The language of the input audio we translate from.

Supplying the input language in ISO-639-1 format will improve accuracy.

max_completion_tokens `class-attribute` `instance-attribute` ¶

max_completion_tokens: int | None = None

The maximum number of tokens to generate.

model `class-attribute` `instance-attribute` ¶

model: str | None = None

ID of the model to use.

prompt `class-attribute` `instance-attribute` ¶

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

response_format `class-attribute` `instance-attribute` ¶

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed `class-attribute` `instance-attribute` ¶

seed: int | None = Field(None, ge=min, le=max)

The seed to use for sampling.

stream `class-attribute` `instance-attribute` ¶

stream: bool | None = False

Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

temperature `class-attribute` `instance-attribute` ¶

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

to_language `class-attribute` `instance-attribute` ¶

to_language: str | None = None

The language of the input audio we translate to.

Please note that this is not supported by all models, refer to the specific model documentation for more details. For instance, Whisper only supports to_language=en.

TranslationResponse ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py

class TranslationResponse(OpenAIBaseModel):
    text: str
    """The translated text."""

text `instance-attribute` ¶

text: str

The translated text.

TranslationResponseVerbose ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/speech_to_text/protocol.py

class TranslationResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The translated text."""

    segments: list[TranslationSegment] | None = None
    """Segments of the translated text and their corresponding details."""

    words: list[TranslationWord] | None = None
    """Extracted words and their corresponding timestamps."""

duration `instance-attribute` ¶

duration: str

The duration of the input audio.

language `instance-attribute` ¶

language: str

The language of the input audio.

segments `class-attribute` `instance-attribute` ¶

segments: list[TranslationSegment] | None = None

Segments of the translated text and their corresponding details.

text `instance-attribute` ¶

text: str

The translated text.

words `class-attribute` `instance-attribute` ¶

words: list[TranslationWord] | None = None

Extracted words and their corresponding timestamps.

init_logger ¶

init_logger(name: str) -> _VllmLogger

The main purpose of this function is to ensure that loggers are retrieved in such a way that we can be sure the root vllm logger has already been configured.

Source code in vllm/logger.py

def init_logger(name: str) -> _VllmLogger:
    """The main purpose of this function is to ensure that loggers are
    retrieved in such a way that we can be sure the root vllm logger has
    already been configured."""

    logger = logging.getLogger(name)

    for method_name, method in _METHODS_TO_PATCH.items():
        setattr(logger, method_name, MethodType(method, logger))

    return cast(_VllmLogger, logger)

vllm.entrypoints.openai.translations.serving ¶

EngineClient ¶

abort abstractmethod async ¶

add_lora abstractmethod async ¶

check_health abstractmethod async ¶

collective_rpc async ¶

encode abstractmethod ¶

generate abstractmethod ¶

get_supported_tasks async ¶

init_weight_transfer_engine async ¶

is_paused abstractmethod async ¶

is_sleeping abstractmethod async ¶

pause_generation abstractmethod async ¶

reset_encoder_cache abstractmethod async ¶

reset_mm_cache abstractmethod async ¶

reset_prefix_cache abstractmethod async ¶

resume_generation abstractmethod async ¶

scale_elastic_ep async ¶

sleep abstractmethod async ¶

start_profile abstractmethod async ¶

stop_profile abstractmethod async ¶

update_weights async ¶

wake_up abstractmethod async ¶

OpenAIServingModels ¶

init_static_loras async ¶

model_name ¶

resolve_lora async ¶

show_available_models async ¶

OpenAIServingTranscription ¶

create_transcription async ¶

OpenAIServingTranslation ¶

create_translation async ¶

OpenAISpeechToText ¶

_create_speech_to_text async ¶

_detect_language async ¶

_find_split_point ¶

_get_verbose_segments ¶

_warmup_audio_preprocessing ¶

_warmup_input_processor ¶

RequestOutput ¶

add ¶

TranscriptionRequest ¶

file instance-attribute ¶

frequency_penalty class-attribute instance-attribute ¶

language class-attribute instance-attribute ¶

max_completion_tokens class-attribute instance-attribute ¶

min_p class-attribute instance-attribute ¶

model class-attribute instance-attribute ¶

presence_penalty class-attribute instance-attribute ¶

prompt class-attribute instance-attribute ¶

repetition_penalty class-attribute instance-attribute ¶

response_format class-attribute instance-attribute ¶

seed class-attribute instance-attribute ¶

stream class-attribute instance-attribute ¶

temperature class-attribute instance-attribute ¶

timestamp_granularities class-attribute instance-attribute ¶

to_language class-attribute instance-attribute ¶

top_k class-attribute instance-attribute ¶

top_p class-attribute instance-attribute ¶

TranscriptionResponse ¶

text instance-attribute ¶

TranscriptionResponseVerbose ¶

duration instance-attribute ¶

language instance-attribute ¶

segments class-attribute instance-attribute ¶

text instance-attribute ¶

words class-attribute instance-attribute ¶

TranslationRequest ¶

file instance-attribute ¶

language class-attribute instance-attribute ¶

max_completion_tokens class-attribute instance-attribute ¶

model class-attribute instance-attribute ¶

prompt class-attribute instance-attribute ¶

response_format class-attribute instance-attribute ¶

seed class-attribute instance-attribute ¶

stream class-attribute instance-attribute ¶

temperature class-attribute instance-attribute ¶

to_language class-attribute instance-attribute ¶

TranslationResponse ¶

text instance-attribute ¶

abort `abstractmethod` `async` ¶

add_lora `abstractmethod` `async` ¶

check_health `abstractmethod` `async` ¶

collective_rpc `async` ¶

encode `abstractmethod` ¶

generate `abstractmethod` ¶

get_supported_tasks `async` ¶

init_weight_transfer_engine `async` ¶

is_paused `abstractmethod` `async` ¶

is_sleeping `abstractmethod` `async` ¶

pause_generation `abstractmethod` `async` ¶

reset_encoder_cache `abstractmethod` `async` ¶

reset_mm_cache `abstractmethod` `async` ¶

reset_prefix_cache `abstractmethod` `async` ¶

resume_generation `abstractmethod` `async` ¶

scale_elastic_ep `async` ¶

sleep `abstractmethod` `async` ¶

start_profile `abstractmethod` `async` ¶

stop_profile `abstractmethod` `async` ¶

update_weights `async` ¶

wake_up `abstractmethod` `async` ¶

init_static_loras `async` ¶

resolve_lora `async` ¶

show_available_models `async` ¶

create_transcription `async` ¶

create_translation `async` ¶

_create_speech_to_text `async` ¶

_detect_language `async` ¶

file `instance-attribute` ¶

frequency_penalty `class-attribute` `instance-attribute` ¶

language `class-attribute` `instance-attribute` ¶

max_completion_tokens `class-attribute` `instance-attribute` ¶

min_p `class-attribute` `instance-attribute` ¶

model `class-attribute` `instance-attribute` ¶

presence_penalty `class-attribute` `instance-attribute` ¶

prompt `class-attribute` `instance-attribute` ¶

repetition_penalty `class-attribute` `instance-attribute` ¶

response_format `class-attribute` `instance-attribute` ¶

seed `class-attribute` `instance-attribute` ¶

stream `class-attribute` `instance-attribute` ¶

temperature `class-attribute` `instance-attribute` ¶

timestamp_granularities `class-attribute` `instance-attribute` ¶

to_language `class-attribute` `instance-attribute` ¶

top_k `class-attribute` `instance-attribute` ¶

top_p `class-attribute` `instance-attribute` ¶

text `instance-attribute` ¶

duration `instance-attribute` ¶

language `instance-attribute` ¶

segments `class-attribute` `instance-attribute` ¶

text `instance-attribute` ¶

words `class-attribute` `instance-attribute` ¶

file `instance-attribute` ¶

language `class-attribute` `instance-attribute` ¶

max_completion_tokens `class-attribute` `instance-attribute` ¶

model `class-attribute` `instance-attribute` ¶

prompt `class-attribute` `instance-attribute` ¶

response_format `class-attribute` `instance-attribute` ¶

seed `class-attribute` `instance-attribute` ¶

stream `class-attribute` `instance-attribute` ¶

temperature `class-attribute` `instance-attribute` ¶

to_language `class-attribute` `instance-attribute` ¶

text `instance-attribute` ¶

duration `instance-attribute` ¶

language `instance-attribute` ¶

segments `class-attribute` `instance-attribute` ¶

text `instance-attribute` ¶

words `class-attribute` `instance-attribute` ¶